diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000000..fd0115020f --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,8 @@ +# make format with black 24.10.0 on 2024-10-19 +49d99f17799aa4a4235126bbdfb45f85e6499b27 +# make sort_imports +a307a57651296e576d55f9159869dc9130da2b7c +# make remove_unused_import +1c6217e11a2815476ede1cc3ee584d5da3d5242a +# make pyupgrade (3.9+) +00f54f8417b3a0bc5d1ea46650aa2e6ce4792159 diff --git a/Makefile b/Makefile index 6304babd3e..185cb86f0a 100644 --- a/Makefile +++ b/Makefile @@ -341,7 +341,7 @@ diff_mypy: diff-cover --fail-under=100 --compare-branch origin/master cobertura.xml pyupgrade: $(PYSOURCES) - pyupgrade --exit-zero-even-if-changed --py37-plus $^ + pyupgrade --exit-zero-even-if-changed --py39-plus $^ flake8: $(PYSOURCES) flake8 --ignore=E501,W293,W291,E265,E302,E722,E126,E303,E261,E201,E202,W503,W504,W391,E128,E301,E127,E502,E129,E262,E111,E117,E306,E203,E231,E226,E741,E122,E251,E305,E701,E222,E225,E241,E305,E123,E121,E703,E704,E125,E402 $^ diff --git a/contrib/mypy-stubs/configargparse/__init__.pyi b/contrib/mypy-stubs/configargparse/__init__.pyi index 0c4451c512..4cdda2609c 100644 --- a/contrib/mypy-stubs/configargparse/__init__.pyi +++ b/contrib/mypy-stubs/configargparse/__init__.pyi @@ -1,5 +1,5 @@ +from .configargparse import SUPPRESS as SUPPRESS from .configargparse import ArgParser as ArgParser from .configargparse import ArgumentParser as ArgumentParser from .configargparse import Namespace as Namespace -from .configargparse import SUPPRESS as SUPPRESS from .configargparse import YAMLConfigFileParser as YAMLConfigFileParser diff --git a/contrib/mypy-stubs/configargparse/configargparse.pyi b/contrib/mypy-stubs/configargparse/configargparse.pyi index eda8ef5368..8df083e9d2 100644 --- a/contrib/mypy-stubs/configargparse/configargparse.pyi +++ b/contrib/mypy-stubs/configargparse/configargparse.pyi @@ -23,7 +23,6 @@ class ArgumentParser(argparse.ArgumentParser): def _default_config_files(self) -> List[Any]: ... @property def _ignore_unknown_config_file_keys(self) -> Any: ... - def __init__(self, *args: Any, **kwargs: Any) -> None: ... # There may be a better way of type hinting this without a type: ignore, but mypy gets unhappy pretty much no matter what as the signatures for parse_args doesn't match with its superclass in argparse def parse_args(self, args: Sequence[str] | None = None, namespace: Namespace | None = None, config_file_contents: str | None = None, env_vars: Any = None) -> Namespace: ... # type: ignore[override] diff --git a/contrib/mypy-stubs/dill/_dill.pyi b/contrib/mypy-stubs/dill/_dill.pyi index 5c96a30d60..ed912a3b12 100644 --- a/contrib/mypy-stubs/dill/_dill.pyi +++ b/contrib/mypy-stubs/dill/_dill.pyi @@ -1,8 +1,12 @@ import sys -from typing import IO, Any, Callable, Iterable, Protocol, Union +from typing import Any, Callable, Iterable, Protocol, final from _typeshed import ReadableBuffer -from typing_extensions import TypeAlias, final + +if sys.version_info < (3, 10): + from typing_extensions import TypeAlias +else: + from typing import TypeAlias class _ReadableFileobj(Protocol): def read(self, __n: int) -> bytes: ... @@ -17,6 +21,7 @@ if sys.version_info >= (3, 8): def __init__(self, buffer: ReadableBuffer) -> None: ... def raw(self) -> memoryview: ... def release(self) -> None: ... + _BufferCallback: TypeAlias = Callable[[PickleBuffer], Any] | None def dump( diff --git a/requirements-dev.txt b/requirements-dev.txt index e9a1e463bc..9ee989911e 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -9,7 +9,7 @@ astroid>=3,<4 sphinx-autodoc-typehints>=1.24.0,<3 sphinxcontrib-autoprogram==0.1.9 cwltest>=2.2.20211116163652 -mypy==1.12.1 +mypy==1.13.0 types-aws-xray-sdk types-boto<2.49.18.20241020 types-Flask-Cors diff --git a/setup.py b/setup.py index 27e830de63..400af2b15f 100755 --- a/setup.py +++ b/setup.py @@ -21,6 +21,7 @@ SETUP_DIR = os.path.dirname(__file__) README = os.path.join(SETUP_DIR, "README.rst") + def get_requirements(extra=None): """ Load the requirements for the given extra. @@ -32,7 +33,9 @@ def get_requirements(extra=None): with open(filename) as fp: # Parse out as one per line, dropping comments - return [l.split('#')[0].strip() for l in fp.readlines() if l.split('#')[0].strip()] + return [ + l.split("#")[0].strip() for l in fp.readlines() if l.split("#")[0].strip() + ] def run_setup(): @@ -58,90 +61,101 @@ def run_setup(): "google", "kubernetes", "wdl", - "server" + "server", ] for extra in non_htcondor_extras: extras_require[extra] = get_requirements(extra) all_reqs += extras_require[extra] # We exclude htcondor from "all" because it can't be on Mac extras_require['htcondor:sys_platform!="darwin"'] = get_requirements("htcondor") - extras_require['mesos'] = get_requirements("mesos") + extras_require["mesos"] = get_requirements("mesos") all_reqs += get_requirements("mesos") extras_require["all"] = all_reqs setup( - name='toil', + name="toil", version=version.distVersion, long_description=open(README).read(), long_description_content_type="text/x-rst", - description='Pipeline management software for clusters.', - author='Benedict Paten and the Toil community', - author_email='toil-community@googlegroups.com', + description="Pipeline management software for clusters.", + author="Benedict Paten and the Toil community", + author_email="toil-community@googlegroups.com", url="https://github.com/DataBiosphere/toil", classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'Environment :: Console', - 'Intended Audience :: Developers', - 'Intended Audience :: Science/Research', - 'Intended Audience :: Healthcare Industry', - 'License :: OSI Approved :: Apache Software License', - 'Natural Language :: English', - 'Operating System :: MacOS :: MacOS X', - 'Operating System :: POSIX', - 'Operating System :: POSIX :: Linux', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - 'Programming Language :: Python :: 3.12', - 'Topic :: Scientific/Engineering', - 'Topic :: Scientific/Engineering :: Bio-Informatics', - 'Topic :: Scientific/Engineering :: Astronomy', - 'Topic :: Scientific/Engineering :: Atmospheric Science', - 'Topic :: Scientific/Engineering :: Information Analysis', - 'Topic :: Scientific/Engineering :: Medical Science Apps.', - 'Topic :: System :: Distributed Computing', - 'Topic :: Utilities'], + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Intended Audience :: Healthcare Industry", + "License :: OSI Approved :: Apache Software License", + "Natural Language :: English", + "Operating System :: MacOS :: MacOS X", + "Operating System :: POSIX", + "Operating System :: POSIX :: Linux", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "Topic :: Scientific/Engineering :: Astronomy", + "Topic :: Scientific/Engineering :: Atmospheric Science", + "Topic :: Scientific/Engineering :: Information Analysis", + "Topic :: Scientific/Engineering :: Medical Science Apps.", + "Topic :: System :: Distributed Computing", + "Topic :: Utilities", + ], license="Apache License v2.0", python_requires=">=3.9", install_requires=install_requires, extras_require=extras_require, - package_dir={'': 'src'}, - packages=find_packages(where='src'), + package_dir={"": "src"}, + packages=find_packages(where="src"), package_data={ - '': ['*.yml', '*.yaml', 'cloud-config', '*.cwl'], + "": ["*.yml", "*.yaml", "cloud-config", "*.cwl"], }, # Unfortunately, the names of the entry points are hard-coded elsewhere in the code base so # you can't just change them here. Luckily, most of them are pretty unique strings, and thus # easy to search for. entry_points={ - 'console_scripts': [ - 'toil = toil.utils.toilMain:main', - '_toil_worker = toil.worker:main', - 'cwltoil = toil.cwl.cwltoil:cwltoil_was_removed [cwl]', - 'toil-cwl-runner = toil.cwl.cwltoil:main [cwl]', - 'toil-wdl-runner = toil.wdl.wdltoil:main [wdl]', - 'toil-wes-cwl-runner = toil.server.cli.wes_cwl_runner:main [server]', - '_toil_mesos_executor = toil.batchSystems.mesos.executor:main [mesos]', - '_toil_contained_executor = toil.batchSystems.contained_executor:executor']}) + "console_scripts": [ + "toil = toil.utils.toilMain:main", + "_toil_worker = toil.worker:main", + "cwltoil = toil.cwl.cwltoil:cwltoil_was_removed [cwl]", + "toil-cwl-runner = toil.cwl.cwltoil:main [cwl]", + "toil-wdl-runner = toil.wdl.wdltoil:main [wdl]", + "toil-wes-cwl-runner = toil.server.cli.wes_cwl_runner:main [server]", + "_toil_mesos_executor = toil.batchSystems.mesos.executor:main [mesos]", + "_toil_contained_executor = toil.batchSystems.contained_executor:executor", + ] + }, + ) def import_version(): """Return the module object for src/toil/version.py, generate from the template if required.""" - if not os.path.exists('src/toil/version.py'): + if not os.path.exists("src/toil/version.py"): for req in get_requirements("cwl"): # Determine cwltool version from requirements file if req.startswith("cwltool=="): - cwltool_version = req[len("cwltool=="):] + cwltool_version = req[len("cwltool==") :] break # Use the template to generate src/toil/version.py import version_template - with NamedTemporaryFile(mode='w', dir='src/toil', prefix='version.py.', delete=False) as f: - f.write(version_template.expand_(others={ - # expose the dependency versions that we may need to access in Toil - 'cwltool_version': cwltool_version, - })) - os.rename(f.name, 'src/toil/version.py') - - loader = SourceFileLoader('toil.version', 'src/toil/version.py') + + with NamedTemporaryFile( + mode="w", dir="src/toil", prefix="version.py.", delete=False + ) as f: + f.write( + version_template.expand_( + others={ + # expose the dependency versions that we may need to access in Toil + "cwltool_version": cwltool_version, + } + ) + ) + os.rename(f.name, "src/toil/version.py") + + loader = SourceFileLoader("toil.version", "src/toil/version.py") mod = types.ModuleType(loader.name) loader.exec_module(mod) return mod diff --git a/src/toil/__init__.py b/src/toil/__init__.py index 5a737c873f..ab45c0d7c4 100644 --- a/src/toil/__init__.py +++ b/src/toil/__init__.py @@ -11,21 +11,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import errno import logging import os import re import socket import sys -import time from datetime import datetime -from typing import TYPE_CHECKING, Optional, Tuple +from typing import TYPE_CHECKING, Optional import requests from docker.errors import ImageNotFound from toil.lib.memoize import memoize -from toil.lib.retry import retry +from toil.lib.retry import retry as retry from toil.version import currentCommit if TYPE_CHECKING: @@ -43,15 +41,15 @@ def which(cmd, mode=os.F_OK | os.X_OK, path=None) -> Optional[str]: `mode` defaults to os.F_OK | os.X_OK. `path` defaults to the result of os.environ.get("PATH"), or can be overridden with a custom search path. - + :returns: The path found, or None. """ + # Check that a given file can be accessed with the correct mode. # Additionally check that `file` is not a directory, as on Windows # directories pass the os.access check. def _access_check(fn, mode): - return (os.path.exists(fn) and os.access(fn, mode) - and not os.path.isdir(fn)) + return os.path.exists(fn) and os.access(fn, mode) and not os.path.isdir(fn) # If we're given a path with a directory part, look it up directly rather # than referring to PATH directories. This includes checking relative to the @@ -106,17 +104,19 @@ def toilPackageDirPath() -> str: The return value is guaranteed to end in '/toil'. """ result = os.path.dirname(os.path.realpath(__file__)) - if not result.endswith('/toil'): + if not result.endswith("/toil"): raise RuntimeError("The top-level toil package is not named Toil.") return result def inVirtualEnv() -> bool: """Test if we are inside a virtualenv or Conda virtual environment.""" - return ('VIRTUAL_ENV' in os.environ or - 'CONDA_DEFAULT_ENV' in os.environ or - hasattr(sys, 'real_prefix') or - (hasattr(sys, 'base_prefix') and sys.base_prefix != sys.prefix)) + return ( + "VIRTUAL_ENV" in os.environ + or "CONDA_DEFAULT_ENV" in os.environ + or hasattr(sys, "real_prefix") + or (hasattr(sys, "base_prefix") and sys.base_prefix != sys.prefix) + ) def resolveEntryPoint(entryPoint: str) -> str: @@ -125,7 +125,7 @@ def resolveEntryPoint(entryPoint: str) -> str: :returns: The path found, which may be an absolute or a relative path. """ - if os.environ.get("TOIL_CHECK_ENV", None) == 'True' and inVirtualEnv(): + if os.environ.get("TOIL_CHECK_ENV", None) == "True" and inVirtualEnv(): path = os.path.join(os.path.dirname(sys.executable), entryPoint) # Inside a virtualenv we try to use absolute paths to the entrypoints. if os.path.isfile(path): @@ -134,7 +134,9 @@ def resolveEntryPoint(entryPoint: str) -> str: # if Toil is installed in a virtualenv on the leader, it must be installed in # a virtualenv located at the same path on each worker as well. if not os.access(path, os.X_OK): - raise RuntimeError("Cannot access the Toil virtualenv. If installed in a virtualenv on a cluster, make sure that the virtualenv path is the same for the leader and workers.") + raise RuntimeError( + "Cannot access the Toil virtualenv. If installed in a virtualenv on a cluster, make sure that the virtualenv path is the same for the leader and workers." + ) return path # Otherwise, we aren't in a virtualenv, or we're in a virtualenv but Toil # came in via --system-site-packages, or we think the virtualenv might not @@ -154,10 +156,15 @@ def physicalMemory() -> int: True """ try: - return os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES') + return os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES") except ValueError: import subprocess - return int(subprocess.check_output(['sysctl', '-n', 'hw.memsize']).decode('utf-8').strip()) + + return int( + subprocess.check_output(["sysctl", "-n", "hw.memsize"]) + .decode("utf-8") + .strip() + ) def physicalDisk(directory: str) -> int: @@ -181,15 +188,22 @@ def applianceSelf(forceDockerAppliance: bool = False) -> str: Setting TOIL_APPLIANCE_SELF will not be necessary in most cases. """ import toil.version - registry = lookupEnvVar(name='docker registry', - envName='TOIL_DOCKER_REGISTRY', - defaultValue=toil.version.dockerRegistry) - name = lookupEnvVar(name='docker name', - envName='TOIL_DOCKER_NAME', - defaultValue=toil.version.dockerName) - appliance = lookupEnvVar(name='docker appliance', - envName='TOIL_APPLIANCE_SELF', - defaultValue=registry + '/' + name + ':' + toil.version.dockerTag) + + registry = lookupEnvVar( + name="docker registry", + envName="TOIL_DOCKER_REGISTRY", + defaultValue=toil.version.dockerRegistry, + ) + name = lookupEnvVar( + name="docker name", + envName="TOIL_DOCKER_NAME", + defaultValue=toil.version.dockerName, + ) + appliance = lookupEnvVar( + name="docker appliance", + envName="TOIL_APPLIANCE_SELF", + defaultValue=registry + "/" + name + ":" + toil.version.dockerTag, + ) checkDockerSchema(appliance) @@ -211,9 +225,11 @@ def customDockerInitCmd() -> str: :returns: The custom command, or an empty string is returned if the environment variable is not set. """ - command = lookupEnvVar(name='user-defined custom docker init command', - envName='TOIL_CUSTOM_DOCKER_INIT_COMMAND', - defaultValue='') + command = lookupEnvVar( + name="user-defined custom docker init command", + envName="TOIL_CUSTOM_DOCKER_INIT_COMMAND", + defaultValue="", + ) _check_custom_bash_cmd(command) return command.replace("'", "'\\''") # Ensure any single quotes are escaped. @@ -224,24 +240,28 @@ def customInitCmd() -> str: The custom init command is run prior to running Toil appliance itself in workers and/or the primary node (i.e. this is run one stage before ``TOIL_CUSTOM_DOCKER_INIT_COMMAND``). - + This can be useful for doing any custom initialization on instances (e.g. authenticating to private docker registries). Any single quotes are escaped and the command cannot contain a set of blacklisted chars (newline or tab). returns: the custom command or n empty string is returned if the environment variable is not set. """ - command = lookupEnvVar(name='user-defined custom init command', - envName='TOIL_CUSTOM_INIT_COMMAND', - defaultValue='') + command = lookupEnvVar( + name="user-defined custom init command", + envName="TOIL_CUSTOM_INIT_COMMAND", + defaultValue="", + ) _check_custom_bash_cmd(command) return command.replace("'", "'\\''") # Ensure any single quotes are escaped. def _check_custom_bash_cmd(cmd_str): """Ensure that the Bash command doesn't contain invalid characters.""" - if re.search(r'[\n\r\t]', cmd_str): - raise RuntimeError(f'"{cmd_str}" contains invalid characters (newline and/or tab).') + if re.search(r"[\n\r\t]", cmd_str): + raise RuntimeError( + f'"{cmd_str}" contains invalid characters (newline and/or tab).' + ) def lookupEnvVar(name: str, envName: str, defaultValue: str) -> str: @@ -256,10 +276,14 @@ def lookupEnvVar(name: str, envName: str, defaultValue: str) -> str: try: value = os.environ[envName] except KeyError: - log.info('Using default %s of %s as %s is not set.', name, defaultValue, envName) + log.info( + "Using default %s of %s as %s is not set.", name, defaultValue, envName + ) return defaultValue else: - log.info('Overriding %s of %s with %s from %s.', name, defaultValue, value, envName) + log.info( + "Overriding %s of %s with %s from %s.", name, defaultValue, value, envName + ) return value @@ -278,14 +302,20 @@ def checkDockerImageExists(appliance: str) -> str: return appliance registryName, imageName, tag = parseDockerAppliance(appliance) - if registryName == 'docker.io': - return requestCheckDockerIo(origAppliance=appliance, imageName=imageName, tag=tag) + if registryName == "docker.io": + return requestCheckDockerIo( + origAppliance=appliance, imageName=imageName, tag=tag + ) else: - return requestCheckRegularDocker(origAppliance=appliance, registryName=registryName, imageName=imageName, - tag=tag) + return requestCheckRegularDocker( + origAppliance=appliance, + registryName=registryName, + imageName=imageName, + tag=tag, + ) -def parseDockerAppliance(appliance: str) -> Tuple[str, str, str]: +def parseDockerAppliance(appliance: str) -> tuple[str, str, str]: """ Derive parsed registry, image reference, and tag from a docker image string. @@ -303,21 +333,21 @@ def parseDockerAppliance(appliance: str) -> Tuple[str, str, str]: appliance = appliance.lower() # get the tag - if ':' in appliance: - tag = appliance.split(':')[-1] - appliance = appliance[:-(len(':' + tag))] # remove only the tag + if ":" in appliance: + tag = appliance.split(":")[-1] + appliance = appliance[: -(len(":" + tag))] # remove only the tag else: # default to 'latest' if no tag is specified - tag = 'latest' + tag = "latest" # get the registry and image - registryName = 'docker.io' # default if not specified + registryName = "docker.io" # default if not specified imageName = appliance # will be true if not specified - if '/' in appliance and '.' in appliance.split('/')[0]: - registryName = appliance.split('/')[0] - imageName = appliance[len(registryName):] - registryName = registryName.strip('/') - imageName = imageName.strip('/') + if "/" in appliance and "." in appliance.split("/")[0]: + registryName = appliance.split("/")[0] + imageName = appliance[len(registryName) :] + registryName = registryName.strip("/") + imageName = imageName.strip("/") return registryName, imageName, tag @@ -325,12 +355,14 @@ def parseDockerAppliance(appliance: str) -> Tuple[str, str, str]: def checkDockerSchema(appliance): if not appliance: raise ImageNotFound("No docker image specified.") - elif '://' in appliance: - raise ImageNotFound("Docker images cannot contain a schema (such as '://'): %s" - "" % appliance) + elif "://" in appliance: + raise ImageNotFound( + "Docker images cannot contain a schema (such as '://'): %s" "" % appliance + ) elif len(appliance) > 256: - raise ImageNotFound("Docker image must be less than 256 chars: %s" - "" % appliance) + raise ImageNotFound( + "Docker image must be less than 256 chars: %s" "" % appliance + ) class ApplianceImageNotFound(ImageNotFound): @@ -345,22 +377,28 @@ class ApplianceImageNotFound(ImageNotFound): """ def __init__(self, origAppliance, url, statusCode): - msg = ("The docker image that TOIL_APPLIANCE_SELF specifies (%s) produced " - "a nonfunctional manifest URL (%s). The HTTP status returned was %s. " - "The specifier is most likely unsupported or malformed. " - "Please supply a docker image with the format: " - "'.io/:' or ':' " - "(for official docker.io images). Examples: " - "'quay.io/ucsc_cgl/toil:latest', 'ubuntu:latest', or " - "'broadinstitute/genomes-in-the-cloud:2.0.0'." - "" % (origAppliance, url, str(statusCode))) + msg = ( + "The docker image that TOIL_APPLIANCE_SELF specifies (%s) produced " + "a nonfunctional manifest URL (%s). The HTTP status returned was %s. " + "The specifier is most likely unsupported or malformed. " + "Please supply a docker image with the format: " + "'.io/:' or ':' " + "(for official docker.io images). Examples: " + "'quay.io/ucsc_cgl/toil:latest', 'ubuntu:latest', or " + "'broadinstitute/genomes-in-the-cloud:2.0.0'." + "" % (origAppliance, url, str(statusCode)) + ) super().__init__(msg) + # Cache images we know exist so we don't have to ask the registry about them # all the time. KNOWN_EXTANT_IMAGES = set() -def requestCheckRegularDocker(origAppliance: str, registryName: str, imageName: str, tag: str) -> bool: + +def requestCheckRegularDocker( + origAppliance: str, registryName: str, imageName: str, tag: str +) -> bool: """ Check if an image exists using the requests library. @@ -384,8 +422,9 @@ def requestCheckRegularDocker(origAppliance: str, registryName: str, imageName: # Check the cache first return origAppliance - ioURL = 'https://{webhost}/v2/{pathName}/manifests/{tag}' \ - ''.format(webhost=registryName, pathName=imageName, tag=tag) + ioURL = "https://{webhost}/v2/{pathName}/manifests/{tag}" "".format( + webhost=registryName, pathName=imageName, tag=tag + ) response = requests.head(ioURL) if not response.ok: raise ApplianceImageNotFound(origAppliance, ioURL, response.status_code) @@ -412,17 +451,20 @@ def requestCheckDockerIo(origAppliance: str, imageName: str, tag: str) -> bool: return origAppliance # only official images like 'busybox' or 'ubuntu' - if '/' not in imageName: - imageName = 'library/' + imageName + if "/" not in imageName: + imageName = "library/" + imageName - token_url = 'https://auth.docker.io/token?service=registry.docker.io&scope=repository:{repo}:pull'.format( - repo=imageName) - requests_url = f'https://registry-1.docker.io/v2/{imageName}/manifests/{tag}' + token_url = "https://auth.docker.io/token?service=registry.docker.io&scope=repository:{repo}:pull".format( + repo=imageName + ) + requests_url = f"https://registry-1.docker.io/v2/{imageName}/manifests/{tag}" token = requests.get(token_url) jsonToken = token.json() bearer = jsonToken["token"] - response = requests.head(requests_url, headers={'Authorization': f'Bearer {bearer}'}) + response = requests.head( + requests_url, headers={"Authorization": f"Bearer {bearer}"} + ) if not response.ok: raise ApplianceImageNotFound(origAppliance, requests_url, response.status_code) else: @@ -434,21 +476,18 @@ def logProcessContext(config: "Config") -> None: # toil.version.version (string) cannot be imported at top level because it conflicts with # toil.version (module) and Sphinx doesn't like that. from toil.version import version + log.info("Running Toil version %s on host %s.", version, socket.gethostname()) log.debug("Configuration: %s", config.__dict__) try: - from botocore.credentials import (JSONFileCache, - RefreshableCredentials, - create_credential_resolver) - from botocore.session import Session - - cache_path = '~/.cache/aws/cached_temporary_credentials' - datetime_format = "%Y-%m-%dT%H:%M:%SZ" # incidentally the same as the format used by AWS + cache_path = "~/.cache/aws/cached_temporary_credentials" + datetime_format = ( + "%Y-%m-%dT%H:%M:%SZ" # incidentally the same as the format used by AWS + ) log = logging.getLogger(__name__) - # But in addition to our manual cache, we also are going to turn on boto3's # new built-in caching layer. @@ -461,7 +500,6 @@ def datetime_to_str(dt): """ return dt.strftime(datetime_format) - def str_to_datetime(s): """ Convert a string, explicitly UTC into a naive (implicitly UTC) datetime object. diff --git a/src/toil/batchSystems/__init__.py b/src/toil/batchSystems/__init__.py index a41c18f9ea..5aa56c6141 100644 --- a/src/toil/batchSystems/__init__.py +++ b/src/toil/batchSystems/__init__.py @@ -18,6 +18,7 @@ class DeadlockException(Exception): Exception thrown by the Leader or BatchSystem when a deadlock is encountered due to insufficient resources to run the workflow """ + def __init__(self, msg): self.msg = f"Deadlock encountered: {msg}" super().__init__() diff --git a/src/toil/batchSystems/abstractBatchSystem.py b/src/toil/batchSystems/abstractBatchSystem.py index 6ac82a8c63..e2e50d5bee 100644 --- a/src/toil/batchSystems/abstractBatchSystem.py +++ b/src/toil/batchSystems/abstractBatchSystem.py @@ -18,18 +18,10 @@ import time from abc import ABC, abstractmethod from argparse import ArgumentParser, _ArgumentGroup +from collections.abc import Iterator from contextlib import contextmanager from threading import Condition -from typing import (Any, - ContextManager, - Dict, - Iterator, - List, - NamedTuple, - Optional, - Set, - Union, - cast) +from typing import Any, ContextManager, NamedTuple, Optional, Union, cast from toil.batchSystems.options import OptionSetter from toil.bus import MessageBus, MessageOutbox @@ -45,6 +37,7 @@ # Value to use as exitStatus in UpdatedBatchJobInfo.exitStatus when status is not available. EXIT_STATUS_UNAVAILABLE_VALUE = 255 + class BatchJobExitReason(enum.IntEnum): FINISHED: int = 1 """Successfully finished.""" @@ -65,7 +58,6 @@ class BatchJobExitReason(enum.IntEnum): PARTITION: int = 9 """Job was not able to talk to the leader via the job store, so Toil declared it failed.""" - @classmethod def to_string(cls, value: int) -> str: """ @@ -80,6 +72,7 @@ def to_string(cls, value: int) -> str: except ValueError: return str(value) + class UpdatedBatchJobInfo(NamedTuple): jobID: int exitStatus: int @@ -93,6 +86,7 @@ class UpdatedBatchJobInfo(NamedTuple): exitReason: Optional[BatchJobExitReason] wallTime: Union[float, int, None] + # Information required for worker cleanup on shutdown of the batch system. class WorkerCleanupInfo(NamedTuple): work_dir: Optional[str] @@ -110,8 +104,10 @@ class WorkerCleanupInfo(NamedTuple): 'onSuccess', 'onError', 'never') """ + class AbstractBatchSystem(ABC): """An abstract base class to represent the interface the batch system must provide to Toil.""" + @classmethod @abstractmethod def supportsAutoDeployment(cls) -> bool: @@ -163,7 +159,12 @@ def set_message_bus(self, message_bus: MessageBus) -> None: """ @abstractmethod - def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int: + def issueBatchJob( + self, + command: str, + job_desc: JobDescription, + job_environment: Optional[dict[str, str]] = None, + ) -> int: """ Issues a job with the specified command to the batch system and returns a unique job ID number. @@ -180,7 +181,7 @@ def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: raise NotImplementedError() @abstractmethod - def killBatchJobs(self, jobIDs: List[int]) -> None: + def killBatchJobs(self, jobIDs: list[int]) -> None: """ Kills the given job IDs. After returning, the killed jobs will not appear in the results of getRunningBatchJobIDs. The killed job will not @@ -193,7 +194,7 @@ def killBatchJobs(self, jobIDs: List[int]) -> None: # FIXME: Return value should be a set (then also fix the tests) @abstractmethod - def getIssuedBatchJobIDs(self) -> List[int]: + def getIssuedBatchJobIDs(self) -> list[int]: """ Gets all currently issued jobs @@ -204,7 +205,7 @@ def getIssuedBatchJobIDs(self) -> List[int]: raise NotImplementedError() @abstractmethod - def getRunningBatchJobIDs(self) -> Dict[int, float]: + def getRunningBatchJobIDs(self) -> dict[int, float]: """ Gets a map of jobs as job ID numbers that are currently running (not just waiting) and how long they have been running, in seconds. @@ -292,7 +293,7 @@ def setOptions(cls, setOption: OptionSetter) -> None: returning nothing, used to update run configuration as a side effect. """ - def getWorkerContexts(self) -> List[ContextManager[Any]]: + def getWorkerContexts(self) -> list[ContextManager[Any]]: """ Get a list of picklable context manager objects to wrap worker work in, in order. @@ -308,7 +309,9 @@ def getWorkerContexts(self) -> List[ContextManager[Any]]: class BatchSystemSupport(AbstractBatchSystem): """Partial implementation of AbstractBatchSystem, support methods.""" - def __init__(self, config: Config, maxCores: float, maxMemory: int, maxDisk: int) -> None: + def __init__( + self, config: Config, maxCores: float, maxMemory: int, maxDisk: int + ) -> None: """ Initialize initial state of the object. @@ -330,7 +333,7 @@ def __init__(self, config: Config, maxCores: float, maxMemory: int, maxDisk: int self.maxCores = maxCores self.maxMemory = maxMemory self.maxDisk = maxDisk - self.environment: Dict[str, str] = {} + self.environment: dict[str, str] = {} if config.workflowID is None: raise Exception("config.workflowID must be set") else: @@ -356,9 +359,11 @@ def check_resource_request(self, requirer: Requirer) -> None: greater than allowed """ try: - for resource, requested, available in [('cores', requirer.cores, self.maxCores), - ('memory', requirer.memory, self.maxMemory), - ('disk', requirer.disk, self.maxDisk)]: + for resource, requested, available in [ + ("cores", requirer.cores, self.maxCores), + ("memory", requirer.memory, self.maxMemory), + ("disk", requirer.disk, self.maxDisk), + ]: assert requested is not None if requested > available: raise InsufficientSystemResources(requirer, resource, available) @@ -367,7 +372,7 @@ def check_resource_request(self, requirer: Requirer) -> None: except InsufficientSystemResources as e: # Add more annotation info to the error e.batch_system = self.__class__.__name__ or None - e.source = self.config.workDir if e.resource == 'disk' else None + e.source = self.config.workDir if e.resource == "disk" else None raise e def _check_accelerator_request(self, requirer: Requirer) -> None: @@ -380,9 +385,12 @@ def _check_accelerator_request(self, requirer: Requirer) -> None: """ if len(requirer.accelerators) > 0: # By default we assume we can't fulfill any of these - raise InsufficientSystemResources(requirer, 'accelerators', [], details=[ - 'The batch system does not support any accelerators.' - ]) + raise InsufficientSystemResources( + requirer, + "accelerators", + [], + details=["The batch system does not support any accelerators."], + ) def setEnv(self, name: str, value: Optional[str] = None) -> None: """ @@ -439,7 +447,9 @@ def get_batch_logs_dir(self) -> str: # And if nothing is specified use the workDir. return Toil.getToilWorkDir(self.config.workDir) - def format_std_out_err_path(self, toil_job_id: int, cluster_job_id: str, std: str) -> str: + def format_std_out_err_path( + self, toil_job_id: int, cluster_job_id: str, std: str + ) -> str: """ Format path for batch system standard output/error and other files generated by the batch system itself. @@ -458,7 +468,9 @@ def format_std_out_err_path(self, toil_job_id: int, cluster_job_id: str, std: st if self.config.noStdOutErr: return os.devnull - file_name: str = f'toil_{self.config.workflowID}.{toil_job_id}.{cluster_job_id}.{std}.log' + file_name: str = ( + f"toil_{self.config.workflowID}.{toil_job_id}.{cluster_job_id}.{std}.log" + ) logs_dir: str = self.get_batch_logs_dir() return os.path.join(logs_dir, file_name) @@ -466,7 +478,7 @@ def format_std_out_err_glob(self, toil_job_id: int) -> str: """ Get a glob string that will match all file paths generated by format_std_out_err_path for a job. """ - file_glob: str = f'toil_{self.config.workflowID}.{toil_job_id}.*.log' + file_glob: str = f"toil_{self.config.workflowID}.{toil_job_id}.*.log" logs_dir: str = self.get_batch_logs_dir() return os.path.join(logs_dir, file_glob) @@ -480,23 +492,28 @@ def workerCleanup(info: WorkerCleanupInfo) -> None: :param WorkerCleanupInfo info: A named tuple consisting of all the relevant information for cleaning up the worker. """ - logger.debug('Attempting worker cleanup') + logger.debug("Attempting worker cleanup") assert isinstance(info, WorkerCleanupInfo) assert info.workflow_id is not None workflowDir = Toil.getLocalWorkflowDir(info.workflow_id, info.work_dir) - coordination_dir = Toil.get_local_workflow_coordination_dir(info.workflow_id, info.work_dir, info.coordination_dir) + coordination_dir = Toil.get_local_workflow_coordination_dir( + info.workflow_id, info.work_dir, info.coordination_dir + ) DeferredFunctionManager.cleanupWorker(coordination_dir) workflowDirContents = os.listdir(workflowDir) - AbstractFileStore.shutdownFileStore(info.workflow_id, info.work_dir, info.coordination_dir) - if info.clean_work_dir in ('always', 'onSuccess', 'onError'): + AbstractFileStore.shutdownFileStore( + info.workflow_id, info.work_dir, info.coordination_dir + ) + if info.clean_work_dir in ("always", "onSuccess", "onError"): if workflowDirContents in ([], [cacheDirName(info.workflow_id)]): - logger.debug('Deleting workflow directory %s', workflowDir) + logger.debug("Deleting workflow directory %s", workflowDir) shutil.rmtree(workflowDir, ignore_errors=True) if coordination_dir != workflowDir: # No more coordination to do here either. - logger.debug('Deleting coordination directory %s', coordination_dir) + logger.debug("Deleting coordination directory %s", coordination_dir) shutil.rmtree(coordination_dir, ignore_errors=True) + class NodeInfo: """ The coresUsed attribute is a floating point value between 0 (all cores idle) and 1 (all cores @@ -513,10 +530,17 @@ class NodeInfo: The workers attribute is an integer reflecting the number of workers currently active workers on the node. """ - def __init__(self, coresUsed: float, memoryUsed: float, - coresTotal: float, memoryTotal: int, - requestedCores: float, requestedMemory: int, - workers: int) -> None: + + def __init__( + self, + coresUsed: float, + memoryUsed: float, + coresTotal: float, + memoryTotal: int, + requestedCores: float, + requestedMemory: int, + workers: int, + ) -> None: self.coresUsed = coresUsed self.memoryUsed = memoryUsed @@ -539,7 +563,9 @@ class AbstractScalableBatchSystem(AbstractBatchSystem): """ @abstractmethod - def getNodes(self, preemptible: Optional[bool] = None, timeout: int = 600) -> Dict[str, NodeInfo]: + def getNodes( + self, preemptible: Optional[bool] = None, timeout: int = 600 + ) -> dict[str, NodeInfo]: """ Returns a dictionary mapping node identifiers of preemptible or non-preemptible nodes to NodeInfo objects, one for each node. @@ -584,7 +610,15 @@ def unignoreNode(self, nodeAddress: str) -> None: class InsufficientSystemResources(Exception): - def __init__(self, requirer: Requirer, resource: str, available: Optional[ParsedRequirement] = None, batch_system: Optional[str] = None, source: Optional[str] = None, details: List[str] = []) -> None: + def __init__( + self, + requirer: Requirer, + resource: str, + available: Optional[ParsedRequirement] = None, + batch_system: Optional[str] = None, + source: Optional[str] = None, + details: list[str] = [], + ) -> None: """ Make a new exception about how we couldn't get enough of something. @@ -597,7 +631,7 @@ def __init__(self, requirer: Requirer, resource: str, available: Optional[Parsed :param details: Any extra details about the problem that can be attached to the error. """ - self.job_name : Optional[str] = str(requirer) + self.job_name: Optional[str] = str(requirer) self.resource = resource self.requested = cast(ParsedRequirement, getattr(requirer, resource)) self.available = available @@ -610,38 +644,52 @@ def __str__(self) -> str: Explain the exception. """ - unit = 'bytes of ' if self.resource in ('disk', 'memory') else '' - purpose = ' for temporary space' if self.resource == 'disk' else '' - qualifier = ' free on {self.source}' if self.resource == 'disk' and self.source is not None else '' + unit = "bytes of " if self.resource in ("disk", "memory") else "" + purpose = " for temporary space" if self.resource == "disk" else "" + qualifier = ( + " free on {self.source}" + if self.resource == "disk" and self.source is not None + else "" + ) msg = [] if self.job_name is not None: - msg.append(f'The job {self.job_name} is requesting ') + msg.append(f"The job {self.job_name} is requesting ") else: - msg.append(f'Requesting ') - msg.append(f'{self.requested} {unit}{self.resource}') + msg.append(f"Requesting ") + msg.append(f"{self.requested} {unit}{self.resource}") msg.append(purpose) if self.available is not None: - msg.append(f', more than the maximum of {self.available} {unit}{self.resource}{qualifier} that {self.batch_system or "this batch system"} was configured with') - if self.resource in ('cores', 'memory', 'disk'): - msg.append(f', or enforced by --max{self.resource.capitalize()}') + msg.append( + f', more than the maximum of {self.available} {unit}{self.resource}{qualifier} that {self.batch_system or "this batch system"} was configured with' + ) + if self.resource in ("cores", "memory", "disk"): + msg.append(f", or enforced by --max{self.resource.capitalize()}") else: - msg.append(', but that is not available') - msg.append('.') + msg.append(", but that is not available") + msg.append(".") - if self.resource == 'disk': - msg.append(' Try setting/changing the toil option "--workDir" or changing the base temporary directory by setting TMPDIR.') + if self.resource == "disk": + msg.append( + ' Try setting/changing the toil option "--workDir" or changing the base temporary directory by setting TMPDIR.' + ) for detail in self.details: - msg.append(' ') + msg.append(" ") msg.append(detail) - return ''.join(msg) + return "".join(msg) class AcquisitionTimeoutException(Exception): """To be raised when a resource request times out.""" - def __init__(self, resource: str, requested: Union[int, float, Set[int]], available: Union[int, float, Set[int]]) -> None: + + def __init__( + self, + resource: str, + requested: Union[int, float, set[int]], + available: Union[int, float, set[int]], + ) -> None: """ Creates an instance of this exception that indicates which resource is insufficient for current demands, as well as the resources requested and actually available. @@ -661,7 +709,10 @@ class ResourcePool: Provides a context manager to do something with an amount of resource acquired. """ - def __init__(self, initial_value: int, resource_type: str, timeout: float = 5) -> None: + + def __init__( + self, initial_value: int, resource_type: str, timeout: float = 5 + ) -> None: super().__init__() # We use this condition to signal everyone whenever some resource is released. # We use its associated lock to guard value. @@ -695,8 +746,11 @@ def acquire(self, amount: int) -> None: while amount > self.value: if time.time() - startTime >= self.timeout: # This means the thread timed out waiting for the resource. - raise AcquisitionTimeoutException(resource=self.resource_type, - requested=amount, available=self.value) + raise AcquisitionTimeoutException( + resource=self.resource_type, + requested=amount, + available=self.value, + ) # Allow self.timeout seconds to get the resource, else quit # through the above if condition. This wait + timeout is the # last thing in the loop such that a request that takes longer @@ -737,7 +791,10 @@ class ResourceSet: Provides a context manager to do something with a set of of resources acquired. """ - def __init__(self, initial_value: Set[int], resource_type: str, timeout: float = 5) -> None: + + def __init__( + self, initial_value: set[int], resource_type: str, timeout: float = 5 + ) -> None: super().__init__() # We use this condition to signal everyone whenever some resource is released. # We use its associated lock to guard value. @@ -747,7 +804,7 @@ def __init__(self, initial_value: Set[int], resource_type: str, timeout: float = self.resource_type = resource_type self.timeout = timeout - def acquireNow(self, subset: Set[int]) -> bool: + def acquireNow(self, subset: set[int]) -> bool: """ Reserve the given amount of the given resource. Returns True if successful and False if this is not possible immediately. @@ -759,7 +816,7 @@ def acquireNow(self, subset: Set[int]) -> bool: self.value -= subset return True - def acquire(self, subset: Set[int]) -> None: + def acquire(self, subset: set[int]) -> None: """ Reserve the given amount of the given resource. Raises AcquisitionTimeoutException if this is not possible in under @@ -770,8 +827,11 @@ def acquire(self, subset: Set[int]) -> None: while subset > self.value: if time.time() - startTime >= self.timeout: # This means the thread timed out waiting for the resource. - raise AcquisitionTimeoutException(resource=self.resource_type, - requested=subset, available=self.value) + raise AcquisitionTimeoutException( + resource=self.resource_type, + requested=subset, + available=self.value, + ) # Allow self.timeout seconds to get the resource, else quit # through the above if condition. This wait + timeout is the # last thing in the loop such that a request that takes longer @@ -780,12 +840,12 @@ def acquire(self, subset: Set[int]) -> None: self.condition.wait(timeout=self.timeout) self.value -= subset - def release(self, subset: Set[int]) -> None: + def release(self, subset: set[int]) -> None: with self.condition: self.value |= subset self.condition.notify_all() - def get_free_snapshot(self) -> Set[int]: + def get_free_snapshot(self) -> set[int]: """ Get a snapshot of what items are free right now. May be stale as soon as you get it, but you will need some kind of hint @@ -800,7 +860,7 @@ def __repr__(self) -> str: return "ResourceSet(%s)" % self.value @contextmanager - def acquisitionOf(self, subset: Set[int]) -> Iterator[None]: + def acquisitionOf(self, subset: set[int]) -> Iterator[None]: self.acquire(subset) try: yield diff --git a/src/toil/batchSystems/abstractGridEngineBatchSystem.py b/src/toil/batchSystems/abstractGridEngineBatchSystem.py index 26c19f3065..456bc2066e 100644 --- a/src/toil/batchSystems/abstractGridEngineBatchSystem.py +++ b/src/toil/batchSystems/abstractGridEngineBatchSystem.py @@ -17,17 +17,19 @@ from datetime import datetime from queue import Empty, Queue from threading import Lock, Thread -from typing import Dict, List, Optional, Tuple, Union +from typing import Optional, Union -from toil.common import Config -from toil.batchSystems.abstractBatchSystem import (BatchJobExitReason, - UpdatedBatchJobInfo) +from toil.batchSystems.abstractBatchSystem import ( + BatchJobExitReason, + UpdatedBatchJobInfo, +) from toil.batchSystems.cleanup_support import BatchSystemCleanupSupport from toil.bus import ExternalBatchIdMessage, get_job_kind -from toil.job import JobDescription, AcceleratorRequirement +from toil.common import Config +from toil.job import AcceleratorRequirement, JobDescription from toil.statsAndLogging import TRACE from toil.lib.misc import CalledProcessErrorStderr -from toil.lib.retry import old_retry, DEFAULT_DELAYS, retry +from toil.lib.retry import DEFAULT_DELAYS, old_retry logger = logging.getLogger(__name__) @@ -40,22 +42,34 @@ # Unit name of the job # Environment dict for the job # Accelerator requirements for the job -JobTuple = Tuple[int, float, int, str, str, Dict[str, str], List[AcceleratorRequirement]] +JobTuple = tuple[ + int, float, int, str, str, dict[str, str], list[AcceleratorRequirement] +] + class ExceededRetryAttempts(Exception): def __init__(self): super().__init__("Exceeded retry attempts talking to scheduler.") + class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport): """ A partial implementation of BatchSystemSupport for batch systems run on a standard HPC cluster. By default auto-deployment is not implemented. """ + class GridEngineThreadException(Exception): pass class GridEngineThread(Thread, metaclass=ABCMeta): - def __init__(self, newJobsQueue: Queue, updatedJobsQueue: Queue, killQueue: Queue, killedJobsQueue: Queue, boss: 'AbstractGridEngineBatchSystem') -> None: + def __init__( + self, + newJobsQueue: Queue, + updatedJobsQueue: Queue, + killQueue: Queue, + killedJobsQueue: Queue, + boss: "AbstractGridEngineBatchSystem", + ) -> None: """ Abstract thread interface class. All instances are created with five initial arguments (below). Note the Queue instances passed are empty. @@ -70,19 +84,22 @@ def __init__(self, newJobsQueue: Queue, updatedJobsQueue: Queue, killQueue: Queu """ Thread.__init__(self) self.boss = boss - self.boss.config.statePollingWait = \ + self.boss.config.statePollingWait = ( self.boss.config.statePollingWait or self.boss.getWaitDuration() - self.boss.config.state_polling_timeout = \ - self.boss.config.state_polling_timeout or self.boss.config.statePollingWait * 10 + ) + self.boss.config.state_polling_timeout = ( + self.boss.config.state_polling_timeout + or self.boss.config.statePollingWait * 10 + ) self.newJobsQueue = newJobsQueue self.updatedJobsQueue = updatedJobsQueue self.killQueue = killQueue self.killedJobsQueue = killedJobsQueue - self.waitingJobs: List[JobTuple] = list() + self.waitingJobs: list[JobTuple] = list() self.runningJobs = set() # TODO: Why do we need a lock for this? We have the GIL. self.runningJobsLock = Lock() - self.batchJobIDs: Dict[int, str] = dict() + self.batchJobIDs: dict[int, str] = dict() self._checkOnJobsCache = None self._checkOnJobsTimestamp = None self.exception = None @@ -126,19 +143,28 @@ def createJobs(self, newJob: JobTuple) -> bool: if newJob is not None: self.waitingJobs.append(newJob) # Launch jobs as necessary: - while len(self.waitingJobs) > 0 and \ - len(self.runningJobs) < int(self.boss.config.max_jobs): + while len(self.waitingJobs) > 0 and len(self.runningJobs) < int( + self.boss.config.max_jobs + ): activity = True - jobID, cpu, memory, command, jobName, environment, gpus = self.waitingJobs.pop(0) + jobID, cpu, memory, command, jobName, environment, gpus = ( + self.waitingJobs.pop(0) + ) if self.boss.config.memory_is_product and cpu > 1: memory = memory // cpu # prepare job submission command - subLine = self.prepareSubmission(cpu, memory, jobID, command, jobName, environment, gpus) + subLine = self.prepareSubmission( + cpu, memory, jobID, command, jobName, environment, gpus + ) logger.debug("Running %r", subLine) batchJobID = self.boss.with_retries(self.submitJob, subLine) if self.boss._outbox is not None: - #JobID corresponds to the toil version of the jobID, dif from jobstore idea of the id, batchjobid is what we get from slurm - self.boss._outbox.publish(ExternalBatchIdMessage(jobID, batchJobID, self.boss.__class__.__name__)) + # JobID corresponds to the toil version of the jobID, dif from jobstore idea of the id, batchjobid is what we get from slurm + self.boss._outbox.publish( + ExternalBatchIdMessage( + jobID, batchJobID, self.boss.__class__.__name__ + ) + ) logger.debug("Submitted job %s", str(batchJobID)) @@ -173,7 +199,7 @@ def killJobs(self): # Do the dirty job for jobID in list(killList): if jobID in self.runningJobs: - logger.debug('Killing job: %s', jobID) + logger.debug("Killing job: %s", jobID) # this call should be implementation-specific, all other # code is redundant w/ other implementations @@ -190,12 +216,15 @@ def killJobs(self): batchJobID = self.getBatchSystemID(jobID) exit_code = self.boss.with_retries(self.getJobExitCode, batchJobID) if exit_code is not None: - logger.debug('Adding jobID %s to killedJobsQueue', jobID) + logger.debug("Adding jobID %s to killedJobsQueue", jobID) self.killedJobsQueue.put(jobID) killList.remove(jobID) self.forgetJob(jobID) if len(killList) > 0: - logger.warning("Some jobs weren't killed, trying again in %is.", self.boss.sleepSeconds()) + logger.warning( + "Some jobs weren't killed, trying again in %is.", + self.boss.sleepSeconds(), + ) return True @@ -207,7 +236,9 @@ def checkOnJobs(self): """ if self._checkOnJobsTimestamp: - time_since_last_check = (datetime.now() - self._checkOnJobsTimestamp).total_seconds() + time_since_last_check = ( + datetime.now() - self._checkOnJobsTimestamp + ).total_seconds() if time_since_last_check < self.boss.config.statePollingWait: return self._checkOnJobsCache @@ -221,16 +252,17 @@ def checkOnJobs(self): ) # We got the statuses as a batch for running_job_id, status in zip(running_job_list, statuses): - activity = self._handle_job_status( - running_job_id, status, activity - ) + activity = self._handle_job_status(running_job_id, status, activity) self._checkOnJobsCache = activity self._checkOnJobsTimestamp = datetime.now() return activity def _handle_job_status( - self, job_id: int, status: Union[int, Tuple[int, Optional[BatchJobExitReason]], None], activity: bool + self, + job_id: int, + status: Union[int, tuple[int, Optional[BatchJobExitReason]], None], + activity: bool, ) -> bool: """ Helper method for checkOnJobs to handle job statuses @@ -258,7 +290,7 @@ def _runStep(self): activity = True newJob = self.newJobsQueue.get() if newJob is None: - logger.debug('Received queue sentinel.') + logger.debug("Received queue sentinel.") # Send out kill signals before stopping self.killJobs() return False @@ -269,7 +301,7 @@ def _runStep(self): if self.checkOnJobs(): activity = True if not activity: - logger.log(TRACE, 'No activity, sleeping for %is', self.boss.sleepSeconds()) + logger.log(TRACE, "No activity, sleeping for %is", self.boss.sleepSeconds()) return True def run(self): @@ -286,7 +318,9 @@ def run(self): # signalling exception in the thread as we expect the thread to # always be running for the duration of the workflow - def coalesce_job_exit_codes(self, batch_job_id_list: list) -> List[Union[int, Tuple[int, Optional[BatchJobExitReason]], None]]: + def coalesce_job_exit_codes( + self, batch_job_id_list: list + ) -> list[Union[int, tuple[int, Optional[BatchJobExitReason]], None]]: """ Returns exit codes and possibly exit reasons for a list of jobs, or None if they are running. @@ -299,7 +333,9 @@ def coalesce_job_exit_codes(self, batch_job_id_list: list) -> List[Union[int, Tu statuses = [] try: for batch_job_id in batch_job_id_list: - statuses.append(self.boss.with_retries(self.getJobExitCode, batch_job_id)) + statuses.append( + self.boss.with_retries(self.getJobExitCode, batch_job_id) + ) except CalledProcessErrorStderr as err: # This avoids the nested retry issue where we could issue n^2 retries when the backing scheduler somehow disappears # We catch the internal retry exception and raise something else so the outer retry doesn't retry the entire function again @@ -307,14 +343,16 @@ def coalesce_job_exit_codes(self, batch_job_id_list: list) -> List[Union[int, Tu return statuses @abstractmethod - def prepareSubmission(self, - cpu: int, - memory: int, - jobID: int, - command: str, - jobName: str, - job_environment: Optional[Dict[str, str]] = None, - gpus: Optional[int] = None) -> List[str]: + def prepareSubmission( + self, + cpu: int, + memory: int, + jobID: int, + command: str, + jobName: str, + job_environment: Optional[dict[str, str]] = None, + gpus: Optional[int] = None, + ) -> list[str]: """ Preparation in putting together a command-line string for submitting to batch system (via submitJob().) @@ -364,7 +402,9 @@ def killJob(self, jobID): raise NotImplementedError() @abstractmethod - def getJobExitCode(self, batchJobID) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]: + def getJobExitCode( + self, batchJobID + ) -> Union[int, tuple[int, Optional[BatchJobExitReason]], None]: """ Returns job exit code and possibly an instance of abstractBatchSystem.BatchJobExitReason. @@ -380,9 +420,10 @@ def getJobExitCode(self, batchJobID) -> Union[int, Tuple[int, Optional[BatchJobE """ raise NotImplementedError() - def __init__(self, config: Config, maxCores: float, maxMemory: int, maxDisk: int) -> None: - super().__init__( - config, maxCores, maxMemory, maxDisk) + def __init__( + self, config: Config, maxCores: float, maxMemory: int, maxDisk: int + ) -> None: + super().__init__(config, maxCores, maxMemory, maxDisk) self.config = config self.currentJobs = set() @@ -392,8 +433,13 @@ def __init__(self, config: Config, maxCores: float, maxMemory: int, maxDisk: int self.killQueue = Queue() self.killedJobsQueue = Queue() # get the associated thread class here - self.background_thread = self.GridEngineThread(self.newJobsQueue, self.updatedJobsQueue, - self.killQueue, self.killedJobsQueue, self) + self.background_thread = self.GridEngineThread( + self.newJobsQueue, + self.updatedJobsQueue, + self.killQueue, + self.killedJobsQueue, + self, + ) self.background_thread.start() self._getRunningBatchJobIDsTimestamp = None self._getRunningBatchJobIDsCache = {} @@ -409,14 +455,19 @@ def count_needed_gpus(self, job_desc: JobDescription): gpus = 0 if isinstance(job_desc.accelerators, list): for accelerator in job_desc.accelerators: - if accelerator['kind'] == 'gpu': - gpus += accelerator['count'] + if accelerator["kind"] == "gpu": + gpus += accelerator["count"] else: gpus = job_desc.accelerators return gpus - def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None): + def issueBatchJob( + self, + command: str, + job_desc: JobDescription, + job_environment: Optional[dict[str, str]] = None, + ): # Avoid submitting internal jobs to the batch queue, handle locally local_id = self.handleLocalJob(command, job_desc) if local_id is not None: @@ -427,10 +478,23 @@ def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: job_id = self.getNextJobID() self.currentJobs.add(job_id) - self.newJobsQueue.put((job_id, job_desc.cores, job_desc.memory, command, get_job_kind(job_desc.get_names()), - job_environment, gpus)) - logger.debug("Issued the job command: %s with job id: %s and job name %s", command, str(job_id), - get_job_kind(job_desc.get_names())) + self.newJobsQueue.put( + ( + job_id, + job_desc.cores, + job_desc.memory, + command, + get_job_kind(job_desc.get_names()), + job_environment, + gpus, + ) + ) + logger.debug( + "Issued the job command: %s with job id: %s and job name %s", + command, + str(job_id), + get_job_kind(job_desc.get_names()), + ) return job_id def killBatchJobs(self, jobIDs): @@ -440,7 +504,7 @@ def killBatchJobs(self, jobIDs): """ self.killLocalJobs(jobIDs) jobIDs = set(jobIDs) - logger.debug('Jobs to be killed: %r', jobIDs) + logger.debug("Jobs to be killed: %r", jobIDs) for jobID in jobIDs: self.killQueue.put(jobID) while jobIDs: @@ -448,7 +512,9 @@ def killBatchJobs(self, jobIDs): killedJobId = self.killedJobsQueue.get(timeout=10) except Empty: if not self.background_thread.is_alive(): - raise self.GridEngineThreadException("Grid engine thread failed unexpectedly") from self.background_thread.exception + raise self.GridEngineThreadException( + "Grid engine thread failed unexpectedly" + ) from self.background_thread.exception continue if killedJobId is None: break @@ -459,8 +525,11 @@ def killBatchJobs(self, jobIDs): if killedJobId in self.currentJobs: self.currentJobs.remove(killedJobId) if jobIDs: - logger.debug('Some kills (%s) still pending, sleeping %is', len(jobIDs), - self.sleepSeconds()) + logger.debug( + "Some kills (%s) still pending, sleeping %is", + len(jobIDs), + self.sleepSeconds(), + ) def getIssuedBatchJobIDs(self): """ @@ -475,10 +544,11 @@ def getRunningBatchJobIDs(self): Respects statePollingWait and will return cached results if not within time period to talk with the scheduler. """ - if (self._getRunningBatchJobIDsTimestamp and ( - datetime.now() - - self._getRunningBatchJobIDsTimestamp).total_seconds() < - self.config.statePollingWait): + if ( + self._getRunningBatchJobIDsTimestamp + and (datetime.now() - self._getRunningBatchJobIDsTimestamp).total_seconds() + < self.config.statePollingWait + ): batchIds = self._getRunningBatchJobIDsCache else: batchIds = self.with_retries(self.background_thread.getRunningJobIDs) @@ -493,7 +563,9 @@ def getUpdatedBatchJob(self, maxWait): if not self.background_thread.is_alive(): # kill remaining jobs on the thread self.background_thread.killJobs() - raise self.GridEngineThreadException("Unexpected GridEngineThread failure") from self.background_thread.exception + raise self.GridEngineThreadException( + "Unexpected GridEngineThread failure" + ) from self.background_thread.exception if local_tuple: return local_tuple else: @@ -501,7 +573,7 @@ def getUpdatedBatchJob(self, maxWait): item = self.updatedJobsQueue.get(timeout=maxWait) except Empty: return None - logger.debug('UpdatedJobsQueue Item: %s', item) + logger.debug("UpdatedJobsQueue Item: %s", item) self.currentJobs.remove(item.jobID) return item @@ -523,14 +595,20 @@ def shutdown(self) -> None: # Now in one thread, kill all the jobs if len(self.background_thread.runningJobs) > 0: - logger.warning("Cleaning up %s jobs still running at shutdown", len(self.background_thread.runningJobs)) + logger.warning( + "Cleaning up %s jobs still running at shutdown", + len(self.background_thread.runningJobs), + ) for job in self.background_thread.runningJobs: self.killQueue.put(job) self.background_thread.killJobs() def setEnv(self, name, value=None): - if value and ',' in value: - raise ValueError(type(self).__name__ + " does not support commata in environment variable values") + if value and "," in value: + raise ValueError( + type(self).__name__ + + " does not support commata in environment variable values" + ) return super().setEnv(name, value) @classmethod @@ -538,8 +616,7 @@ def getWaitDuration(self): return 1 def sleepSeconds(self, sleeptime=1): - """ Helper function to drop on all state-querying functions to avoid over-querying. - """ + """Helper function to drop on all state-querying functions to avoid over-querying.""" time.sleep(sleeptime) return sleeptime @@ -550,15 +627,21 @@ def with_retries(self, operation, *args, **kwargs): """ for attempt in old_retry( # Don't retry more often than the state polling wait. - delays=[max(delay, self.config.statePollingWait) for delay in DEFAULT_DELAYS], + delays=[ + max(delay, self.config.statePollingWait) for delay in DEFAULT_DELAYS + ], timeout=self.config.state_polling_timeout, - predicate=lambda e: isinstance(e, CalledProcessErrorStderr) + predicate=lambda e: isinstance(e, CalledProcessErrorStderr), ): with attempt: try: return operation(*args, **kwargs) except CalledProcessErrorStderr as err: - logger.error("Errored operation %s, code %d: %s", - operation.__name__, err.returncode, err.stderr) + logger.error( + "Errored operation %s, code %d: %s", + operation.__name__, + err.returncode, + err.stderr, + ) # Raise up to the retry logic, which will retry until timeout raise err diff --git a/src/toil/batchSystems/awsBatch.py b/src/toil/batchSystems/awsBatch.py index 29751f370d..613cfd4105 100644 --- a/src/toil/batchSystems/awsBatch.py +++ b/src/toil/batchSystems/awsBatch.py @@ -34,15 +34,18 @@ import time import uuid from argparse import ArgumentParser, _ArgumentGroup -from typing import Any, Dict, Iterator, List, Optional, Set, Union +from collections.abc import Iterator +from typing import Any, Optional, Union from botocore.exceptions import ClientError from toil import applianceSelf -from toil.batchSystems.abstractBatchSystem import (EXIT_STATUS_UNAVAILABLE_VALUE, - BatchJobExitReason, - InsufficientSystemResources, - UpdatedBatchJobInfo) +from toil.batchSystems.abstractBatchSystem import ( + EXIT_STATUS_UNAVAILABLE_VALUE, + BatchJobExitReason, + InsufficientSystemResources, + UpdatedBatchJobInfo, +) from toil.batchSystems.cleanup_support import BatchSystemCleanupSupport from toil.batchSystems.contained_executor import pack_job from toil.batchSystems.options import OptionSetter @@ -60,9 +63,9 @@ # Map from AWS Batch terminal states to Toil batch job exit reasons -STATE_TO_EXIT_REASON: Dict[str, BatchJobExitReason] = { - 'SUCCEEDED': BatchJobExitReason.FINISHED, - 'FAILED': BatchJobExitReason.FAILED +STATE_TO_EXIT_REASON: dict[str, BatchJobExitReason] = { + "SUCCEEDED": BatchJobExitReason.FINISHED, + "FAILED": BatchJobExitReason.FAILED, } # What's the max polling list size? @@ -73,53 +76,62 @@ # AWS batch won't accept API requests asking for less than this many CPUs. MIN_REQUESTABLE_CORES = 1 + class AWSBatchBatchSystem(BatchSystemCleanupSupport): @classmethod def supportsAutoDeployment(cls) -> bool: return True - def __init__(self, config: Config, maxCores: float, maxMemory: int, maxDisk: int) -> None: + def __init__( + self, config: Config, maxCores: float, maxMemory: int, maxDisk: int + ) -> None: super().__init__(config, maxCores, maxMemory, maxDisk) # Determine region to use. # Either it's set specifically or maybe we can get it from the "best" zone. # TODO: Parse it from a full queue ARN? - self.region = getattr(config, 'aws_batch_region') + self.region = getattr(config, "aws_batch_region") if self.region is None: self.region = get_current_aws_region() if self.region is None: # Can't proceed without a real region - raise RuntimeError('To use AWS Batch, specify --awsBatchRegion or ' - 'TOIL_AWS_REGION or TOIL_AWS_ZONE, or configure ' - 'a default zone in boto') + raise RuntimeError( + "To use AWS Batch, specify --awsBatchRegion or " + "TOIL_AWS_REGION or TOIL_AWS_ZONE, or configure " + "a default zone in boto" + ) # Connect to AWS Batch. # TODO: Use a global AWSConnectionManager so we can share a client # cache with provisioners, etc. - self.client = establish_boto3_session(self.region).client('batch') + self.client = establish_boto3_session(self.region).client("batch") # Determine our batch queue - self.queue = getattr(config, 'aws_batch_queue') + self.queue = getattr(config, "aws_batch_queue") if self.queue is None: # Make sure we actually have a queue - raise RuntimeError("To use AWS Batch, --awsBatchQueue or TOIL_AWS_BATCH_QUEUE must be set") + raise RuntimeError( + "To use AWS Batch, --awsBatchQueue or TOIL_AWS_BATCH_QUEUE must be set" + ) # And the role, if any, jobs should assume - self.job_role_arn = getattr(config, 'aws_batch_job_role_arn') + self.job_role_arn = getattr(config, "aws_batch_job_role_arn") # And the Owner tag value, if any, to apply to things we create - self.owner_tag = os.environ.get('TOIL_OWNER_TAG') + self.owner_tag = os.environ.get("TOIL_OWNER_TAG") # Try and guess what Toil work dir the workers will use. # We need to be able to provision (possibly shared) space there. # TODO: Deduplicate with Kubernetes batch system. self.worker_work_dir = Toil.getToilWorkDir(config.workDir) - if (config.workDir is None and - os.getenv('TOIL_WORKDIR') is None and - self.worker_work_dir == tempfile.gettempdir()): + if ( + config.workDir is None + and os.getenv("TOIL_WORKDIR") is None + and self.worker_work_dir == tempfile.gettempdir() + ): # We defaulted to the system temp directory. But we think the # worker Dockerfiles will make them use /var/lib/toil instead. # TODO: Keep this in sync with the Dockerfile. - self.worker_work_dir = '/var/lib/toil' + self.worker_work_dir = "/var/lib/toil" # We assign job names based on a numerical job ID. This functionality # is managed by the BatchSystemLocalSupport. @@ -136,27 +148,39 @@ def __init__(self, config: Config, maxCores: float, maxMemory: int, maxDisk: int self.job_definition: Optional[str] = None # We need a way to map between our batch system ID numbers, and AWS Batch job IDs from the server. - self.bs_id_to_aws_id: Dict[int, str] = {} - self.aws_id_to_bs_id: Dict[str, int] = {} + self.bs_id_to_aws_id: dict[int, str] = {} + self.aws_id_to_bs_id: dict[str, int] = {} # We need to track if jobs were killed so they don't come out as updated - self.killed_job_aws_ids: Set[str] = set() + self.killed_job_aws_ids: set[str] = set() def setUserScript(self, user_script: Resource) -> None: - logger.debug(f'Setting user script for deployment: {user_script}') + logger.debug(f"Setting user script for deployment: {user_script}") self.user_script = user_script # setEnv is provided by BatchSystemSupport, updates self.environment def _check_accelerator_request(self, requirer: Requirer) -> None: for accelerator in requirer.accelerators: - if accelerator['kind'] != 'gpu' or accelerator.get('brand', 'nvidia') != 'nvidia': + if ( + accelerator["kind"] != "gpu" + or accelerator.get("brand", "nvidia") != "nvidia" + ): # We can only provide GPUs, and of those only nvidia ones. - raise InsufficientSystemResources(requirer, 'accelerators', details=[ - f'The accelerator {accelerator} could not be provided.', - 'AWS Batch can only provide nvidia gpu accelerators.' - ]) - - def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int: + raise InsufficientSystemResources( + requirer, + "accelerators", + details=[ + f"The accelerator {accelerator} could not be provided.", + "AWS Batch can only provide nvidia gpu accelerators.", + ], + ) + + def issueBatchJob( + self, + command: str, + job_desc: JobDescription, + job_environment: Optional[dict[str, str]] = None, + ) -> int: # Try the job as local local_id = self.handleLocalJob(command, job_desc) if local_id is not None: @@ -188,41 +212,54 @@ def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: # Compose a job spec to submit job_spec = { - 'jobName': job_name, - 'jobQueue': self.queue, - 'jobDefinition': self._get_or_create_job_definition(), - 'containerOverrides': { - 'command': command_list, - 'environment': [{'name': k, 'value': v} for k, v in environment.items()], - 'resourceRequirements': [ - {'type': 'MEMORY', 'value': str(max(MIN_REQUESTABLE_MIB, math.ceil(b_to_mib(job_desc.memory))))}, - {'type': 'VCPU', 'value': str(max(MIN_REQUESTABLE_CORES, math.ceil(job_desc.cores)))} - ] - } + "jobName": job_name, + "jobQueue": self.queue, + "jobDefinition": self._get_or_create_job_definition(), + "containerOverrides": { + "command": command_list, + "environment": [ + {"name": k, "value": v} for k, v in environment.items() + ], + "resourceRequirements": [ + { + "type": "MEMORY", + "value": str( + max( + MIN_REQUESTABLE_MIB, + math.ceil(b_to_mib(job_desc.memory)), + ) + ), + }, + { + "type": "VCPU", + "value": str( + max(MIN_REQUESTABLE_CORES, math.ceil(job_desc.cores)) + ), + }, + ], + }, } gpus_needed = 0 for accelerator in job_desc.accelerators: - if accelerator['kind'] == 'gpu': + if accelerator["kind"] == "gpu": # We just assume that all GPUs are equivalent when running # on AWS Batch because there's no way to tell AWS Batch to # send us to one or another. - gpus_needed += accelerator['count'] + gpus_needed += accelerator["count"] # Other accelerators are rejected by check_resource_request if gpus_needed > 0: # We need some GPUs so ask for them. - job_spec['containerOverrides']['resourceRequirements'].append({ - 'type': 'GPU', - 'value': gpus_needed - }) + job_spec["containerOverrides"]["resourceRequirements"].append( + {"type": "GPU", "value": gpus_needed} + ) if self.owner_tag: # We are meant to tag everything with an owner - job_spec['tags'] = {'Owner': self.owner_tag} - + job_spec["tags"] = {"Owner": self.owner_tag} # Launch it and get back the AWS ID that we can use to poll the task. # TODO: retry! response = self.client.submit_job(**job_spec) - aws_id = response['jobId'] + aws_id = response["jobId"] # Tie it to the numeric ID self.bs_id_to_aws_id[bs_id] = aws_id @@ -230,8 +267,10 @@ def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: if self._outbox is not None: # Specify relationship between toil batch ID and aws ID in message bus - self._outbox.publish(ExternalBatchIdMessage(bs_id, aws_id, self.__class__.__name__)) - logger.debug('Launched job: %s', job_name) + self._outbox.publish( + ExternalBatchIdMessage(bs_id, aws_id, self.__class__.__name__) + ) + logger.debug("Launched job: %s", job_name) return bs_id @@ -250,16 +289,16 @@ def _ensafen_name(input_name: str) -> str: # Do replacements to enhance readability input_name = input_name.replace(" ", "-") # Keep only acceptable characters - kept_chars = [c for c in input_name if c.isalnum() or c == '-' or c == '_'] + kept_chars = [c for c in input_name if c.isalnum() or c == "-" or c == "_"] if len(kept_chars) == 0 or not kept_chars[0].isalnum(): # Make sure we start with something alphanumeric - kept_chars = ['j'] + kept_chars + kept_chars = ["j"] + kept_chars # Keep no more than the limit of them kept_chars = kept_chars[:128] # And re-compose them into a string - return ''.join(kept_chars) + return "".join(kept_chars) - def _get_runtime(self, job_detail: Dict[str, Any]) -> Optional[float]: + def _get_runtime(self, job_detail: dict[str, Any]) -> Optional[float]: """ Internal function. Should not be called outside this class. @@ -269,20 +308,25 @@ def _get_runtime(self, job_detail: Dict[str, Any]) -> Optional[float]: Takes an AWS JobDetail as a dict. """ - if 'status' not in job_detail or job_detail['status'] not in ['STARTING', 'RUNNING', 'SUCCEEDED', 'FAILED']: + if "status" not in job_detail or job_detail["status"] not in [ + "STARTING", + "RUNNING", + "SUCCEEDED", + "FAILED", + ]: # Job is not running yet. logger.info("Runtime unavailable because job is still waiting") return None - if 'startedAt' not in job_detail: + if "startedAt" not in job_detail: # Job has no known start time logger.info("Runtime unavailable because job has no start time") return None - start_ms = job_detail['startedAt'] + start_ms = job_detail["startedAt"] - if 'stoppedAt' in job_detail: - end_ms = job_detail['stoppedAt'] + if "stoppedAt" in job_detail: + end_ms = job_detail["stoppedAt"] else: end_ms = unix_now_ms() @@ -291,7 +335,7 @@ def _get_runtime(self, job_detail: Dict[str, Any]) -> Optional[float]: # Return the time it has been running for. return runtime - def _get_exit_code(self, job_detail: Dict[str, Any]) -> int: + def _get_exit_code(self, job_detail: dict[str, Any]) -> int: """ Internal function. Should not be called outside this class. @@ -299,12 +343,18 @@ def _get_exit_code(self, job_detail: Dict[str, Any]) -> int: EXIT_STATUS_UNAVAILABLE_VALUE if it cannot be gotten. """ - return int(job_detail.get('container', {}).get('exitCode', EXIT_STATUS_UNAVAILABLE_VALUE)) + return int( + job_detail.get("container", {}).get( + "exitCode", EXIT_STATUS_UNAVAILABLE_VALUE + ) + ) def getUpdatedBatchJob(self, maxWait: int) -> Optional[UpdatedBatchJobInfo]: # Remember when we started, for respecting the timeout entry = datetime.datetime.now() - while ((datetime.datetime.now() - entry).total_seconds() < maxWait or not maxWait): + while ( + datetime.datetime.now() - entry + ).total_seconds() < maxWait or not maxWait: result = self.getUpdatedLocalJob(0) if result: return result @@ -315,9 +365,9 @@ def getUpdatedBatchJob(self, maxWait: int) -> Optional[UpdatedBatchJobInfo]: acknowledged = [] for job_detail in self._describe_jobs_in_batches(): - if job_detail.get('status') in ['SUCCEEDED', 'FAILED']: + if job_detail.get("status") in ["SUCCEEDED", "FAILED"]: # This job is done! - aws_id = job_detail['jobId'] + aws_id = job_detail["jobId"] bs_id = self.aws_id_to_bs_id[aws_id] # Acknowledge it @@ -325,7 +375,7 @@ def getUpdatedBatchJob(self, maxWait: int) -> Optional[UpdatedBatchJobInfo]: if aws_id in self.killed_job_aws_ids: # Killed jobs aren't allowed to appear as updated. - logger.debug('Job %s was killed so skipping it', bs_id) + logger.debug("Job %s was killed so skipping it", bs_id) continue # Otherwise, it stopped running and it wasn't our fault. @@ -334,21 +384,33 @@ def getUpdatedBatchJob(self, maxWait: int) -> Optional[UpdatedBatchJobInfo]: runtime = self._get_runtime(job_detail) # Determine if it succeeded - exit_reason = STATE_TO_EXIT_REASON[job_detail['status']] + exit_reason = STATE_TO_EXIT_REASON[job_detail["status"]] # Get its exit code exit_code = self._get_exit_code(job_detail) - if job_detail['status'] == 'FAILED' and 'statusReason' in job_detail: + if ( + job_detail["status"] == "FAILED" + and "statusReason" in job_detail + ): # AWS knows why the job failed, so log the error - logger.error('Job %s failed because: %s', bs_id, job_detail['statusReason']) + logger.error( + "Job %s failed because: %s", + bs_id, + job_detail["statusReason"], + ) # Compose a result - return UpdatedBatchJobInfo(jobID=bs_id, exitStatus=exit_code, wallTime=runtime, exitReason=exit_reason) + return UpdatedBatchJobInfo( + jobID=bs_id, + exitStatus=exit_code, + wallTime=runtime, + exitReason=exit_reason, + ) finally: # Drop all the records for tasks we acknowledged - for (aws_id, bs_id) in acknowledged: + for aws_id, bs_id in acknowledged: del self.aws_id_to_bs_id[aws_id] del self.bs_id_to_aws_id[bs_id] if aws_id in self.killed_job_aws_ids: @@ -357,7 +419,7 @@ def getUpdatedBatchJob(self, maxWait: int) -> Optional[UpdatedBatchJobInfo]: if maxWait: # Wait a bit and poll again - time.sleep(min(maxWait/2, 1.0)) + time.sleep(min(maxWait / 2, 1.0)) else: # Only poll once break @@ -390,7 +452,7 @@ def _try_terminate(self, aws_id: str) -> None: # later. self.killed_job_aws_ids.add(aws_id) # Kill the AWS Batch job - self.client.terminate_job(jobId=aws_id, reason='Killed by Toil') + self.client.terminate_job(jobId=aws_id, reason="Killed by Toil") @retry(errors=[ClientError]) def _wait_until_stopped(self, aws_id: str) -> None: @@ -406,16 +468,19 @@ def _wait_until_stopped(self, aws_id: str) -> None: while True: # Poll the job response = self.client.describe_jobs(jobs=[aws_id]) - jobs = response.get('jobs', []) + jobs = response.get("jobs", []) if len(jobs) == 0: # Job no longer exists at all return job = jobs[0] - if job.get('status') and job['status'] in ['SUCCEEDED', 'FAILED']: + if job.get("status") and job["status"] in ["SUCCEEDED", "FAILED"]: # The job has stopped return # Otherwise the job is still going. Wait for it to stop. - logger.info('Waiting for killed job %s to stop', self.aws_id_to_bs_id.get(aws_id, aws_id)) + logger.info( + "Waiting for killed job %s to stop", + self.aws_id_to_bs_id.get(aws_id, aws_id), + ) time.sleep(2) @retry(errors=[ClientError]) @@ -429,56 +494,76 @@ def _get_or_create_job_definition(self) -> str: if self.job_definition is None: # First work out what volume mounts to make, because the type # system is happiest this way - volumes: List[Dict[str, Union[str, Dict[str, str]]]] = [] - mount_points: List[Dict[str, str]] = [] - for i, shared_path in enumerate({ - '/var/lib/toil', - '/var/lib/docker', - '/var/lib/cwl', - '/var/run/docker.sock', - '/var/run/user', - '/tmp', - self.worker_work_dir - }): + volumes: list[dict[str, Union[str, dict[str, str]]]] = [] + mount_points: list[dict[str, str]] = [] + for i, shared_path in enumerate( + { + "/var/lib/toil", + "/var/lib/docker", + "/var/lib/cwl", + "/var/run/docker.sock", + "/var/run/user", + "/tmp", + self.worker_work_dir, + } + ): # For every path we want to be the same on the host and the # container, choose a name - vol_name = f'mnt{i}' + vol_name = f"mnt{i}" # Make a volume for that path - volumes.append({'name': vol_name, 'host': {'sourcePath': shared_path}}) + volumes.append({"name": vol_name, "host": {"sourcePath": shared_path}}) # Mount the volume at that path - mount_points.append({'containerPath': shared_path, 'sourceVolume': vol_name}) + mount_points.append( + {"containerPath": shared_path, "sourceVolume": vol_name} + ) job_def_spec = { - 'jobDefinitionName': 'toil-' + str(uuid.uuid4()), - 'type': 'container', - 'containerProperties': { - 'image': self.docker_image, - 'volumes': volumes, - 'mountPoints': mount_points, + "jobDefinitionName": "toil-" + str(uuid.uuid4()), + "type": "container", + "containerProperties": { + "image": self.docker_image, + "volumes": volumes, + "mountPoints": mount_points, # Requirements will always be overridden but must be present anyway - 'resourceRequirements': [ - {'type': 'MEMORY', 'value': str(max(MIN_REQUESTABLE_MIB, math.ceil(b_to_mib(self.config.defaultMemory))))}, - {'type': 'VCPU', 'value': str(max(MIN_REQUESTABLE_CORES, math.ceil(self.config.defaultCores)))} + "resourceRequirements": [ + { + "type": "MEMORY", + "value": str( + max( + MIN_REQUESTABLE_MIB, + math.ceil(b_to_mib(self.config.defaultMemory)), + ) + ), + }, + { + "type": "VCPU", + "value": str( + max( + MIN_REQUESTABLE_CORES, + math.ceil(self.config.defaultCores), + ) + ), + }, ], # Be privileged because we can. And we'd like Singularity # to work even if we do have the Docker socket. See # . - 'privileged': True + "privileged": True, }, - 'retryStrategy': {'attempts': 1}, - 'propagateTags': True # This will propagate to ECS task but not to job! + "retryStrategy": {"attempts": 1}, + "propagateTags": True, # This will propagate to ECS task but not to job! } if self.job_role_arn: # We need to give the job a role. # We might not be able to do much job store access without this! - container_properties = job_def_spec['containerProperties'] + container_properties = job_def_spec["containerProperties"] assert isinstance(container_properties, dict) - container_properties['jobRoleArn'] = self.job_role_arn + container_properties["jobRoleArn"] = self.job_role_arn if self.owner_tag: # We are meant to tag everything with an owner - job_def_spec['tags'] = {'Owner': self.owner_tag} + job_def_spec["tags"] = {"Owner": self.owner_tag} response = self.client.register_job_definition(**job_def_spec) - self.job_definition = response['jobDefinitionArn'] + self.job_definition = response["jobDefinitionArn"] return self.job_definition @@ -494,10 +579,10 @@ def _destroy_job_definition(self) -> None: # TODO: How do we tolerate it not existing anymore? self.job_definition = None - def getIssuedBatchJobIDs(self) -> List[int]: + def getIssuedBatchJobIDs(self) -> list[int]: return self.getIssuedLocalJobIDs() + list(self.bs_id_to_aws_id.keys()) - def _describe_jobs_in_batches(self) -> Iterator[Dict[str, Any]]: + def _describe_jobs_in_batches(self) -> Iterator[dict[str, Any]]: """ Internal function. Should not be called outside this class. @@ -506,28 +591,30 @@ def _describe_jobs_in_batches(self) -> Iterator[Dict[str, Any]]: """ # Get all the AWS IDs to poll - to_check = list(aws_and_bs_id[0] for aws_and_bs_id in self.aws_id_to_bs_id.items()) + to_check = list( + aws_and_bs_id[0] for aws_and_bs_id in self.aws_id_to_bs_id.items() + ) while len(to_check) > 0: # Go through jobs we want to poll in batches of the max size check_batch = to_check[-MAX_POLL_COUNT:] # And pop them off the end of the list of jobs to check - to_check = to_check[:-len(check_batch)] + to_check = to_check[: -len(check_batch)] # TODO: retry response = self.client.describe_jobs(jobs=check_batch) # Yield each returned JobDetail - yield from response.get('jobs', []) + yield from response.get("jobs", []) - def getRunningBatchJobIDs(self) -> Dict[int, float]: + def getRunningBatchJobIDs(self) -> dict[int, float]: # We need a dict from job_id (integer) to seconds it has been running bs_id_to_runtime = {} for job_detail in self._describe_jobs_in_batches(): - if job_detail.get('status') == 'RUNNING': + if job_detail.get("status") == "RUNNING": runtime = self._get_runtime(job_detail) - aws_id = job_detail['jobId'] + aws_id = job_detail["jobId"] bs_id = self.aws_id_to_bs_id[aws_id] if runtime: # We can measure a runtime @@ -535,12 +622,17 @@ def getRunningBatchJobIDs(self) -> Dict[int, float]: else: # If we can't find a runtime, we can't say it's running # because we can't say how long it has been running for. - logger.warning("Job %s is %s but has no runtime: %s", bs_id, job_detail['status'], job_detail) + logger.warning( + "Job %s is %s but has no runtime: %s", + bs_id, + job_detail["status"], + job_detail, + ) # Give back the times all our running jobs have been running for. return bs_id_to_runtime - def killBatchJobs(self, job_ids: List[int]) -> None: + def killBatchJobs(self, job_ids: list[int]) -> None: # Kill all the ones that are local self.killLocalJobs(job_ids) @@ -559,14 +651,31 @@ def killBatchJobs(self, job_ids: List[int]) -> None: @classmethod def add_options(cls, parser: Union[ArgumentParser, _ArgumentGroup]) -> None: - parser.add_argument("--awsBatchRegion", dest="aws_batch_region", default=None, env_var="TOIL_AWS_REGION", - help="The AWS region containing the AWS Batch queue to submit to.") - parser.add_argument("--awsBatchQueue", dest="aws_batch_queue", default=None, env_var="TOIL_AWS_BATCH_QUEUE", - help="The name or ARN of the AWS Batch queue to submit to.") - parser.add_argument("--awsBatchJobRoleArn", dest="aws_batch_job_role_arn", default=None, env_var="TOIL_AWS_BATCH_JOB_ROLE_ARN", - help=("The ARN of an IAM role to run AWS Batch jobs as, so they " - "can e.g. access a job store. Must be assumable by " - "ecs-tasks.amazonaws.com.")) + parser.add_argument( + "--awsBatchRegion", + dest="aws_batch_region", + default=None, + env_var="TOIL_AWS_REGION", + help="The AWS region containing the AWS Batch queue to submit to.", + ) + parser.add_argument( + "--awsBatchQueue", + dest="aws_batch_queue", + default=None, + env_var="TOIL_AWS_BATCH_QUEUE", + help="The name or ARN of the AWS Batch queue to submit to.", + ) + parser.add_argument( + "--awsBatchJobRoleArn", + dest="aws_batch_job_role_arn", + default=None, + env_var="TOIL_AWS_BATCH_JOB_ROLE_ARN", + help=( + "The ARN of an IAM role to run AWS Batch jobs as, so they " + "can e.g. access a job store. Must be assumable by " + "ecs-tasks.amazonaws.com." + ), + ) @classmethod def setOptions(cls, setOption: OptionSetter) -> None: diff --git a/src/toil/batchSystems/cleanup_support.py b/src/toil/batchSystems/cleanup_support.py index 4506e47480..b9d3d06e6b 100644 --- a/src/toil/batchSystems/cleanup_support.py +++ b/src/toil/batchSystems/cleanup_support.py @@ -13,16 +13,16 @@ # limitations under the License. import logging from types import TracebackType -from typing import Any, ContextManager, List, Optional, Type +from typing import Any, ContextManager, Optional -from toil.batchSystems.abstractBatchSystem import (BatchSystemSupport, - WorkerCleanupInfo) +from toil.batchSystems.abstractBatchSystem import BatchSystemSupport, WorkerCleanupInfo from toil.batchSystems.local_support import BatchSystemLocalSupport from toil.common import Config, Toil from toil.lib.threading import LastProcessStandingArena logger = logging.getLogger(__name__) + class BatchSystemCleanupSupport(BatchSystemLocalSupport): """ Adds cleanup support when the last running job leaves a node, for batch @@ -33,7 +33,7 @@ class BatchSystemCleanupSupport(BatchSystemLocalSupport): def supportsWorkerCleanup(cls) -> bool: return True - def getWorkerContexts(self) -> List[ContextManager[Any]]: + def getWorkerContexts(self) -> list[ContextManager[Any]]: # Tell worker to register for and invoke cleanup # Create a context manager that has a copy of our cleanup info @@ -44,9 +44,12 @@ def getWorkerContexts(self) -> List[ContextManager[Any]]: contexts.append(context) return contexts - def __init__(self, config: Config, maxCores: float, maxMemory: int, maxDisk: int) -> None: + def __init__( + self, config: Config, maxCores: float, maxMemory: int, maxDisk: int + ) -> None: super().__init__(config, maxCores, maxMemory, maxDisk) + class WorkerCleanupContext: """ Context manager used by :class:`BatchSystemCleanupSupport` to implement @@ -63,7 +66,6 @@ def __init__(self, workerCleanupInfo: WorkerCleanupInfo) -> None: the last to exit the context manager. """ - self.workerCleanupInfo = workerCleanupInfo # Don't set self.arena or MyPy will be upset that sometimes it doesn't have the right type. @@ -71,27 +73,31 @@ def __enter__(self) -> None: # Set up an arena so we know who is the last worker to leave self.arena = LastProcessStandingArena( Toil.get_toil_coordination_dir( - self.workerCleanupInfo.work_dir, - self.workerCleanupInfo.coordination_dir + self.workerCleanupInfo.work_dir, self.workerCleanupInfo.coordination_dir ), - Toil.get_workflow_path_component(self.workerCleanupInfo.workflow_id) + "-cleanup" + Toil.get_workflow_path_component(self.workerCleanupInfo.workflow_id) + + "-cleanup", ) - logger.debug('Entering cleanup arena') + logger.debug("Entering cleanup arena") self.arena.enter() - logger.debug('Cleanup arena entered') + logger.debug("Cleanup arena entered") # This is exactly the signature MyPy demands. # Also, it demands we not say we can return a bool if we return False # always, because it can be smarter about reachability if it knows what # context managers never eat exceptions. So it decides any context manager # that is always falsey but claims to return a bool is an error. - def __exit__(self, type: Optional[Type[BaseException]], value: Optional[BaseException], traceback: Optional[TracebackType]) -> None: - logger.debug('Leaving cleanup arena') + def __exit__( + self, + type: Optional[type[BaseException]], + value: Optional[BaseException], + traceback: Optional[TracebackType], + ) -> None: + logger.debug("Leaving cleanup arena") for _ in self.arena.leave(): # We are the last concurrent worker to finish. # Do batch system cleanup. - logger.debug('Cleaning up worker') + logger.debug("Cleaning up worker") BatchSystemSupport.workerCleanup(self.workerCleanupInfo) # Now the coordination_dir is allowed to no longer exist on the node. - logger.debug('Cleanup arena left') - + logger.debug("Cleanup arena left") diff --git a/src/toil/batchSystems/contained_executor.py b/src/toil/batchSystems/contained_executor.py index 0859bb8e02..9b554cf16e 100644 --- a/src/toil/batchSystems/contained_executor.py +++ b/src/toil/batchSystems/contained_executor.py @@ -22,7 +22,7 @@ import pickle import subprocess import sys -from typing import Any, Dict, List, Optional +from typing import Any, Optional from toil.batchSystems.abstractBatchSystem import EXIT_STATUS_UNAVAILABLE_VALUE from toil.resource import Resource @@ -31,9 +31,13 @@ logger = logging.getLogger(__name__) -def pack_job(command: str, user_script: Optional[Resource] = None, environment: Optional[Dict[str, str]] = None) -> List[str]: +def pack_job( + command: str, + user_script: Optional[Resource] = None, + environment: Optional[dict[str, str]] = None, +) -> list[str]: """ - Create a command that runs the given command in an environment. + Create a command that runs the given command in an environment. :param command: Worker command to run to run the job. :param user_script: User script that will be loaded before the job is run. @@ -45,19 +49,21 @@ def pack_job(command: str, user_script: Optional[Resource] = None, environment: """ # Make a job dict to send to the executor. # TODO: Factor out executor setup from here and Kubernetes and TES - job: Dict[str, Any] = {"command": command} + job: dict[str, Any] = {"command": command} if user_script is not None: # If there's a user script resource be sure to send it along - job['userScript'] = user_script + job["userScript"] = user_script if environment is not None: # We also may have an environment to send. - job['environment'] = environment + job["environment"] = environment # Encode it in a form we can send in a command-line argument. Pickle in # the highest protocol to prevent mixed-Python-version workflows from # trying to work. Make sure it is text so we can ship it via JSON. - encoded_job = base64.b64encode(pickle.dumps(job, pickle.HIGHEST_PROTOCOL)).decode('utf-8') + encoded_job = base64.b64encode(pickle.dumps(job, pickle.HIGHEST_PROTOCOL)).decode( + "utf-8" + ) # Make a command to run it in the executor - command_list = ['_toil_contained_executor', encoded_job] + command_list = ["_toil_contained_executor", encoded_job] return command_list @@ -81,53 +87,51 @@ def executor() -> None: exit_code = EXIT_STATUS_UNAVAILABLE_VALUE if len(sys.argv) != 2: - logger.error('Executor requires exactly one base64-encoded argument') + logger.error("Executor requires exactly one base64-encoded argument") sys.exit(exit_code) # Take in a base64-encoded pickled dict as our first argument and decode it try: # Make sure to encode the text arguments to bytes before base 64 decoding - job = pickle.loads(base64.b64decode(sys.argv[1].encode('utf-8'))) + job = pickle.loads(base64.b64decode(sys.argv[1].encode("utf-8"))) except: exc_info = sys.exc_info() - logger.error('Exception while unpickling task: ', exc_info=exc_info) + logger.error("Exception while unpickling task: ", exc_info=exc_info) sys.exit(exit_code) - if 'environment' in job: + if "environment" in job: # Adopt the job environment into the executor. # This lets us use things like TOIL_WORKDIR when figuring out how to talk to other executors. - logger.debug('Adopting environment: %s', str(job['environment'].keys())) - for var, value in job['environment'].items(): + logger.debug("Adopting environment: %s", str(job["environment"].keys())) + for var, value in job["environment"].items(): os.environ[var] = value # Set JTRES_ROOT and other global state needed for resource # downloading/deployment to work. # TODO: Every worker downloads resources independently. # We should have a way to share a resource directory. - logger.debug('Preparing system for resource download') + logger.debug("Preparing system for resource download") Resource.prepareSystem() try: - if 'userScript' in job: - job['userScript'].register() + if "userScript" in job: + job["userScript"].register() # Start the child process - logger.debug("Invoking command: '%s'", job['command']) - child = subprocess.Popen(job['command'], - preexec_fn=lambda: os.setpgrp(), - shell=True) + logger.debug("Invoking command: '%s'", job["command"]) + child = subprocess.Popen( + job["command"], preexec_fn=lambda: os.setpgrp(), shell=True + ) # Reproduce child's exit code exit_code = child.wait() except: # This will print a traceback for us, since exit() in the finally # will bypass the normal way of getting one. - logger.exception('Encountered exception running child') + logger.exception("Encountered exception running child") finally: - logger.debug('Cleaning up resources') + logger.debug("Cleaning up resources") # TODO: Change resource system to use a shared resource directory for everyone. # Then move this into worker cleanup somehow Resource.cleanSystem() - logger.debug('Shutting down') + logger.debug("Shutting down") sys.exit(exit_code) - - diff --git a/src/toil/batchSystems/gridengine.py b/src/toil/batchSystems/gridengine.py index 17fe1838c7..5666104774 100644 --- a/src/toil/batchSystems/gridengine.py +++ b/src/toil/batchSystems/gridengine.py @@ -17,10 +17,11 @@ import shlex import time from shlex import quote -from typing import Dict, List, Optional +from typing import Optional -from toil.batchSystems.abstractGridEngineBatchSystem import \ - AbstractGridEngineBatchSystem +from toil.batchSystems.abstractGridEngineBatchSystem import ( + AbstractGridEngineBatchSystem, +) from toil.lib.misc import CalledProcessErrorStderr, call_command logger = logging.getLogger(__name__) @@ -32,33 +33,38 @@ class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread): """ Grid Engine-specific AbstractGridEngineWorker methods """ + def getRunningJobIDs(self): times = {} with self.runningJobsLock: currentjobs = {str(self.batchJobIDs[x][0]): x for x in self.runningJobs} stdout = call_command(["qstat"]) - for currline in stdout.split('\n'): + for currline in stdout.split("\n"): items = currline.strip().split() if items: - if items[0] in currentjobs and items[4] == 'r': + if items[0] in currentjobs and items[4] == "r": jobstart = " ".join(items[5:7]) - jobstart = time.mktime(time.strptime(jobstart, "%m/%d/%Y %H:%M:%S")) + jobstart = time.mktime( + time.strptime(jobstart, "%m/%d/%Y %H:%M:%S") + ) times[currentjobs[items[0]]] = time.time() - jobstart return times def killJob(self, jobID): - call_command(['qdel', self.getBatchSystemID(jobID)]) - - def prepareSubmission(self, - cpu: int, - memory: int, - jobID: int, - command: str, - jobName: str, - job_environment: Optional[Dict[str, str]] = None, - gpus: Optional[int] = None): + call_command(["qdel", self.getBatchSystemID(jobID)]) + + def prepareSubmission( + self, + cpu: int, + memory: int, + jobID: int, + command: str, + jobName: str, + job_environment: Optional[dict[str, str]] = None, + gpus: Optional[int] = None, + ): # POSIX qsub # # expects a single script argument, which is supposed to be a file. @@ -67,11 +73,13 @@ def prepareSubmission(self, # hope that the qsub we are using is clever enough to forward along # arguments. Otherwise, some qsubs will go looking for the full # Toil command string as a file. - return self.prepareQsub(cpu, memory, jobID, job_environment) + shlex.split(command) + return self.prepareQsub(cpu, memory, jobID, job_environment) + shlex.split( + command + ) def submitJob(self, subLine): stdout = call_command(subLine) - output = stdout.split('\n')[0].strip() + output = stdout.split("\n")[0].strip() result = int(output) return result @@ -84,8 +92,8 @@ def getJobExitCode(self, sgeJobID): """ # the task is set as part of the job ID if using getBatchSystemID() job, task = (sgeJobID, None) - if '.' in sgeJobID: - job, task = sgeJobID.split('.', 1) + if "." in sgeJobID: + job, task = sgeJobID.split(".", 1) assert task is None, "task ids not currently support by qstat logic below" # First try qstat to see if job is still running, if not get the @@ -101,66 +109,94 @@ def getJobExitCode(self, sgeJobID): if task is not None: args.extend(["-t", str(task)]) stdout = call_command(args) - for line in stdout.split('\n'): + for line in stdout.split("\n"): if line.startswith("failed") and int(line.split()[1]) == 1: return 1 elif line.startswith("exit_status"): - logger.debug('Exit Status: %r', line.split()[1]) + logger.debug("Exit Status: %r", line.split()[1]) return int(line.split()[1]) return None """ Implementation-specific helper methods """ - def prepareQsub(self, - cpu: int, - mem: int, - jobID: int, - job_environment: Optional[Dict[str, str]] = None) -> List[str]: - qsubline = ['qsub', '-V', '-b', 'y', '-terse', '-j', 'y', '-cwd', - '-N', 'toil_job_' + str(jobID)] + + def prepareQsub( + self, + cpu: int, + mem: int, + jobID: int, + job_environment: Optional[dict[str, str]] = None, + ) -> list[str]: + qsubline = [ + "qsub", + "-V", + "-b", + "y", + "-terse", + "-j", + "y", + "-cwd", + "-N", + "toil_job_" + str(jobID), + ] environment = self.boss.environment.copy() if job_environment: environment.update(job_environment) if environment: - qsubline.append('-v') - qsubline.append(','.join(k + '=' + quote(os.environ[k] if v is None else v) - for k, v in environment.items())) + qsubline.append("-v") + qsubline.append( + ",".join( + k + "=" + quote(os.environ[k] if v is None else v) + for k, v in environment.items() + ) + ) reqline = list() - sgeArgs = os.getenv('TOIL_GRIDENGINE_ARGS') + sgeArgs = os.getenv("TOIL_GRIDENGINE_ARGS") if mem is not None: - memStr = str(mem // 1024) + 'K' + memStr = str(mem // 1024) + "K" if not self.boss.config.manualMemArgs: # for UGE instead of SGE; see #2309 - reqline += ['vf=' + memStr, 'h_vmem=' + memStr] + reqline += ["vf=" + memStr, "h_vmem=" + memStr] elif self.boss.config.manualMemArgs and not sgeArgs: - raise ValueError("--manualMemArgs set to True, but TOIL_GRIDGENGINE_ARGS is not set." - "Please set TOIL_GRIDGENGINE_ARGS to specify memory allocation for " - "your system. Default adds the arguments: vf= h_vmem= " - "to qsub.") + raise ValueError( + "--manualMemArgs set to True, but TOIL_GRIDGENGINE_ARGS is not set." + "Please set TOIL_GRIDGENGINE_ARGS to specify memory allocation for " + "your system. Default adds the arguments: vf= h_vmem= " + "to qsub." + ) if len(reqline) > 0: - qsubline.extend(['-hard', '-l', ','.join(reqline)]) + qsubline.extend(["-hard", "-l", ",".join(reqline)]) if sgeArgs: sgeArgs = sgeArgs.split() for arg in sgeArgs: if arg.startswith(("vf=", "h_vmem=", "-pe")): - raise ValueError("Unexpected CPU, memory or pe specifications in TOIL_GRIDGENGINE_ARGs: %s" % arg) + raise ValueError( + "Unexpected CPU, memory or pe specifications in TOIL_GRIDGENGINE_ARGs: %s" + % arg + ) qsubline.extend(sgeArgs) # If cpu == 1 (or None) then don't add PE env variable to the qsub command. # This will allow for use of the serial queue for these jobs. - if (os.getenv('TOIL_GRIDENGINE_PE') is not None) and (cpu is not None) and (cpu > 1) : + if ( + (os.getenv("TOIL_GRIDENGINE_PE") is not None) + and (cpu is not None) + and (cpu > 1) + ): peCpu = int(math.ceil(cpu)) - qsubline.extend(['-pe', os.getenv('TOIL_GRIDENGINE_PE'), str(peCpu)]) + qsubline.extend(["-pe", os.getenv("TOIL_GRIDENGINE_PE"), str(peCpu)]) elif (cpu is not None) and (cpu > 1): - raise RuntimeError("must specify PE in TOIL_GRIDENGINE_PE environment variable when using multiple CPUs. " - "Run qconf -spl and your local documentation for possible values") - - stdoutfile: str = self.boss.format_std_out_err_path(jobID, '$JOB_ID', 'out') - stderrfile: str = self.boss.format_std_out_err_path(jobID, '$JOB_ID', 'err') - qsubline.extend(['-o', stdoutfile, '-e', stderrfile]) + raise RuntimeError( + "must specify PE in TOIL_GRIDENGINE_PE environment variable when using multiple CPUs. " + "Run qconf -spl and your local documentation for possible values" + ) + + stdoutfile: str = self.boss.format_std_out_err_path(jobID, "$JOB_ID", "out") + stderrfile: str = self.boss.format_std_out_err_path(jobID, "$JOB_ID", "err") + qsubline.extend(["-o", stdoutfile, "-e", stderrfile]) return qsubline diff --git a/src/toil/batchSystems/htcondor.py b/src/toil/batchSystems/htcondor.py index 17b718bb75..33a36446d1 100644 --- a/src/toil/batchSystems/htcondor.py +++ b/src/toil/batchSystems/htcondor.py @@ -18,12 +18,13 @@ import time from contextlib import contextmanager from threading import Lock -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Optional import htcondor -from toil.batchSystems.abstractGridEngineBatchSystem import \ - AbstractGridEngineBatchSystem +from toil.batchSystems.abstractGridEngineBatchSystem import ( + AbstractGridEngineBatchSystem, +) from toil.job import AcceleratorRequirement from toil.lib.retry import retry @@ -40,11 +41,14 @@ # *Command to run* (swapped with unit name) # Environment dict for the job # Accelerator requirements for the job -JobTuple = Tuple[int, int, int, int, str, str, Dict[str, str], List[AcceleratorRequirement]] +JobTuple = tuple[ + int, int, int, int, str, str, dict[str, str], list[AcceleratorRequirement] +] # We have one global lock to control access to the HTCondor scheduler schedd_lock = Lock() + class HTCondorBatchSystem(AbstractGridEngineBatchSystem): # When using HTCondor, the Schedd handles scheduling @@ -59,15 +63,31 @@ def createJobs(self, newJob: JobTuple) -> bool: self.waitingJobs.append(newJob) # Queue jobs as necessary: - while len(self.waitingJobs) > 0 and len(self.runningJobs) < int(self.boss.config.max_jobs): + while len(self.waitingJobs) > 0 and len(self.runningJobs) < int( + self.boss.config.max_jobs + ): activity = True - jobID, cpu, memory, disk, jobName, command, environment, accelerators = self.waitingJobs.pop(0) + ( + jobID, + cpu, + memory, + disk, + jobName, + command, + environment, + accelerators, + ) = self.waitingJobs.pop(0) if accelerators: - logger.warning('Scheduling job %s without enforcing accelerator requirement', jobID) + logger.warning( + "Scheduling job %s without enforcing accelerator requirement", + jobID, + ) # Prepare the htcondor.Submit object - submitObj: htcondor.Submit = self.prepareSubmission(cpu, memory, disk, jobID, jobName, command, environment) + submitObj: htcondor.Submit = self.prepareSubmission( + cpu, memory, disk, jobID, jobName, command, environment + ) logger.debug("Submitting %r", submitObj) # Submit job and get batch system ID (i.e. the ClusterId) @@ -85,13 +105,22 @@ def createJobs(self, newJob: JobTuple) -> bool: return activity - def prepareSubmission(self, cpu: int, memory: int, disk: int, jobID: int, jobName: str, command: str, environment: Dict[str, str]) -> htcondor.Submit: + def prepareSubmission( + self, + cpu: int, + memory: int, + disk: int, + jobID: int, + jobName: str, + command: str, + environment: dict[str, str], + ) -> htcondor.Submit: # Note that we don't yet take the accelerators here. # Convert resource requests - cpu = int(math.ceil(cpu)) # integer CPUs - ht_memory = float(memory)/1024 # memory in KB - ht_disk = float(disk)/1024 # disk in KB + cpu = int(math.ceil(cpu)) # integer CPUs + ht_memory = float(memory) / 1024 # memory in KB + ht_disk = float(disk) / 1024 # disk in KB # NOTE: format_std_out_err_path() by default puts files in the Toil # work directory, which defaults to being in the system temporary @@ -101,41 +130,54 @@ def prepareSubmission(self, cpu: int, memory: int, disk: int, jobID: int, jobNam # = Yes in the submit file, so that HTCondor will write the # standard output/error files on the compute node, then transfer # back once the job has completed. - stdoutfile: str = self.boss.format_std_out_err_path(jobID, '$(cluster)', 'out') - stderrfile: str = self.boss.format_std_out_err_path(jobID, '$(cluster)', 'err') - condorlogfile: str = self.boss.format_std_out_err_path(jobID, '$(cluster)', 'events') + stdoutfile: str = self.boss.format_std_out_err_path( + jobID, "$(cluster)", "out" + ) + stderrfile: str = self.boss.format_std_out_err_path( + jobID, "$(cluster)", "err" + ) + condorlogfile: str = self.boss.format_std_out_err_path( + jobID, "$(cluster)", "events" + ) # Execute the entire command as /bin/sh -c "command" # TODO: Transfer the jobStore directory if using a local file store with a relative path. submit_parameters = { - 'executable': '/bin/sh', - 'transfer_executable': 'False', - 'arguments': f'''"-c '{self.duplicate_quotes(command)}'"'''.encode(), # Workaround for HTCondor Python bindings Unicode conversion bug - 'environment': self.getEnvString(environment), - 'getenv': 'True', - 'should_transfer_files': 'Yes', # See note above for stdoutfile, stderrfile - 'output': stdoutfile, - 'error': stderrfile, - 'log': condorlogfile, - 'request_cpus': f'{cpu}', - 'request_memory': f'{ht_memory:.3f}KB', - 'request_disk': f'{ht_disk:.3f}KB', - 'leave_in_queue': '(JobStatus == 4)', - '+IsToilJob': 'True', - '+ToilJobID': f'{jobID}', - '+ToilJobName': f'"{jobName}"', - '+ToilJobKilled': 'False', + "executable": "/bin/sh", + "transfer_executable": "False", + "arguments": f'''"-c '{self.duplicate_quotes(command)}'"'''.encode(), # Workaround for HTCondor Python bindings Unicode conversion bug + "environment": self.getEnvString(environment), + "getenv": "True", + "should_transfer_files": "Yes", # See note above for stdoutfile, stderrfile + "output": stdoutfile, + "error": stderrfile, + "log": condorlogfile, + "request_cpus": f"{cpu}", + "request_memory": f"{ht_memory:.3f}KB", + "request_disk": f"{ht_disk:.3f}KB", + "leave_in_queue": "(JobStatus == 4)", + "+IsToilJob": "True", + "+ToilJobID": f"{jobID}", + "+ToilJobName": f'"{jobName}"', + "+ToilJobKilled": "False", } # Extra parameters for HTCondor - extra_parameters = os.getenv('TOIL_HTCONDOR_PARAMS') + extra_parameters = os.getenv("TOIL_HTCONDOR_PARAMS") if extra_parameters is not None: - logger.debug(f"Extra HTCondor parameters added to submit file from TOIL_HTCONDOR_PARAMS env. variable: {extra_parameters}") - for parameter, value in [parameter_value.split('=', 1) for parameter_value in extra_parameters.split(';')]: + logger.debug( + f"Extra HTCondor parameters added to submit file from TOIL_HTCONDOR_PARAMS env. variable: {extra_parameters}" + ) + for parameter, value in [ + parameter_value.split("=", 1) + for parameter_value in extra_parameters.split(";") + ]: parameter = parameter.strip() value = value.strip() if parameter in submit_parameters: - raise ValueError(f"Some extra parameters are incompatible: {extra_parameters}") + raise ValueError( + f"Some extra parameters are incompatible: {extra_parameters}" + ) submit_parameters[parameter] = value @@ -156,23 +198,24 @@ def submitJob(self, submitObj): def getRunningJobIDs(self): # Get all Toil jobs that are running - requirements = '(JobStatus == 2) && (IsToilJob)' - projection = ['ClusterId', 'ToilJobID', 'EnteredCurrentStatus'] + requirements = "(JobStatus == 2) && (IsToilJob)" + projection = ["ClusterId", "ToilJobID", "EnteredCurrentStatus"] with self.connectSchedd() as schedd: - ads = schedd.xquery(requirements = requirements, - projection = projection) + ads = schedd.xquery(requirements=requirements, projection=projection) # Only consider the Toil jobs that are part of this workflow - batchJobIDs = [batchJobID for (batchJobID, task) in self.batchJobIDs.values()] + batchJobIDs = [ + batchJobID for (batchJobID, task) in self.batchJobIDs.values() + ] job_runtimes = {} for ad in ads: - batchJobID = int(ad['ClusterId']) - jobID = int(ad['ToilJobID']) + batchJobID = int(ad["ClusterId"]) + jobID = int(ad["ToilJobID"]) if not (batchJobID in batchJobIDs): continue # HTCondor stores the start of the runtime as a Unix timestamp - runtime = time.time() - ad['EnteredCurrentStatus'] + runtime = time.time() - ad["EnteredCurrentStatus"] job_runtimes[jobID] = runtime return job_runtimes @@ -183,28 +226,33 @@ def killJob(self, jobID): # Set the job to be killed when its exit status is checked with self.connectSchedd() as schedd: - job_spec = f'(ClusterId == {batchJobID})' - schedd.edit(job_spec, 'ToilJobKilled', 'True') + job_spec = f"(ClusterId == {batchJobID})" + schedd.edit(job_spec, "ToilJobKilled", "True") def getJobExitCode(self, batchJobID): logger.debug(f"Getting exit code for HTCondor job {batchJobID}") status = { - 1: 'Idle', - 2: 'Running', - 3: 'Removed', - 4: 'Completed', - 5: 'Held', - 6: 'Transferring Output', - 7: 'Suspended' + 1: "Idle", + 2: "Running", + 3: "Removed", + 4: "Completed", + 5: "Held", + 6: "Transferring Output", + 7: "Suspended", } - requirements = f'(ClusterId == {batchJobID})' - projection = ['JobStatus', 'ToilJobKilled', 'ExitCode', - 'HoldReason', 'HoldReasonSubCode'] + requirements = f"(ClusterId == {batchJobID})" + projection = [ + "JobStatus", + "ToilJobKilled", + "ExitCode", + "HoldReason", + "HoldReasonSubCode", + ] with self.connectSchedd() as schedd: - ads = schedd.xquery(requirements = requirements, projection = projection) + ads = schedd.xquery(requirements=requirements, projection=projection) # Make sure a ClassAd was returned try: @@ -214,7 +262,8 @@ def getJobExitCode(self, batchJobID): ad = ads.next() except StopIteration: logger.error( - f"No HTCondor ads returned using constraint: {requirements}") + f"No HTCondor ads returned using constraint: {requirements}" + ) raise # Make sure only one ClassAd was returned @@ -227,40 +276,49 @@ def getJobExitCode(self, batchJobID): pass else: logger.warning( - f"Multiple HTCondor ads returned using constraint: {requirements}") + f"Multiple HTCondor ads returned using constraint: {requirements}" + ) - if ad['ToilJobKilled']: + if ad["ToilJobKilled"]: logger.debug(f"HTCondor job {batchJobID} was killed by Toil") # Remove the job from the Schedd and return 1 - job_spec = f'ClusterId == {batchJobID}' + job_spec = f"ClusterId == {batchJobID}" schedd.act(htcondor.JobAction.Remove, job_spec) return 1 - elif status[ad['JobStatus']] == 'Completed': - logger.debug("HTCondor job {} completed with exit code {}".format( - batchJobID, ad['ExitCode'])) + elif status[ad["JobStatus"]] == "Completed": + logger.debug( + "HTCondor job {} completed with exit code {}".format( + batchJobID, ad["ExitCode"] + ) + ) # Remove the job from the Schedd and return its exit code - job_spec = f'ClusterId == {batchJobID}' + job_spec = f"ClusterId == {batchJobID}" schedd.act(htcondor.JobAction.Remove, job_spec) - return int(ad['ExitCode']) + return int(ad["ExitCode"]) - elif status[ad['JobStatus']] == 'Held': - logger.error("HTCondor job {} was held: '{} (sub code {})'".format( - batchJobID, ad['HoldReason'], ad['HoldReasonSubCode'])) + elif status[ad["JobStatus"]] == "Held": + logger.error( + "HTCondor job {} was held: '{} (sub code {})'".format( + batchJobID, ad["HoldReason"], ad["HoldReasonSubCode"] + ) + ) # Remove the job from the Schedd and return 1 - job_spec = f'ClusterId == {batchJobID}' + job_spec = f"ClusterId == {batchJobID}" schedd.act(htcondor.JobAction.Remove, job_spec) return 1 - else: # Job still running or idle or doing something else - logger.debug("HTCondor job {} has not completed (Status: {})".format( - batchJobID, status[ad['JobStatus']])) + else: # Job still running or idle or doing something else + logger.debug( + "HTCondor job {} has not completed (Status: {})".format( + batchJobID, status[ad["JobStatus"]] + ) + ) return None - """ Implementation-specific helper methods """ @@ -294,7 +352,7 @@ def _ping_scheduler(self, schedd: Any) -> None: """ Ping the scheduler, or fail if it persistently cannot be contacted. """ - schedd.xquery(limit = 0) + schedd.xquery(limit=0) @retry(errors=[htcondor.HTCondorIOError]) def _get_schedd_address(self) -> Optional[str]: @@ -305,8 +363,8 @@ def _get_schedd_address(self) -> Optional[str]: """ # TODO: Memoize? Or is the collector meant to field every request? - condor_host = os.getenv('TOIL_HTCONDOR_COLLECTOR') - schedd_name = os.getenv('TOIL_HTCONDOR_SCHEDD') + condor_host = os.getenv("TOIL_HTCONDOR_COLLECTOR") + schedd_name = os.getenv("TOIL_HTCONDOR_SCHEDD") # Get the scheduler's address, if not local schedd_ad: Optional[str] = None @@ -315,17 +373,22 @@ def _get_schedd_address(self) -> Optional[str]: if condor_host and schedd_name: logger.debug( "Connecting to HTCondor Schedd {} using Collector at {}".format( - schedd_name, condor_host)) + schedd_name, condor_host + ) + ) try: schedd_ad = htcondor.Collector(condor_host).locate( - htcondor.DaemonTypes.Schedd, schedd_name) + htcondor.DaemonTypes.Schedd, schedd_name + ) except OSError: logger.error( - f"Could not connect to HTCondor Collector at {condor_host}") + f"Could not connect to HTCondor Collector at {condor_host}" + ) raise except ValueError: logger.error( - f"Could not find HTCondor Schedd with name {schedd_name}") + f"Could not find HTCondor Schedd with name {schedd_name}" + ) raise else: # Otherwise assume the Schedd is on the local machine @@ -359,7 +422,7 @@ def duplicate_quotes(self, value: str) -> str: """ return value.replace("'", "''").replace('"', '""') - def getEnvString(self, overrides: Dict[str, str]) -> str: + def getEnvString(self, overrides: dict[str, str]) -> str: """ Build an environment string that a HTCondor Submit object can use. @@ -384,10 +447,12 @@ def getEnvString(self, overrides: Dict[str, str]) -> str: # The entire string should be encapsulated in double quotes # Each variable should be separated by a single space - return '"' + ' '.join(env_items) + '"' + return '"' + " ".join(env_items) + '"' # Override the issueBatchJob method so HTCondor can be given the disk request - def issueBatchJob(self, command: str, jobNode, job_environment: Optional[Dict[str, str]] = None): + def issueBatchJob( + self, command: str, jobNode, job_environment: Optional[dict[str, str]] = None + ): # Avoid submitting internal jobs to the batch queue, handle locally localID = self.handleLocalJob(command, jobNode) if localID is not None: @@ -398,7 +463,19 @@ def issueBatchJob(self, command: str, jobNode, job_environment: Optional[Dict[st self.currentJobs.add(jobID) # Construct our style of job tuple - self.newJobsQueue.put((jobID, jobNode.cores, jobNode.memory, jobNode.disk, jobNode.jobName, command, - job_environment or {}, jobNode.accelerators)) - logger.debug("Issued the job command: %s with job id: %s ", command, str(jobID)) + self.newJobsQueue.put( + ( + jobID, + jobNode.cores, + jobNode.memory, + jobNode.disk, + jobNode.jobName, + command, + job_environment or {}, + jobNode.accelerators, + ) + ) + logger.debug( + "Issued the job command: %s with job id: %s ", command, str(jobID) + ) return jobID diff --git a/src/toil/batchSystems/kubernetes.py b/src/toil/batchSystems/kubernetes.py index 0fdeeb51d7..daa7ea50ae 100644 --- a/src/toil/batchSystems/kubernetes.py +++ b/src/toil/batchSystems/kubernetes.py @@ -30,22 +30,10 @@ import time import uuid from argparse import ArgumentParser, _ArgumentGroup +from collections.abc import Iterator from queue import Empty, Queue from threading import Condition, Event, RLock, Thread -from typing import (Any, - Callable, - Dict, - Iterator, - List, - Literal, - Optional, - Set, - Tuple, - Type, - TypeVar, - Union, - cast, - overload) +from typing import Any, Callable, Literal, Optional, TypeVar, Union, cast, overload from toil.lib.conversions import opt_strtobool @@ -53,72 +41,79 @@ from typing_extensions import ParamSpec else: from typing import ParamSpec -if sys.version_info >= (3, 8): - from typing import Protocol, TypedDict, runtime_checkable + +if sys.version_info < (3, 11): + from typing_extensions import NotRequired else: - from typing_extensions import Protocol, TypedDict, runtime_checkable -# TODO: When this gets into the standard library, get it from there and drop + from typing import NotRequired + +from typing import Protocol, TypedDict, runtime_checkable + import urllib3 import yaml + # The Right Way to use the Kubernetes module is to `import kubernetes` and then you get all your stuff as like ApiClient. But this doesn't work for the stubs: the stubs seem to only support importing things from the internal modules in `kubernetes` where they are actually defined. See for example . So we just import all the things we use into our global namespace here. -from kubernetes.client import (BatchV1Api, - CoreV1Api, - CustomObjectsApi, - V1Affinity, - V1Container, - V1ContainerStatus, - V1EmptyDirVolumeSource, - V1HostPathVolumeSource, - V1Job, - V1JobCondition, - V1JobSpec, - V1NodeAffinity, - V1NodeSelector, - V1NodeSelectorRequirement, - V1NodeSelectorTerm, - V1ObjectMeta, - V1Pod, - V1PodSpec, - V1PodTemplateSpec, - V1PreferredSchedulingTerm, - V1ResourceRequirements, - V1SecretVolumeSource, - V1Toleration, - V1Volume, - V1VolumeMount, V1SecurityContext) +from kubernetes.client import ( + BatchV1Api, + CoreV1Api, + CustomObjectsApi, + V1Affinity, + V1Container, + V1ContainerStatus, + V1EmptyDirVolumeSource, + V1HostPathVolumeSource, + V1Job, + V1JobCondition, + V1JobSpec, + V1NodeAffinity, + V1NodeSelector, + V1NodeSelectorRequirement, + V1NodeSelectorTerm, + V1ObjectMeta, + V1Pod, + V1PodSpec, + V1PodTemplateSpec, + V1PreferredSchedulingTerm, + V1ResourceRequirements, + V1SecretVolumeSource, + V1SecurityContext, + V1Toleration, + V1Volume, + V1VolumeMount, +) from kubernetes.client.api_client import ApiClient from kubernetes.client.exceptions import ApiException from kubernetes.config.config_exception import ConfigException from kubernetes.config.incluster_config import load_incluster_config -from kubernetes.config.kube_config import (list_kube_config_contexts, - load_kube_config) +from kubernetes.config.kube_config import list_kube_config_contexts, load_kube_config + # TODO: Watch API is not typed yet from kubernetes.watch import Watch # type: ignore -# typing-extensions dependency on Pythons that are new enough. -from typing_extensions import NotRequired from toil import applianceSelf -from toil.batchSystems.abstractBatchSystem import (EXIT_STATUS_UNAVAILABLE_VALUE, - BatchJobExitReason, - InsufficientSystemResources, - ResourcePool, - UpdatedBatchJobInfo) +from toil.batchSystems.abstractBatchSystem import ( + EXIT_STATUS_UNAVAILABLE_VALUE, + BatchJobExitReason, + InsufficientSystemResources, + ResourcePool, + UpdatedBatchJobInfo, +) from toil.batchSystems.cleanup_support import BatchSystemCleanupSupport from toil.batchSystems.contained_executor import pack_job from toil.batchSystems.options import OptionSetter from toil.common import Config, Toil -from toil.options.common import SYS_MAX_SIZE from toil.job import JobDescription, Requirer from toil.lib.conversions import human2bytes from toil.lib.misc import get_user_name, slow_down, utc_now from toil.lib.retry import ErrorCondition, retry +from toil.options.common import SYS_MAX_SIZE from toil.resource import Resource logger = logging.getLogger(__name__) -retryable_kubernetes_errors: List[Union[Type[Exception], ErrorCondition]] = [ +retryable_kubernetes_errors: list[Union[type[Exception], ErrorCondition]] = [ urllib3.exceptions.MaxRetryError, urllib3.exceptions.ProtocolError, - ApiException + ApiException, ] @@ -132,8 +127,10 @@ def is_retryable_kubernetes_error(e: Exception) -> bool: return True return False + # Represents a collection of label or taint keys and their sets of acceptable (or unacceptable) values. -KeyValuesList = List[Tuple[str, List[str]]] +KeyValuesList = list[tuple[str, list[str]]] + class KubernetesBatchSystem(BatchSystemCleanupSupport): @classmethod @@ -150,8 +147,9 @@ class _ApiStorageDict(TypedDict): core: NotRequired[CoreV1Api] customObjects: NotRequired[CustomObjectsApi] - - def __init__(self, config: Config, maxCores: int, maxMemory: int, maxDisk: int) -> None: + def __init__( + self, config: Config, maxCores: int, maxMemory: int, maxDisk: int + ) -> None: super().__init__(config, maxCores, maxMemory, maxDisk) # Re-type the config to make sure it has all the fields we need. @@ -162,8 +160,8 @@ def __init__(self, config: Config, maxCores: int, maxMemory: int, maxDisk: int) # Otherwise if we are at debug log level, we dump every # request/response to Kubernetes, including tokens which we shouldn't # reveal on CI. - logging.getLogger('kubernetes').setLevel(logging.ERROR) - logging.getLogger('requests_oauthlib').setLevel(logging.ERROR) + logging.getLogger("kubernetes").setLevel(logging.ERROR) + logging.getLogger("requests_oauthlib").setLevel(logging.ERROR) # This will hold the last time our Kubernetes credentials were refreshed self.credential_time: Optional[datetime.datetime] = None @@ -171,7 +169,7 @@ def __init__(self, config: Config, maxCores: int, maxMemory: int, maxDisk: int) self._apis: KubernetesBatchSystem._ApiStorageDict = {} # Get our namespace (and our Kubernetes credentials to make sure they exist) - self.namespace: str = self._api('namespace') + self.namespace: str = self._api("namespace") # Decide if we are going to mount a Kubernetes host path as the Toil # work dir in the workers, for shared caching. @@ -190,7 +188,7 @@ def __init__(self, config: Config, maxCores: int, maxMemory: int, maxDisk: int) self.unique_id = uuid.uuid4() # Create a prefix for jobs, starting with our username - self.job_prefix: str = f'{username}-toil-{self.unique_id}-' + self.job_prefix: str = f"{username}-toil-{self.unique_id}-" # Instead of letting Kubernetes assign unique job names, we assign our # own based on a numerical job ID. This functionality is managed by the # BatchSystemLocalSupport. @@ -214,55 +212,61 @@ def __init__(self, config: Config, maxCores: int, maxMemory: int, maxDisk: int) # Try and guess what Toil work dir the workers will use. # We need to be able to provision (possibly shared) space there. self.worker_work_dir: str = Toil.getToilWorkDir(config.workDir) - if (config.workDir is None and - os.getenv('TOIL_WORKDIR') is None and - self.worker_work_dir == tempfile.gettempdir()): + if ( + config.workDir is None + and os.getenv("TOIL_WORKDIR") is None + and self.worker_work_dir == tempfile.gettempdir() + ): # We defaulted to the system temp directory. But we think the # worker Dockerfiles will make them use /var/lib/toil instead. # TODO: Keep this in sync with the Dockerfile. - self.worker_work_dir = '/var/lib/toil' + self.worker_work_dir = "/var/lib/toil" # A Toil-managed Kubernetes cluster will have most of its temp space at # /var/tmp, which is where really large temp files really belong # according to https://systemd.io/TEMPORARY_DIRECTORIES/. So we will # set the default temporary directory to there for all our jobs. - self.environment['TMPDIR'] = '/var/tmp' + self.environment["TMPDIR"] = "/var/tmp" # Get the name of the AWS secret, if any, to mount in containers. - self.aws_secret_name: Optional[str] = os.environ.get("TOIL_AWS_SECRET_NAME", None) + self.aws_secret_name: Optional[str] = os.environ.get( + "TOIL_AWS_SECRET_NAME", None + ) # Set this to True to enable the experimental wait-for-job-update code self.enable_watching: bool = os.environ.get("KUBE_WATCH_ENABLED", False) # This will be a label to select all our jobs. - self.run_id: str = f'toil-{self.unique_id}' + self.run_id: str = f"toil-{self.unique_id}" # Keep track of available resources. - maxMillicores = int(SYS_MAX_SIZE if self.maxCores == SYS_MAX_SIZE else self.maxCores * 1000) - self.resource_sources: List[ResourcePool] = [ + maxMillicores = int( + SYS_MAX_SIZE if self.maxCores == SYS_MAX_SIZE else self.maxCores * 1000 + ) + self.resource_sources: list[ResourcePool] = [ # A pool representing available job slots - ResourcePool(self.config.max_jobs, 'job slots'), + ResourcePool(self.config.max_jobs, "job slots"), # A pool representing available CPU in units of millicores (1 CPU # unit = 1000 millicores) - ResourcePool(maxMillicores, 'cores'), + ResourcePool(maxMillicores, "cores"), # A pool representing available memory in bytes - ResourcePool(self.maxMemory, 'memory'), + ResourcePool(self.maxMemory, "memory"), # A pool representing the available space in bytes - ResourcePool(self.maxDisk, 'disk'), + ResourcePool(self.maxDisk, "disk"), ] # A set of job IDs that are queued (useful for getIssuedBatchJobIDs()) - self._queued_job_ids: Set[int] = set() + self._queued_job_ids: set[int] = set() # Keep track of the acquired resources for each job - self._acquired_resources: Dict[str, List[int]] = {} + self._acquired_resources: dict[str, list[int]] = {} # Queue for jobs to be submitted to the Kubernetes cluster - self._jobs_queue: Queue[Tuple[int, JobDescription, V1PodSpec]] = Queue() + self._jobs_queue: Queue[tuple[int, JobDescription, V1PodSpec]] = Queue() # A set of job IDs that should be killed - self._killed_queue_jobs: Set[int] = set() + self._killed_queue_jobs: set[int] = set() # We use this event to signal shutdown self._shutting_down: Event = Event() @@ -286,7 +290,7 @@ def _pretty_print(self, kubernetes_object: Any) -> str: """ if not kubernetes_object: - return 'None' + return "None" # We need a Kubernetes widget that knows how to translate # its data structures to nice YAML-able dicts. See: @@ -296,7 +300,7 @@ def _pretty_print(self, kubernetes_object: Any) -> str: # Convert to a dict root_dict = api_client.sanitize_for_serialization(kubernetes_object) - def drop_boring(here: Dict[str, Any]) -> None: + def drop_boring(here: dict[str, Any]) -> None: """ Drop boring fields recursively. """ @@ -304,7 +308,7 @@ def drop_boring(here: Dict[str, Any]) -> None: for k, v in here.items(): if isinstance(v, dict): drop_boring(v) - if k in ['managedFields']: + if k in ["managedFields"]: boring_keys.append(k) for k in boring_keys: del here[k] @@ -314,33 +318,43 @@ def drop_boring(here: Dict[str, Any]) -> None: @overload def _api( - self, kind: Literal['batch'], max_age_seconds: float = 5 * 60, errors: Optional[List[int]] = None - ) -> BatchV1Api: - ... + self, + kind: Literal["batch"], + max_age_seconds: float = 5 * 60, + errors: Optional[list[int]] = None, + ) -> BatchV1Api: ... @overload def _api( - self, kind: Literal['core'], max_age_seconds: float = 5 * 60, errors: Optional[List[int]] = None - ) -> CoreV1Api: - ... + self, + kind: Literal["core"], + max_age_seconds: float = 5 * 60, + errors: Optional[list[int]] = None, + ) -> CoreV1Api: ... @overload def _api( - self, kind: Literal['customObjects'], max_age_seconds: float = 5 * 60, errors: Optional[List[int]] = None - ) -> CustomObjectsApi: - ... + self, + kind: Literal["customObjects"], + max_age_seconds: float = 5 * 60, + errors: Optional[list[int]] = None, + ) -> CustomObjectsApi: ... @overload def _api( - self, kind: Literal['namespace'], max_age_seconds: float = 5 * 60 - ) -> str: - ... + self, kind: Literal["namespace"], max_age_seconds: float = 5 * 60 + ) -> str: ... def _api( self, - kind: Union[Literal['batch'], Literal['core'], Literal['customObjects'], Literal['namespace']], + kind: Union[ + Literal["batch"], + Literal["core"], + Literal["customObjects"], + Literal["namespace"], + ], max_age_seconds: float = 5 * 60, - errors: Optional[List[int]] = None + errors: Optional[list[int]] = None, ) -> Union[BatchV1Api, CoreV1Api, CustomObjectsApi, str]: """ The Kubernetes module isn't clever enough to renew its credentials when @@ -373,44 +387,53 @@ def _api( now = utc_now() - if self.credential_time is None or (now - self.credential_time).total_seconds() > max_age_seconds: + if ( + self.credential_time is None + or (now - self.credential_time).total_seconds() > max_age_seconds + ): # Credentials need a refresh try: # Load ~/.kube/config or KUBECONFIG load_kube_config() # Worked. We're using kube config - config_source = 'kube' + config_source = "kube" except ConfigException: # Didn't work. Try pod-based credentials in case we are in a pod. try: load_incluster_config() # Worked. We're using in_cluster config - config_source = 'in_cluster' + config_source = "in_cluster" except ConfigException: - raise RuntimeError('Could not load Kubernetes configuration from ~/.kube/config, $KUBECONFIG, or current pod.') + raise RuntimeError( + "Could not load Kubernetes configuration from ~/.kube/config, $KUBECONFIG, or current pod." + ) # Now fill in the API objects with these credentials - self._apis['batch'] = BatchV1Api() - self._apis['core'] = CoreV1Api() - self._apis['customObjects'] = CustomObjectsApi() + self._apis["batch"] = BatchV1Api() + self._apis["core"] = CoreV1Api() + self._apis["customObjects"] = CustomObjectsApi() # And save the time self.credential_time = now - if kind == 'namespace': + if kind == "namespace": # We just need the namespace string - if config_source == 'in_cluster': + if config_source == "in_cluster": # Our namespace comes from a particular file. - with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace") as fh: + with open( + "/var/run/secrets/kubernetes.io/serviceaccount/namespace" + ) as fh: return fh.read().strip() else: # Find all contexts and the active context. # The active context gets us our namespace. contexts, activeContext = list_kube_config_contexts() if not contexts: - raise RuntimeError("No Kubernetes contexts available in ~/.kube/config or $KUBECONFIG") + raise RuntimeError( + "No Kubernetes contexts available in ~/.kube/config or $KUBECONFIG" + ) # Identify the namespace to work in - namespace = activeContext.get('context', {}).get('namespace', 'default') + namespace = activeContext.get("context", {}).get("namespace", "default") assert isinstance(namespace, str) return namespace @@ -430,11 +453,13 @@ def _api( ErrorCondition( error=ApiException, error_codes=errors, - retry_on_this_condition=False + retry_on_this_condition=False, ) ) decorator = retry(errors=error_list) - wrapper = KubernetesBatchSystem.DecoratorWrapper(api_object, decorator) + wrapper = KubernetesBatchSystem.DecoratorWrapper( + api_object, decorator + ) return cast(Union[BatchV1Api, CoreV1Api, CustomObjectsApi], wrapper) except KeyError: raise RuntimeError(f"Unknown Kubernetes API type: {kind}") @@ -445,7 +470,12 @@ class DecoratorWrapper: """ P = ParamSpec("P") - def __init__(self, to_wrap: Any, decorator: Callable[[Callable[P, Any]], Callable[P, Any]]) -> None: + + def __init__( + self, + to_wrap: Any, + decorator: Callable[[Callable[P, Any]], Callable[P, Any]], + ) -> None: """ Make a wrapper around the given object. When methods on the object are called, they will be called through @@ -469,16 +499,19 @@ def __getattr__(self, name: str) -> Any: return attr ItemT = TypeVar("ItemT") + class _ItemsHaver(Protocol[ItemT]): """ Anything that has a .items that is a list of something. """ + # KubernetesBatchSystem isn't defined until the class executes, so any # up-references to types from there that are in signatures (and not # method code) need to be quoted - items: List["KubernetesBatchSystem.ItemT"] + items: list["KubernetesBatchSystem.ItemT"] CovItemT = TypeVar("CovItemT", covariant=True) + class _WatchEvent(Protocol[CovItemT]): """ An event from a Kubernetes watch stream. @@ -490,23 +523,26 @@ class _WatchEvent(Protocol[CovItemT]): # __getitem__ instead. @overload - def __getitem__(self, name: Literal['type']) -> str: - ... + def __getitem__(self, name: Literal["type"]) -> str: ... @overload - def __getitem__(self, name: Literal['object']) -> "KubernetesBatchSystem.CovItemT": - ... + def __getitem__( + self, name: Literal["object"] + ) -> "KubernetesBatchSystem.CovItemT": ... @overload - def __getitem__(self, name: Literal['raw_object']) -> Dict[str, Any]: - ... + def __getitem__(self, name: Literal["raw_object"]) -> dict[str, Any]: ... - def __getitem__(self, name: Union[Literal['type'], Literal['object'], Literal['raw_object']]) -> Any: - ... + def __getitem__( + self, name: Union[Literal["type"], Literal["object"], Literal["raw_object"]] + ) -> Any: ... P = ParamSpec("P") R = TypeVar("R") - def _stream_until_error(self, method: Callable[P, _ItemsHaver[R]], *args: P.args, **kwargs: P.kwargs) -> Iterator[_WatchEvent[R]]: + + def _stream_until_error( + self, method: Callable[P, _ItemsHaver[R]], *args: P.args, **kwargs: P.kwargs + ) -> Iterator[_WatchEvent[R]]: """ Kubernetes kubernetes.watch.Watch().stream() streams can fail and raise errors. We don't want to have those errors fail the entire workflow, so @@ -572,7 +608,7 @@ def _scheduler(self) -> None: # Loop through all jobs inside the queue and see if any of them # could be launched. - jobs: Queue[Tuple[int, JobDescription, V1PodSpec]] = Queue() + jobs: Queue[tuple[int, JobDescription, V1PodSpec]] = Queue() while True: try: job = self._jobs_queue.get_nowait() @@ -584,7 +620,7 @@ def _scheduler(self) -> None: logger.debug(f"Skipping killed job {job_id}") continue - job_name = f'{self.job_prefix}{job_id}' + job_name = f"{self.job_prefix}{job_id}" result = self._launch_job(job_name, job_desc, spec) if result is False: # Not enough resources to launch this job. @@ -605,7 +641,7 @@ def _scheduler(self) -> None: logger.debug(f"Roughly {self._jobs_queue.qsize} jobs in the queue") def setUserScript(self, userScript: Resource) -> None: - logger.info(f'Setting user script for deployment: {userScript}') + logger.info(f"Setting user script for deployment: {userScript}") self.user_script = userScript # setEnv is provided by BatchSystemSupport, updates self.environment @@ -657,18 +693,21 @@ def set_preemptible(self, preemptible: bool) -> None: # Amazon just uses a label, while Google # # uses a label and a taint. - PREEMPTIBLE_SCHEMES = {'labels': [('eks.amazonaws.com/capacityType', ['SPOT']), - ('cloud.google.com/gke-preemptible', ['true'])], - 'taints': [('cloud.google.com/gke-preemptible', ['true'])]} + PREEMPTIBLE_SCHEMES = { + "labels": [ + ("eks.amazonaws.com/capacityType", ["SPOT"]), + ("cloud.google.com/gke-preemptible", ["true"]), + ], + "taints": [("cloud.google.com/gke-preemptible", ["true"])], + } if preemptible: # We want to seek preemptible labels and tolerate preemptible taints. - self.desired_labels += PREEMPTIBLE_SCHEMES['labels'] - self.tolerated_taints += PREEMPTIBLE_SCHEMES['taints'] + self.desired_labels += PREEMPTIBLE_SCHEMES["labels"] + self.tolerated_taints += PREEMPTIBLE_SCHEMES["taints"] else: # We want to prohibit preemptible labels - self.prohibited_labels += PREEMPTIBLE_SCHEMES['labels'] - + self.prohibited_labels += PREEMPTIBLE_SCHEMES["labels"] def apply(self, pod_spec: V1PodSpec) -> None: """ @@ -679,29 +718,26 @@ def apply(self, pod_spec: V1PodSpec) -> None: # Convert our collections to Kubernetes expressions. # REQUIRE that ALL of these requirements be satisfied - required_selector_requirements: List[V1NodeSelectorRequirement] = [] + required_selector_requirements: list[V1NodeSelectorRequirement] = [] # PREFER that EACH of these terms be satisfied - preferred_scheduling_terms: List[V1PreferredSchedulingTerm] = [] + preferred_scheduling_terms: list[V1PreferredSchedulingTerm] = [] # And this list of tolerations to apply - tolerations: List[V1Toleration] = [] + tolerations: list[V1Toleration] = [] for label, values in self.required_labels: # Collect requirements for the required labels - has_label = V1NodeSelectorRequirement(key=label, - operator='In', - values=values) + has_label = V1NodeSelectorRequirement( + key=label, operator="In", values=values + ) required_selector_requirements.append(has_label) for label, values in self.desired_labels: # Collect preferences for the preferred labels - has_label = V1NodeSelectorRequirement(key=label, - operator='In', - values=values) - term = V1NodeSelectorTerm( - match_expressions=[has_label] + has_label = V1NodeSelectorRequirement( + key=label, operator="In", values=values ) + term = V1NodeSelectorTerm(match_expressions=[has_label]) # Each becomes a separate preference, more is better. - preference = V1PreferredSchedulingTerm(weight=1, - preference=term) + preference = V1PreferredSchedulingTerm(weight=1, preference=term) preferred_scheduling_terms.append(preference) for label, values in self.prohibited_labels: @@ -712,15 +748,14 @@ def apply(self, pod_spec: V1PodSpec) -> None: # # So we create a NotIn for each label and AND them # all together. - not_labeled = V1NodeSelectorRequirement(key=label, - operator='NotIn', - values=values) + not_labeled = V1NodeSelectorRequirement( + key=label, operator="NotIn", values=values + ) required_selector_requirements.append(not_labeled) for taint, values in self.tolerated_taints: for value in values: # Each toleration can tolerate one value - taint_ok = V1Toleration(key=taint, - value=value) + taint_ok = V1Toleration(key=taint, value=value) tolerations.append(taint_ok) # Now combine everything @@ -734,16 +769,22 @@ def apply(self, pod_spec: V1PodSpec) -> None: match_expressions=required_selector_requirements ) # And a selector to hold the term - requirements_selector = V1NodeSelector(node_selector_terms=[requirements_term]) + requirements_selector = V1NodeSelector( + node_selector_terms=[requirements_term] + ) # Make an affinity that prefers the preferences and requires the requirements node_affinity = V1NodeAffinity( - preferred_during_scheduling_ignored_during_execution=preferred_scheduling_terms if preferred_scheduling_terms else None, - required_during_scheduling_ignored_during_execution=requirements_selector + preferred_during_scheduling_ignored_during_execution=( + preferred_scheduling_terms + if preferred_scheduling_terms + else None + ), + required_during_scheduling_ignored_during_execution=requirements_selector, ) # Apply the affinity - pod_spec.affinity = V1Affinity(node_affinity = node_affinity) + pod_spec.affinity = V1Affinity(node_affinity=node_affinity) if tolerations: # Apply the tolerations @@ -751,18 +792,22 @@ def apply(self, pod_spec: V1PodSpec) -> None: def _check_accelerator_request(self, requirer: Requirer) -> None: for accelerator in requirer.accelerators: - if accelerator['kind'] != 'gpu' and 'model' not in accelerator: + if accelerator["kind"] != "gpu" and "model" not in accelerator: # We can only provide GPUs or things with a model right now - raise InsufficientSystemResources(requirer, 'accelerators', details=[ - f'The accelerator {accelerator} could not be provided.', - 'The Toil Kubernetes batch system only knows how to request gpu accelerators or accelerators with a defined model.' - ]) + raise InsufficientSystemResources( + requirer, + "accelerators", + details=[ + f"The accelerator {accelerator} could not be provided.", + "The Toil Kubernetes batch system only knows how to request gpu accelerators or accelerators with a defined model.", + ], + ) def _create_pod_spec( - self, - command: str, - job_desc: JobDescription, - job_environment: Optional[Dict[str, str]] = None + self, + command: str, + job_desc: JobDescription, + job_environment: Optional[dict[str, str]] = None, ) -> V1PodSpec: """ Make the specification for a pod that can execute the given job. @@ -789,9 +834,11 @@ def _create_pod_spec( # OOMing. We also want to provision some extra space so that when # we test _isPodStuckOOM we never get True unless the job has # exceeded job_desc.memory. - requirements_dict = {'cpu': job_desc.cores, - 'memory': job_desc.memory + 1024 * 1024 * 512, - 'ephemeral-storage': job_desc.disk + 1024 * 1024 * 512} + requirements_dict = { + "cpu": job_desc.cores, + "memory": job_desc.memory + 1024 * 1024 * 512, + "ephemeral-storage": job_desc.disk + 1024 * 1024 * 512, + } # Also start on the placement constraints placement = KubernetesBatchSystem.Placement() @@ -801,19 +848,21 @@ def _create_pod_spec( # Add in requirements for accelerators (GPUs). # See https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/ - if accelerator['kind'] == 'gpu': + if accelerator["kind"] == "gpu": # We can't schedule GPUs without a brand, because the # Kubernetes resources are .com/gpu. If no brand is # specified, default to nvidia, which is very popular. - vendor = accelerator.get('brand', 'nvidia') + vendor = accelerator.get("brand", "nvidia") key = f'{vendor}.com/{accelerator["kind"]}' if key not in requirements_dict: requirements_dict[key] = 0 - requirements_dict[key] += accelerator['count'] + requirements_dict[key] += accelerator["count"] - if 'model' in accelerator: + if "model" in accelerator: # TODO: What if the cluster uses some other accelerator model labeling scheme? - placement.required_labels.append(('accelerator', [accelerator['model']])) + placement.required_labels.append( + ("accelerator", [accelerator["model"]]) + ) # TODO: Support AMD's labeling scheme: https://github.com/RadeonOpenCompute/k8s-device-plugin/tree/master/cmd/k8s-node-labeller # That just has each trait of the accelerator as a separate label, but nothing that quite corresponds to a model. @@ -825,14 +874,15 @@ def _create_pod_spec( # the UCSC Kubernetes admins want it that way. For GPUs, Kubernetes # requires them to be equal. limits_dict = requests_dict - resources = V1ResourceRequirements(limits=limits_dict, - requests=requests_dict) + resources = V1ResourceRequirements(limits=limits_dict, requests=requests_dict) # Collect volumes and mounts volumes = [] mounts = [] - def mount_host_path(volume_name: str, host_path: str, mount_path: str, create: bool = False) -> None: + def mount_host_path( + volume_name: str, host_path: str, mount_path: str, create: bool = False + ) -> None: """ Add a host path volume with the given name to mount the given path. @@ -840,10 +890,9 @@ def mount_host_path(volume_name: str, host_path: str, mount_path: str, create: b not exist. Otherwise, when the directory does not exist, the pod will wait for it to come into existence. """ - volume_type = 'DirectoryOrCreate' if create else 'Directory' + volume_type = "DirectoryOrCreate" if create else "Directory" volume_source = V1HostPathVolumeSource(path=host_path, type=volume_type) - volume = V1Volume(name=volume_name, - host_path=volume_source) + volume = V1Volume(name=volume_name, host_path=volume_source) volumes.append(volume) volume_mount = V1VolumeMount(mount_path=mount_path, name=volume_name) mounts.append(volume_mount) @@ -851,49 +900,63 @@ def mount_host_path(volume_name: str, host_path: str, mount_path: str, create: b if self.host_path is not None: # Provision Toil WorkDir from a HostPath volume, to share with other pods. # Create the directory if it doesn't exist already. - mount_host_path('workdir', self.host_path, self.worker_work_dir, create=True) + mount_host_path( + "workdir", self.host_path, self.worker_work_dir, create=True + ) # We also need to mount across /run/lock, where we will put # per-node coordiantion info. # Don't create this; it really should always exist. - mount_host_path('coordination', '/run/lock', '/run/lock') + mount_host_path("coordination", "/run/lock", "/run/lock") else: # Provision Toil WorkDir as an ephemeral volume - ephemeral_volume_name = 'workdir' + ephemeral_volume_name = "workdir" ephemeral_volume_source = V1EmptyDirVolumeSource() - ephemeral_volume = V1Volume(name=ephemeral_volume_name, - empty_dir=ephemeral_volume_source) + ephemeral_volume = V1Volume( + name=ephemeral_volume_name, empty_dir=ephemeral_volume_source + ) volumes.append(ephemeral_volume) - ephemeral_volume_mount = V1VolumeMount(mount_path=self.worker_work_dir, name=ephemeral_volume_name) + ephemeral_volume_mount = V1VolumeMount( + mount_path=self.worker_work_dir, name=ephemeral_volume_name + ) mounts.append(ephemeral_volume_mount) # And don't share coordination directory if self.aws_secret_name is not None: # Also mount an AWS secret, if provided. # TODO: make this generic somehow - secret_volume_name = 's3-credentials' - secret_volume_source = V1SecretVolumeSource(secret_name=self.aws_secret_name) - secret_volume = V1Volume(name=secret_volume_name, - secret=secret_volume_source) + secret_volume_name = "s3-credentials" + secret_volume_source = V1SecretVolumeSource( + secret_name=self.aws_secret_name + ) + secret_volume = V1Volume( + name=secret_volume_name, secret=secret_volume_source + ) volumes.append(secret_volume) - secret_volume_mount = V1VolumeMount(mount_path='/root/.aws', name=secret_volume_name) + secret_volume_mount = V1VolumeMount( + mount_path="/root/.aws", name=secret_volume_name + ) mounts.append(secret_volume_mount) # Make a container definition - container = V1Container(command=command_list, - image=self.docker_image, - name="runner-container", - resources=resources, - volume_mounts=mounts) + container = V1Container( + command=command_list, + image=self.docker_image, + name="runner-container", + resources=resources, + volume_mounts=mounts, + ) # In case security context rules are not allowed to be set, we only apply # a security context at all if we need to turn on privileged mode. if self.config.kubernetes_privileged: - container.security_context = V1SecurityContext(privileged=self.config.kubernetes_privileged) + container.security_context = V1SecurityContext( + privileged=self.config.kubernetes_privileged + ) # Wrap the container in a spec - pod_spec = V1PodSpec(containers=[container], - volumes=volumes, - restart_policy="Never") + pod_spec = V1PodSpec( + containers=[container], volumes=volumes, restart_policy="Never" + ) # Tell the spec where to land placement.apply(pod_spec) @@ -903,7 +966,9 @@ def mount_host_path(volume_name: str, host_path: str, mount_path: str, create: b return pod_spec - def _release_acquired_resources(self, resources: List[int], notify: bool = False) -> None: + def _release_acquired_resources( + self, resources: list[int], notify: bool = False + ) -> None: """ Release all resources acquired for a job. @@ -922,10 +987,7 @@ def _release_acquired_resources(self, resources: List[int], notify: bool = False self._work_available.notify_all() def _launch_job( - self, - job_name: str, - job_desc: JobDescription, - pod_spec: V1PodSpec + self, job_name: str, job_desc: JobDescription, pod_spec: V1PodSpec ) -> bool: """ Try to launch the given job to the Kubernetes cluster. Return False if @@ -933,19 +995,26 @@ def _launch_job( """ # Limit the amount of resources requested at a time. - resource_requests: List[int] = [1, int(job_desc.cores * 1000), job_desc.memory, job_desc.disk] + resource_requests: list[int] = [ + 1, + int(job_desc.cores * 1000), + job_desc.memory, + job_desc.disk, + ] acquired = [] for source, request in zip(self.resource_sources, resource_requests): # For each kind of resource we want, go get it - assert ((isinstance(source, ResourcePool) and isinstance(request, int))) + assert isinstance(source, ResourcePool) and isinstance(request, int) if source.acquireNow(request): acquired.append(request) else: # We can't get everything - self._release_acquired_resources(acquired, + self._release_acquired_resources( + acquired, # Put it back quietly. - notify=False) + notify=False, + ) return False self._acquired_resources[job_name] = acquired @@ -954,9 +1023,11 @@ def _launch_job( # Make metadata to label the job/pod with info. # Don't let the cluster autoscaler evict any Toil jobs. - metadata = V1ObjectMeta(name=job_name, - labels={"toil_run": self.run_id}, - annotations={"cluster-autoscaler.kubernetes.io/safe-to-evict": "false"}) + metadata = V1ObjectMeta( + name=job_name, + labels={"toil_run": self.run_id}, + annotations={"cluster-autoscaler.kubernetes.io/safe-to-evict": "false"}, + ) # Wrap the spec in a template template = V1PodTemplateSpec(spec=pod_spec, metadata=metadata) @@ -964,18 +1035,21 @@ def _launch_job( # Make another spec for the job, asking to run the template with no # backoff/retry. Specify our own TTL to avoid catching the notice # of over-zealous abandoned job cleanup scripts. - job_spec = V1JobSpec(template=template, - backoff_limit=0, - ttl_seconds_after_finished=self.finished_job_ttl) + job_spec = V1JobSpec( + template=template, + backoff_limit=0, + ttl_seconds_after_finished=self.finished_job_ttl, + ) # And make the actual job - job = V1Job(spec=job_spec, - metadata=metadata, - api_version="batch/v1", - kind="Job") + job = V1Job( + spec=job_spec, metadata=metadata, api_version="batch/v1", kind="Job" + ) # Launch the job - launched = self._api('batch', errors=[]).create_namespaced_job(self.namespace, job) + launched = self._api("batch", errors=[]).create_namespaced_job( + self.namespace, job + ) logger.debug(f"Launched job: {job_name}") @@ -983,10 +1057,11 @@ def _launch_job( def _delete_job( self, - job_name: str, *, + job_name: str, + *, propagation_policy: Literal["Foreground", "Background"] = "Foreground", gone_ok: bool = False, - resource_notify: bool = True + resource_notify: bool = True, ) -> None: """ Given the name of a kubernetes job, delete the job and release all @@ -999,11 +1074,9 @@ def _delete_job( the self._work_available condition. """ try: - logger.debug(f'Deleting Kubernetes job {job_name}') - self._api('batch', errors=[404] if gone_ok else []).delete_namespaced_job( - job_name, - self.namespace, - propagation_policy=propagation_policy + logger.debug(f"Deleting Kubernetes job {job_name}") + self._api("batch", errors=[404] if gone_ok else []).delete_namespaced_job( + job_name, self.namespace, propagation_policy=propagation_policy ) finally: # We should always release the acquired resources. @@ -1014,7 +1087,12 @@ def _delete_job( self._release_acquired_resources(resources, notify=resource_notify) del self._acquired_resources[job_name] - def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int: + def issueBatchJob( + self, + command: str, + job_desc: JobDescription, + job_environment: Optional[dict[str, str]] = None, + ) -> int: # Try the job as local localID = self.handleLocalJob(command, job_desc) if localID is not None: @@ -1027,7 +1105,9 @@ def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: self.check_resource_request(job_desc) # Make a pod that describes running the job - pod_spec = self._create_pod_spec(command, job_desc, job_environment=job_environment) + pod_spec = self._create_pod_spec( + command, job_desc, job_environment=job_environment + ) # Make a batch system scope job ID job_id = self.getNextJobID() @@ -1055,6 +1135,7 @@ class _ArgsDict(TypedDict): kwargs, so we can't just set unused ones to None. But we also don't want to duplicate code for every combination of possible present keys. """ + _continue: NotRequired[str] label_selector: NotRequired[str] field_selector: NotRequired[str] @@ -1084,30 +1165,30 @@ def _ourJobObject(self, onlySucceeded: bool = False) -> Iterator[V1Job]: token = None while True: - kwargs: KubernetesBatchSystem._ArgsDict = {'label_selector': f"toil_run={self.run_id}"} + kwargs: KubernetesBatchSystem._ArgsDict = { + "label_selector": f"toil_run={self.run_id}" + } if onlySucceeded: - kwargs['field_selector'] = "status.successful==1" + kwargs["field_selector"] = "status.successful==1" if token is not None: - kwargs['_continue'] = token + kwargs["_continue"] = token - results = self._api('batch', errors=[]).list_namespaced_job( - self.namespace, - **kwargs + results = self._api("batch", errors=[]).list_namespaced_job( + self.namespace, **kwargs ) - + # These jobs belong to us yield from (j for j in results.items if not self._is_deleted(j)) # Remember the continuation token, if any - token = getattr(results.metadata, 'continue', None) + token = getattr(results.metadata, "continue", None) if token is None: # There isn't one. We got everything. break - def _ourPodObject(self) -> Iterator[V1Pod]: """ Yield Kubernetes V1Pod objects that we are responsible for that the @@ -1117,25 +1198,25 @@ def _ourPodObject(self) -> Iterator[V1Pod]: token = None while True: - kwargs: KubernetesBatchSystem._ArgsDict = {'label_selector': f"toil_run={self.run_id}"} + kwargs: KubernetesBatchSystem._ArgsDict = { + "label_selector": f"toil_run={self.run_id}" + } if token is not None: - kwargs['_continue'] = token + kwargs["_continue"] = token - results = self._api('core', errors=[]).list_namespaced_pod( - self.namespace, - **kwargs + results = self._api("core", errors=[]).list_namespaced_pod( + self.namespace, **kwargs ) yield from (j for j in results.items if not self._is_deleted(j)) # Remember the continuation token, if any - token = getattr(results.metadata, 'continue', None) + token = getattr(results.metadata, "continue", None) if token is None: # There isn't one. We got everything. break - def _getPodForJob(self, jobObject: V1Job) -> Optional[V1Pod]: """ Get the pod that belongs to the given job, or None if the job's pod is @@ -1149,22 +1230,26 @@ def _getPodForJob(self, jobObject: V1Job) -> Optional[V1Pod]: """ # Make sure the job has the fields we need - assert(jobObject.metadata is not None) + assert jobObject.metadata is not None token = None while True: - kwargs: KubernetesBatchSystem._ArgsDict = {'label_selector': f'job-name={jobObject.metadata.name}'} + kwargs: KubernetesBatchSystem._ArgsDict = { + "label_selector": f"job-name={jobObject.metadata.name}" + } if token is not None: - kwargs['_continue'] = token - results = self._api('core', errors=[]).list_namespaced_pod(self.namespace, **kwargs) + kwargs["_continue"] = token + results = self._api("core", errors=[]).list_namespaced_pod( + self.namespace, **kwargs + ) for pod in results.items: # Return the first pod we find return pod # Remember the continuation token, if any - token = getattr(results.metadata, 'continue', None) + token = getattr(results.metadata, "continue", None) if token is None: # There isn't one. We got everything. @@ -1188,12 +1273,13 @@ def _getLogForPod(self, podObject: V1Pod) -> str: assert podObject.metadata is not None assert podObject.metadata.name is not None - return self._api('core', errors=[]).read_namespaced_pod_log( - podObject.metadata.name, - namespace=self.namespace + return self._api("core", errors=[]).read_namespaced_pod_log( + podObject.metadata.name, namespace=self.namespace ) - def _isPodStuckOOM(self, podObject: V1Pod, minFreeBytes: float = 1024 * 1024 * 2) -> bool: + def _isPodStuckOOM( + self, podObject: V1Pod, minFreeBytes: float = 1024 * 1024 * 2 + ) -> bool: """ Poll the current memory usage for the pod from the cluster. @@ -1223,14 +1309,18 @@ def _isPodStuckOOM(self, podObject: V1Pod, minFreeBytes: float = 1024 * 1024 * 2 assert podObject.metadata.name is not None # Compose a query to get just the pod we care about - query = f'metadata.name={podObject.metadata.name}' + query = f"metadata.name={podObject.metadata.name}" # Look for it, but manage our own exceptions try: # TODO: When the Kubernetes Python API actually wraps the metrics API, switch to that - response = self._api('customObjects').list_namespaced_custom_object('metrics.k8s.io', 'v1beta1', - self.namespace, 'pods', - field_selector=query) + response = self._api("customObjects").list_namespaced_custom_object( + "metrics.k8s.io", + "v1beta1", + self.namespace, + "pods", + field_selector=query, + ) except Exception as e: # We couldn't talk to the metrics service on this attempt. We don't # retry, but we also don't want to just ignore all errors. We only @@ -1246,7 +1336,7 @@ def _isPodStuckOOM(self, podObject: V1Pod, minFreeBytes: float = 1024 * 1024 * 2 raise # Pull out the items - items = response.get('items', []) + items = response.get("items", []) if len(items) == 0: # If there's no statistics we can't say we're stuck OOM @@ -1255,7 +1345,7 @@ def _isPodStuckOOM(self, podObject: V1Pod, minFreeBytes: float = 1024 * 1024 * 2 # Assume the first result is the right one, because of the selector. # That means we don't need to bother with _continue. # Assume it has exactly one pod, because we made it. - containers = items[0].get('containers', [{}]) + containers = items[0].get("containers", [{}]) if len(containers) == 0: # If there are no containers (because none have started yet?), we can't say we're stuck OOM @@ -1264,26 +1354,37 @@ def _isPodStuckOOM(self, podObject: V1Pod, minFreeBytes: float = 1024 * 1024 * 2 # Otherwise, assume it just has one container. # Grab the memory usage string, like 123Ki, and convert to bytes. # If anything is missing, assume 0 bytes used. - bytesUsed = human2bytes(containers[0].get('usage', {}).get('memory', '0')) + bytesUsed = human2bytes(containers[0].get("usage", {}).get("memory", "0")) # Also get the limit out of the pod object's spec assert podObject.spec is not None assert len(podObject.spec.containers) > 0 assert podObject.spec.containers[0].resources is not None assert podObject.spec.containers[0].resources.limits is not None - assert 'memory' in podObject.spec.containers[0].resources.limits - bytesAllowed = human2bytes(podObject.spec.containers[0].resources.limits['memory']) + assert "memory" in podObject.spec.containers[0].resources.limits + bytesAllowed = human2bytes( + podObject.spec.containers[0].resources.limits["memory"] + ) if bytesAllowed - bytesUsed < minFreeBytes: # This is too much! - logger.warning('Pod %s has used %d of %d bytes of memory; reporting as stuck due to OOM.', - podObject.metadata.name, bytesUsed, bytesAllowed) + logger.warning( + "Pod %s has used %d of %d bytes of memory; reporting as stuck due to OOM.", + podObject.metadata.name, + bytesUsed, + bytesAllowed, + ) return True else: return False - def _isPodStuckWaiting(self, pod_object: V1Pod, reason: Optional[str] = None, timeout: Optional[float] = None) -> bool: + def _isPodStuckWaiting( + self, + pod_object: V1Pod, + reason: Optional[str] = None, + timeout: Optional[float] = None, + ) -> bool: """ Return True if the pod looks to be in a waiting state, and false otherwise. @@ -1307,7 +1408,9 @@ def _isPodStuckWaiting(self, pod_object: V1Pod, reason: Optional[str] = None, ti # Can't be stuck return False - waiting_info = getattr(getattr(container_statuses[0], 'state', None), 'waiting', None) + waiting_info = getattr( + getattr(container_statuses[0], "state", None), "waiting", None + ) if waiting_info is None: # Pod is not waiting return False @@ -1316,15 +1419,17 @@ def _isPodStuckWaiting(self, pod_object: V1Pod, reason: Optional[str] = None, ti # Pod fails reason filter return False - start_time = getattr(pod_object.status, 'start_time', None) - if timeout is not None and (start_time is None or (utc_now() - start_time).total_seconds() < timeout): + start_time = getattr(pod_object.status, "start_time", None) + if timeout is not None and ( + start_time is None or (utc_now() - start_time).total_seconds() < timeout + ): # It hasn't been waiting too long, or we care but don't know how # long it has been waiting return False return True - def _is_deleted(self, kube_thing: Union['V1Job', 'V1Pod']) -> bool: + def _is_deleted(self, kube_thing: Union["V1Job", "V1Pod"]) -> bool: """ Determine if a job or pod is in the process od being deleted, and shouldn't count anymore. @@ -1333,7 +1438,9 @@ def _is_deleted(self, kube_thing: Union['V1Job', 'V1Pod']) -> bool: # Kubernetes "Terminating" is the same as having the deletion_timestamp # set in the metadata of the object. - deletion_timestamp: Optional[datetime.datetime] = getattr(getattr(kube_thing, 'metadata', None), 'deletion_timestamp', None) + deletion_timestamp: Optional[datetime.datetime] = getattr( + getattr(kube_thing, "metadata", None), "deletion_timestamp", None + ) # If the deletion timestamp is set to anything, it is in the process of # being deleted. We will treat that as as good as gone. return deletion_timestamp is not None @@ -1350,8 +1457,7 @@ def _getIDForOurJob(self, jobObject: V1Job) -> int: assert jobObject.metadata is not None assert jobObject.metadata.name is not None - return int(jobObject.metadata.name[len(self.job_prefix):]) - + return int(jobObject.metadata.name[len(self.job_prefix) :]) def getUpdatedBatchJob(self, maxWait: float) -> Optional[UpdatedBatchJobInfo]: @@ -1367,22 +1473,27 @@ def getUpdatedBatchJob(self, maxWait: float) -> Optional[UpdatedBatchJobInfo]: # Otherwise we need to maybe wait. if self.enable_watching and maxWait >= 1: # We can try a watch. Watches can only work in whole seconds. - for event in self._stream_until_error(self._api('batch').list_namespaced_job, - self.namespace, - label_selector=f"toil_run={self.run_id}", - timeout_seconds=math.floor(maxWait)): + for event in self._stream_until_error( + self._api("batch").list_namespaced_job, + self.namespace, + label_selector=f"toil_run={self.run_id}", + timeout_seconds=math.floor(maxWait), + ): # Grab the metadata data, ID, the list of conditions of the current job, and the total pods - jobObject = event['object'] - + jobObject = event["object"] + if self._is_deleted(jobObject): # Job is already deleted, so ignore it. - logger.warning('Kubernetes job %s is deleted; ignore its update', getattr(getattr(jobObject, 'metadata', None), 'name', None)) + logger.warning( + "Kubernetes job %s is deleted; ignore its update", + getattr(getattr(jobObject, "metadata", None), "name", None), + ) continue - + assert jobObject.metadata is not None assert jobObject.metadata.name is not None - - jobID = int(jobObject.metadata.name[len(self.job_prefix):]) + + jobID = int(jobObject.metadata.name[len(self.job_prefix) :]) if jobObject.status is None: # Can't tell what is up with this job. continue @@ -1392,7 +1503,10 @@ def getUpdatedBatchJob(self, maxWait: float) -> Optional[UpdatedBatchJobInfo]: failed_pods = jobObject.status.failed or 0 # Fetch out the condition object that has info about how the job is going. condition: Optional[V1JobCondition] = None - if jobObject.status.conditions is not None and len(jobObject.status.conditions) > 0: + if ( + jobObject.status.conditions is not None + and len(jobObject.status.conditions) > 0 + ): condition = jobObject.status.conditions[0] totalPods = active_pods + succeeded_pods + failed_pods @@ -1402,14 +1516,25 @@ def getUpdatedBatchJob(self, maxWait: float) -> Optional[UpdatedBatchJobInfo]: # Check if there are any active pods if active_pods > 0: - logger.info("%s has %d pods running" % jobObject.metadata.name, active_pods) + logger.info( + "%s has %d pods running" % jobObject.metadata.name, active_pods + ) continue elif succeeded_pods > 0 or failed_pods > 0: # No more active pods in the current job ; must be finished - logger.info("%s RESULTS -> Succeeded: %d Failed:%d Active:%d" % jobObject.metadata.name, - succeeded_pods, failed_pods, active_pods) + logger.info( + "%s RESULTS -> Succeeded: %d Failed:%d Active:%d" + % jobObject.metadata.name, + succeeded_pods, + failed_pods, + active_pods, + ) # Log out success/failure given a reason - logger.info("%s REASON: %s", getattr(condition, 'type', None), getattr(condition, 'reason', None)) + logger.info( + "%s REASON: %s", + getattr(condition, "type", None), + getattr(condition, "reason", None), + ) # Log out reason of failure and pod exit code if failed_pods > 0: @@ -1419,22 +1544,40 @@ def getUpdatedBatchJob(self, maxWait: float) -> Optional[UpdatedBatchJobInfo]: if condition is not None: logger.warning("Failed Job Message: %s", condition.message) pod = self._getPodForJob(jobObject) - statuses: List[V1ContainerStatus] = getattr(getattr(pod, 'status', None), 'container_statuses', []) - if len(statuses) > 0 and statuses[0].state is not None and statuses[0].state.terminated is not None: + statuses: list[V1ContainerStatus] = getattr( + getattr(pod, "status", None), "container_statuses", [] + ) + if ( + len(statuses) > 0 + and statuses[0].state is not None + and statuses[0].state.terminated is not None + ): exitCode = statuses[0].state.terminated.exit_code raw_runtime = 0.0 - if jobObject.status.completion_time is not None and jobObject.status.start_time is not None: - raw_runtime = (jobObject.status.completion_time - jobObject.status.start_time).total_seconds() + if ( + jobObject.status.completion_time is not None + and jobObject.status.start_time is not None + ): + raw_runtime = ( + jobObject.status.completion_time + - jobObject.status.start_time + ).total_seconds() runtime = slow_down(raw_runtime) - result = UpdatedBatchJobInfo(jobID=jobID, exitStatus=exitCode, wallTime=runtime, exitReason=exitReason) + result = UpdatedBatchJobInfo( + jobID=jobID, + exitStatus=exitCode, + wallTime=runtime, + exitReason=exitReason, + ) - if (exitReason == BatchJobExitReason.FAILED) or (succeeded_pods + failed_pods == totalPods): + if (exitReason == BatchJobExitReason.FAILED) or ( + succeeded_pods + failed_pods == totalPods + ): # Cleanup if job is all finished or there was a pod that failed # TODO: use delete_job() to release acquired resources self._delete_job( - jobObject.metadata.name, - propagation_policy='Foreground' + jobObject.metadata.name, propagation_policy="Foreground" ) # Make sure the job is deleted so we won't see it again. self._waitForJobDeath(jobObject.metadata.name) @@ -1442,12 +1585,19 @@ def getUpdatedBatchJob(self, maxWait: float) -> Optional[UpdatedBatchJobInfo]: continue else: # Job is not running/updating ; no active, successful, or failed pods yet - logger.debug("Job {} -> {}".format(jobObject.metadata.name, getattr(condition, 'reason', None))) + logger.debug( + "Job {} -> {}".format( + jobObject.metadata.name, getattr(condition, "reason", None) + ) + ) # Pod could be pending; don't say it's lost. continue else: # Try polling instead - while result is None and (datetime.datetime.now() - entry).total_seconds() < maxWait: + while ( + result is None + and (datetime.datetime.now() - entry).total_seconds() < maxWait + ): # We still have nothing and we haven't hit the timeout. # Poll @@ -1455,12 +1605,11 @@ def getUpdatedBatchJob(self, maxWait: float) -> Optional[UpdatedBatchJobInfo]: if result is None: # Still nothing. Wait a second, or some fraction of our max wait time. - time.sleep(min(maxWait/2, 1.0)) + time.sleep(min(maxWait / 2, 1.0)) # When we get here, either we found something or we ran out of time return result - def _getUpdatedBatchJobImmediately(self) -> Optional[UpdatedBatchJobInfo]: """ Return None if no updated (completed or failed) batch job is currently @@ -1484,25 +1633,25 @@ def _getUpdatedBatchJobImmediately(self) -> Optional[UpdatedBatchJobInfo]: # Find a job that is done, failed, or stuck jobObject = None # Put 'done', 'failed', or 'stuck' here - chosenFor = '' + chosenFor = "" for j in self._ourJobObject(onlySucceeded=True): # Look for succeeded jobs because that's the only filter Kubernetes has jobObject = j - chosenFor = 'done' + chosenFor = "done" if jobObject is None: for j in self._ourJobObject(): # If there aren't any succeeded jobs, scan all jobs # See how many times each failed - failCount = getattr(j.status, 'failed', 0) + failCount = getattr(j.status, "failed", 0) if failCount is None: # Make sure it is an int failCount = 0 if failCount > 0: # Take the first failed one you find jobObject = j - chosenFor = 'failed' + chosenFor = "failed" break if jobObject is None: @@ -1515,23 +1664,30 @@ def _getUpdatedBatchJobImmediately(self) -> Optional[UpdatedBatchJobInfo]: continue # Containers can get stuck in Waiting with reason ImagePullBackOff - if self._isPodStuckWaiting(pod, reason='ImagePullBackoff'): + if self._isPodStuckWaiting(pod, reason="ImagePullBackoff"): # Assume it will never finish, even if the registry comes back or whatever. # We can get into this state when we send in a non-existent image. # See https://github.com/kubernetes/kubernetes/issues/58384 jobObject = j - chosenFor = 'stuck' - logger.warning('Failing stuck job (ImagePullBackoff); did you try to run a non-existent Docker image?' - ' Check TOIL_APPLIANCE_SELF.') + chosenFor = "stuck" + logger.warning( + "Failing stuck job (ImagePullBackoff); did you try to run a non-existent Docker image?" + " Check TOIL_APPLIANCE_SELF." + ) break # Containers can also get stuck in Waiting with reason # ContainerCreating, if for example their mounts don't work. - if self._isPodStuckWaiting(pod, reason='ContainerCreating', timeout=self.pod_timeout): + if self._isPodStuckWaiting( + pod, reason="ContainerCreating", timeout=self.pod_timeout + ): # Assume that it will never finish. jobObject = j - chosenFor = 'stuck' - logger.warning('Failing stuck job (ContainerCreating longer than %s seconds); did you try to mount something impossible?', self.pod_timeout) + chosenFor = "stuck" + logger.warning( + "Failing stuck job (ContainerCreating longer than %s seconds); did you try to mount something impossible?", + self.pod_timeout, + ) break # Pods can also get stuck nearly but not quite out of memory, @@ -1541,7 +1697,7 @@ def _getUpdatedBatchJobImmediately(self) -> Optional[UpdatedBatchJobInfo]: # We found a job that probably should be OOM! Report it as stuck. # Polling function takes care of the logging. jobObject = j - chosenFor = 'stuck' + chosenFor = "stuck" break if jobObject is None: @@ -1549,25 +1705,30 @@ def _getUpdatedBatchJobImmediately(self) -> Optional[UpdatedBatchJobInfo]: return None else: # We actually have something - logger.debug('Identified stopped Kubernetes job %s as %s', getattr(jobObject.metadata, 'name', None), chosenFor) - + logger.debug( + "Identified stopped Kubernetes job %s as %s", + getattr(jobObject.metadata, "name", None), + chosenFor, + ) # Otherwise we got something. # Work out what the job's ID was (whatever came after our name prefix) assert jobObject.metadata is not None assert jobObject.metadata.name is not None - jobID = int(jobObject.metadata.name[len(self.job_prefix):]) + jobID = int(jobObject.metadata.name[len(self.job_prefix) :]) # Grab the pod pod = self._getPodForJob(jobObject) if pod is not None: - if chosenFor == 'done' or chosenFor == 'failed': + if chosenFor == "done" or chosenFor == "failed": # The job actually finished or failed # Get the statuses of the pod's containers - containerStatuses = getattr(getattr(pod, 'status', None), 'container_statuses', None) + containerStatuses = getattr( + getattr(pod, "status", None), "container_statuses", None + ) # Get when the pod started (reached the Kubelet) as a datetime start_time = self._get_start_time(pod, jobObject) @@ -1577,18 +1738,24 @@ def _getUpdatedBatchJobImmediately(self) -> Optional[UpdatedBatchJobInfo]: # This happens when a pod is "Scheduled". But how could a # 'done' or 'failed' pod be merely "Scheduled"? # Complain so we can find out. - logger.warning('Exit code and runtime unavailable; pod has no container statuses') - logger.warning('Pod: %s', str(pod)) + logger.warning( + "Exit code and runtime unavailable; pod has no container statuses" + ) + logger.warning("Pod: %s", str(pod)) exitCode = EXIT_STATUS_UNAVAILABLE_VALUE # Say it stopped now and started when it was scheduled/submitted. # We still need a strictly positive runtime. runtime = slow_down((utc_now() - start_time).total_seconds()) else: # Get the termination info from the pod's main (only) container - terminatedInfo = getattr(getattr(containerStatuses[0], 'state', None), 'terminated', None) + terminatedInfo = getattr( + getattr(containerStatuses[0], "state", None), "terminated", None + ) if terminatedInfo is None: - logger.warning('Exit code and runtime unavailable; pod stopped without container terminating') - logger.warning('Pod: %s', str(pod)) + logger.warning( + "Exit code and runtime unavailable; pod stopped without container terminating" + ) + logger.warning("Pod: %s", str(pod)) exitCode = EXIT_STATUS_UNAVAILABLE_VALUE # Say it stopped now and started when it was scheduled/submitted. # We still need a strictly positive runtime. @@ -1603,34 +1770,42 @@ def _getUpdatedBatchJobImmediately(self) -> Optional[UpdatedBatchJobInfo]: # created. And we need to look at the pod's end time # because the job only gets a completion time if # successful. - runtime = slow_down((terminatedInfo.finished_at - - start_time).total_seconds()) + runtime = slow_down( + (terminatedInfo.finished_at - start_time).total_seconds() + ) - if chosenFor == 'failed': + if chosenFor == "failed": # Warn the user with the failed pod's log # TODO: cut this down somehow? - logger.warning('Log from failed pod: %s', self._getLogForPod(pod)) + logger.warning( + "Log from failed pod: %s", self._getLogForPod(pod) + ) else: # The job has gotten stuck - assert chosenFor == 'stuck' + assert chosenFor == "stuck" # Synthesize an exit code exitCode = EXIT_STATUS_UNAVAILABLE_VALUE # Say it ran from when the job was submitted to when the pod got stuck - runtime = slow_down((utc_now() - self._get_start_time(job=jobObject)).total_seconds()) + runtime = slow_down( + (utc_now() - self._get_start_time(job=jobObject)).total_seconds() + ) else: # The pod went away from under the job. - logging.warning('Exit code and runtime unavailable; pod vanished') + logging.warning("Exit code and runtime unavailable; pod vanished") exitCode = EXIT_STATUS_UNAVAILABLE_VALUE # Say it ran from when the job was submitted to when the pod vanished - runtime = slow_down((utc_now() - self._get_start_time(job=jobObject)).total_seconds()) - + runtime = slow_down( + (utc_now() - self._get_start_time(job=jobObject)).total_seconds() + ) try: # Delete the job and all dependents (pods), hoping to get a 404 if it's magically gone - self._delete_job(jobObject.metadata.name, propagation_policy='Foreground', gone_ok=True) + self._delete_job( + jobObject.metadata.name, propagation_policy="Foreground", gone_ok=True + ) # That just kicks off the deletion process. Foreground doesn't # actually block. See @@ -1646,7 +1821,9 @@ def _getUpdatedBatchJobImmediately(self) -> Optional[UpdatedBatchJobInfo]: # Otherwise everything is fine and the job is gone. # Return the one finished job we found - return UpdatedBatchJobInfo(jobID=jobID, exitStatus=exitCode, wallTime=runtime, exitReason=None) + return UpdatedBatchJobInfo( + jobID=jobID, exitStatus=exitCode, wallTime=runtime, exitReason=None + ) def _waitForJobDeath(self, jobName: str) -> None: """ @@ -1660,7 +1837,9 @@ def _waitForJobDeath(self, jobName: str) -> None: while True: try: # Look for the job - job_object = self._api('batch', errors=[404]).read_namespaced_job(jobName, self.namespace) + job_object = self._api("batch", errors=[404]).read_namespaced_job( + jobName, self.namespace + ) if self._is_deleted(job_object): # The job looks deleted, so we can treat it as not being there. return @@ -1685,59 +1864,80 @@ def shutdown(self) -> None: # Shutdown scheduling thread self._shutting_down.set() with self._work_available: - self._work_available.notify_all() # Wake it up. + self._work_available.notify_all() # Wake it up. self.schedulingThread.join() # Kill all of our jobs and clean up pods that are associated with those jobs try: - logger.debug('Deleting all Kubernetes jobs for toil_run=%s', self.run_id) - self._api('batch', errors=[404]).delete_collection_namespaced_job( + logger.debug("Deleting all Kubernetes jobs for toil_run=%s", self.run_id) + self._api("batch", errors=[404]).delete_collection_namespaced_job( self.namespace, label_selector=f"toil_run={self.run_id}", - propagation_policy='Background' + propagation_policy="Background", + ) + logger.debug( + "Killed jobs with delete_collection_namespaced_job; cleaned up" ) - logger.debug('Killed jobs with delete_collection_namespaced_job; cleaned up') # TODO: should we release all resources? We're shutting down so would it matter? except ApiException as e: if e.status != 404: # Anything other than a 404 is weird here. - logger.error("Exception when calling BatchV1Api->delete_collection_namespaced_job: %s" % e) + logger.error( + "Exception when calling BatchV1Api->delete_collection_namespaced_job: %s" + % e + ) # If batch delete fails, try to delete all remaining jobs individually. - logger.debug('Deleting Kubernetes jobs individually for toil_run=%s', self.run_id) + logger.debug( + "Deleting Kubernetes jobs individually for toil_run=%s", self.run_id + ) for job_id in self._getIssuedNonLocalBatchJobIDs(): - job_name = f'{self.job_prefix}{job_id}' - self._delete_job(job_name, propagation_policy='Background', resource_notify=False) + job_name = f"{self.job_prefix}{job_id}" + self._delete_job( + job_name, propagation_policy="Background", resource_notify=False + ) # Aggregate all pods and check if any pod has failed to cleanup or is orphaned. ourPods = self._ourPodObject() for pod in ourPods: try: - phase = getattr(pod.status, 'phase', None) - if phase == 'Failed': - logger.debug('Failed pod encountered at shutdown:\n%s', self._pretty_print(pod)) - if phase == 'Orphaned': - logger.debug('Orphaned pod encountered at shutdown:\n%s', self._pretty_print(pod)) + phase = getattr(pod.status, "phase", None) + if phase == "Failed": + logger.debug( + "Failed pod encountered at shutdown:\n%s", + self._pretty_print(pod), + ) + if phase == "Orphaned": + logger.debug( + "Orphaned pod encountered at shutdown:\n%s", + self._pretty_print(pod), + ) except: # Don't get mad if that doesn't work. pass if pod.metadata is not None and pod.metadata.name is not None: try: - logger.debug('Cleaning up pod at shutdown: %s', pod.metadata.name) - response = self._api('core', errors=[404]).delete_namespaced_pod( + logger.debug( + "Cleaning up pod at shutdown: %s", pod.metadata.name + ) + response = self._api( + "core", errors=[404] + ).delete_namespaced_pod( pod.metadata.name, self.namespace, - propagation_policy='Background' + propagation_policy="Background", ) except ApiException as e: if e.status != 404: # Anything other than a 404 is weird here. - logger.error("Exception when calling CoreV1Api->delete_namespaced_pod: %s" % e) - + logger.error( + "Exception when calling CoreV1Api->delete_namespaced_pod: %s" + % e + ) - def _getIssuedNonLocalBatchJobIDs(self) -> List[int]: + def _getIssuedNonLocalBatchJobIDs(self) -> list[int]: """ Get the issued batch job IDs that are not for local jobs. """ @@ -1749,29 +1949,35 @@ def _getIssuedNonLocalBatchJobIDs(self) -> List[int]: jobIDs.append(self._getIDForOurJob(job)) return jobIDs - def getIssuedBatchJobIDs(self) -> List[int]: + def getIssuedBatchJobIDs(self) -> list[int]: # Make sure to send the local jobs and queued jobs also with self._mutex: queued_jobs = list(self._queued_job_ids) - return self._getIssuedNonLocalBatchJobIDs() + list(self.getIssuedLocalJobIDs()) + queued_jobs + return ( + self._getIssuedNonLocalBatchJobIDs() + + list(self.getIssuedLocalJobIDs()) + + queued_jobs + ) - def _get_start_time(self, pod: Optional[V1Pod] = None, job: Optional[V1Job] = None) -> datetime.datetime: + def _get_start_time( + self, pod: Optional[V1Pod] = None, job: Optional[V1Job] = None + ) -> datetime.datetime: """ Get an actual or estimated start time for a pod. """ # Get when the pod started (reached the Kubelet) as a datetime - start_time = getattr(getattr(pod, 'status', None), 'start_time', None) + start_time = getattr(getattr(pod, "status", None), "start_time", None) if start_time is None: # If the pod never made it to the kubelet to get a # start_time, say it was when the job was submitted. - start_time = getattr(getattr(job, 'status', None), 'start_time', None) + start_time = getattr(getattr(job, "status", None), "start_time", None) if start_time is None: # If this is still unset, say it was just now. start_time = utc_now() return start_time - def getRunningBatchJobIDs(self) -> Dict[int, float]: + def getRunningBatchJobIDs(self) -> dict[int, float]: # We need a dict from jobID (integer) to seconds it has been running secondsPerJob = dict() for job in self._ourJobObject(): @@ -1782,7 +1988,7 @@ def getRunningBatchJobIDs(self) -> Dict[int, float]: # Jobs whose pods are gone are not running continue - if getattr(pod.status, 'phase', None) == 'Running': + if getattr(pod.status, "phase", None) == "Running": # The job's pod is running # Estimate the runtime @@ -1794,7 +2000,7 @@ def getRunningBatchJobIDs(self) -> Dict[int, float]: secondsPerJob.update(self.getRunningLocalJobIDs()) return secondsPerJob - def killBatchJobs(self, jobIDs: List[int]) -> None: + def killBatchJobs(self, jobIDs: list[int]) -> None: # Kill all the ones that are local self.killLocalJobs(jobIDs) @@ -1803,7 +2009,7 @@ def killBatchJobs(self, jobIDs: List[int]) -> None: # First get the jobs we even issued non-locally issued_on_kubernetes = set(self._getIssuedNonLocalBatchJobIDs()) - deleted_jobs: List[str] = [] + deleted_jobs: list[str] = [] for job_id in jobIDs: # For each job we are supposed to kill @@ -1829,10 +2035,10 @@ def killBatchJobs(self, jobIDs: List[int]) -> None: # Delete the requested job in the foreground. # This doesn't block, but it does delete expeditiously. - self._delete_job(job_name, propagation_policy='Foreground') + self._delete_job(job_name, propagation_policy="Foreground") deleted_jobs.append(job_name) - logger.debug('Killed job by request: %s', job_name) + logger.debug("Killed job by request: %s", job_name) for job_name in deleted_jobs: # Now we need to wait for all the jobs we killed to be gone. @@ -1842,7 +2048,7 @@ def killBatchJobs(self, jobIDs: List[int]) -> None: # the potential deadlock (if the user code needs exclusive access to # a resource) onto the user code, instead of always hanging # whenever we can't certify that a faulty node is no longer running - # the user code. + # the user code. self._waitForJobDeath(job_name) @classmethod @@ -1853,9 +2059,11 @@ def get_default_kubernetes_owner(cls) -> str: # Make a Kubernetes-acceptable version of our username: not too long, # and all lowercase letters, numbers, or - or . - acceptable_chars = set(string.ascii_lowercase + string.digits + '-.') + acceptable_chars = set(string.ascii_lowercase + string.digits + "-.") - return ''.join([c for c in get_user_name().lower() if c in acceptable_chars])[:100] + return "".join([c for c in get_user_name().lower() if c in acceptable_chars])[ + :100 + ] @runtime_checkable class KubernetesConfig(Protocol): @@ -1867,38 +2075,66 @@ class KubernetesConfig(Protocol): have to let the fact that this also has to be a Config just be manually enforced. """ + kubernetes_host_path: Optional[str] kubernetes_owner: str kubernetes_service_account: Optional[str] kubernetes_pod_timeout: float - @classmethod def add_options(cls, parser: Union[ArgumentParser, _ArgumentGroup]) -> None: - parser.add_argument("--kubernetesHostPath", dest="kubernetes_host_path", default=None, env_var="TOIL_KUBERNETES_HOST_PATH", - help="Path on Kubernetes hosts to use as shared inter-pod temp directory. " - "(default: %(default)s)") - parser.add_argument("--kubernetesOwner", dest="kubernetes_owner", default=None, env_var="TOIL_KUBERNETES_OWNER", - help=f"Username to mark Kubernetes jobs with. If the provided value is None, the value will " - f"be generated at runtime. " - f"(Generated default: {cls.get_default_kubernetes_owner()})") - parser.add_argument("--kubernetesServiceAccount", dest="kubernetes_service_account", default=None, env_var="TOIL_KUBERNETES_SERVICE_ACCOUNT", - help="Service account to run jobs as. " - "(default: %(default)s)") - parser.add_argument("--kubernetesPodTimeout", dest="kubernetes_pod_timeout", default=120, env_var="TOIL_KUBERNETES_POD_TIMEOUT", type=float, - help="Seconds to wait for a scheduled Kubernetes pod to start running. " - "(default: %(default)s)") - parser.add_argument("--kubernetesPrivileged", dest="kubernetes_privileged", default=False, env_var="TOIL_KUBERNETES_PRIVILEGED", type=opt_strtobool, - help="Whether to ask worker pods to run in privileged mode. This should be used to access " - "privileged operations, such as FUSE. On Toil-managed clusters with --enableFuse, " - "this is set to True. (default: %(default)s)") - - OptionType = TypeVar('OptionType') + parser.add_argument( + "--kubernetesHostPath", + dest="kubernetes_host_path", + default=None, + env_var="TOIL_KUBERNETES_HOST_PATH", + help="Path on Kubernetes hosts to use as shared inter-pod temp directory. " + "(default: %(default)s)", + ) + parser.add_argument( + "--kubernetesOwner", + dest="kubernetes_owner", + default=None, + env_var="TOIL_KUBERNETES_OWNER", + help=f"Username to mark Kubernetes jobs with. If the provided value is None, the value will " + f"be generated at runtime. " + f"(Generated default: {cls.get_default_kubernetes_owner()})", + ) + parser.add_argument( + "--kubernetesServiceAccount", + dest="kubernetes_service_account", + default=None, + env_var="TOIL_KUBERNETES_SERVICE_ACCOUNT", + help="Service account to run jobs as. " "(default: %(default)s)", + ) + parser.add_argument( + "--kubernetesPodTimeout", + dest="kubernetes_pod_timeout", + default=120, + env_var="TOIL_KUBERNETES_POD_TIMEOUT", + type=float, + help="Seconds to wait for a scheduled Kubernetes pod to start running. " + "(default: %(default)s)", + ) + parser.add_argument( + "--kubernetesPrivileged", + dest="kubernetes_privileged", + default=False, + env_var="TOIL_KUBERNETES_PRIVILEGED", + type=opt_strtobool, + help="Whether to ask worker pods to run in privileged mode. This should be used to access " + "privileged operations, such as FUSE. On Toil-managed clusters with --enableFuse, " + "this is set to True. (default: %(default)s)", + ) + + OptionType = TypeVar("OptionType") + @classmethod def setOptions(cls, setOption: OptionSetter) -> None: setOption("kubernetes_host_path") setOption("kubernetes_owner") - setOption("kubernetes_service_account",) + setOption( + "kubernetes_service_account", + ) setOption("kubernetes_pod_timeout") setOption("kubernetes_privileged") - diff --git a/src/toil/batchSystems/local_support.py b/src/toil/batchSystems/local_support.py index 301e33006d..7ebc80caa4 100644 --- a/src/toil/batchSystems/local_support.py +++ b/src/toil/batchSystems/local_support.py @@ -12,10 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging -from typing import Dict, List, Optional +from typing import Optional -from toil.batchSystems.abstractBatchSystem import (BatchSystemSupport, - UpdatedBatchJobInfo) +from toil.batchSystems.abstractBatchSystem import ( + BatchSystemSupport, + UpdatedBatchJobInfo, +) from toil.batchSystems.singleMachine import SingleMachineBatchSystem from toil.common import Config from toil.job import JobDescription @@ -27,9 +29,13 @@ class BatchSystemLocalSupport(BatchSystemSupport): """Adds a local queue for helper jobs, useful for CWL & others.""" - def __init__(self, config: Config, maxCores: float, maxMemory: int, maxDisk: int) -> None: + def __init__( + self, config: Config, maxCores: float, maxMemory: int, maxDisk: int + ) -> None: super().__init__(config, maxCores, maxMemory, maxDisk) - max_local_jobs = config.max_local_jobs if config.max_local_jobs is not None else cpu_count() + max_local_jobs = ( + config.max_local_jobs if config.max_local_jobs is not None else cpu_count() + ) self.localBatch: SingleMachineBatchSystem = SingleMachineBatchSystem( config, maxCores, maxMemory, maxDisk, max_jobs=max_local_jobs ) @@ -41,8 +47,7 @@ def handleLocalJob(self, command: str, jobDesc: JobDescription) -> Optional[int] Returns the jobID if the jobDesc has been submitted to the local queue, otherwise returns None """ - if (not self.config.run_local_jobs_on_workers - and jobDesc.local): + if not self.config.run_local_jobs_on_workers and jobDesc.local: # Since singleMachine.py doesn't typecheck yet and MyPy is ignoring # it, it will raise errors here unless we add type annotations to # everything we get back from it. The easiest way to do that seems @@ -55,7 +60,7 @@ def handleLocalJob(self, command: str, jobDesc: JobDescription) -> Optional[int] else: return None - def killLocalJobs(self, jobIDs: List[int]) -> None: + def killLocalJobs(self, jobIDs: list[int]) -> None: """ Will kill all local jobs that match the provided jobIDs. @@ -63,14 +68,14 @@ def killLocalJobs(self, jobIDs: List[int]) -> None: """ self.localBatch.killBatchJobs(jobIDs) - def getIssuedLocalJobIDs(self) -> List[int]: + def getIssuedLocalJobIDs(self) -> list[int]: """To be called by getIssuedBatchJobIDs.""" - local_ids: List[int] = self.localBatch.getIssuedBatchJobIDs() + local_ids: list[int] = self.localBatch.getIssuedBatchJobIDs() return local_ids - def getRunningLocalJobIDs(self) -> Dict[int, float]: + def getRunningLocalJobIDs(self) -> dict[int, float]: """To be called by getRunningBatchJobIDs().""" - local_running: Dict[int, float] = self.localBatch.getRunningBatchJobIDs() + local_running: dict[int, float] = self.localBatch.getRunningBatchJobIDs() return local_running def getUpdatedLocalJob(self, maxWait: int) -> Optional[UpdatedBatchJobInfo]: diff --git a/src/toil/batchSystems/lsf.py b/src/toil/batchSystems/lsf.py index fe41cfb687..137cd4a624 100644 --- a/src/toil/batchSystems/lsf.py +++ b/src/toil/batchSystems/lsf.py @@ -25,18 +25,24 @@ import subprocess from datetime import datetime from random import randint -from typing import Dict, List, Optional, Tuple, Union +from typing import Optional, Union from dateutil.parser import parse from dateutil.tz import tzlocal -from toil.batchSystems.abstractBatchSystem import BatchJobExitReason, EXIT_STATUS_UNAVAILABLE_VALUE -from toil.batchSystems.abstractGridEngineBatchSystem import \ - AbstractGridEngineBatchSystem -from toil.batchSystems.lsfHelper import (check_lsf_json_output_supported, - parse_mem_and_cmd_from_output, - parse_memory, - per_core_reservation) +from toil.batchSystems.abstractBatchSystem import ( + EXIT_STATUS_UNAVAILABLE_VALUE, + BatchJobExitReason, +) +from toil.batchSystems.abstractGridEngineBatchSystem import ( + AbstractGridEngineBatchSystem, +) +from toil.batchSystems.lsfHelper import ( + check_lsf_json_output_supported, + parse_mem_and_cmd_from_output, + parse_memory, + per_core_reservation, +) from toil.lib.misc import call_command logger = logging.getLogger(__name__) @@ -50,47 +56,58 @@ class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread): def getRunningJobIDs(self): times = {} with self.runningJobsLock: - currentjobs = {str(self.batchJobIDs[x][0]): x for x in - self.runningJobs} + currentjobs = {str(self.batchJobIDs[x][0]): x for x in self.runningJobs} if check_lsf_json_output_supported: - stdout = call_command(["bjobs","-json","-o", "jobid stat start_time"]) + stdout = call_command(["bjobs", "-json", "-o", "jobid stat start_time"]) bjobs_records = self.parseBjobs(stdout) if bjobs_records: for single_item in bjobs_records: - if single_item['STAT'] == 'RUN' and single_item['JOBID'] in currentjobs: - jobstart = parse(single_item['START_TIME'], default=datetime.now(tzlocal())) - times[currentjobs[single_item['JOBID']]] = datetime.now(tzlocal()) \ - - jobstart + if ( + single_item["STAT"] == "RUN" + and single_item["JOBID"] in currentjobs + ): + jobstart = parse( + single_item["START_TIME"], + default=datetime.now(tzlocal()), + ) + times[currentjobs[single_item["JOBID"]]] = ( + datetime.now(tzlocal()) - jobstart + ) else: times = self.fallbackRunningJobIDs(currentjobs) return times def fallbackRunningJobIDs(self, currentjobs): times = {} - stdout = call_command(["bjobs", "-o", "jobid stat start_time delimiter='|'"]) - for curline in stdout.split('\n'): - items = curline.strip().split('|') - if items[0] in currentjobs and items[1] == 'RUN': + stdout = call_command( + ["bjobs", "-o", "jobid stat start_time delimiter='|'"] + ) + for curline in stdout.split("\n"): + items = curline.strip().split("|") + if items[0] in currentjobs and items[1] == "RUN": jobstart = parse(items[2], default=datetime.now(tzlocal())) - times[currentjobs[items[0]]] = datetime.now(tzlocal()) \ - - jobstart + times[currentjobs[items[0]]] = datetime.now(tzlocal()) - jobstart return times def killJob(self, jobID): - call_command(['bkill', self.getBatchSystemID(jobID)]) - - def prepareSubmission(self, - cpu: int, - memory: int, - jobID: int, - command: str, - jobName: str, - job_environment: Optional[Dict[str, str]] = None, - gpus: Optional[int] = None): - return (self.prepareBsub(cpu, memory, jobID) + [command], - job_environment) # pass job_environment to .submitJob() + call_command(["bkill", self.getBatchSystemID(jobID)]) + + def prepareSubmission( + self, + cpu: int, + memory: int, + jobID: int, + command: str, + jobName: str, + job_environment: Optional[dict[str, str]] = None, + gpus: Optional[int] = None, + ): + return ( + self.prepareBsub(cpu, memory, jobID) + [command], + job_environment, + ) # pass job_environment to .submitJob() def submitJob(self, subLine): subLine, job_environment = subLine @@ -102,7 +119,7 @@ def submitJob(self, subLine): stdout = call_command(subLine, env=combinedEnv) # Example success: Job <39605914> is submitted to default queue . # Example fail: Service class does not exist. Job not submitted. - result_search = re.search('Job <(.*)> is submitted', stdout) + result_search = re.search("Job <(.*)> is submitted", stdout) if result_search: result = int(result_search.group(1)) @@ -138,7 +155,11 @@ def coalesce_job_exit_codes(self, batch_job_id_list: list) -> list: logger.debug("Getting coalesced job exit codes via bjobs") bjobs_records = self.parseBjobs( subprocess.run( - args, check=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding="utf-8" + args, + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + encoding="utf-8", ).stdout ) if bjobs_records: @@ -161,23 +182,31 @@ def coalesce_job_exit_codes(self, batch_job_id_list: list) -> list: status_resonse.append(None) return status_resonse - def getJobExitCode(self, lsfJobID) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]: + def getJobExitCode( + self, lsfJobID + ) -> Union[int, tuple[int, Optional[BatchJobExitReason]], None]: # the task is set as part of the job ID if using getBatchSystemID() if "NOT_SUBMITTED" in lsfJobID: logger.error("bjobs detected job failed to submit") return 1 job, task = (lsfJobID, None) - if '.' in lsfJobID: - job, task = lsfJobID.split('.', 1) + if "." in lsfJobID: + job, task = lsfJobID.split(".", 1) self.parseMaxMem(job) # first try bjobs to find out job state if check_lsf_json_output_supported: - args = ["bjobs", "-json", "-o", - "user exit_code stat exit_reason pend_reason", str(job)] - logger.debug("Checking job exit code for job via bjobs: " - "{}".format(job)) + args = [ + "bjobs", + "-json", + "-o", + "user exit_code stat exit_reason pend_reason", + str(job), + ] + logger.debug( + "Checking job exit code for job via bjobs: " "{}".format(job) + ) stdout = call_command(args) bjobs_records = self.parseBjobs(stdout) if bjobs_records: @@ -186,7 +215,9 @@ def getJobExitCode(self, lsfJobID) -> Union[int, Tuple[int, Optional[BatchJobExi return self.fallbackGetJobExitCode(job) - def parse_bjobs_record(self, bjobs_record: dict, job: int) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]: + def parse_bjobs_record( + self, bjobs_record: dict, job: int + ) -> Union[int, tuple[int, Optional[BatchJobExitReason]], None]: """ Helper functions for getJobExitCode and to parse the bjobs status record """ @@ -202,7 +233,8 @@ def parse_bjobs_record(self, bjobs_record: dict, job: int) -> Union[int, Tuple[i pending_info = "\n" + bjobs_record["PEND_REASON"] logger.debug( "bjobs detected job pending with: %s\nfor job: %s", - pending_info, job + pending_info, + job, ) return None if process_status == "EXIT": @@ -221,10 +253,18 @@ def parse_bjobs_record(self, bjobs_record: dict, job: int) -> Union[int, Tuple[i exit_info += f"\nexit reason: {exit_reason}" logger.error( "bjobs detected job failed with: %s\nfor job: %s", - exit_info, job + exit_info, + job, ) if "TERM_MEMLIMIT" in exit_reason: - return (exit_code if exit_code != 0 else EXIT_STATUS_UNAVAILABLE_VALUE, BatchJobExitReason.MEMLIMIT) + return ( + ( + exit_code + if exit_code != 0 + else EXIT_STATUS_UNAVAILABLE_VALUE + ), + BatchJobExitReason.MEMLIMIT, + ) return exit_code if process_status == "RUN": logger.debug( @@ -237,46 +277,53 @@ def parse_bjobs_record(self, bjobs_record: dict, job: int) -> Union[int, Tuple[i return self.getJobExitCodeBACCT(job) - def getJobExitCodeBACCT(self,job) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]: + def getJobExitCodeBACCT( + self, job + ) -> Union[int, tuple[int, Optional[BatchJobExitReason]], None]: # if not found in bjobs, then try bacct (slower than bjobs) - logger.debug("bjobs failed to detect job - trying bacct: " - "{}".format(job)) + logger.debug("bjobs failed to detect job - trying bacct: " "{}".format(job)) args = ["bacct", "-l", str(job)] stdout = call_command(args) - process_output = stdout.split('\n') + process_output = stdout.split("\n") for line in process_output: if line.find("Completed ") > -1 or line.find("") > -1: - logger.debug("Detected job completed for job: " - "{}".format(job)) + logger.debug("Detected job completed for job: " "{}".format(job)) return 0 elif line.find("Completed ") > -1 or line.find("") > -1: - logger.error("Detected job failed for job: " - "{}".format(job)) + logger.error("Detected job failed for job: " "{}".format(job)) return 1 - logger.debug("Can't determine exit code for job or job still " - "running: {}".format(job)) + logger.debug( + "Can't determine exit code for job or job still " + "running: {}".format(job) + ) return None - def fallbackGetJobExitCode(self, job) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]: + def fallbackGetJobExitCode( + self, job + ) -> Union[int, tuple[int, Optional[BatchJobExitReason]], None]: args = ["bjobs", "-l", str(job)] logger.debug(f"Checking job exit code for job via bjobs (fallback): {job}") stdout = call_command(args) output = stdout.replace("\n ", "") - process_output = output.split('\n') + process_output = output.split("\n") started = 0 for line in process_output: if "Done successfully" in line or "Status " in line: logger.debug(f"bjobs detected job completed for job: {job}") return 0 elif "New job is waiting for scheduling" in line: - logger.debug(f"bjobs detected job pending scheduling for job: {job}") + logger.debug( + f"bjobs detected job pending scheduling for job: {job}" + ) return None elif "PENDING REASONS" in line or "Status " in line: logger.debug(f"bjobs detected job pending for job: {job}") return None elif "Exited with exit code" in line: - exit = int(line[line.find("Exited with exit code ")+22:].split('.')[0]) + exit = int( + line[line.find("Exited with exit code ") + 22 :].split(".")[0] + ) logger.error(f"bjobs detected job exit code {exit} for job {job}") return exit elif "Completed " in line: @@ -293,7 +340,8 @@ def fallbackGetJobExitCode(self, job) -> Union[int, Tuple[int, Optional[BatchJob """ Implementation-specific helper methods """ - def prepareBsub(self, cpu: int, mem: int, jobID: int) -> List[str]: + + def prepareBsub(self, cpu: int, mem: int, jobID: int) -> list[str]: """ Make a bsub commandline to execute. @@ -308,18 +356,15 @@ def prepareBsub(self, cpu: int, mem: int, jobID: int) -> List[str]: if per_core_reservation() and cpu: mem = mem / math.ceil(cpu) mem = parse_memory(mem) - bsubMem = ['-R', - f'select[mem>{mem}] ' - f'rusage[mem={mem}]', - '-M', mem] - bsubCpu = [] if cpu is None else ['-n', str(math.ceil(cpu))] + bsubMem = ["-R", f"select[mem>{mem}] " f"rusage[mem={mem}]", "-M", mem] + bsubCpu = [] if cpu is None else ["-n", str(math.ceil(cpu))] bsubline = ["bsub", "-cwd", ".", "-J", f"toil_job_{jobID}"] bsubline.extend(bsubMem) bsubline.extend(bsubCpu) - stdoutfile: str = self.boss.format_std_out_err_path(jobID, '%J', 'out') - stderrfile: str = self.boss.format_std_out_err_path(jobID, '%J', 'err') - bsubline.extend(['-o', stdoutfile, '-e', stderrfile]) - lsfArgs = os.getenv('TOIL_LSF_ARGS') + stdoutfile: str = self.boss.format_std_out_err_path(jobID, "%J", "out") + stderrfile: str = self.boss.format_std_out_err_path(jobID, "%J", "err") + bsubline.extend(["-o", stdoutfile, "-e", stderrfile]) + lsfArgs = os.getenv("TOIL_LSF_ARGS") if lsfArgs: bsubline.extend(lsfArgs.split()) return bsubline @@ -333,16 +378,16 @@ def parseBjobs(self, bjobs_output_str): bjobs_dict = None bjobs_records = None # Handle Cannot connect to LSF. Please wait ... type messages - dict_start = bjobs_output_str.find('{') - dict_end = bjobs_output_str.rfind('}') + dict_start = bjobs_output_str.find("{") + dict_end = bjobs_output_str.rfind("}") if dict_start != -1 and dict_end != -1: - bjobs_output = bjobs_output_str[dict_start:(dict_end+1)] + bjobs_output = bjobs_output_str[dict_start : (dict_end + 1)] try: bjobs_dict = json.loads(bjobs_output) except json.decoder.JSONDecodeError: logger.error(f"Could not parse bjobs output: {bjobs_output_str}") - if 'RECORDS' in bjobs_dict: - bjobs_records = bjobs_dict['RECORDS'] + if "RECORDS" in bjobs_dict: + bjobs_records = bjobs_dict["RECORDS"] if bjobs_records is None: logger.error(f"Could not find bjobs output json in: {bjobs_output_str}") @@ -358,16 +403,24 @@ def parseMaxMem(self, jobID): output = subprocess.check_output(["bjobs", "-l", str(jobID)], text=True) max_mem, command = parse_mem_and_cmd_from_output(output=output) if not max_mem: - logger.warning(f"[job ID {jobID}] Unable to Collect Maximum Memory Usage: {output}") + logger.warning( + f"[job ID {jobID}] Unable to Collect Maximum Memory Usage: {output}" + ) return if not command: - logger.warning(f"[job ID {jobID}] Cannot Parse Max Memory Due to Missing Command String: {output}") + logger.warning( + f"[job ID {jobID}] Cannot Parse Max Memory Due to Missing Command String: {output}" + ) else: - logger.info(f"[job ID {jobID}, Command {command.group(1)}] Max Memory Used: {max_mem.group(1)}") + logger.info( + f"[job ID {jobID}, Command {command.group(1)}] Max Memory Used: {max_mem.group(1)}" + ) return max_mem except subprocess.CalledProcessError as e: - logger.warning(f"[job ID {jobID}] Unable to Collect Maximum Memory Usage: {e}") + logger.warning( + f"[job ID {jobID}] Unable to Collect Maximum Memory Usage: {e}" + ) def getWaitDuration(self): """We give LSF a second to catch its breath (in seconds)""" diff --git a/src/toil/batchSystems/lsfHelper.py b/src/toil/batchSystems/lsfHelper.py index d2a8c7357f..7e2cac57e0 100755 --- a/src/toil/batchSystems/lsfHelper.py +++ b/src/toil/batchSystems/lsfHelper.py @@ -72,7 +72,7 @@ def apply_conf_file(fn, conf_filename): for env in LSF_CONF_ENV: conf_file = get_conf_file(conf_filename, env) if conf_file: - with open(conf_file, encoding='utf-8') as conf_handle: + with open(conf_file, encoding="utf-8") as conf_handle: value = fn(conf_handle) if value: return value @@ -112,9 +112,9 @@ def apply_bparams(fn): """ cmd = ["bparams", "-a"] try: - output = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode('utf-8') + output = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode("utf-8") except subprocess.CalledProcessError as exc: - logger.debug(exc.output.decode('utf-8')) + logger.debug(exc.output.decode("utf-8")) return None return fn(output.split("\n")) @@ -125,9 +125,9 @@ def apply_lsadmin(fn): """ cmd = ["lsadmin", "showconf", "lim"] try: - output = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode('utf-8') + output = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode("utf-8") except subprocess.CalledProcessError as exc: - logger.debug(exc.output.decode('utf-8')) + logger.debug(exc.output.decode("utf-8")) return None return fn(output.split("\n")) @@ -161,7 +161,7 @@ def parse_mem_and_cmd_from_output(output: str): # Handle hard wrapping in the middle of words and arbitrary # indents. May drop spaces at the starts of lines that aren't # meant to be part of the indent. - cleaned_up_output = ' '.join(re.sub(r"\n\s*", "", output).split(',')) + cleaned_up_output = " ".join(re.sub(r"\n\s*", "", output).split(",")) max_mem = re.search(r"MAX ?MEM: ?(.*?);", cleaned_up_output) command = re.search(r"Command ?<(.*?)>", cleaned_up_output) return max_mem, command @@ -173,10 +173,10 @@ def get_lsf_version(): """ cmd = ["lsid"] try: - output = subprocess.check_output(cmd).decode('utf-8') + output = subprocess.check_output(cmd).decode("utf-8") except: return None - bjobs_search = re.search('IBM Spectrum LSF Standard (.*),', output) + bjobs_search = re.search("IBM Spectrum LSF Standard (.*),", output) if bjobs_search: lsf_version = bjobs_search.group(1) return lsf_version @@ -188,7 +188,9 @@ def check_lsf_json_output_supported(): """Check if the current LSF system supports bjobs json output.""" try: lsf_version = get_lsf_version() - if lsf_version and (version.parse(lsf_version) >= version.parse(LSF_JSON_OUTPUT_MIN_VERSION)): + if lsf_version and ( + version.parse(lsf_version) >= version.parse(LSF_JSON_OUTPUT_MIN_VERSION) + ): return True except: return False @@ -197,11 +199,11 @@ def check_lsf_json_output_supported(): def parse_memory(mem: float) -> str: """Parse memory parameter.""" - megabytes_of_mem = convert_units(float(mem), src_unit='B', dst_unit='MB') + megabytes_of_mem = convert_units(float(mem), src_unit="B", dst_unit="MB") if megabytes_of_mem < 1: megabytes_of_mem = 1.0 # round as a string here to avoid returning something like 1.231e+12 - return f'{megabytes_of_mem:.0f}MB' + return f"{megabytes_of_mem:.0f}MB" def per_core_reservation(): diff --git a/src/toil/batchSystems/mesos/__init__.py b/src/toil/batchSystems/mesos/__init__.py index 9d443f1ad5..0d58581cc9 100644 --- a/src/toil/batchSystems/mesos/__init__.py +++ b/src/toil/batchSystems/mesos/__init__.py @@ -19,19 +19,23 @@ from toil.provisioners.abstractProvisioner import Shape -TaskData = namedtuple('TaskData', ( - # Time when the task was started - 'startTime', - # Mesos' ID of the agent where task is being run - 'agentID', - # IP of agent where task is being run - 'agentIP', - # Mesos' ID of the executor running the task - 'executorID', - # Memory requirement of the task - 'memory', - # CPU requirement of the task - 'cores')) +TaskData = namedtuple( + "TaskData", + ( + # Time when the task was started + "startTime", + # Mesos' ID of the agent where task is being run + "agentID", + # IP of agent where task is being run + "agentIP", + # Mesos' ID of the executor running the task + "executorID", + # Memory requirement of the task + "memory", + # CPU requirement of the task + "cores", + ), +) class JobQueue: @@ -52,7 +56,11 @@ def insertJob(self, job, jobType): def jobIDs(self): with self.jobLock: - return [job.jobID for queue in list(self.queues.values()) for job in list(queue.queue)] + return [ + job.jobID + for queue in list(self.queues.values()) + for job in list(queue.queue) + ] def nextJobOfType(self, jobType): with self.jobLock: @@ -80,18 +88,22 @@ def __gt__(self, other): return not self.greater_than(other) -ToilJob = namedtuple('ToilJob', ( - # A job ID specific to this batch system implementation - 'jobID', - # What string to display in the mesos UI - 'name', - # A ResourceRequirement tuple describing the resources needed by this job - 'resources', - # The command to be run on the worker node - 'command', - # The resource object representing the user script - 'userScript', - # A dictionary with additional environment variables to be set on the worker process - 'environment', - # A named tuple containing all the required info for cleaning up the worker node - 'workerCleanupInfo')) +ToilJob = namedtuple( + "ToilJob", + ( + # A job ID specific to this batch system implementation + "jobID", + # What string to display in the mesos UI + "name", + # A ResourceRequirement tuple describing the resources needed by this job + "resources", + # The command to be run on the worker node + "command", + # The resource object representing the user script + "userScript", + # A dictionary with additional environment variables to be set on the worker process + "environment", + # A named tuple containing all the required info for cleaning up the worker node + "workerCleanupInfo", + ), +) diff --git a/src/toil/batchSystems/mesos/batchSystem.py b/src/toil/batchSystems/mesos/batchSystem.py index 14981064bc..4869c763ac 100644 --- a/src/toil/batchSystems/mesos/batchSystem.py +++ b/src/toil/batchSystems/mesos/batchSystem.py @@ -22,7 +22,7 @@ import traceback from argparse import ArgumentParser, _ArgumentGroup from queue import Empty, Queue -from typing import Dict, Optional, Union +from typing import Optional, Union from urllib.parse import quote_plus from urllib.request import urlopen @@ -30,11 +30,13 @@ from pymesos import MesosSchedulerDriver, Scheduler, decode_data, encode_data from toil import resolveEntryPoint -from toil.batchSystems.abstractBatchSystem import (EXIT_STATUS_UNAVAILABLE_VALUE, - AbstractScalableBatchSystem, - BatchJobExitReason, - NodeInfo, - UpdatedBatchJobInfo) +from toil.batchSystems.abstractBatchSystem import ( + EXIT_STATUS_UNAVAILABLE_VALUE, + AbstractScalableBatchSystem, + BatchJobExitReason, + NodeInfo, + UpdatedBatchJobInfo, +) from toil.batchSystems.local_support import BatchSystemLocalSupport from toil.batchSystems.mesos import JobQueue, MesosShape, TaskData, ToilJob from toil.batchSystems.options import OptionSetter @@ -46,9 +48,7 @@ log = logging.getLogger(__name__) -class MesosBatchSystem(BatchSystemLocalSupport, - AbstractScalableBatchSystem, - Scheduler): +class MesosBatchSystem(BatchSystemLocalSupport, AbstractScalableBatchSystem, Scheduler): """ A Toil batch system implementation that uses Apache Mesos to distribute toil jobs as Mesos tasks over a cluster of agent nodes. A Mesos framework consists of a scheduler and an @@ -174,7 +174,12 @@ def ignoreNode(self, nodeAddress): def unignoreNode(self, nodeAddress): self.ignoredNodes.remove(nodeAddress) - def issueBatchJob(self, command: str, jobNode: JobDescription, job_environment: Optional[Dict[str, str]] = None): + def issueBatchJob( + self, + command: str, + jobNode: JobDescription, + job_environment: Optional[dict[str, str]] = None, + ): """ Issues the following command returning a unique jobID. Command is the string to run, memory is an int giving the number of bytes the job needs to run in and cores is the number of cpus @@ -189,7 +194,7 @@ def issueBatchJob(self, command: str, jobNode: JobDescription, job_environment: "memory": jobNode.memory, "cores": jobNode.cores, "disk": jobNode.disk, - "preemptible": jobNode.preemptible + "preemptible": jobNode.preemptible, } jobID = self.getNextJobID() @@ -197,13 +202,15 @@ def issueBatchJob(self, command: str, jobNode: JobDescription, job_environment: if job_environment: environment.update(job_environment) - job = ToilJob(jobID=jobID, - name=str(jobNode), - resources=MesosShape(wallTime=0, **mesos_resources), - command=command, - userScript=self.userScript, - environment=environment, - workerCleanupInfo=self.workerCleanupInfo) + job = ToilJob( + jobID=jobID, + name=str(jobNode), + resources=MesosShape(wallTime=0, **mesos_resources), + command=command, + userScript=self.userScript, + environment=environment, + workerCleanupInfo=self.workerCleanupInfo, + ) jobType = job.resources log.debug("Queueing the job %s with job id: %s ...", jobNode, str(jobID)) @@ -285,11 +292,17 @@ def getUpdatedBatchJob(self, maxWait): try: self.intendedKill.remove(item.jobID) except KeyError: - log.debug('Job %s ended with status %i, took %s seconds.', item.jobID, item.exitStatus, - '???' if item.wallTime is None else str(item.wallTime)) + log.debug( + "Job %s ended with status %i, took %s seconds.", + item.jobID, + item.exitStatus, + "???" if item.wallTime is None else str(item.wallTime), + ) return item else: - log.debug('Job %s ended naturally before it could be killed.', item.jobID) + log.debug( + "Job %s ended naturally before it could be killed.", item.jobID + ) def nodeInUse(self, nodeIP: str) -> bool: return nodeIP in self.hostToJobIDs @@ -308,7 +321,7 @@ def _buildExecutor(self): # The executor program is installed as a setuptools entry point by setup.py info = addict.Dict() info.name = "toil" - info.command.value = resolveEntryPoint('_toil_mesos_executor') + info.command.value = resolveEntryPoint("_toil_mesos_executor") info.executor_id.value = "toil-%i" % os.getpid() info.source = pwd.getpwuid(os.getuid()).pw_name return info @@ -318,18 +331,24 @@ def _startDriver(self, config): The Mesos driver thread which handles the scheduler's communication with the Mesos master """ framework = addict.Dict() - framework.user = get_user_name() # We must determine the user name ourselves with pymesos + framework.user = ( + get_user_name() + ) # We must determine the user name ourselves with pymesos framework.name = config.mesos_name framework.principal = framework.name if config.mesos_role is not None: framework.roles = config.mesos_role - framework.capabilities = [dict(type='MULTI_ROLE')] + framework.capabilities = [dict(type="MULTI_ROLE")] # Make the driver which implements most of the scheduler logic and calls back to us for the user-defined parts. # Make sure it will call us with nice namespace-y addicts - self.driver = MesosSchedulerDriver(self, framework, - self._resolveAddress(self.mesos_endpoint), - use_addict=True, implicit_acknowledgements=True) + self.driver = MesosSchedulerDriver( + self, + framework, + self._resolveAddress(self.mesos_endpoint), + use_addict=True, + implicit_acknowledgements=True, + ) self.driver.start() @staticmethod @@ -349,10 +368,10 @@ def _resolveAddress(address): >>> f('127.0.0.1:123') '127.0.0.1:123' """ - address = address.split(':') + address = address.split(":") assert len(address) in (1, 2) address[0] = socket.gethostbyname(address[0]) - return ':'.join(address) + return ":".join(address) def shutdown(self) -> None: self.shutdownLocal() @@ -361,7 +380,7 @@ def shutdown(self) -> None: log.debug("Joining Mesos driver") driver_result = self.driver.join() log.debug("Joined Mesos driver") - if driver_result is not None and driver_result != 'DRIVER_STOPPED': + if driver_result is not None and driver_result != "DRIVER_STOPPED": # TODO: The docs say join should return a code, but it keeps returning # None when apparently successful. So tolerate that here too. raise RuntimeError("Mesos driver failed with %s" % driver_result) @@ -384,11 +403,15 @@ def _parseOffer(self, offer): disk = 0 preemptible = None for attribute in offer.attributes: - if attribute.name == 'preemptible': - assert preemptible is None, "Attribute 'preemptible' occurs more than once." + if attribute.name == "preemptible": + assert ( + preemptible is None + ), "Attribute 'preemptible' occurs more than once." preemptible = strict_bool(attribute.text.value) if preemptible is None: - log.debug('Agent not marked as either preemptible or not. Assuming non-preemptible.') + log.debug( + "Agent not marked as either preemptible or not. Assuming non-preemptible." + ) preemptible = False for resource in offer.resources: if resource.name == "cpus": @@ -415,14 +438,16 @@ def _updateStateToRunning(self, offer, runnableTasks): except KeyError: self.hostToJobIDs[agentIP] = [resourceKey] - self.runningJobMap[int(task.task_id.value)] = TaskData(startTime=time.time(), - agentID=offer.agent_id.value, - agentIP=agentIP, - executorID=task.executor.executor_id.value, - cores=resources.cores, - memory=resources.memory) + self.runningJobMap[int(task.task_id.value)] = TaskData( + startTime=time.time(), + agentID=offer.agent_id.value, + agentIP=agentIP, + executorID=task.executor.executor_id.value, + cores=resources.cores, + memory=resources.memory, + ) del self.taskResources[resourceKey] - log.debug('Launched Mesos task %s.', task.task_id.value) + log.debug("Launched Mesos task %s.", task.task_id.value) def resourceOffers(self, driver, offers): """ @@ -445,10 +470,18 @@ def resourceOffers(self, driver, offers): continue runnableTasks = [] # TODO: In an offer, can there ever be more than one resource with the same name? - offerCores, offerMemory, offerDisk, offerPreemptible = self._parseOffer(offer) - log.debug('Got offer %s for a %spreemptible agent with %.2f MiB memory, %.2f core(s) ' - 'and %.2f MiB of disk.', offer.id.value, '' if offerPreemptible else 'non-', - offerMemory, offerCores, offerDisk) + offerCores, offerMemory, offerDisk, offerPreemptible = self._parseOffer( + offer + ) + log.debug( + "Got offer %s for a %spreemptible agent with %.2f MiB memory, %.2f core(s) " + "and %.2f MiB of disk.", + offer.id.value, + "" if offerPreemptible else "non-", + offerMemory, + offerCores, + offerDisk, + ) remainingCores = offerCores remainingMemory = offerMemory remainingDisk = offerDisk @@ -460,35 +493,47 @@ def resourceOffers(self, driver, offers): # loop. nextToLaunchIndex = 0 # Toil specifies disk and memory in bytes but Mesos uses MiB - while ( not self.jobQueues.typeEmpty(jobType) - # On a non-preemptible node we can run any job, on a preemptible node we - # can only run preemptible jobs: - and (not offerPreemptible or jobType.preemptible) - and remainingCores >= jobType.cores - and remainingDisk >= b_to_mib(jobType.disk) - and remainingMemory >= b_to_mib(jobType.memory)): + while ( + not self.jobQueues.typeEmpty(jobType) + # On a non-preemptible node we can run any job, on a preemptible node we + # can only run preemptible jobs: + and (not offerPreemptible or jobType.preemptible) + and remainingCores >= jobType.cores + and remainingDisk >= b_to_mib(jobType.disk) + and remainingMemory >= b_to_mib(jobType.memory) + ): task = self._prepareToRun(jobType, offer) # TODO: this used to be a conditional but Hannes wanted it changed to an assert # TODO: ... so we can understand why it exists. assert int(task.task_id.value) not in self.runningJobMap runnableTasksOfType.append(task) - log.debug("Preparing to launch Mesos task %s with %.2f cores, %.2f MiB memory, and %.2f MiB disk using offer %s ...", - task.task_id.value, jobType.cores, b_to_mib(jobType.memory), b_to_mib(jobType.disk), offer.id.value) + log.debug( + "Preparing to launch Mesos task %s with %.2f cores, %.2f MiB memory, and %.2f MiB disk using offer %s ...", + task.task_id.value, + jobType.cores, + b_to_mib(jobType.memory), + b_to_mib(jobType.disk), + offer.id.value, + ) remainingCores -= jobType.cores remainingMemory -= b_to_mib(jobType.memory) remainingDisk -= b_to_mib(jobType.disk) nextToLaunchIndex += 1 if not self.jobQueues.typeEmpty(jobType): # report that remaining jobs cannot be run with the current resourcesq: - log.debug('Offer %(offer)s not suitable to run the tasks with requirements ' - '%(requirements)r. Mesos offered %(memory)s memory, %(cores)s cores ' - 'and %(disk)s of disk on a %(non)spreemptible agent.', - dict(offer=offer.id.value, - requirements=jobType.__dict__, - non='' if offerPreemptible else 'non-', - memory=mib_to_b(offerMemory), - cores=offerCores, - disk=mib_to_b(offerDisk))) + log.debug( + "Offer %(offer)s not suitable to run the tasks with requirements " + "%(requirements)r. Mesos offered %(memory)s memory, %(cores)s cores " + "and %(disk)s of disk on a %(non)spreemptible agent.", + dict( + offer=offer.id.value, + requirements=jobType.__dict__, + non="" if offerPreemptible else "non-", + memory=mib_to_b(offerMemory), + cores=offerCores, + disk=mib_to_b(offerDisk), + ), + ) runnableTasks.extend(runnableTasksOfType) # Launch all runnable tasks together so we only call launchTasks once per offer if runnableTasks: @@ -496,21 +541,27 @@ def resourceOffers(self, driver, offers): driver.launchTasks(offer.id, runnableTasks) self._updateStateToRunning(offer, runnableTasks) else: - log.debug('Although there are queued jobs, none of them could be run with offer %s ' - 'extended to the framework.', offer.id) + log.debug( + "Although there are queued jobs, none of them could be run with offer %s " + "extended to the framework.", + offer.id, + ) driver.declineOffer(offer.id) if unableToRun and time.time() > (self.lastTimeOfferLogged + self.logPeriod): self.lastTimeOfferLogged = time.time() - log.debug('Although there are queued jobs, none of them were able to run in ' - 'any of the offers extended to the framework. There are currently ' - '%i jobs running. Enable debug level logging to see more details about ' - 'job types and offers received.', len(self.runningJobMap)) + log.debug( + "Although there are queued jobs, none of them were able to run in " + "any of the offers extended to the framework. There are currently " + "%i jobs running. Enable debug level logging to see more details about " + "job types and offers received.", + len(self.runningJobMap), + ) def _trackOfferedNodes(self, offers): for offer in offers: # All AgentID messages are required to have a value according to the Mesos Protobuf file. - assert 'value' in offer.agent_id + assert "value" in offer.agent_id try: nodeAddress = socket.gethostbyname(offer.hostname) except: @@ -519,7 +570,7 @@ def _trackOfferedNodes(self, offers): self._registerNode(nodeAddress, offer.agent_id.value) preemptible = False for attribute in offer.attributes: - if attribute.name == 'preemptible': + if attribute.name == "preemptible": preemptible = strict_bool(attribute.text.value) if preemptible: try: @@ -532,11 +583,17 @@ def _trackOfferedNodes(self, offers): def _filterOfferedNodes(self, offers): if not self.nodeFilter: return offers - executorInfoOrNone = [self.executors.get(socket.gethostbyname(offer.hostname)) for offer in offers] + executorInfoOrNone = [ + self.executors.get(socket.gethostbyname(offer.hostname)) for offer in offers + ] executorInfos = [_f for _f in executorInfoOrNone if _f] executorsToConsider = list(filter(self.nodeFilter[0], executorInfos)) ipsToConsider = {ex.nodeAddress for ex in executorsToConsider} - return [offer for offer in offers if socket.gethostbyname(offer.hostname) in ipsToConsider] + return [ + offer + for offer in offers + if socket.gethostbyname(offer.hostname) in ipsToConsider + ] def _newMesosTask(self, job, offer): """ @@ -553,30 +610,36 @@ def _newMesosTask(self, job, offer): task.resources.append(addict.Dict()) cpus = task.resources[-1] - cpus.name = 'cpus' - cpus.type = 'SCALAR' + cpus.name = "cpus" + cpus.type = "SCALAR" cpus.scalar.value = job.resources.cores task.resources.append(addict.Dict()) disk = task.resources[-1] - disk.name = 'disk' - disk.type = 'SCALAR' + disk.name = "disk" + disk.type = "SCALAR" if b_to_mib(job.resources.disk) > 1: disk.scalar.value = b_to_mib(job.resources.disk) else: - log.warning("Job %s uses less disk than Mesos requires. Rounding %s up to 1 MiB.", - job.jobID, job.resources.disk) + log.warning( + "Job %s uses less disk than Mesos requires. Rounding %s up to 1 MiB.", + job.jobID, + job.resources.disk, + ) disk.scalar.value = 1 task.resources.append(addict.Dict()) mem = task.resources[-1] - mem.name = 'mem' - mem.type = 'SCALAR' + mem.name = "mem" + mem.type = "SCALAR" if b_to_mib(job.resources.memory) > 1: mem.scalar.value = b_to_mib(job.resources.memory) else: - log.warning("Job %s uses less memory than Mesos requires. Rounding %s up to 1 MiB.", - job.jobID, job.resources.memory) + log.warning( + "Job %s uses less memory than Mesos requires. Rounding %s up to 1 MiB.", + job.jobID, + job.resources.memory, + ) mem.scalar.value = 1 return task @@ -590,19 +653,34 @@ def statusUpdate(self, driver, update): agent sending the status update is lost/fails during that time). """ jobID = int(update.task_id.value) - log.debug("Job %i is in state '%s' due to reason '%s'.", jobID, update.state, update.reason) + log.debug( + "Job %i is in state '%s' due to reason '%s'.", + jobID, + update.state, + update.reason, + ) def jobEnded(_exitStatus, wallTime=None, exitReason=None): """ Notify external observers of the job ending. """ - self.updatedJobsQueue.put(UpdatedBatchJobInfo(jobID=jobID, exitStatus=_exitStatus, wallTime=wallTime, exitReason=exitReason)) + self.updatedJobsQueue.put( + UpdatedBatchJobInfo( + jobID=jobID, + exitStatus=_exitStatus, + wallTime=wallTime, + exitReason=exitReason, + ) + ) agentIP = None try: agentIP = self.runningJobMap[jobID].agentIP except KeyError: - log.warning("Job %i returned exit code %i but isn't tracked as running.", - jobID, _exitStatus) + log.warning( + "Job %i returned exit code %i but isn't tracked as running.", + jobID, + _exitStatus, + ) else: # Mark the job as no longer running. We MUST do this BEFORE # saying we killed the job, or it will be possible for another @@ -612,8 +690,11 @@ def jobEnded(_exitStatus, wallTime=None, exitReason=None): try: self.hostToJobIDs[agentIP].remove(jobID) except KeyError: - log.warning("Job %i returned exit code %i from unknown host.", - jobID, _exitStatus) + log.warning( + "Job %i returned exit code %i from unknown host.", + jobID, + _exitStatus, + ) try: self.killJobIds.remove(jobID) @@ -626,41 +707,62 @@ def jobEnded(_exitStatus, wallTime=None, exitReason=None): # state from other threads. self.killedJobIds.add(jobID) - if update.state == 'TASK_FINISHED': + if update.state == "TASK_FINISHED": # We get the running time of the job via the timestamp, which is in job-local time in seconds labels = update.labels.labels wallTime = None for label in labels: - if label['key'] == 'wallTime': - wallTime = float(label['value']) + if label["key"] == "wallTime": + wallTime = float(label["value"]) break - assert(wallTime is not None) + assert wallTime is not None jobEnded(0, wallTime=wallTime, exitReason=BatchJobExitReason.FINISHED) - elif update.state == 'TASK_FAILED': + elif update.state == "TASK_FAILED": try: exitStatus = int(update.message) except ValueError: exitStatus = EXIT_STATUS_UNAVAILABLE_VALUE - log.warning("Job %i failed with message '%s' due to reason '%s' on executor '%s' on agent '%s'.", - jobID, update.message, update.reason, - update.executor_id, update.agent_id) + log.warning( + "Job %i failed with message '%s' due to reason '%s' on executor '%s' on agent '%s'.", + jobID, + update.message, + update.reason, + update.executor_id, + update.agent_id, + ) else: - log.warning("Job %i failed with exit status %i and message '%s' due to reason '%s' on executor '%s' on agent '%s'.", - jobID, exitStatus, - update.message, update.reason, - update.executor_id, update.agent_id) + log.warning( + "Job %i failed with exit status %i and message '%s' due to reason '%s' on executor '%s' on agent '%s'.", + jobID, + exitStatus, + update.message, + update.reason, + update.executor_id, + update.agent_id, + ) jobEnded(exitStatus, exitReason=BatchJobExitReason.FAILED) - elif update.state == 'TASK_LOST': + elif update.state == "TASK_LOST": log.warning("Job %i is lost.", jobID) jobEnded(EXIT_STATUS_UNAVAILABLE_VALUE, exitReason=BatchJobExitReason.LOST) - elif update.state in ('TASK_KILLED', 'TASK_ERROR'): - log.warning("Job %i is in unexpected state %s with message '%s' due to reason '%s'.", - jobID, update.state, update.message, update.reason) - jobEnded(EXIT_STATUS_UNAVAILABLE_VALUE, - exitReason=(BatchJobExitReason.KILLED if update.state == 'TASK_KILLED' else BatchJobExitReason.ERROR)) - - if 'limitation' in update: + elif update.state in ("TASK_KILLED", "TASK_ERROR"): + log.warning( + "Job %i is in unexpected state %s with message '%s' due to reason '%s'.", + jobID, + update.state, + update.message, + update.reason, + ) + jobEnded( + EXIT_STATUS_UNAVAILABLE_VALUE, + exitReason=( + BatchJobExitReason.KILLED + if update.state == "TASK_KILLED" + else BatchJobExitReason.ERROR + ), + ) + + if "limitation" in update: log.warning("Job limit info: %s" % update.limitation) def frameworkMessage(self, driver, executorId, agentId, message): @@ -671,22 +773,31 @@ def frameworkMessage(self, driver, executorId, agentId, message): # Take it out of base 64 encoding from Protobuf message = decode_data(message).decode() - log.debug('Got framework message from executor %s running on agent %s: %s', - executorId.value, agentId.value, message) + log.debug( + "Got framework message from executor %s running on agent %s: %s", + executorId.value, + agentId.value, + message, + ) message = ast.literal_eval(message) assert isinstance(message, dict) # Handle the mandatory fields of a message - nodeAddress = message.pop('address') + nodeAddress = message.pop("address") executor = self._registerNode(nodeAddress, agentId.value) # Handle optional message fields for k, v in message.items(): - if k == 'nodeInfo': + if k == "nodeInfo": assert isinstance(v, dict) - resources = [taskData for taskData in self.runningJobMap.values() - if taskData.executorID == executorId.value] + resources = [ + taskData + for taskData in self.runningJobMap.values() + if taskData.executorID == executorId.value + ] requestedCores = sum(taskData.cores for taskData in resources) requestedMemory = sum(taskData.memory for taskData in resources) - executor.nodeInfo = NodeInfo(requestedCores=requestedCores, requestedMemory=requestedMemory, **v) + executor.nodeInfo = NodeInfo( + requestedCores=requestedCores, requestedMemory=requestedMemory, **v + ) self.executors[nodeAddress] = executor else: raise RuntimeError("Unknown message field '%s'." % k) @@ -699,10 +810,12 @@ def _registerNode(self, nodeAddress, agentId, nodePort=5051): """ executor = self.executors.get(nodeAddress) if executor is None or executor.agentId != agentId: - executor = self.ExecutorInfo(nodeAddress=nodeAddress, - agentId=agentId, - nodeInfo=None, - lastSeen=time.time()) + executor = self.ExecutorInfo( + nodeAddress=nodeAddress, + agentId=agentId, + nodeInfo=None, + lastSeen=time.time(), + ) self.executors[nodeAddress] = executor else: executor.lastSeen = time.time() @@ -712,9 +825,9 @@ def _registerNode(self, nodeAddress, agentId, nodePort=5051): return executor - def getNodes(self, - preemptible: Optional[bool] = None, - timeout: Optional[int] = None) -> Dict[str, NodeInfo]: + def getNodes( + self, preemptible: Optional[bool] = None, timeout: Optional[int] = None + ) -> dict[str, NodeInfo]: """ Return all nodes that match: - preemptible status (None includes all) @@ -722,7 +835,9 @@ def getNodes(self, """ nodes = dict() for node_ip, executor in self.executors.items(): - if preemptible is None or (preemptible == (executor.agentId not in self.nonPreemptibleNodes)): + if preemptible is None or ( + preemptible == (executor.agentId not in self.nonPreemptibleNodes) + ): if timeout is None or (time.time() - executor.lastSeen < timeout): nodes[node_ip] = executor.nodeInfo return nodes @@ -731,7 +846,7 @@ def reregistered(self, driver, masterInfo): """ Invoked when the scheduler re-registers with a newly elected Mesos master. """ - log.debug('Registered with new master') + log.debug("Registered with new master") def _handleFailedExecutor(self, agentID, executorID=None): """ @@ -746,8 +861,9 @@ def _handleFailedExecutor(self, agentID, executorID=None): Useful for debugging failing executor code. """ - log.warning("Handling failure of executor '%s' on agent '%s'.", - executorID, agentID) + log.warning( + "Handling failure of executor '%s' on agent '%s'.", executorID, agentID + ) try: # Look up the IP. We should always know it unless we get answers @@ -763,22 +879,27 @@ def _handleFailedExecutor(self, agentID, executorID=None): # it, and I can't find a good way to list it, because the API only # seems to report running containers. So we dump all the available # files with /files/debug and look for one that looks right. - filesQueryURL = errorLogURL = "http://%s:%d/files/debug" % \ - (agentAddress, agentPort) + filesQueryURL = errorLogURL = "http://%s:%d/files/debug" % ( + agentAddress, + agentPort, + ) # Download all the root mount points, which are in an object from # mounted name to real name filesDict = json.loads(urlopen(filesQueryURL).read()) - log.debug('Available files: %s', repr(filesDict.keys())) + log.debug("Available files: %s", repr(filesDict.keys())) # Generate filenames for each container pointing to where stderr should be stderrFilenames = [] # And look for the actual agent logs. agentLogFilenames = [] for filename in filesDict: - if (self.frameworkId in filename and agentID in filename and - (executorID is None or executorID in filename)): + if ( + self.frameworkId in filename + and agentID in filename + and (executorID is None or executorID in filename) + ): stderrFilenames.append("%s/stderr" % filename) elif filename.endswith("log"): @@ -793,10 +914,15 @@ def _handleFailedExecutor(self, agentID, executorID=None): # According to # http://mesos.apache.org/documentation/latest/sandbox/ we can use # the web API to fetch the error log. - errorLogURL = "http://%s:%d/files/download?path=%s" % \ - (agentAddress, agentPort, quote_plus(stderrFilename)) + errorLogURL = "http://%s:%d/files/download?path=%s" % ( + agentAddress, + agentPort, + quote_plus(stderrFilename), + ) - log.warning("Attempting to retrieve executor error log: %s", errorLogURL) + log.warning( + "Attempting to retrieve executor error log: %s", errorLogURL + ) for line in urlopen(errorLogURL): # Warn all the lines of the executor's error log @@ -808,8 +934,11 @@ def _handleFailedExecutor(self, agentID, executorID=None): for agentLogFilename in agentLogFilenames: try: - agentLogURL = "http://%s:%d/files/download?path=%s" % \ - (agentAddress, agentPort, quote_plus(agentLogFilename)) + agentLogURL = "http://%s:%d/files/download?path=%s" % ( + agentAddress, + agentPort, + quote_plus(agentLogFilename), + ) log.warning("Attempting to retrieve agent log: %s", agentLogURL) @@ -829,7 +958,7 @@ def executorLost(self, driver, executorId, agentId, status): Invoked when an executor has exited/terminated abnormally. """ - failedId = executorId.get('value', None) + failedId = executorId.get("value", None) log.warning("Executor '%s' reported lost with status '%s'.", failedId, status) @@ -840,20 +969,31 @@ def get_default_mesos_endpoint(cls) -> str: """ Get the default IP/hostname and port that we will look for Mesos at. """ - return f'{get_public_ip()}:5050' + return f"{get_public_ip()}:5050" @classmethod def add_options(cls, parser: Union[ArgumentParser, _ArgumentGroup]) -> None: - parser.add_argument("--mesosEndpoint", "--mesosMaster", dest="mesos_endpoint", default=None, - help=f"The host and port of the Mesos master separated by colon. If the provided value " - f"is None, the value will be generated at runtime. " - f"(Generated default: {cls.get_default_mesos_endpoint})") - parser.add_argument("--mesosFrameworkId", dest="mesos_framework_id", - help="Use a specific Mesos framework ID.") - parser.add_argument("--mesosRole", dest="mesos_role", - help="Use a Mesos role.") - parser.add_argument("--mesosName", dest="mesos_name", default="toil", - help="The Mesos name to use. (default: %(default)s)") + parser.add_argument( + "--mesosEndpoint", + "--mesosMaster", + dest="mesos_endpoint", + default=None, + help=f"The host and port of the Mesos master separated by colon. If the provided value " + f"is None, the value will be generated at runtime. " + f"(Generated default: {cls.get_default_mesos_endpoint})", + ) + parser.add_argument( + "--mesosFrameworkId", + dest="mesos_framework_id", + help="Use a specific Mesos framework ID.", + ) + parser.add_argument("--mesosRole", dest="mesos_role", help="Use a Mesos role.") + parser.add_argument( + "--mesosName", + dest="mesos_name", + default="toil", + help="The Mesos name to use. (default: %(default)s)", + ) @classmethod def setOptions(cls, setOption: OptionSetter): @@ -861,4 +1001,3 @@ def setOptions(cls, setOption: OptionSetter): setOption("mesos_name") setOption("mesos_role") setOption("mesos_framework_id") - diff --git a/src/toil/batchSystems/mesos/executor.py b/src/toil/batchSystems/mesos/executor.py index 831f7c6401..27bd1e04e2 100644 --- a/src/toil/batchSystems/mesos/executor.py +++ b/src/toil/batchSystems/mesos/executor.py @@ -51,14 +51,14 @@ def __init__(self): self.popenLock = threading.Lock() self.runningTasks = {} self.workerCleanupInfo = None - log.debug('Preparing system for resource download') + log.debug("Preparing system for resource download") Resource.prepareSystem() self.address = None self.id = None # Setting this value at this point will ensure that the toil workflow directory will go to # the mesos sandbox if the user hasn't specified --workDir on the command line. - if not os.getenv('TOIL_WORKDIR'): - os.environ['TOIL_WORKDIR'] = os.getcwd() + if not os.getenv("TOIL_WORKDIR"): + os.environ["TOIL_WORKDIR"] = os.getcwd() def registered(self, driver, executorInfo, frameworkInfo, agentInfo): """ @@ -66,11 +66,13 @@ def registered(self, driver, executorInfo, frameworkInfo, agentInfo): """ # Get the ID we have been assigned, if we have it - self.id = executorInfo.executor_id.get('value', None) + self.id = executorInfo.executor_id.get("value", None) log.debug("Registered executor %s with framework", self.id) self.address = socket.gethostbyname(agentInfo.hostname) - nodeInfoThread = threading.Thread(target=self._sendFrameworkMessage, args=[driver], daemon=True) + nodeInfoThread = threading.Thread( + target=self._sendFrameworkMessage, args=[driver], daemon=True + ) nodeInfoThread.start() def reregistered(self, driver, agentInfo): @@ -99,12 +101,12 @@ def killTask(self, driver, taskId): os.killpg(pgid, signal.SIGKILL) def shutdown(self, driver): - log.critical('Shutting down executor ...') + log.critical("Shutting down executor ...") for taskId in list(self.runningTasks.keys()): self.killTask(driver, taskId) Resource.cleanSystem() BatchSystemSupport.workerCleanup(self.workerCleanupInfo) - log.critical('... executor shut down.') + log.critical("... executor shut down.") def error(self, driver, message): """ @@ -123,13 +125,15 @@ def _sendFrameworkMessage(self, driver): message = Expando(address=self.address) psutil.cpu_percent() else: - message.nodeInfo = dict(coresUsed=float(psutil.cpu_percent()) * .01, - memoryUsed=float(psutil.virtual_memory().percent) * .01, - coresTotal=cpu_count(), - memoryTotal=psutil.virtual_memory().total, - workers=len(self.runningTasks)) + message.nodeInfo = dict( + coresUsed=float(psutil.cpu_percent()) * 0.01, + memoryUsed=float(psutil.virtual_memory().percent) * 0.01, + coresTotal=cpu_count(), + memoryTotal=psutil.virtual_memory().total, + workers=len(self.runningTasks), + ) log.debug("Send framework message: %s", message) - driver.sendFrameworkMessage(encode_data(repr(message).encode('utf-8'))) + driver.sendFrameworkMessage(encode_data(repr(message).encode("utf-8"))) # Prevent workers launched together from repeatedly hitting the leader at the same time time.sleep(random.randint(45, 75)) @@ -144,16 +148,21 @@ def runTask(): log.debug("Running task %s", task.task_id.value) startTime = time.time() - sendUpdate(task, 'TASK_RUNNING', wallTime=0) + sendUpdate(task, "TASK_RUNNING", wallTime=0) # try to unpickle the task try: taskData = pickle.loads(decode_data(task.data)) except: exc_info = sys.exc_info() - log.error('Exception while unpickling task: ', exc_info=exc_info) + log.error("Exception while unpickling task: ", exc_info=exc_info) exc_type, exc_value, exc_trace = exc_info - sendUpdate(task, 'TASK_FAILED', wallTime=0, msg=''.join(traceback.format_exception_only(exc_type, exc_value))) + sendUpdate( + task, + "TASK_FAILED", + wallTime=0, + msg="".join(traceback.format_exception_only(exc_type, exc_value)), + ) return # This is where task.data is first invoked. Using this position to setup cleanupInfo @@ -170,23 +179,27 @@ def runTask(): exitStatus = process.wait() wallTime = time.time() - startTime if 0 == exitStatus: - sendUpdate(task, 'TASK_FINISHED', wallTime) + sendUpdate(task, "TASK_FINISHED", wallTime) elif -9 == exitStatus: - sendUpdate(task, 'TASK_KILLED', wallTime) + sendUpdate(task, "TASK_KILLED", wallTime) else: - sendUpdate(task, 'TASK_FAILED', wallTime, msg=str(exitStatus)) + sendUpdate(task, "TASK_FAILED", wallTime, msg=str(exitStatus)) finally: del self.runningTasks[task.task_id.value] except: wallTime = time.time() - startTime exc_info = sys.exc_info() - log.error('Exception while running task:', exc_info=exc_info) + log.error("Exception while running task:", exc_info=exc_info) exc_type, exc_value, exc_trace = exc_info - sendUpdate(task, 'TASK_FAILED', wallTime=wallTime, msg=''.join(traceback.format_exception_only(exc_type, exc_value))) + sendUpdate( + task, + "TASK_FAILED", + wallTime=wallTime, + msg="".join(traceback.format_exception_only(exc_type, exc_value)), + ) wallTime = time.time() - startTime - sendUpdate(task, 'TASK_FINISHED', wallTime) - + sendUpdate(task, "TASK_FINISHED", wallTime) def runJob(job): """ @@ -200,13 +213,13 @@ def runJob(job): log.debug("Invoking command: '%s'", command) # Construct the job's environment jobEnv = dict(os.environ, **job.environment) - log.debug('Using environment variables: %s', jobEnv.keys()) + log.debug("Using environment variables: %s", jobEnv.keys()) with self.popenLock: - return subprocess.Popen(command, - preexec_fn=lambda: os.setpgrp(), - shell=True, env=jobEnv) + return subprocess.Popen( + command, preexec_fn=lambda: os.setpgrp(), shell=True, env=jobEnv + ) - def sendUpdate(task, taskState, wallTime, msg=''): + def sendUpdate(task, taskState, wallTime, msg=""): update = addict.Dict() update.task_id.value = task.task_id.value if self.id is not None: @@ -217,7 +230,7 @@ def sendUpdate(task, taskState, wallTime, msg=''): # Add wallTime as a label. labels = addict.Dict() - labels.labels = [{'key': 'wallTime', 'value': str(wallTime)}] + labels.labels = [{"key": "wallTime", "value": str(wallTime)}] update.labels = labels driver.sendStatusUpdate(update) @@ -239,34 +252,48 @@ def main(): if not os.environ.get("MESOS_AGENT_ENDPOINT"): # Some Mesos setups in our tests somehow lack this variable. Provide a # fake one to maybe convince the executor driver to work. - os.environ["MESOS_AGENT_ENDPOINT"] = os.environ.get("MESOS_SLAVE_ENDPOINT", "127.0.0.1:5051") - log.warning("Had to fake MESOS_AGENT_ENDPOINT as %s" % os.environ["MESOS_AGENT_ENDPOINT"]) + os.environ["MESOS_AGENT_ENDPOINT"] = os.environ.get( + "MESOS_SLAVE_ENDPOINT", "127.0.0.1:5051" + ) + log.warning( + "Had to fake MESOS_AGENT_ENDPOINT as %s" + % os.environ["MESOS_AGENT_ENDPOINT"] + ) # must be set manually to enable toggling of the mesos log level for debugging jenkins # may be useful: https://github.com/DataBiosphere/toil/pull/2338#discussion_r223854931 if False: try: - urlopen("http://%s/logging/toggle?level=1&duration=15mins" % os.environ["MESOS_AGENT_ENDPOINT"]).read() + urlopen( + "http://%s/logging/toggle?level=1&duration=15mins" + % os.environ["MESOS_AGENT_ENDPOINT"] + ).read() log.debug("Toggled agent log level") except Exception: log.debug("Failed to toggle agent log level") # Parse the agent state - agent_state = json.loads(urlopen("http://%s/state" % os.environ["MESOS_AGENT_ENDPOINT"]).read()) - if 'completed_frameworks' in agent_state: + agent_state = json.loads( + urlopen("http://%s/state" % os.environ["MESOS_AGENT_ENDPOINT"]).read() + ) + if "completed_frameworks" in agent_state: # Drop the completed frameworks which grow over time - del agent_state['completed_frameworks'] + del agent_state["completed_frameworks"] log.debug("Agent state: %s", str(agent_state)) log.debug("Virtual memory info in executor: %s" % repr(psutil.virtual_memory())) - if os.path.exists('/sys/fs/cgroup/memory'): + if os.path.exists("/sys/fs/cgroup/memory"): # Mesos can limit memory with a cgroup, so we should report on that. - for (dirpath, dirnames, filenames) in os.walk('/sys/fs/cgroup/memory', followlinks=True): + for dirpath, dirnames, filenames in os.walk( + "/sys/fs/cgroup/memory", followlinks=True + ): for filename in filenames: - if 'limit_in_bytes' not in filename: + if "limit_in_bytes" not in filename: continue - log.debug('cgroup memory info from %s:' % os.path.join(dirpath, filename)) + log.debug( + "cgroup memory info from %s:" % os.path.join(dirpath, filename) + ) try: for line in open(os.path.join(dirpath, filename)): log.debug(line.rstrip()) @@ -275,14 +302,13 @@ def main(): # Mesos can also impose rlimit limits, including on things that really # ought to not be limited, like virtual address space size. - log.debug('DATA rlimit: %s', str(resource.getrlimit(resource.RLIMIT_DATA))) - log.debug('STACK rlimit: %s', str(resource.getrlimit(resource.RLIMIT_STACK))) - log.debug('RSS rlimit: %s', str(resource.getrlimit(resource.RLIMIT_RSS))) - log.debug('AS rlimit: %s', str(resource.getrlimit(resource.RLIMIT_AS))) - + log.debug("DATA rlimit: %s", str(resource.getrlimit(resource.RLIMIT_DATA))) + log.debug("STACK rlimit: %s", str(resource.getrlimit(resource.RLIMIT_STACK))) + log.debug("RSS rlimit: %s", str(resource.getrlimit(resource.RLIMIT_RSS))) + log.debug("AS rlimit: %s", str(resource.getrlimit(resource.RLIMIT_AS))) executor = MesosExecutor() - log.debug('Made executor') + log.debug("Made executor") driver = MesosExecutorDriver(executor, use_addict=True) old_on_event = driver.on_event @@ -296,13 +322,15 @@ def patched_on_event(event): driver.on_event = patched_on_event - log.debug('Made driver') + log.debug("Made driver") driver.start() - log.debug('Started driver') + log.debug("Started driver") driver_result = driver.join() - log.debug('Joined driver') + log.debug("Joined driver") # Tolerate a None in addition to the code the docs suggest we should receive from join() - exit_value = 0 if (driver_result is None or driver_result == 'DRIVER_STOPPED') else 1 + exit_value = ( + 0 if (driver_result is None or driver_result == "DRIVER_STOPPED") else 1 + ) assert len(executor.runningTasks) == 0 sys.exit(exit_value) diff --git a/src/toil/batchSystems/mesos/test/__init__.py b/src/toil/batchSystems/mesos/test/__init__.py index dbf75a8e5c..dcd1a9a568 100644 --- a/src/toil/batchSystems/mesos/test/__init__.py +++ b/src/toil/batchSystems/mesos/test/__init__.py @@ -17,16 +17,18 @@ class MesosTestSupport: """Mixin for test cases that need a running Mesos master and agent on the local host.""" - @retry(intervals=[1, 1, 2, 4, 8, 16, 32, 64, 128], - log_message=(log.info, 'Checking if Mesos is ready...')) + @retry( + intervals=[1, 1, 2, 4, 8, 16, 32, 64, 128], + log_message=(log.info, "Checking if Mesos is ready..."), + ) def wait_for_master(self): - with closing(urlopen('http://127.0.0.1:5050/version')) as content: + with closing(urlopen("http://127.0.0.1:5050/version")) as content: content.read() def _startMesos(self, numCores=None): if numCores is None: numCores = cpu_count() - shutil.rmtree('/tmp/mesos', ignore_errors=True) + shutil.rmtree("/tmp/mesos", ignore_errors=True) self.master = self.MesosMasterThread(numCores) self.master.start() self.agent = self.MesosAgentThread(numCores) @@ -35,7 +37,7 @@ def _startMesos(self, numCores=None): # Bad Things will happen if the master is not yet ready when Toil tries to use it. self.wait_for_master() - log.info('Mesos is ready! Running test.') + log.info("Mesos is ready! Running test.") def _stopProcess(self, process, timeout=10) -> None: """Gracefully stop a process on a timeout, given the Popen object for the process.""" @@ -47,7 +49,7 @@ def _stopProcess(self, process, timeout=10) -> None: waited += 1 if process.poll() is None: # It didn't shut down gracefully - log.warning('Forcibly killing child which ignored SIGTERM') + log.warning("Forcibly killing child which ignored SIGTERM") process.kill() def _stopMesos(self): @@ -71,7 +73,7 @@ def mesosCommand(self): def tryRun(self): self.popen.wait() - log.info('Exiting %s', self.__class__.__name__) + log.info("Exiting %s", self.__class__.__name__) def findMesosBinary(self, names): if isinstance(names, str): @@ -86,7 +88,7 @@ def findMesosBinary(self, names): # Special case for users of PyCharm on OS X. This is where Homebrew installs # it. It's hard to set PATH for PyCharm (or any GUI app) on OS X so let's # make it easy for those poor souls. - return which(name, path='/usr/local/sbin') + return which(name, path="/usr/local/sbin") except StopIteration: pass @@ -94,18 +96,22 @@ def findMesosBinary(self, names): if len(names) == 1: sought = "binary '%s'" % names[0] else: - sought = 'any binary in %s' % str(names) + sought = "any binary in %s" % str(names) - raise RuntimeError("Cannot find %s. Make sure Mesos is installed " - "and it's 'bin' directory is present on the PATH." % sought) + raise RuntimeError( + "Cannot find %s. Make sure Mesos is installed " + "and it's 'bin' directory is present on the PATH." % sought + ) class MesosMasterThread(MesosThread): def mesosCommand(self): - return [self.findMesosBinary('mesos-master'), - '--registry=in_memory', - '--ip=127.0.0.1', - '--port=5050', - '--allocation_interval=500ms'] + return [ + self.findMesosBinary("mesos-master"), + "--registry=in_memory", + "--ip=127.0.0.1", + "--port=5050", + "--allocation_interval=500ms", + ] class MesosAgentThread(MesosThread): def mesosCommand(self): @@ -114,10 +120,12 @@ def mesosCommand(self): # We also make sure to point it explicitly at the right temp work directory, and # to disable systemd support because we have to be root to make systemd make us # things and we probably aren't when testing. - return [self.findMesosBinary(['mesos-agent']), - '--ip=127.0.0.1', - '--master=127.0.0.1:5050', - '--attributes=preemptible:False', - '--resources=cpus(*):%i' % self.numCores, - '--work_dir=/tmp/mesos', - '--no-systemd_enable_support'] + return [ + self.findMesosBinary(["mesos-agent"]), + "--ip=127.0.0.1", + "--master=127.0.0.1:5050", + "--attributes=preemptible:False", + "--resources=cpus(*):%i" % self.numCores, + "--work_dir=/tmp/mesos", + "--no-systemd_enable_support", + ] diff --git a/src/toil/batchSystems/options.py b/src/toil/batchSystems/options.py index bfab8e5718..7ebc0a6ced 100644 --- a/src/toil/batchSystems/options.py +++ b/src/toil/batchSystems/options.py @@ -12,22 +12,19 @@ # See the License for the specific language governing permissions and import logging -import sys from argparse import ArgumentParser, _ArgumentGroup -from typing import Any, Callable, List, Optional, TypeVar, Union +from typing import Any, Callable, Optional, Protocol, TypeVar, Union -if sys.version_info >= (3, 8): - from typing import Protocol -else: - from typing_extensions import Protocol - -from toil.batchSystems.registry import (DEFAULT_BATCH_SYSTEM, - get_batch_system, - get_batch_systems) +from toil.batchSystems.registry import ( + DEFAULT_BATCH_SYSTEM, + get_batch_system, + get_batch_systems, +) from toil.lib.threading import cpu_count logger = logging.getLogger(__name__) + class OptionSetter(Protocol): """ Protocol for the setOption function we get to let us set up CLI options for @@ -36,19 +33,22 @@ class OptionSetter(Protocol): Actual functionality is defined in the Config class. """ - OptionType = TypeVar('OptionType') + OptionType = TypeVar("OptionType") + def __call__( self, option_name: str, parsing_function: Optional[Callable[[Any], OptionType]] = None, check_function: Optional[Callable[[OptionType], Union[None, bool]]] = None, default: Optional[OptionType] = None, - env: Optional[List[str]] = None, - old_names: Optional[List[str]] = None - ) -> bool: - ... + env: Optional[list[str]] = None, + old_names: Optional[list[str]] = None, + ) -> bool: ... + -def set_batchsystem_options(batch_system: Optional[str], set_option: OptionSetter) -> None: +def set_batchsystem_options( + batch_system: Optional[str], set_option: OptionSetter +) -> None: """ Call set_option for all the options for the given named batch system, or all batch systems if no name is provided. @@ -110,11 +110,11 @@ def add_all_batchsystem_options(parser: Union[ArgumentParser, _ArgumentGroup]) - parser.add_argument( "--maxJobs", dest="max_jobs", - default=SYS_MAX_SIZE, # This is *basically* unlimited and saves a lot of Optional[] + default=SYS_MAX_SIZE, # This is *basically* unlimited and saves a lot of Optional[] type=lambda x: int(x) or SYS_MAX_SIZE, help="Specifies the maximum number of jobs to submit to the " - "backing scheduler at once. Not supported on Mesos or " - "AWS Batch. Use 0 for unlimited. Defaults to unlimited.", + "backing scheduler at once. Not supported on Mesos or " + "AWS Batch. Use 0 for unlimited. Defaults to unlimited.", ) parser.add_argument( "--maxLocalJobs", @@ -122,8 +122,8 @@ def add_all_batchsystem_options(parser: Union[ArgumentParser, _ArgumentGroup]) - default=None, type=lambda x: int(x) or 0, help=f"Specifies the maximum number of housekeeping jobs to " - f"run sumultaneously on the local system. Use 0 for " - f"unlimited. Defaults to the number of local cores ({cpu_count()}).", + f"run sumultaneously on the local system. Use 0 for " + f"unlimited. Defaults to the number of local cores ({cpu_count()}).", ) parser.add_argument( "--manualMemArgs", @@ -162,8 +162,8 @@ def add_all_batchsystem_options(parser: Union[ArgumentParser, _ArgumentGroup]) - type=int, default=None, help="Time, in seconds, to wait before doing a scheduler query for job state. " - "Return cached results if within the waiting period. Only works for grid " - "engine batch systems such as gridengine, htcondor, torque, slurm, and lsf." + "Return cached results if within the waiting period. Only works for grid " + "engine batch systems such as gridengine, htcondor, torque, slurm, and lsf.", ) parser.add_argument( "--statePollingTimeout", @@ -171,7 +171,7 @@ def add_all_batchsystem_options(parser: Union[ArgumentParser, _ArgumentGroup]) - type=int, default=1200, help="Time, in seconds, to retry against a broken scheduler. Only works for grid " - "engine batch systems such as gridengine, htcondor, torque, slurm, and lsf." + "engine batch systems such as gridengine, htcondor, torque, slurm, and lsf.", ) parser.add_argument( "--batchLogsDir", @@ -179,15 +179,20 @@ def add_all_batchsystem_options(parser: Union[ArgumentParser, _ArgumentGroup]) - default=None, env_var="TOIL_BATCH_LOGS_DIR", help="Directory to tell the backing batch system to log into. Should be available " - "on both the leader and the workers, if the backing batch system writes logs " - "to the worker machines' filesystems, as many HPC schedulers do. If unset, " - "the Toil work directory will be used. Only works for grid engine batch " - "systems such as gridengine, htcondor, torque, slurm, and lsf." + "on both the leader and the workers, if the backing batch system writes logs " + "to the worker machines' filesystems, as many HPC schedulers do. If unset, " + "the Toil work directory will be used. Only works for grid engine batch " + "systems such as gridengine, htcondor, torque, slurm, and lsf.", ) - parser.add_argument('--memoryIsProduct', dest='memory_is_product', default=False, action="store_true", - help="If the batch system understands requested memory as a product of the requested memory and the number" - "of cores, set this flag to properly allocate memory.") + parser.add_argument( + "--memoryIsProduct", + dest="memory_is_product", + default=False, + action="store_true", + help="If the batch system understands requested memory as a product of the requested memory and the number" + "of cores, set this flag to properly allocate memory.", + ) for name in get_batch_systems(): # All the batch systems are responsible for adding their own options @@ -198,5 +203,5 @@ def add_all_batchsystem_options(parser: Union[ArgumentParser, _ArgumentGroup]) - # Skip anything we can't import continue # Ask the batch system to create its options in the parser - logger.debug('Add options for %s batch system', name) + logger.debug("Add options for %s batch system", name) batch_system_type.add_options(parser) diff --git a/src/toil/batchSystems/registry.py b/src/toil/batchSystems/registry.py index 93ffda64b1..700a9c389a 100644 --- a/src/toil/batchSystems/registry.py +++ b/src/toil/batchSystems/registry.py @@ -16,7 +16,8 @@ import logging import pkgutil import warnings -from typing import TYPE_CHECKING, Callable, Dict, List, Sequence, Tuple, Type +from collections.abc import Sequence +from typing import TYPE_CHECKING, Callable from toil.lib.compatibility import deprecated from toil.lib.memoize import memoize @@ -30,7 +31,10 @@ # Plugin system/API ##### -def add_batch_system_factory(key: str, class_factory: Callable[[], Type['AbstractBatchSystem']]): + +def add_batch_system_factory( + key: str, class_factory: Callable[[], type["AbstractBatchSystem"]] +): """ Adds a batch system to the registry for workflow or plugin-supplied batch systems. @@ -39,6 +43,7 @@ def add_batch_system_factory(key: str, class_factory: Callable[[], Type['Abstrac _registry_keys.append(key) _registry[key] = class_factory + def get_batch_systems() -> Sequence[str]: """ Get the names of all the availsble batch systems. @@ -47,7 +52,8 @@ def get_batch_systems() -> Sequence[str]: return _registry_keys -def get_batch_system(key: str) -> Type['AbstractBatchSystem']: + +def get_batch_system(key: str) -> type["AbstractBatchSystem"]: """ Get a batch system class by name. @@ -58,68 +64,81 @@ def get_batch_system(key: str) -> Type['AbstractBatchSystem']: return _registry[key]() -DEFAULT_BATCH_SYSTEM = 'single_machine' +DEFAULT_BATCH_SYSTEM = "single_machine" ##### # Built-in batch systems ##### + def aws_batch_batch_system_factory(): from toil.batchSystems.awsBatch import AWSBatchBatchSystem + return AWSBatchBatchSystem + def gridengine_batch_system_factory(): from toil.batchSystems.gridengine import GridEngineBatchSystem + return GridEngineBatchSystem def lsf_batch_system_factory(): from toil.batchSystems.lsf import LSFBatchSystem + return LSFBatchSystem def single_machine_batch_system_factory(): from toil.batchSystems.singleMachine import SingleMachineBatchSystem + return SingleMachineBatchSystem def mesos_batch_system_factory(): from toil.batchSystems.mesos.batchSystem import MesosBatchSystem + return MesosBatchSystem def slurm_batch_system_factory(): from toil.batchSystems.slurm import SlurmBatchSystem + return SlurmBatchSystem + def torque_batch_system_factory(): from toil.batchSystems.torque import TorqueBatchSystem + return TorqueBatchSystem def htcondor_batch_system_factory(): from toil.batchSystems.htcondor import HTCondorBatchSystem + return HTCondorBatchSystem def kubernetes_batch_system_factory(): from toil.batchSystems.kubernetes import KubernetesBatchSystem + return KubernetesBatchSystem + ##### # Registry implementation ##### -_registry: Dict[str, Callable[[], Type["AbstractBatchSystem"]]] = { - 'aws_batch' : aws_batch_batch_system_factory, - 'single_machine' : single_machine_batch_system_factory, - 'grid_engine' : gridengine_batch_system_factory, - 'lsf' : lsf_batch_system_factory, - 'mesos' : mesos_batch_system_factory, - 'slurm' : slurm_batch_system_factory, - 'torque' : torque_batch_system_factory, - 'htcondor' : htcondor_batch_system_factory, - 'kubernetes' : kubernetes_batch_system_factory +_registry: dict[str, Callable[[], type["AbstractBatchSystem"]]] = { + "aws_batch": aws_batch_batch_system_factory, + "single_machine": single_machine_batch_system_factory, + "grid_engine": gridengine_batch_system_factory, + "lsf": lsf_batch_system_factory, + "mesos": mesos_batch_system_factory, + "slurm": slurm_batch_system_factory, + "torque": torque_batch_system_factory, + "htcondor": htcondor_batch_system_factory, + "kubernetes": kubernetes_batch_system_factory, } _registry_keys = list(_registry.keys()) @@ -127,6 +146,7 @@ def kubernetes_batch_system_factory(): # add_batch_system_factory() _PLUGIN_NAME_PREFIX = "toil_batch_system_" + @memoize def _load_all_plugins() -> None: """ @@ -139,6 +159,7 @@ def _load_all_plugins() -> None: # If it is a Toil batch system plugin, import it importlib.import_module(name) + ##### # Deprecated API ##### @@ -155,17 +176,24 @@ def __getattr__(name): See . """ if name == "BATCH_SYSTEM_FACTORY_REGISTRY": - warnings.warn("BATCH_SYSTEM_FACTORY_REGISTRY is deprecated; use get_batch_system() or add_batch_system_factory()", DeprecationWarning) + warnings.warn( + "BATCH_SYSTEM_FACTORY_REGISTRY is deprecated; use get_batch_system() or add_batch_system_factory()", + DeprecationWarning, + ) return _registry elif name == "BATCH_SYSTEMS": - warnings.warn("BATCH_SYSTEMS is deprecated; use get_batch_systems()", DeprecationWarning) + warnings.warn( + "BATCH_SYSTEMS is deprecated; use get_batch_systems()", DeprecationWarning + ) return _registry_keys else: raise AttributeError(f"Module {__name__} ahs no attribute {name}") @deprecated(new_function_name="add_batch_system_factory") -def addBatchSystemFactory(key: str, batchSystemFactory: Callable[[], Type['AbstractBatchSystem']]): +def addBatchSystemFactory( + key: str, batchSystemFactory: Callable[[], type["AbstractBatchSystem"]] +): """ Deprecated method to add a batch system. """ @@ -180,7 +208,10 @@ def addBatchSystemFactory(key: str, batchSystemFactory: Callable[[], Type['Abstr # the globals because module-level globals are their own references, so we # can't touch this module's global name bindings from a client module. -def save_batch_system_plugin_state() -> Tuple[List[str], Dict[str, Callable[[], Type['AbstractBatchSystem']]]]: + +def save_batch_system_plugin_state() -> ( + tuple[list[str], dict[str, Callable[[], type["AbstractBatchSystem"]]]] +): """ Return a snapshot of the plugin registry that can be restored to remove added plugins. Useful for testing the plugin system in-process with other @@ -190,7 +221,10 @@ def save_batch_system_plugin_state() -> Tuple[List[str], Dict[str, Callable[[], snapshot = (list(_registry_keys), dict(_registry)) return snapshot -def restore_batch_system_plugin_state(snapshot: Tuple[List[str], Dict[str, Callable[[], Type['AbstractBatchSystem']]]]): + +def restore_batch_system_plugin_state( + snapshot: tuple[list[str], dict[str, Callable[[], type["AbstractBatchSystem"]]]] +): """ Restore the batch system registry state to a snapshot from save_batch_system_plugin_state(). diff --git a/src/toil/batchSystems/singleMachine.py b/src/toil/batchSystems/singleMachine.py index 8e69439f5d..711b8b5cb3 100644 --- a/src/toil/batchSystems/singleMachine.py +++ b/src/toil/batchSystems/singleMachine.py @@ -20,29 +20,36 @@ import time import traceback from argparse import ArgumentParser, _ArgumentGroup +from collections.abc import Sequence from queue import Empty, Queue from threading import Event, Lock, Thread -from typing import Dict, List, Optional, Sequence, Set, Tuple, Union +from typing import Optional, Union import toil from toil import worker as toil_worker -from toil.batchSystems.abstractBatchSystem import (EXIT_STATUS_UNAVAILABLE_VALUE, - BatchSystemSupport, - InsufficientSystemResources, - ResourcePool, - ResourceSet, - UpdatedBatchJobInfo) +from toil.batchSystems.abstractBatchSystem import ( + EXIT_STATUS_UNAVAILABLE_VALUE, + BatchSystemSupport, + InsufficientSystemResources, + ResourcePool, + ResourceSet, + UpdatedBatchJobInfo, +) from toil.batchSystems.options import OptionSetter from toil.bus import ExternalBatchIdMessage from toil.common import Config, Toil -from toil.options.common import SYS_MAX_SIZE, make_open_interval_action -from toil.job import (AcceleratorRequirement, - JobDescription, - Requirer, - accelerator_satisfies) -from toil.lib.accelerators import (get_individual_local_accelerators, - get_restrictive_environment_for_local_accelerators) +from toil.job import ( + AcceleratorRequirement, + JobDescription, + Requirer, + accelerator_satisfies, +) +from toil.lib.accelerators import ( + get_individual_local_accelerators, + get_restrictive_environment_for_local_accelerators, +) from toil.lib.threading import cpu_count +from toil.options.common import SYS_MAX_SIZE, make_open_interval_action logger = logging.getLogger(__name__) @@ -84,7 +91,12 @@ def supportsWorkerCleanup(cls): physicalMemory = toil.physicalMemory() def __init__( - self, config: Config, maxCores: float, maxMemory: int, maxDisk: int, max_jobs: Optional[int] = None + self, + config: Config, + maxCores: float, + maxMemory: int, + maxDisk: int, + max_jobs: Optional[int] = None, ) -> None: self.config = config @@ -102,22 +114,38 @@ def __init__( # If we don't have up to the limit of the resource (and the resource # isn't the inlimited sentinel), warn. if maxCores > self.numCores: - if maxCores != SYS_MAX_SIZE and maxCores != float('inf'): + if maxCores != SYS_MAX_SIZE and maxCores != float("inf"): # We have an actually specified limit and not the default - logger.warning('Not enough cores! User limited to %i but we only have %i.', maxCores, self.numCores) + logger.warning( + "Not enough cores! User limited to %i but we only have %i.", + maxCores, + self.numCores, + ) maxCores = self.numCores if maxMemory > self.physicalMemory: - if maxMemory < SYS_MAX_SIZE: # todo: looks like humans2bytes converts SYS_MAX_SIZE to SYS_MAX_SIZE+1 + if ( + maxMemory < SYS_MAX_SIZE + ): # todo: looks like humans2bytes converts SYS_MAX_SIZE to SYS_MAX_SIZE+1 # We have an actually specified limit and not the default - logger.warning('Not enough memory! User limited to %i bytes but we only have %i bytes.', maxMemory, self.physicalMemory) + logger.warning( + "Not enough memory! User limited to %i bytes but we only have %i bytes.", + maxMemory, + self.physicalMemory, + ) maxMemory = self.physicalMemory - workdir = Toil.getLocalWorkflowDir(config.workflowID, config.workDir) # config.workDir may be None; this sets a real directory + workdir = Toil.getLocalWorkflowDir( + config.workflowID, config.workDir + ) # config.workDir may be None; this sets a real directory self.physicalDisk = toil.physicalDisk(workdir) if maxDisk > self.physicalDisk: if maxDisk < SYS_MAX_SIZE: # same as maxMemory logger.warning # We have an actually specified limit and not the default - logger.warning('Not enough disk space! User limited to %i bytes but we only have %i bytes.', maxDisk, self.physicalDisk) + logger.warning( + "Not enough disk space! User limited to %i bytes but we only have %i bytes.", + maxDisk, + self.physicalDisk, + ) maxDisk = self.physicalDisk super().__init__(config, maxCores, maxMemory, maxDisk) @@ -132,8 +160,10 @@ def __init__( if config.badWorker > 0 and config.debugWorker: # We can't throw SIGUSR1 at the worker because it is also going to # be the leader and/or test harness. - raise RuntimeError("Cannot use badWorker and debugWorker together; " - "worker would have to kill the leader") + raise RuntimeError( + "Cannot use badWorker and debugWorker together; " + "worker would have to kill the leader" + ) self.debugWorker = config.debugWorker @@ -143,7 +173,7 @@ def __init__( self.jobIndexLock = Lock() # A dictionary mapping batch system IDs of submitted jobs to the command line - self.jobs: Dict[int, JobDescription] = {} + self.jobs: dict[int, JobDescription] = {} # A queue of jobs waiting to be executed. Consumed by the daddy thread. self.inputQueue = Queue() @@ -152,15 +182,15 @@ def __init__( self.outputQueue = Queue() # A dictionary mapping batch system IDs of currently running jobs to their Info objects - self.runningJobs: Dict[int, Info] = {} + self.runningJobs: dict[int, Info] = {} # These next two are only used outside debug-worker mode # A dict mapping PIDs to Popen objects for running jobs. # Jobs that don't fork are executed one at a time in the main thread. - self.children: Dict[int, subprocess.Popen] = {} + self.children: dict[int, subprocess.Popen] = {} # A dict mapping child PIDs to the Job IDs they are supposed to be running. - self.childToJob: Dict[int, str] = {} + self.childToJob: dict[int, str] = {} # For accelerators, we need a collection of what each accelerator is, and an acquirable set of them. self.accelerator_identities = get_individual_local_accelerators() @@ -168,15 +198,15 @@ def __init__( # Put them all organized by resource type self.resource_sources = [ # A pool representing available job slots - ResourcePool(self.max_jobs, 'job slots'), + ResourcePool(self.max_jobs, "job slots"), # A pool representing available CPU in units of minCores - ResourcePool(int(self.maxCores / self.minCores), 'cores'), + ResourcePool(int(self.maxCores / self.minCores), "cores"), # A pool representing available memory in bytes - ResourcePool(self.maxMemory, 'memory'), + ResourcePool(self.maxMemory, "memory"), # A pool representing the available space in bytes - ResourcePool(self.maxDisk, 'disk'), + ResourcePool(self.maxDisk, "disk"), # And a set for acquiring individual accelerators - ResourceSet(set(range(len(self.accelerator_identities))), 'accelerators') + ResourceSet(set(range(len(self.accelerator_identities))), "accelerators"), ] # If we can't schedule something, we fill this in with a reason why @@ -192,11 +222,11 @@ def __init__( self.daddyException: Optional[Exception] = None if self.debugWorker: - logger.debug('Started batch system %s in worker debug mode.', id(self)) + logger.debug("Started batch system %s in worker debug mode.", id(self)) else: self.daddyThread = Thread(target=self.daddy, daemon=True) self.daddyThread.start() - logger.debug('Started batch system %s in normal mode.', id(self)) + logger.debug("Started batch system %s in normal mode.", id(self)) def daddy(self): """ @@ -214,7 +244,7 @@ def daddy(self): """ try: - logger.debug('Started daddy thread for batch system %s.', id(self)) + logger.debug("Started daddy thread for batch system %s.", id(self)) while not self.shuttingDown.is_set(): # Main loop @@ -224,13 +254,28 @@ def daddy(self): try: # Grab something from the input queue if available. args = self.inputQueue.get_nowait() - jobCommand, jobID, jobCores, jobMemory, jobDisk, job_accelerators, environment = args + ( + jobCommand, + jobID, + jobCores, + jobMemory, + jobDisk, + job_accelerators, + environment, + ) = args coreFractions = int(jobCores / self.minCores) # Try to start the child - result = self._startChild(jobCommand, jobID, - coreFractions, jobMemory, jobDisk, job_accelerators, environment) + result = self._startChild( + jobCommand, + jobID, + coreFractions, + jobMemory, + jobDisk, + job_accelerators, + environment, + ) if result is None: # We did not get the resources to run this job. @@ -241,12 +286,15 @@ def daddy(self): self.inputQueue.put(args) break elif result is not False: - #Result is a PID + # Result is a PID if self._outbox is not None: # Annotate the job with the PID generated. self._outbox.publish( - ExternalBatchIdMessage(jobID, str(result), self.__class__.__name__)) + ExternalBatchIdMessage( + jobID, str(result), self.__class__.__name__ + ) + ) # Otherwise False @@ -265,18 +313,28 @@ def daddy(self): # For now we just sleep and loop. time.sleep(0.01) - # When we get here, we are shutting down. - logger.debug('Daddy thread cleaning up %d remaining children for batch system %s...', len(self.children), id(self)) + logger.debug( + "Daddy thread cleaning up %d remaining children for batch system %s...", + len(self.children), + id(self), + ) self._stop_and_wait(self.children.values()) - logger.debug('Daddy thread for batch system %s finishing because no children should now exist', id(self)) + logger.debug( + "Daddy thread for batch system %s finishing because no children should now exist", + id(self), + ) # Then exit the thread. return except Exception as e: - logger.critical('Unhandled exception in daddy thread for batch system %s: %s', id(self), traceback.format_exc()) + logger.critical( + "Unhandled exception in daddy thread for batch system %s: %s", + id(self), + traceback.format_exc(), + ) # Pass the exception back to the main thread so it can stop the next person who calls into us. self.daddyException = e raise @@ -284,15 +342,17 @@ def daddy(self): def _checkOnDaddy(self): if self.daddyException is not None: # The daddy thread broke and we cannot do our job - logger.critical('Propagating unhandled exception in daddy thread to main thread') + logger.critical( + "Propagating unhandled exception in daddy thread to main thread" + ) exc = self.daddyException self.daddyException = None if isinstance(exc, Exception): raise exc else: - raise TypeError(f'Daddy thread failed with non-exception: {exc}') + raise TypeError(f"Daddy thread failed with non-exception: {exc}") - def _stop_now(self, popens: Sequence[subprocess.Popen]) -> List[int]: + def _stop_now(self, popens: Sequence[subprocess.Popen]) -> list[int]: """ Stop the given child processes and all their children. Does not reap them. @@ -322,7 +382,11 @@ def _stop_now(self, popens: Sequence[subprocess.Popen]) -> List[int]: # The child process really is in its own group, and not ours. # Kill the group, which hopefully hasn't been reused - logger.debug('Send shutdown kill to process group %s known to batch system %s', pgid, id(self)) + logger.debug( + "Send shutdown kill to process group %s known to batch system %s", + pgid, + id(self), + ) try: os.killpg(pgid, signal.SIGKILL) pgids.append(pgid) @@ -339,7 +403,9 @@ def _stop_now(self, popens: Sequence[subprocess.Popen]) -> List[int]: return pgids - def _stop_and_wait(self, popens: Sequence[subprocess.Popen], timeout: int = 5) -> None: + def _stop_and_wait( + self, popens: Sequence[subprocess.Popen], timeout: int = 5 + ) -> None: """ Stop the given child processes and all their children. Blocks until the processes are gone or timeout is passed. @@ -354,13 +420,17 @@ def _stop_and_wait(self, popens: Sequence[subprocess.Popen], timeout: int = 5) - for popen in popens: # Wait on all the children popen.wait() - logger.debug('Process %s known to batch system %s is stopped; it returned %s', - popen.pid, id(self), popen.returncode) + logger.debug( + "Process %s known to batch system %s is stopped; it returned %s", + popen.pid, + id(self), + popen.returncode, + ) # Make sure all child processes have received their kill signal self._wait_for_death(pgids, timeout) - def _wait_for_death(self, pgids: List[int], timeout: int = 5): + def _wait_for_death(self, pgids: list[int], timeout: int = 5): """ Wait for the process groups to be killed. Blocks until the processes are gone or timeout is passed. @@ -373,8 +443,11 @@ def _wait_for_death(self, pgids: List[int], timeout: int = 5): # process and its PGID may have been re-used. start = datetime.datetime.now() - while len(pgids) > 0 and (datetime.datetime.now() - start).total_seconds() < timeout: - new_pgids: List[int] = [] + while ( + len(pgids) > 0 + and (datetime.datetime.now() - start).total_seconds() < timeout + ): + new_pgids: list[int] = [] for pgid in pgids: try: # Send a kill to the group again, to see if anything in it @@ -399,9 +472,11 @@ def _wait_for_death(self, pgids: List[int], timeout: int = 5): if len(pgids) > 0: # If any processes are still alive, let user know that we may leave # behind dead but unreaped processes. - logger.warning('Processes were not reaped in groups: %s.', str(pgids)) - logger.warning('Make sure your jobs are cleaning up child processes appropriately to avoid zombie ' - 'processes possibly being left behind.') + logger.warning("Processes were not reaped in groups: %s.", str(pgids)) + logger.warning( + "Make sure your jobs are cleaning up child processes appropriately to avoid zombie " + "processes possibly being left behind." + ) def _pollForDoneChildrenIn(self, pid_to_popen): """ @@ -420,7 +495,7 @@ def _pollForDoneChildrenIn(self, pid_to_popen): ready = set() # Find the waitid function - waitid = getattr(os, 'waitid', None) + waitid = getattr(os, "waitid", None) if callable(waitid): # waitid exists (not Mac) @@ -439,7 +514,11 @@ def _pollForDoneChildrenIn(self, pid_to_popen): # instead of the weird C behavior of overwriting a field in # a pointed-to struct. siginfo = None - if siginfo is not None and siginfo.si_pid in pid_to_popen and siginfo.si_pid not in ready: + if ( + siginfo is not None + and siginfo.si_pid in pid_to_popen + and siginfo.si_pid not in ready + ): # Something new finished ready.add(siginfo.si_pid) else: @@ -454,7 +533,7 @@ def _pollForDoneChildrenIn(self, pid_to_popen): if popen.poll() is not None: # Process is done ready.add(pid) - logger.debug('Child %d has stopped', pid) + logger.debug("Child %d has stopped", pid) # Return all the done processes we found return ready @@ -473,19 +552,33 @@ def _runDebugJob(self, jobCommand, jobID, environment): if jobCommand.startswith("_toil_worker "): # We can actually run in this thread - jobName, jobStoreLocator, jobStoreID = jobCommand.split()[1:4] # Parse command + jobName, jobStoreLocator, jobStoreID = jobCommand.split()[ + 1:4 + ] # Parse command jobStore = Toil.resumeJobStore(jobStoreLocator) - statusCode = toil_worker.workerScript(jobStore, jobStore.config, jobName, jobStoreID, - redirect_output_to_log_file=not self.debugWorker) # Call the worker + statusCode = toil_worker.workerScript( + jobStore, + jobStore.config, + jobName, + jobStoreID, + redirect_output_to_log_file=not self.debugWorker, + ) # Call the worker else: # Run synchronously. If starting or running the command fails, let the exception stop us. - statusCode = subprocess.check_call(jobCommand, - shell=True, - env=dict(os.environ, **environment)) + statusCode = subprocess.check_call( + jobCommand, shell=True, env=dict(os.environ, **environment) + ) self.runningJobs.pop(jobID) if not info.killIntended: - self.outputQueue.put(UpdatedBatchJobInfo(jobID=jobID, exitStatus=statusCode, wallTime=time.time() - info.time, exitReason=None)) + self.outputQueue.put( + UpdatedBatchJobInfo( + jobID=jobID, + exitStatus=statusCode, + wallTime=time.time() - info.time, + exitReason=None, + ) + ) def getSchedulingStatusMessage(self): # Implement the abstractBatchSystem's scheduling status message API @@ -505,19 +598,25 @@ def check_resource_request(self, requirer: Requirer) -> None: super().check_resource_request(requirer) except InsufficientSystemResources as e: # Tack the scale onto the exception - e.details.append(f'Scale is set to {self.scale}.') + e.details.append(f"Scale is set to {self.scale}.") raise e def _check_accelerator_request(self, requirer: Requirer) -> None: - _, problem = self._identify_sufficient_accelerators(requirer.accelerators, set(range(len(self.accelerator_identities)))) + _, problem = self._identify_sufficient_accelerators( + requirer.accelerators, set(range(len(self.accelerator_identities))) + ) if problem is not None: # We can't get the accelerators - raise InsufficientSystemResources(requirer, 'accelerators', self.accelerator_identities, details=[ - f'The accelerator {problem} could not be provided.' - ]) - - - def _release_acquired_resources(self, resources: List[Union[int, Set[int]]]) -> None: + raise InsufficientSystemResources( + requirer, + "accelerators", + self.accelerator_identities, + details=[f"The accelerator {problem} could not be provided."], + ) + + def _release_acquired_resources( + self, resources: list[Union[int, set[int]]] + ) -> None: """ Release all resources acquired for a job. Assumes resources are in the order: core fractions, memory, disk, accelerators. @@ -526,11 +625,16 @@ def _release_acquired_resources(self, resources: List[Union[int, Set[int]]]) -> # What pools and sets do we want resources from for resource, request in zip(self.resource_sources, resources): - assert ((isinstance(resource, ResourcePool) and isinstance(request, int)) or - (isinstance(resource, ResourceSet) and isinstance(request, set))) + assert ( + isinstance(resource, ResourcePool) and isinstance(request, int) + ) or (isinstance(resource, ResourceSet) and isinstance(request, set)) resource.release(request) - def _identify_sufficient_accelerators(self, needed_accelerators: List[AcceleratorRequirement], available_accelerator_ids: Set[int]) -> Tuple[Optional[Set[int]], Optional[AcceleratorRequirement]]: + def _identify_sufficient_accelerators( + self, + needed_accelerators: list[AcceleratorRequirement], + available_accelerator_ids: set[int], + ) -> tuple[Optional[set[int]], Optional[AcceleratorRequirement]]: """ Given the accelerator requirements of a job, and the set of available accelerators out of our associated collection of accelerators, find a @@ -547,17 +651,17 @@ def _identify_sufficient_accelerators(self, needed_accelerators: List[Accelerato Ignores accelerator model constraints. """ - accelerators_needed: Set[int] = set() + accelerators_needed: set[int] = set() accelerators_still_available = set(available_accelerator_ids) for requirement in needed_accelerators: - for i in range(requirement['count']): + for i in range(requirement["count"]): # For each individual accelerator we need satisfied = False for candidate_index in accelerators_still_available: # Check all the ones we haven't grabbed yet # TODO: We'll re-check early ones against this requirement if it has a count of more than one. candidate = self.accelerator_identities[candidate_index] - if accelerator_satisfies(candidate, requirement, ignore=['model']): + if accelerator_satisfies(candidate, requirement, ignore=["model"]): # If this accelerator can satisfy one unit of this requirement. # We ignore model constraints because as a single # machine we can't really determine the models of @@ -577,7 +681,16 @@ def _identify_sufficient_accelerators(self, needed_accelerators: List[Accelerato # If we get here we satisfied everything return accelerators_needed, None - def _startChild(self, jobCommand, jobID, coreFractions, jobMemory, jobDisk, job_accelerators: List[AcceleratorRequirement], environment): + def _startChild( + self, + jobCommand, + jobID, + coreFractions, + jobMemory, + jobDisk, + job_accelerators: list[AcceleratorRequirement], + environment, + ): """ Start a child process for the given job. @@ -596,7 +709,12 @@ def _startChild(self, jobCommand, jobID, coreFractions, jobMemory, jobDisk, job_ # And what do we want from each resource in self.resource_sources? # We know they go job slot, cores, memory, disk, accelerators. - resource_requests: List[Union[int, Set[int]]] = [1, coreFractions, jobMemory, jobDisk] + resource_requests: list[Union[int, set[int]]] = [ + 1, + coreFractions, + jobMemory, + jobDisk, + ] # Keep a reference to the accelerators separately accelerators_needed = None @@ -604,31 +722,37 @@ def _startChild(self, jobCommand, jobID, coreFractions, jobMemory, jobDisk, job_ if job_accelerators: # Try and find some accelerators to use. # Start with all the accelerators that are free right now - accelerator_set : ResourceSet = self.resource_sources[-1] + accelerator_set: ResourceSet = self.resource_sources[-1] snapshot = accelerator_set.get_free_snapshot() # And build a plan of the ones we want - accelerators_needed, problem = self._identify_sufficient_accelerators(job_accelerators, snapshot) + accelerators_needed, problem = self._identify_sufficient_accelerators( + job_accelerators, snapshot + ) if accelerators_needed is not None: # Now we have a plan to get the accelerators we need. resource_requests.append(accelerators_needed) else: # We couldn't make a plan; the accelerators are busy assert problem is not None - logger.debug('Accelerators are busy: %s', problem) - self._setSchedulingStatusMessage('Not enough accelerators to run job %s' % jobID) + logger.debug("Accelerators are busy: %s", problem) + self._setSchedulingStatusMessage( + "Not enough accelerators to run job %s" % jobID + ) return None - acquired = [] for source, request in zip(self.resource_sources, resource_requests): # For each kind of resource we want, go get it - assert ((isinstance(source, ResourcePool) and isinstance(request, int)) or - (isinstance(source, ResourceSet) and isinstance(request, set))) + assert (isinstance(source, ResourcePool) and isinstance(request, int)) or ( + isinstance(source, ResourceSet) and isinstance(request, set) + ) if source.acquireNow(request): acquired.append(request) else: # We can't get everything - self._setSchedulingStatusMessage('Not enough {} to run job {}'.format(source.resource_type, jobID)) + self._setSchedulingStatusMessage( + f"Not enough {source.resource_type} to run job {jobID}" + ) self._release_acquired_resources(acquired) return None @@ -639,8 +763,12 @@ def _startChild(self, jobCommand, jobID, coreFractions, jobMemory, jobDisk, job_ # Communicate the accelerator resources, if any, to the child process # by modifying the environemnt - accelerators_acquired: Set[int] = accelerators_needed if accelerators_needed is not None else set() - child_environment.update(get_restrictive_environment_for_local_accelerators(accelerators_acquired)) + accelerators_acquired: set[int] = ( + accelerators_needed if accelerators_needed is not None else set() + ) + child_environment.update( + get_restrictive_environment_for_local_accelerators(accelerators_acquired) + ) # Actually run the job. # When it finishes we will release what it was using. @@ -656,18 +784,24 @@ def _startChild(self, jobCommand, jobID, coreFractions, jobMemory, jobDisk, job_ # process group ID will equal the PID of the process we # are starting. logger.debug("Attempting to run job command: %s", jobCommand) - popen = subprocess.Popen(jobCommand, - shell=True, - env=child_environment, - start_new_session=True) + popen = subprocess.Popen( + jobCommand, shell=True, env=child_environment, start_new_session=True + ) except Exception: # If the job can't start, make sure we release resources now self._release_acquired_resources(acquired) - logger.error('Could not start job %s: %s', jobID, traceback.format_exc()) + logger.error("Could not start job %s: %s", jobID, traceback.format_exc()) # Report as failed. - self.outputQueue.put(UpdatedBatchJobInfo(jobID=jobID, exitStatus=EXIT_STATUS_UNAVAILABLE_VALUE, wallTime=0, exitReason=None)) + self.outputQueue.put( + UpdatedBatchJobInfo( + jobID=jobID, + exitStatus=EXIT_STATUS_UNAVAILABLE_VALUE, + wallTime=0, + exitReason=None, + ) + ) # Complain it broke. return False @@ -680,7 +814,7 @@ def _startChild(self, jobCommand, jobID, coreFractions, jobMemory, jobDisk, job_ info = Info(startTime, popen, acquired, killIntended=False) self.runningJobs[jobID] = info - logger.debug('Launched job %s as child %d', jobID, popen.pid) + logger.debug("Launched job %s as child %d", jobID, popen.pid) # Report success starting the job # Note that if a PID were somehow 0 it would look like False @@ -704,13 +838,12 @@ def _handleChild(self, pid: int) -> None: # Get the job resources reserved by the job acquired = info.resources - # Clean up our records of the job. self.runningJobs.pop(jobID) self.childToJob.pop(pid) self.children.pop(pid) - if popen.returncode is None or not callable(getattr(os, 'waitid', None)): + if popen.returncode is None or not callable(getattr(os, "waitid", None)): # It isn't reaped yet, or we have to reap all children to see if thay're done. # Before we reap it (if possible), kill its PID as a PGID to make sure # it isn't leaving children behind. @@ -728,12 +861,22 @@ def _handleChild(self, pid: int) -> None: # See how the child did, and reap it. statusCode = popen.wait() if statusCode != 0 and not info.killIntended: - logger.error("Got exit code %i (indicating failure) " - "from job %s.", statusCode, self.jobs[jobID]) + logger.error( + "Got exit code %i (indicating failure) " "from job %s.", + statusCode, + self.jobs[jobID], + ) if not info.killIntended: # Report if the job failed and we didn't kill it. # If we killed it then it shouldn't show up in the queue. - self.outputQueue.put(UpdatedBatchJobInfo(jobID=jobID, exitStatus=statusCode, wallTime=time.time() - info.time, exitReason=None)) + self.outputQueue.put( + UpdatedBatchJobInfo( + jobID=jobID, + exitStatus=statusCode, + wallTime=time.time() - info.time, + exitReason=None, + ) + ) # Last attempt to make sure all processes in the group have received # their kill signals. @@ -742,22 +885,31 @@ def _handleChild(self, pid: int) -> None: # Free up the job's resources. self._release_acquired_resources(acquired) - logger.debug('Child %d for job %s succeeded', pid, jobID) + logger.debug("Child %d for job %s succeeded", pid, jobID) - def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int: + def issueBatchJob( + self, + command: str, + job_desc: JobDescription, + job_environment: Optional[dict[str, str]] = None, + ) -> int: """Adds the command and resources to a queue to be run.""" self._checkOnDaddy() # Apply scale in cores - scaled_desc = job_desc.scale('cores', self.scale) + scaled_desc = job_desc.scale("cores", self.scale) # Round cores up to multiples of minCores - scaled_desc.cores = max(math.ceil(scaled_desc.cores / self.minCores) * self.minCores, self.minCores) + scaled_desc.cores = max( + math.ceil(scaled_desc.cores / self.minCores) * self.minCores, self.minCores + ) # Don't do our own assertions about job size vs. our configured size. # The abstract batch system can handle it. self.check_resource_request(scaled_desc) - logger.debug(f"Issuing the command: {command} with {scaled_desc.requirements_string()}") + logger.debug( + f"Issuing the command: {command} with {scaled_desc.requirements_string()}" + ) with self.jobIndexLock: jobID = self.jobIndex self.jobIndex += 1 @@ -773,20 +925,29 @@ def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: self._runDebugJob(command, jobID, environment) else: # Queue the job for later - self.inputQueue.put((command, jobID, scaled_desc.cores, scaled_desc.memory, - scaled_desc.disk, scaled_desc.accelerators, environment)) + self.inputQueue.put( + ( + command, + jobID, + scaled_desc.cores, + scaled_desc.memory, + scaled_desc.disk, + scaled_desc.accelerators, + environment, + ) + ) return jobID - def killBatchJobs(self, jobIDs: List[int]) -> None: + def killBatchJobs(self, jobIDs: list[int]) -> None: """Kills jobs by ID.""" self._checkOnDaddy() - logger.debug(f'Killing jobs: {jobIDs}') + logger.debug(f"Killing jobs: {jobIDs}") # Collect the popen handles for the jobs we have to stop - popens: List[subprocess.Popen] = [] + popens: list[subprocess.Popen] = [] for jobID in jobIDs: if jobID in self.runningJobs: @@ -808,19 +969,21 @@ def killBatchJobs(self, jobIDs: List[int]) -> None: # Wait for the daddy thread to collect them. time.sleep(0.01) - def getIssuedBatchJobIDs(self) -> List[int]: + def getIssuedBatchJobIDs(self) -> list[int]: """Just returns all the jobs that have been run, but not yet returned as updated.""" self._checkOnDaddy() return list(self.jobs.keys()) - def getRunningBatchJobIDs(self) -> Dict[int, float]: + def getRunningBatchJobIDs(self) -> dict[int, float]: self._checkOnDaddy() now = time.time() - return {jobID: now - info.time for jobID, info in list(self.runningJobs.items())} + return { + jobID: now - info.time for jobID, info in list(self.runningJobs.items()) + } def shutdown(self) -> None: """Terminate cleanly and join daddy thread.""" @@ -847,11 +1010,17 @@ def getUpdatedBatchJob(self, maxWait: int) -> Optional[UpdatedBatchJobInfo]: @classmethod def add_options(cls, parser: Union[ArgumentParser, _ArgumentGroup]) -> None: - parser.add_argument("--scale", dest="scale", type=float, default=1, action=make_open_interval_action(0.0), - help="A scaling factor to change the value of all submitted tasks's submitted cores. " - "Used in the single_machine batch system. Useful for running workflows on " - "smaller machines than they were designed for, by setting a value less than 1. " - "(default: %(default)s)") + parser.add_argument( + "--scale", + dest="scale", + type=float, + default=1, + action=make_open_interval_action(0.0), + help="A scaling factor to change the value of all submitted tasks's submitted cores. " + "Used in the single_machine batch system. Useful for running workflows on " + "smaller machines than they were designed for, by setting a value less than 1. " + "(default: %(default)s)", + ) @classmethod def setOptions(cls, setOption: OptionSetter): @@ -866,6 +1035,7 @@ class Info: (or None), the tuple of (coreFractions, memory, disk) it is using (or None), and whether the job is supposed to be being killed. """ + # Can't use namedtuple here since killIntended needs to be mutable def __init__(self, startTime, popen, resources, killIntended): self.time = startTime diff --git a/src/toil/batchSystems/slurm.py b/src/toil/batchSystems/slurm.py index b516a90921..3ddddff609 100644 --- a/src/toil/batchSystems/slurm.py +++ b/src/toil/batchSystems/slurm.py @@ -17,16 +17,21 @@ import math import os import sys -from argparse import ArgumentParser, _ArgumentGroup, SUPPRESS +from argparse import SUPPRESS, ArgumentParser, _ArgumentGroup from shlex import quote -from typing import Dict, List, Optional, Set, Tuple, TypeVar, Union, NamedTuple - +from typing import NamedTuple, TypeVar + +from toil.batchSystems.abstractBatchSystem import ( + EXIT_STATUS_UNAVAILABLE_VALUE, + BatchJobExitReason, + InsufficientSystemResources, +) +from toil.batchSystems.abstractGridEngineBatchSystem import ( + AbstractGridEngineBatchSystem, +) +from toil.batchSystems.options import OptionSetter from toil.bus import get_job_kind from toil.common import Config -from toil.batchSystems.abstractBatchSystem import BatchJobExitReason, EXIT_STATUS_UNAVAILABLE_VALUE, InsufficientSystemResources -from toil.batchSystems.abstractGridEngineBatchSystem import \ - AbstractGridEngineBatchSystem -from toil.batchSystems.options import OptionSetter from toil.job import JobDescription, Requirer from toil.lib.conversions import strtobool from toil.lib.misc import CalledProcessErrorStderr, call_command @@ -43,7 +48,7 @@ # it, so Toil should wait for it. # # We map from each terminal state to the Toil-ontology exit reason. -TERMINAL_STATES: Dict[str, BatchJobExitReason] = { +TERMINAL_STATES: dict[str, BatchJobExitReason] = { "BOOT_FAIL": BatchJobExitReason.LOST, "CANCELLED": BatchJobExitReason.KILLED, "COMPLETED": BatchJobExitReason.FINISHED, @@ -54,12 +59,12 @@ "PREEMPTED": BatchJobExitReason.KILLED, "REVOKED": BatchJobExitReason.KILLED, "SPECIAL_EXIT": BatchJobExitReason.FAILED, - "TIMEOUT": BatchJobExitReason.KILLED + "TIMEOUT": BatchJobExitReason.KILLED, } # If a job is in one of these states, it might eventually move to a different # state. -NONTERMINAL_STATES: Set[str] = { +NONTERMINAL_STATES: set[str] = { "CONFIGURING", "COMPLETING", "PENDING", @@ -72,9 +77,10 @@ "SIGNALING", "STAGE_OUT", "STOPPED", - "SUSPENDED" + "SUSPENDED", } + def parse_slurm_time(slurm_time: str) -> int: """ Parse a Slurm-style time duration like 7-00:00:00 to a number of seconds. @@ -86,7 +92,7 @@ def parse_slurm_time(slurm_time: str) -> int: # For ease of calculating, we'll make sure all the delimeters are ':' # Then reverse the list so that we're always counting up from seconds -> minutes -> hours -> days total_seconds = 0 - elapsed_split: List[str] = slurm_time.replace('-', ':').split(':') + elapsed_split: list[str] = slurm_time.replace("-", ":").split(":") elapsed_split.reverse() seconds_per_unit = [1, 60, 3600, 86400] for index, multiplier in enumerate(seconds_per_unit): @@ -94,6 +100,7 @@ def parse_slurm_time(slurm_time: str) -> int: total_seconds += multiplier * int(elapsed_split[index]) return total_seconds + class SlurmBatchSystem(AbstractGridEngineBatchSystem): class PartitionInfo(NamedTuple): partition_name: str @@ -107,9 +114,10 @@ class PartitionSet: """ Set of available partitions detected on the slurm batch system """ - default_gpu_partition: Optional[SlurmBatchSystem.PartitionInfo] - all_partitions: List[SlurmBatchSystem.PartitionInfo] - gpu_partitions: Set[str] + + default_gpu_partition: SlurmBatchSystem.PartitionInfo | None + all_partitions: list[SlurmBatchSystem.PartitionInfo] + gpu_partitions: set[str] def __init__(self) -> None: self._get_partition_info() @@ -120,15 +128,17 @@ def _get_gpu_partitions(self) -> None: Get all available GPU partitions. Also get the default GPU partition. :return: None """ - gpu_partitions = [partition for partition in self.all_partitions if partition.gres] + gpu_partitions = [ + partition for partition in self.all_partitions if partition.gres + ] self.gpu_partitions = {p.partition_name for p in gpu_partitions} # Grab the lowest priority GPU partition # If no GPU partitions are available, then set the default to None self.default_gpu_partition = None if len(gpu_partitions) > 0: - self.default_gpu_partition = sorted(gpu_partitions, key=lambda x: x.priority)[0] - - + self.default_gpu_partition = sorted( + gpu_partitions, key=lambda x: x.priority + )[0] def _get_partition_info(self) -> None: """ @@ -136,10 +146,7 @@ def _get_partition_info(self) -> None: Then parse the output and store all available Slurm partitions :return: None """ - sinfo_command = ["sinfo", - "-a", - "-o", - "%P %G %l %p %c %m"] + sinfo_command = ["sinfo", "-a", "-o", "%P %G %l %p %c %m"] sinfo = call_command(sinfo_command) @@ -157,12 +164,25 @@ def _get_partition_info(self) -> None: # Parse priority to an int so we can sort on it partition_priority = int(priority) except ValueError: - logger.warning("Could not parse priority %s for partition %s, assuming high priority", partition_name, priority) + logger.warning( + "Could not parse priority %s for partition %s, assuming high priority", + partition_name, + priority, + ) partition_priority = sys.maxsize - parsed_partitions.append(SlurmBatchSystem.PartitionInfo(partition_name.rstrip("*"), gres != "(null)", partition_time, partition_priority, cpus, memory)) + parsed_partitions.append( + SlurmBatchSystem.PartitionInfo( + partition_name.rstrip("*"), + gres != "(null)", + partition_time, + partition_priority, + cpus, + memory, + ) + ) self.all_partitions = parsed_partitions - def get_partition(self, time_limit: Optional[float]) -> Optional[str]: + def get_partition(self, time_limit: float | None) -> str | None: """ Get the partition name to use for a job with the given time limit. """ @@ -173,43 +193,49 @@ def get_partition(self, time_limit: Optional[float]) -> Optional[str]: winning_partition = None for partition in self.all_partitions: - if partition.time_limit >= time_limit and (winning_partition is None or partition.time_limit < winning_partition.time_limit): + if partition.time_limit >= time_limit and ( + winning_partition is None + or partition.time_limit < winning_partition.time_limit + ): # If this partition can fit the job and is faster than the current winner, take it winning_partition = partition # TODO: Store partitions in a better indexed way if winning_partition is None and len(self.all_partitions) > 0: # We have partitions and none of them can fit this - raise RuntimeError("Could not find a Slurm partition that can fit a job that runs for {time_limit} seconds") + raise RuntimeError( + "Could not find a Slurm partition that can fit a job that runs for {time_limit} seconds" + ) if winning_partition is None: return None else: return winning_partition.partition_name - - - class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread): # Our boss is always the enclosing class - boss: "SlurmBatchSystem" + boss: SlurmBatchSystem - def getRunningJobIDs(self) -> Dict[int, int]: + def getRunningJobIDs(self) -> dict[int, int]: # Should return a dictionary of Job IDs and number of seconds times = {} with self.runningJobsLock: - currentjobs: Dict[str, int] = {str(self.batchJobIDs[x][0]): x for x in self.runningJobs} + currentjobs: dict[str, int] = { + str(self.batchJobIDs[x][0]): x for x in self.runningJobs + } # currentjobs is a dictionary that maps a slurm job id (string) to our own internal job id # squeue arguments: # -h for no header # --format to get jobid i, state %t and time days-hours:minutes:seconds - lines = call_command(['squeue', '-h', '--format', '%i %t %M'], quiet=True).split('\n') + lines = call_command( + ["squeue", "-h", "--format", "%i %t %M"], quiet=True + ).split("\n") for line in lines: values = line.split() if len(values) < 3: continue slurm_jobid, state, elapsed_time = values - if slurm_jobid in currentjobs and state == 'R': + if slurm_jobid in currentjobs and state == "R": try: seconds_running = parse_slurm_time(elapsed_time) except ValueError: @@ -220,21 +246,25 @@ def getRunningJobIDs(self) -> Dict[int, int]: return times def killJob(self, jobID: int) -> None: - call_command(['scancel', self.getBatchSystemID(jobID)]) - - def prepareSubmission(self, - cpu: int, - memory: int, - jobID: int, - command: str, - jobName: str, - job_environment: Optional[Dict[str, str]] = None, - gpus: Optional[int] = None) -> List[str]: + call_command(["scancel", self.getBatchSystemID(jobID)]) + + def prepareSubmission( + self, + cpu: int, + memory: int, + jobID: int, + command: str, + jobName: str, + job_environment: dict[str, str] | None = None, + gpus: int | None = None, + ) -> list[str]: # Make sure to use exec so we can get Slurm's signals in the Toil # worker instead of having an intervening Bash - return self.prepareSbatch(cpu, memory, jobID, jobName, job_environment, gpus) + [f'--wrap=exec {command}'] + return self.prepareSbatch( + cpu, memory, jobID, jobName, job_environment, gpus + ) + [f"--wrap=exec {command}"] - def submitJob(self, subLine: List[str]) -> int: + def submitJob(self, subLine: list[str]) -> int: try: # Slurm is not quite clever enough to follow the XDG spec on # its own. If the submission command sees e.g. XDG_RUNTIME_DIR @@ -250,7 +280,11 @@ def submitJob(self, subLine: List[str]) -> int: # This doesn't get us a trustworthy XDG session in Slurm, but # it does let us see the one Slurm tries to give us. no_session_environment = os.environ.copy() - session_names = [n for n in no_session_environment.keys() if n.startswith('XDG_') or n.startswith('DBUS_')] + session_names = [ + n + for n in no_session_environment.keys() + if n.startswith("XDG_") or n.startswith("DBUS_") + ] for name in session_names: del no_session_environment[name] @@ -263,23 +297,29 @@ def submitJob(self, subLine: List[str]) -> int: logger.error(f"sbatch command failed with error: {e}") raise e - def coalesce_job_exit_codes(self, batch_job_id_list: List[str]) -> List[Union[int, Tuple[int, Optional[BatchJobExitReason]], None]]: + def coalesce_job_exit_codes( + self, batch_job_id_list: list[str] + ) -> list[int | tuple[int, BatchJobExitReason | None] | None]: """ Collect all job exit codes in a single call. :param batch_job_id_list: list of Job ID strings, where each string has the form "[.]". :return: list of job exit codes or exit code, exit reason pairs associated with the list of job IDs. """ - logger.log(TRACE, "Getting exit codes for slurm jobs: %s", batch_job_id_list) + logger.log( + TRACE, "Getting exit codes for slurm jobs: %s", batch_job_id_list + ) # Convert batch_job_id_list to list of integer job IDs. - job_id_list = [int(id.split('.')[0]) for id in batch_job_id_list] + job_id_list = [int(id.split(".")[0]) for id in batch_job_id_list] status_dict = self._get_job_details(job_id_list) - exit_codes: List[Union[int, Tuple[int, Optional[BatchJobExitReason]], None]] = [] + exit_codes: list[int | tuple[int, BatchJobExitReason | None] | None] = [] for _, status in status_dict.items(): exit_codes.append(self._get_job_return_code(status)) return exit_codes - def getJobExitCode(self, batchJobID: str) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]: + def getJobExitCode( + self, batchJobID: str + ) -> int | tuple[int, BatchJobExitReason | None] | None: """ Get job exit code for given batch job ID. :param batchJobID: string of the form "[.]". @@ -287,12 +327,14 @@ def getJobExitCode(self, batchJobID: str) -> Union[int, Tuple[int, Optional[Batc """ logger.log(TRACE, "Getting exit code for slurm job: %s", batchJobID) # Convert batchJobID to an integer job ID. - job_id = int(batchJobID.split('.')[0]) + job_id = int(batchJobID.split(".")[0]) status_dict = self._get_job_details([job_id]) status = status_dict[job_id] return self._get_job_return_code(status) - def _get_job_details(self, job_id_list: List[int]) -> Dict[int, Tuple[Optional[str], Optional[int]]]: + def _get_job_details( + self, job_id_list: list[int] + ) -> dict[int, tuple[str | None, int | None]]: """ Helper function for `getJobExitCode` and `coalesce_job_exit_codes`. Fetch job details from Slurm's accounting system or job control system. @@ -306,7 +348,9 @@ def _get_job_details(self, job_id_list: List[int]) -> Dict[int, Tuple[Optional[s status_dict = self._getJobDetailsFromScontrol(job_id_list) return status_dict - def _get_job_return_code(self, status: Tuple[Optional[str], Optional[int]]) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]: + def _get_job_return_code( + self, status: tuple[str | None, int | None] + ) -> int | tuple[int, BatchJobExitReason | None] | None: """ Given a Slurm return code, status pair, summarize them into a Toil return code, exit reason pair. @@ -364,12 +408,17 @@ def _canonicalize_state(self, state: str) -> str: if " " in state_token: state_token = state.split(" ", 1)[0] - if state_token not in TERMINAL_STATES and state_token not in NONTERMINAL_STATES: + if ( + state_token not in TERMINAL_STATES + and state_token not in NONTERMINAL_STATES + ): raise RuntimeError("Toil job in unimplemented Slurm state " + state) return state_token - def _getJobDetailsFromSacct(self, job_id_list: List[int]) -> Dict[int, Tuple[Optional[str], Optional[int]]]: + def _getJobDetailsFromSacct( + self, job_id_list: list[int] + ) -> dict[int, tuple[str | None, int | None]]: """ Get SLURM job exit codes for the jobs in `job_id_list` by running `sacct`. :param job_id_list: list of integer batch job IDs. @@ -377,28 +426,35 @@ def _getJobDetailsFromSacct(self, job_id_list: List[int]) -> Dict[int, Tuple[Opt containing the job's state and exit code. """ job_ids = ",".join(str(id) for id in job_id_list) - args = ['sacct', - '-n', # no header - '-j', job_ids, # job - '--format', 'JobIDRaw,State,ExitCode', # specify output columns - '-P', # separate columns with pipes - '-S', '1970-01-01'] # override start time limit + args = [ + "sacct", + "-n", # no header + "-j", + job_ids, # job + "--format", + "JobIDRaw,State,ExitCode", # specify output columns + "-P", # separate columns with pipes + "-S", + "1970-01-01", + ] # override start time limit stdout = call_command(args, quiet=True) # Collect the job statuses in a dict; key is the job-id, value is a tuple containing # job state and exit status. Initialize dict before processing output of `sacct`. - job_statuses: Dict[int, Tuple[Optional[str], Optional[int]]] = {} + job_statuses: dict[int, tuple[str | None, int | None]] = {} for job_id in job_id_list: job_statuses[job_id] = (None, None) for line in stdout.splitlines(): - values = line.strip().split('|') + values = line.strip().split("|") if len(values) < 3: continue state: str job_id_raw, state, exitcode = values state = self._canonicalize_state(state) - logger.log(TRACE, "%s state of job %s is %s", args[0], job_id_raw, state) + logger.log( + TRACE, "%s state of job %s is %s", args[0], job_id_raw, state + ) # JobIDRaw is in the form JobID[.JobStep]; we're not interested in job steps. job_id_parts = job_id_raw.split(".") if len(job_id_parts) > 1: @@ -406,26 +462,32 @@ def _getJobDetailsFromSacct(self, job_id_list: List[int]) -> Dict[int, Tuple[Opt job_id = int(job_id_parts[0]) status: int signal: int - status, signal = (int(n) for n in exitcode.split(':')) + status, signal = (int(n) for n in exitcode.split(":")) if signal > 0: # A non-zero signal may indicate e.g. an out-of-memory killed job status = 128 + signal - logger.log(TRACE, "%s exit code of job %d is %s, return status %d", - args[0], job_id, exitcode, status) + logger.log( + TRACE, + "%s exit code of job %d is %s, return status %d", + args[0], + job_id, + exitcode, + status, + ) job_statuses[job_id] = state, status logger.log(TRACE, "%s returning job statuses: %s", args[0], job_statuses) return job_statuses - def _getJobDetailsFromScontrol(self, job_id_list: List[int]) -> Dict[int, Tuple[Optional[str], Optional[int]]]: + def _getJobDetailsFromScontrol( + self, job_id_list: list[int] + ) -> dict[int, tuple[str | None, int | None]]: """ Get SLURM job exit codes for the jobs in `job_id_list` by running `scontrol`. :param job_id_list: list of integer batch job IDs. :return: dict of job statuses, where key is the job-id, and value is a tuple containing the job's state and exit code. """ - args = ['scontrol', - 'show', - 'job'] + args = ["scontrol", "show", "job"] # `scontrol` can only return information about a single job, # or all the jobs it knows about. if len(job_id_list) == 1: @@ -436,14 +498,14 @@ def _getJobDetailsFromScontrol(self, job_id_list: List[int]) -> Dict[int, Tuple[ # Job records are separated by a blank line. job_records = None if isinstance(stdout, str): - job_records = stdout.strip().split('\n\n') + job_records = stdout.strip().split("\n\n") elif isinstance(stdout, bytes): - job_records = stdout.decode('utf-8').strip().split('\n\n') + job_records = stdout.decode("utf-8").strip().split("\n\n") # Collect the job statuses in a dict; key is the job-id, value is a tuple containing # job state and exit status. Initialize dict before processing output of `scontrol`. - job_statuses: Dict[int, Tuple[Optional[str], Optional[int]]] = {} - job_id: Optional[int] + job_statuses: dict[int, tuple[str | None, int | None]] = {} + job_id: int | None for job_id in job_id_list: job_statuses[job_id] = (None, None) @@ -453,7 +515,7 @@ def _getJobDetailsFromScontrol(self, job_id_list: List[int]) -> Dict[int, Tuple[ return job_statuses for record in job_records: - job: Dict[str, str] = {} + job: dict[str, str] = {} job_id = None for line in record.splitlines(): for item in line.split(): @@ -462,32 +524,40 @@ def _getJobDetailsFromScontrol(self, job_id_list: List[int]) -> Dict[int, Tuple[ # added to a dictionary. # Note: In some cases, the value itself may contain white-space. So, if we find # a key without a value, we consider that key part of the previous value. - bits = item.split('=', 1) + bits = item.split("=", 1) if len(bits) == 1: - job[key] += ' ' + bits[0] # type: ignore[has-type] # we depend on the previous iteration to populate key + job[key] += " " + bits[0] # type: ignore[has-type] # we depend on the previous iteration to populate key else: key = bits[0] job[key] = bits[1] # The first line of the record contains the JobId. Stop processing the remainder # of this record, if we're not interested in this job. - job_id = int(job['JobId']) + job_id = int(job["JobId"]) if job_id not in job_id_list: - logger.log(TRACE, "%s job %d is not in the list", args[0], job_id) + logger.log( + TRACE, "%s job %d is not in the list", args[0], job_id + ) break if job_id is None or job_id not in job_id_list: continue - state = job['JobState'] + state = job["JobState"] state = self._canonicalize_state(state) logger.log(TRACE, "%s state of job %s is %s", args[0], job_id, state) try: - exitcode = job['ExitCode'] + exitcode = job["ExitCode"] if exitcode is not None: - status, signal = (int(n) for n in exitcode.split(':')) + status, signal = (int(n) for n in exitcode.split(":")) if signal > 0: # A non-zero signal may indicate e.g. an out-of-memory killed job status = 128 + signal - logger.log(TRACE, "%s exit code of job %d is %s, return status %d", - args[0], job_id, exitcode, status) + logger.log( + TRACE, + "%s exit code of job %d is %s, return status %d", + args[0], + job_id, + exitcode, + status, + ) rc = status else: rc = None @@ -501,20 +571,21 @@ def _getJobDetailsFromScontrol(self, job_id_list: List[int]) -> Dict[int, Tuple[ ### Implementation-specific helper methods ### - def prepareSbatch(self, - cpu: int, - mem: int, - jobID: int, - jobName: str, - job_environment: Optional[Dict[str, str]], - gpus: Optional[int]) -> List[str]: - + def prepareSbatch( + self, + cpu: int, + mem: int, + jobID: int, + jobName: str, + job_environment: dict[str, str] | None, + gpus: int | None, + ) -> list[str]: """ Returns the sbatch command line to run to queue the job. """ # Start by naming the job - sbatch_line = ['sbatch', '-J', f'toil_job_{jobID}_{jobName}'] + sbatch_line = ["sbatch", "-J", f"toil_job_{jobID}_{jobName}"] # Make sure the job gets a signal before it disappears so that e.g. # container cleanup finally blocks can run. Ask for SIGINT so we @@ -542,11 +613,15 @@ def prepareSbatch(self, set_exports = "--export=ALL" if nativeConfig is not None: - logger.debug("Native SLURM options appended to sbatch: %s", nativeConfig) + logger.debug( + "Native SLURM options appended to sbatch: %s", nativeConfig + ) for arg in nativeConfig.split(): if arg.startswith("--mem") or arg.startswith("--cpus-per-task"): - raise ValueError(f"Some resource arguments are incompatible: {nativeConfig}") + raise ValueError( + f"Some resource arguments are incompatible: {nativeConfig}" + ) # repleace default behaviour by the one stated at TOIL_SLURM_ARGS if arg.startswith("--export"): set_exports = arg @@ -557,56 +632,72 @@ def prepareSbatch(self, for k, v in environment.items(): quoted_value = quote(os.environ[k] if v is None else v) - argList.append(f'{k}={quoted_value}') + argList.append(f"{k}={quoted_value}") - set_exports += ',' + ','.join(argList) + set_exports += "," + ",".join(argList) # add --export to the sbatch sbatch_line.append(set_exports) parallel_env: str = self.boss.config.slurm_pe # type: ignore[attr-defined] if cpu and cpu > 1 and parallel_env: - sbatch_line.append(f'--partition={parallel_env}') + sbatch_line.append(f"--partition={parallel_env}") if mem is not None and self.boss.config.slurm_allocate_mem: # type: ignore[attr-defined] # memory passed in is in bytes, but slurm expects megabytes - sbatch_line.append(f'--mem={math.ceil(mem / 2 ** 20)}') + sbatch_line.append(f"--mem={math.ceil(mem / 2 ** 20)}") if cpu is not None: - sbatch_line.append(f'--cpus-per-task={math.ceil(cpu)}') + sbatch_line.append(f"--cpus-per-task={math.ceil(cpu)}") time_limit: int = self.boss.config.slurm_time # type: ignore[attr-defined] if time_limit is not None: # Put all the seconds in the seconds slot - sbatch_line.append(f'--time=0:{time_limit}') + sbatch_line.append(f"--time=0:{time_limit}") if gpus: # This block will add a gpu supported partition only if no partition is supplied by the user - sbatch_line = sbatch_line[:1] + [f'--gres=gpu:{gpus}'] + sbatch_line[1:] + sbatch_line = sbatch_line[:1] + [f"--gres=gpu:{gpus}"] + sbatch_line[1:] if not any(option.startswith("--partition") for option in sbatch_line): # no partition specified, so specify one # try to get the name of the lowest priority gpu supported partition lowest_gpu_partition = self.boss.partitions.default_gpu_partition if lowest_gpu_partition is None: # no gpu partitions are available, raise an error - raise RuntimeError(f"The job {jobName} is requesting GPUs, but the Slurm cluster does not appear to have an accessible partition with GPUs") - if time_limit is not None and lowest_gpu_partition.time_limit < time_limit: + raise RuntimeError( + f"The job {jobName} is requesting GPUs, but the Slurm cluster does not appear to have an accessible partition with GPUs" + ) + if ( + time_limit is not None + and lowest_gpu_partition.time_limit < time_limit + ): # TODO: find the lowest-priority GPU partition that has at least each job's time limit! - logger.warning("Trying to submit a job that needs %s seconds to partition %s that has a limit of %s seconds", time_limit, lowest_gpu_partition.partition_name, lowest_gpu_partition.time_limit) - sbatch_line.append(f"--partition={lowest_gpu_partition.partition_name}") + logger.warning( + "Trying to submit a job that needs %s seconds to partition %s that has a limit of %s seconds", + time_limit, + lowest_gpu_partition.partition_name, + lowest_gpu_partition.time_limit, + ) + sbatch_line.append( + f"--partition={lowest_gpu_partition.partition_name}" + ) else: # there is a partition specified already, check if the partition has GPUs for i, option in enumerate(sbatch_line): if option.startswith("--partition"): # grab the partition name depending on if it's specified via an "=" or a space if "=" in option: - partition_name = option[len("--partition="):] + partition_name = option[len("--partition=") :] else: - partition_name = option[i+1] - available_gpu_partitions = self.boss.partitions.gpu_partitions + partition_name = option[i + 1] + available_gpu_partitions = ( + self.boss.partitions.gpu_partitions + ) if partition_name not in available_gpu_partitions: # the specified partition is not compatible, so warn the user that the job may not work - logger.warning(f"Job {jobName} needs {gpus} GPUs, but specified partition {partition_name} is incompatible. This job may not work." - f"Try specifying one of these partitions instead: {', '.join(available_gpu_partitions)}.") + logger.warning( + f"Job {jobName} needs {gpus} GPUs, but specified partition {partition_name} is incompatible. This job may not work." + f"Try specifying one of these partitions instead: {', '.join(available_gpu_partitions)}." + ) break if not any(option.startswith("--partition") for option in sbatch_line): @@ -616,20 +707,26 @@ def prepareSbatch(self, # Route to that partition sbatch_line.append(f"--partition={chosen_partition}") - - stdoutfile: str = self.boss.format_std_out_err_path(jobID, '%j', 'out') - stderrfile: str = self.boss.format_std_out_err_path(jobID, '%j', 'err') - sbatch_line.extend(['-o', stdoutfile, '-e', stderrfile]) + stdoutfile: str = self.boss.format_std_out_err_path(jobID, "%j", "out") + stderrfile: str = self.boss.format_std_out_err_path(jobID, "%j", "err") + sbatch_line.extend(["-o", stdoutfile, "-e", stderrfile]) return sbatch_line - def __init__(self, config: Config, maxCores: float, maxMemory: int, maxDisk: int) -> None: + def __init__( + self, config: Config, maxCores: float, maxMemory: int, maxDisk: int + ) -> None: super().__init__(config, maxCores, maxMemory, maxDisk) self.partitions = SlurmBatchSystem.PartitionSet() # Override issuing jobs so we can check if we need to use Slurm's magic # whole-node-memory feature. - def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int: + def issueBatchJob( + self, + command: str, + job_desc: JobDescription, + job_environment: dict[str, str] | None = None, + ) -> int: # Avoid submitting internal jobs to the batch queue, handle locally local_id = self.handleLocalJob(command, job_desc) if local_id is not None: @@ -648,20 +745,36 @@ def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: # Use the memory actually on the job, or the Toil default memory memory = job_desc.memory - self.newJobsQueue.put((job_id, job_desc.cores, memory, command, get_job_kind(job_desc.get_names()), - job_environment, gpus)) - logger.debug("Issued the job command: %s with job id: %s and job name %s", command, str(job_id), - get_job_kind(job_desc.get_names())) + self.newJobsQueue.put( + ( + job_id, + job_desc.cores, + memory, + command, + get_job_kind(job_desc.get_names()), + job_environment, + gpus, + ) + ) + logger.debug( + "Issued the job command: %s with job id: %s and job name %s", + command, + str(job_id), + get_job_kind(job_desc.get_names()), + ) return job_id def _check_accelerator_request(self, requirer: Requirer) -> None: for accelerator in requirer.accelerators: - if accelerator['kind'] != 'gpu': - raise InsufficientSystemResources(requirer, 'accelerators', details= - [ - f'The accelerator {accelerator} could not be provided' - 'The Toil Slurm batch system only supports gpu accelerators at the moment.' - ]) + if accelerator["kind"] != "gpu": + raise InsufficientSystemResources( + requirer, + "accelerators", + details=[ + f"The accelerator {accelerator} could not be provided" + "The Toil Slurm batch system only supports gpu accelerators at the moment." + ], + ) ### ### The interface for SLURM @@ -675,27 +788,65 @@ def _check_accelerator_request(self, requirer: Requirer) -> None: # implement getWaitDuration(). @classmethod - def add_options(cls, parser: Union[ArgumentParser, _ArgumentGroup]) -> None: - - parser.add_argument("--slurmAllocateMem", dest="slurm_allocate_mem", type=strtobool, default=True, env_var="TOIL_SLURM_ALLOCATE_MEM", - help="If False, do not use --mem. Used as a workaround for Slurm clusters that reject jobs " - "with memory allocations.") + def add_options(cls, parser: ArgumentParser | _ArgumentGroup) -> None: + + parser.add_argument( + "--slurmAllocateMem", + dest="slurm_allocate_mem", + type=strtobool, + default=True, + env_var="TOIL_SLURM_ALLOCATE_MEM", + help="If False, do not use --mem. Used as a workaround for Slurm clusters that reject jobs " + "with memory allocations.", + ) # Keep these deprcated options for backward compatibility - parser.add_argument("--dont_allocate_mem", action='store_false', dest="slurm_allocate_mem", help=SUPPRESS) - parser.add_argument("--allocate_mem", action='store_true', dest="slurm_allocate_mem", help=SUPPRESS) - - parser.add_argument("--slurmDefaultAllMem", dest="slurm_default_all_mem", type=strtobool, default=False, env_var="TOIL_SLURM_DEFAULT_ALL_MEM", - help="If True, assign Toil jobs without their own memory requirements all available " - "memory on a Slurm node (via Slurm --mem=0).") - parser.add_argument("--slurmTime", dest="slurm_time", type=parse_slurm_time, default=None, env_var="TOIL_SLURM_TIME", - help="Slurm job time limit, in [DD-]HH:MM:SS format.") - parser.add_argument("--slurmPE", dest="slurm_pe", default=None, env_var="TOIL_SLURM_PE", - help="Special partition to send Slurm jobs to if they ask for more than 1 CPU.") - parser.add_argument("--slurmArgs", dest="slurm_args", default="", env_var="TOIL_SLURM_ARGS", - help="Extra arguments to pass to Slurm.") + parser.add_argument( + "--dont_allocate_mem", + action="store_false", + dest="slurm_allocate_mem", + help=SUPPRESS, + ) + parser.add_argument( + "--allocate_mem", + action="store_true", + dest="slurm_allocate_mem", + help=SUPPRESS, + ) + + parser.add_argument( + "--slurmDefaultAllMem", + dest="slurm_default_all_mem", + type=strtobool, + default=False, + env_var="TOIL_SLURM_DEFAULT_ALL_MEM", + help="If True, assign Toil jobs without their own memory requirements all available " + "memory on a Slurm node (via Slurm --mem=0).", + ) + parser.add_argument( + "--slurmTime", + dest="slurm_time", + type=parse_slurm_time, + default=None, + env_var="TOIL_SLURM_TIME", + help="Slurm job time limit, in [DD-]HH:MM:SS format.", + ) + parser.add_argument( + "--slurmPE", + dest="slurm_pe", + default=None, + env_var="TOIL_SLURM_PE", + help="Special partition to send Slurm jobs to if they ask for more than 1 CPU.", + ) + parser.add_argument( + "--slurmArgs", + dest="slurm_args", + default="", + env_var="TOIL_SLURM_ARGS", + help="Extra arguments to pass to Slurm.", + ) + + OptionType = TypeVar("OptionType") - - OptionType = TypeVar('OptionType') @classmethod def setOptions(cls, setOption: OptionSetter) -> None: setOption("slurm_allocate_mem") @@ -703,4 +854,3 @@ def setOptions(cls, setOption: OptionSetter) -> None: setOption("slurm_time") setOption("slurm_pe") setOption("slurm_args") - diff --git a/src/toil/batchSystems/torque.py b/src/toil/batchSystems/torque.py index 308c7c2fbd..8ebc9c66fb 100644 --- a/src/toil/batchSystems/torque.py +++ b/src/toil/batchSystems/torque.py @@ -18,10 +18,12 @@ import tempfile from queue import Empty from shlex import quote -from typing import Dict, List, Optional +from typing import Optional -from toil.batchSystems.abstractGridEngineBatchSystem import (AbstractGridEngineBatchSystem, - UpdatedBatchJobInfo) +from toil.batchSystems.abstractGridEngineBatchSystem import ( + AbstractGridEngineBatchSystem, + UpdatedBatchJobInfo, +) from toil.lib.conversions import hms_duration_to_seconds from toil.lib.misc import CalledProcessErrorStderr, call_command @@ -41,18 +43,17 @@ def __init__( self._version = self._pbsVersion() def _pbsVersion(self): - """ Determines PBS/Torque version via pbsnodes - """ + """Determines PBS/Torque version via pbsnodes""" try: out = call_command(["pbsnodes", "--version"]) if "PBSPro" in out: - logger.debug("PBS Pro proprietary Torque version detected") - self._version = "pro" + logger.debug("PBS Pro proprietary Torque version detected") + self._version = "pro" else: - logger.debug("Torque OSS version detected") - self._version = "oss" + logger.debug("Torque OSS version detected") + self._version = "oss" except CalledProcessErrorStderr as e: - if e.returncode != 0: + if e.returncode != 0: logger.error("Could not determine PBS/Torque version") return self._version @@ -60,6 +61,7 @@ def _pbsVersion(self): """ Torque-specific AbstractGridEngineWorker methods """ + def getRunningJobIDs(self): times = {} with self.runningJobsLock: @@ -75,32 +77,33 @@ def getRunningJobIDs(self): # PBS plain qstat will return every running job on the system. jobids = sorted(list(currentjobs.keys())) if self._version == "pro": - stdout = call_command(['qstat', '-x'] + jobids) + stdout = call_command(["qstat", "-x"] + jobids) elif self._version == "oss": - stdout = call_command(['qstat'] + jobids) + stdout = call_command(["qstat"] + jobids) # qstat supports XML output which is more comprehensive, but PBSPro does not support it # so instead we stick with plain commandline qstat tabular outputs - for currline in stdout.split('\n'): + for currline in stdout.split("\n"): items = currline.strip().split() if items: jobid = items[0].strip().split(".")[0] if jobid in currentjobs: logger.debug("getRunningJobIDs job status for is: %s", items[4]) - if jobid in currentjobs and items[4] == 'R': + if jobid in currentjobs and items[4] == "R": walltime = items[3].strip() logger.debug( "getRunningJobIDs qstat reported walltime is: %s", walltime ) # normal qstat has a quirk with job time where it reports '0' # when initially running; this catches this case - if walltime == '0': + if walltime == "0": walltime = 0.0 elif not walltime: # Sometimes we don't get any data here. # See https://github.com/DataBiosphere/toil/issues/3715 logger.warning( - "Assuming 0 walltime due to missing field in qstat line: %s", currline + "Assuming 0 walltime due to missing field in qstat line: %s", + currline, ) walltime = 0.0 else: @@ -120,10 +123,12 @@ def getUpdatedBatchJob(self, maxWait): except Empty: logger.debug("getUpdatedBatchJob: Job queue is empty") else: - return UpdatedBatchJobInfo(jobID=jobID, exitStatus=retcode, wallTime=None, exitReason=None) + return UpdatedBatchJobInfo( + jobID=jobID, exitStatus=retcode, wallTime=None, exitReason=None + ) def killJob(self, jobID): - call_command(['qdel', self.getBatchSystemID(jobID)]) + call_command(["qdel", self.getBatchSystemID(jobID)]) def prepareSubmission( self, @@ -132,8 +137,9 @@ def prepareSubmission( jobID: int, command: str, jobName: str, - job_environment: Optional[Dict[str, str]] = None, - gpus: Optional[int] = None) -> List[str]: + job_environment: Optional[dict[str, str]] = None, + gpus: Optional[int] = None, + ) -> list[str]: return self.prepareQsub(cpu, memory, jobID, job_environment) + [ self.generateTorqueWrapper(command, jobID) ] @@ -143,21 +149,25 @@ def submitJob(self, subLine): def getJobExitCode(self, torqueJobID): if self._version == "pro": - args = ["qstat", "-x", "-f", str(torqueJobID).split('.')[0]] + args = ["qstat", "-x", "-f", str(torqueJobID).split(".")[0]] elif self._version == "oss": - args = ["qstat", "-f", str(torqueJobID).split('.')[0]] + args = ["qstat", "-f", str(torqueJobID).split(".")[0]] stdout = call_command(args) - for line in stdout.split('\n'): + for line in stdout.split("\n"): line = line.strip() # Case differences due to PBSPro vs OSS Torque qstat outputs - if line.startswith("failed") or line.startswith("FAILED") and int(line.split()[1]) == 1: + if ( + line.startswith("failed") + or line.startswith("FAILED") + and int(line.split()[1]) == 1 + ): return 1 if line.startswith("exit_status") or line.startswith("Exit_status"): status = line.split(" = ")[1] logger.debug("Exit Status: %s", status) return int(status) - if 'unknown job id' in line.lower(): + if "unknown job id" in line.lower(): # some clusters configure Torque to forget everything about just # finished jobs instantly, apparently for performance reasons logger.debug( @@ -176,8 +186,8 @@ def prepareQsub( cpu: int, mem: int, jobID: int, - job_environment: Optional[Dict[str, str]], - ) -> List[str]: + job_environment: Optional[dict[str, str]], + ) -> list[str]: # TODO: passing $PWD on command line not working for -d, resorting to # $PBS_O_WORKDIR but maybe should fix this here instead of in script? @@ -189,9 +199,13 @@ def prepareQsub( environment.update(job_environment) if environment: - qsubline.append('-v') - qsubline.append(','.join(k + '=' + quote(os.environ[k] if v is None else v) - for k, v in self.boss.environment.items())) + qsubline.append("-v") + qsubline.append( + ",".join( + k + "=" + quote(os.environ[k] if v is None else v) + for k, v in self.boss.environment.items() + ) + ) reqline = list() if self._version == "pro": @@ -208,7 +222,7 @@ def prepareQsub( reqline.append("nodes=1:ppn=" + str(int(math.ceil(cpu)))) # Other resource requirements can be passed through the environment (see man qsub) - reqlineEnv = os.getenv('TOIL_TORQUE_REQS') + reqlineEnv = os.getenv("TOIL_TORQUE_REQS") if reqlineEnv is not None: logger.debug( "Additional Torque resource requirements appended to qsub from " @@ -232,7 +246,7 @@ def prepareQsub( # All other qsub parameters can be passed through the environment (see man qsub). # No attempt is made to parse them out here and check that they do not conflict # with those that we already constructed above - arglineEnv = os.getenv('TOIL_TORQUE_ARGS') + arglineEnv = os.getenv("TOIL_TORQUE_ARGS") if arglineEnv is not None: logger.debug( "Native Torque options appended to qsub from TOIL_TORQUE_ARGS env. variable: %s", diff --git a/src/toil/bus.py b/src/toil/bus.py index 1e53349a3c..064eb84e07 100644 --- a/src/toil/bus.py +++ b/src/toil/bus.py @@ -67,32 +67,25 @@ import queue import tempfile import threading +from collections.abc import Iterator from dataclasses import dataclass -from typing import (IO, - Any, - Callable, - Dict, - Iterator, - List, - NamedTuple, - Optional, - Type, - TypeVar, - cast) +from typing import IO, Any, Callable, NamedTuple, Optional, TypeVar, cast from pubsub.core import Publisher from pubsub.core.listener import Listener from pubsub.core.topicobj import Topic from pubsub.core.topicutils import ALL_TOPICS -logger = logging.getLogger( __name__ ) +logger = logging.getLogger(__name__) # We define some ways to talk about jobs. + class Names(NamedTuple): """ Stores all the kinds of name a job can have. """ + # Name of the kind of job this is job_name: str # Name of this particular work unit @@ -104,6 +97,7 @@ class Names(NamedTuple): # Job store ID of the job for the work unit job_store_id: str + def get_job_kind(names: Names) -> str: """ Return an identifying string for the job. @@ -127,10 +121,12 @@ def get_job_kind(names: Names) -> str: # We define a bunch of named tuple message types. # These all need to be plain data: only hold ints, strings, etc. + class JobIssuedMessage(NamedTuple): """ Produced when a job is issued to run on the batch system. """ + # The kind of job issued, for statistics aggregation job_type: str # The job store ID of the job @@ -138,20 +134,24 @@ class JobIssuedMessage(NamedTuple): # The toil batch ID of the job toil_batch_id: int + class JobUpdatedMessage(NamedTuple): """ Produced when a job is "updated" and ready to have something happen to it. """ + # The job store ID of the job job_id: str # The error code/return code for the job, which is nonzero if something has # gone wrong, and 0 otherwise. result_status: int + class JobCompletedMessage(NamedTuple): """ Produced when a job is completed, whether successful or not. """ + # The kind of job issued, for statistics aggregation job_type: str # The job store ID of the job @@ -159,27 +159,33 @@ class JobCompletedMessage(NamedTuple): # Exit code for job_id exit_code: int + class JobFailedMessage(NamedTuple): """ Produced when a job is completely failed, and will not be retried again. """ + # The kind of job issued, for statistics aggregation job_type: str # The job store ID of the job job_id: str + class JobMissingMessage(NamedTuple): """ Produced when a job goes missing and should be in the batch system but isn't. """ + # The job store ID of the job job_id: str + class JobAnnotationMessage(NamedTuple): """ Produced when extra information (such as an AWS Batch job ID from the AWSBatchBatchSystem) is available that goes with a job. """ + # The job store ID of the job job_id: str # The name of the annotation @@ -187,50 +193,60 @@ class JobAnnotationMessage(NamedTuple): # The annotation data annotation_value: str + class ExternalBatchIdMessage(NamedTuple): """ Produced when using a batch system, links toil assigned batch ID to Batch system ID (Whatever's returned by local implementation, PID, batch ID, etc) """ - #Assigned toil batch job id + + # Assigned toil batch job id toil_batch_id: int - #Batch system scheduler identity + # Batch system scheduler identity external_batch_id: str - #Batch system name + # Batch system name batch_system: str + class QueueSizeMessage(NamedTuple): """ Produced to describe the size of the queue of jobs issued but not yet completed. Theoretically recoverable from other messages. """ + # The size of the queue queue_size: int + class ClusterSizeMessage(NamedTuple): """ Produced by the Toil-integrated autoscaler describe the number of instances of a certain type in a cluster. """ + # The instance type name, like t4g.medium instance_type: str # The number of instances of that type that the Toil autoscaler thinks # there are current_size: int + class ClusterDesiredSizeMessage(NamedTuple): """ Produced by the Toil-integrated autoscaler to describe the number of instances of a certain type that it thinks will be needed. """ + # The instance type name, like t4g.medium instance_type: str # The number of instances of that type that the Toil autoscaler wants there # to be desired_size: int + # Then we define a serialization format. + def message_to_bytes(message: NamedTuple) -> bytes: """ Convert a plain-old-data named tuple into a byte string. @@ -240,32 +256,39 @@ def message_to_bytes(message: NamedTuple) -> bytes: if isinstance(item, (int, float, bool)) or item is None: # This also handles e.g. values from an IntEnum, where the type extends int. # They might replace __str__() but we hope they use a compatible __format__() - parts.append(f"{item}".encode('utf-8')) + parts.append(f"{item}".encode()) elif isinstance(item, str): - parts.append(item.encode('unicode_escape')) + parts.append(item.encode("unicode_escape")) else: # We haven't implemented this type yet. - raise RuntimeError(f"Cannot store message argument of type {type(item)}: {item}") - return b'\t'.join(parts) + raise RuntimeError( + f"Cannot store message argument of type {type(item)}: {item}" + ) + return b"\t".join(parts) # TODO: Messages have to be named tuple types. -MessageType = TypeVar('MessageType') -def bytes_to_message(message_type: Type[MessageType], data: bytes) -> MessageType: +MessageType = TypeVar("MessageType") + + +def bytes_to_message(message_type: type[MessageType], data: bytes) -> MessageType: """ Convert bytes from message_to_bytes back to a message of the given type. """ - parts = data.split(b'\t') + parts = data.split(b"\t") # Get a mapping from field name to type in the named tuple. # We need to check a couple different fields because this moved in a recent # Python 3 release. - field_to_type: Optional[Dict[str, type]] = cast(Optional[Dict[str, type]], - getattr(message_type, '__annotations__', - getattr(message_type, '_field_types', None))) + field_to_type: Optional[dict[str, type]] = cast( + Optional[dict[str, type]], + getattr( + message_type, "__annotations__", getattr(message_type, "_field_types", None) + ), + ) if field_to_type is None: raise RuntimeError(f"Cannot get field types from {message_type}") - field_names: List[str] = getattr(message_type, '_fields') + field_names: list[str] = getattr(message_type, "_fields") if len(field_names) != len(parts): raise RuntimeError(f"Cannot parse {field_names} from {parts}") @@ -276,10 +299,10 @@ def bytes_to_message(message_type: Type[MessageType], data: bytes) -> MessageTyp for name, part in zip(field_names, parts): field_type = field_to_type[name] if field_type in [int, float, bool]: - typed_parts.append(field_type(part.decode('utf-8'))) + typed_parts.append(field_type(part.decode("utf-8"))) elif field_type == str: # Decode, accounting for escape sequences - typed_parts.append(part.decode('unicode_escape')) + typed_parts.append(part.decode("unicode_escape")) else: raise RuntimeError(f"Cannot read message argument of type {field_type}") @@ -287,8 +310,6 @@ def bytes_to_message(message_type: Type[MessageType], data: bytes) -> MessageTyp return message_type(*typed_parts) - - class MessageBus: """ Holds messages that should cause jobs to change their scheduling states. @@ -317,7 +338,7 @@ def _type_to_name(cls, message_type: type) -> str: characters, hierarchically dotted). """ - return '.'.join([message_type.__module__, message_type.__name__]) + return ".".join([message_type.__module__, message_type.__name__]) # All our messages are NamedTuples, but NamedTuples don't actually inherit # from NamedTupe, so MyPy complains if we require that here. @@ -360,13 +381,16 @@ def _deliver(self, message: Any) -> None: Runs only in the owning thread. Delivers a message to its listeners. """ topic = self._type_to_name(type(message)) - logger.debug('Notifying %s with message: %s', topic, message) + logger.debug("Notifying %s with message: %s", topic, message) self._pubsub.sendMessage(topic, message=message) # This next function takes callables that take things of the type that was passed in as a # runtime argument, which we can explain to MyPy using a TypeVar and Type[] - MessageType = TypeVar('MessageType', bound='NamedTuple') - def subscribe(self, message_type: Type[MessageType], handler: Callable[[MessageType], Any]) -> Listener: + MessageType = TypeVar("MessageType", bound="NamedTuple") + + def subscribe( + self, message_type: type[MessageType], handler: Callable[[MessageType], Any] + ) -> Listener: """ Register the given callable to be called when messages of the given type are sent. It will be called with messages sent after the subscription is created. @@ -374,7 +398,7 @@ def subscribe(self, message_type: Type[MessageType], handler: Callable[[MessageT """ topic = self._type_to_name(message_type) - logger.debug('Listening for message topic: %s', topic) + logger.debug("Listening for message topic: %s", topic) # Make sure to wrap the handler so we get the right argument name and # we can control lifetime. @@ -387,10 +411,10 @@ def handler_wraper(message: MessageBus.MessageType) -> None: # Hide the handler function in the pubsub listener to keep it alive. # If it goes out of scope the subscription expires, and the pubsub # system only uses weak references. - setattr(listener, 'handler_wrapper', handler_wraper) + setattr(listener, "handler_wrapper", handler_wraper) return listener - def connect(self, wanted_types: List[type]) -> 'MessageBusConnection': + def connect(self, wanted_types: list[type]) -> "MessageBusConnection": """ Get a connection object that serves as an inbox for messages of the given types. @@ -402,7 +426,7 @@ def connect(self, wanted_types: List[type]) -> 'MessageBusConnection': connection._set_bus_and_message_types(self, wanted_types) return connection - def outbox(self) -> 'MessageOutbox': + def outbox(self) -> "MessageOutbox": """ Get a connection object that only allows sending messages. """ @@ -420,24 +444,27 @@ def connect_output_file(self, file_path: str) -> Any: somewhere or delete it. """ - - stream = open(file_path, 'wb') + stream = open(file_path, "wb") # Type of the ** is the value type of the dictionary; key type is always string. - def handler(topic_object: Topic = Listener.AUTO_TOPIC, **message_data: NamedTuple) -> None: + def handler( + topic_object: Topic = Listener.AUTO_TOPIC, **message_data: NamedTuple + ) -> None: """ Log the message in the given message data, associated with the given topic. """ # There should always be a "message" - if len(message_data) != 1 or 'message' not in message_data: - raise RuntimeError("Cannot log the bus message. The message is either empty/malformed or there are too many messages provided.") - message = message_data['message'] + if len(message_data) != 1 or "message" not in message_data: + raise RuntimeError( + "Cannot log the bus message. The message is either empty/malformed or there are too many messages provided." + ) + message = message_data["message"] topic = topic_object.getName() - stream.write(topic.encode('utf-8')) - stream.write(b'\t') + stream.write(topic.encode("utf-8")) + stream.write(b"\t") stream.write(message_to_bytes(message)) - stream.write(b'\n') + stream.write(b"\n") stream.flush() listener, _ = self._pubsub.subscribe(handler, ALL_TOPICS) @@ -446,7 +473,6 @@ def handler(topic_object: Topic = Listener.AUTO_TOPIC, **message_data: NamedTupl # want the pypubsub Listener. return (handler, listener) - # TODO: If we annotate this as returning an Iterator[NamedTuple], MyPy # complains when we loop over it that the loop variable is a , # ifen in code protected by isinstance(). Using a typevar makes it complain @@ -456,7 +482,9 @@ def handler(topic_object: Topic = Listener.AUTO_TOPIC, **message_data: NamedTupl # union of the types passed in message_types, in a way that MyPy can # understand. @classmethod - def scan_bus_messages(cls, stream: IO[bytes], message_types: List[Type[NamedTuple]]) -> Iterator[Any]: + def scan_bus_messages( + cls, stream: IO[bytes], message_types: list[type[NamedTuple]] + ) -> Iterator[Any]: """ Get an iterator over all messages in the given log stream of the given types, in order. Discard any trailing partial messages. @@ -466,15 +494,15 @@ def scan_bus_messages(cls, stream: IO[bytes], message_types: List[Type[NamedTupl name_to_type = {cls._type_to_name(t): t for t in message_types} for line in stream: - logger.debug('Got message: %s', line) - if not line.endswith(b'\n'): + logger.debug("Got message: %s", line) + if not line.endswith(b"\n"): # Skip unterminated line continue # Drop the newline and split on first tab - parts = line[:-1].split(b'\t', 1) + parts = line[:-1].split(b"\t", 1) # Get the type of the message - message_type = name_to_type.get(parts[0].decode('utf-8')) + message_type = name_to_type.get(parts[0].decode("utf-8")) if message_type is None: # We aren't interested in this kind of message. continue @@ -485,6 +513,7 @@ def scan_bus_messages(cls, stream: IO[bytes], message_types: List[Type[NamedTupl # And produce it yield message + class MessageBusClient: """ Base class for clients (inboxes and outboxes) of a message bus. Handles @@ -507,6 +536,7 @@ def _set_bus(self, bus: MessageBus) -> None: """ self._bus = bus + class MessageInbox(MessageBusClient): """ A buffered connection to a message bus that lets us receive messages. @@ -522,16 +552,19 @@ def __init__(self) -> None: super().__init__() # This holds all the messages on the bus, organized by type. - self._messages_by_type: Dict[type, List[Any]] = {} + self._messages_by_type: dict[type, list[Any]] = {} # This holds listeners for all the types, when we connect to a bus - self._listeners_by_type: Dict[type, Listener] = {} + self._listeners_by_type: dict[type, Listener] = {} # We define a handler for messages def on_message(message: Any) -> None: self._messages_by_type[type(message)].append(message) + self._handler = on_message - def _set_bus_and_message_types(self, bus: MessageBus, wanted_types: List[type]) -> None: + def _set_bus_and_message_types( + self, bus: MessageBus, wanted_types: list[type] + ) -> None: """ Connect to the given bus and collect the given message types. @@ -576,8 +609,9 @@ def empty(self) -> bool: # This next function returns things of the type that was passed in as a # runtime argument, which we can explain to MyPy using a TypeVar and Type[] - MessageType = TypeVar('MessageType') - def for_each(self, message_type: Type[MessageType]) -> Iterator[MessageType]: + MessageType = TypeVar("MessageType") + + def for_each(self, message_type: type[MessageType]) -> Iterator[MessageType]: """ Loop over all messages currently pending of the given type. Each that is handled without raising an exception will be removed. @@ -607,7 +641,9 @@ def for_each(self, message_type: Type[MessageType]) -> Iterator[MessageType]: try: # Emit the message if not isinstance(message, message_type): - raise RuntimeError(f"Unacceptable message type {type(message)} in list for type {message_type}") + raise RuntimeError( + f"Unacceptable message type {type(message)} in list for type {message_type}" + ) yield message # If we get here it was handled without error. handled = True @@ -622,7 +658,10 @@ def for_each(self, message_type: Type[MessageType]) -> Iterator[MessageType]: # Dump anything remaining in our buffer back into the main buffer, # in the right order, and before the later messages. message_list.reverse() - self._messages_by_type[message_type] = message_list + self._messages_by_type[message_type] + self._messages_by_type[message_type] = ( + message_list + self._messages_by_type[message_type] + ) + class MessageOutbox(MessageBusClient): """ @@ -645,6 +684,7 @@ def publish(self, message: Any) -> None: raise RuntimeError("Cannot send message when not connected to a bus") self._bus.publish(message) + class MessageBusConnection(MessageInbox, MessageOutbox): """ A two-way connection to a message bus. Buffers incoming messages until you @@ -657,7 +697,9 @@ def __init__(self) -> None: """ super().__init__() - def _set_bus_and_message_types(self, bus: MessageBus, wanted_types: List[type]) -> None: + def _set_bus_and_message_types( + self, bus: MessageBus, wanted_types: list[type] + ) -> None: """ Connect to the given bus and collect the given message types. @@ -680,18 +722,21 @@ class JobStatus: job_store_id: str name: str exit_code: int - annotations: Dict[str, str] + annotations: dict[str, str] toil_batch_id: int external_batch_id: str batch_system: str def __repr__(self) -> str: - return json.dumps(self, default= lambda o: o.__dict__, indent=4) + return json.dumps(self, default=lambda o: o.__dict__, indent=4) def is_running(self) -> bool: - return self.exit_code < 0 and self.job_store_id != "" # if the exit code is -1 and the job id is specified, we assume the job is running + return ( + self.exit_code < 0 and self.job_store_id != "" + ) # if the exit code is -1 and the job id is specified, we assume the job is running -def replay_message_bus(path: str) -> Dict[str, JobStatus]: + +def replay_message_bus(path: str) -> dict[str, JobStatus]: """ Replay all the messages and work out what they mean for jobs. @@ -707,15 +752,26 @@ def replay_message_bus(path: str) -> Dict[str, JobStatus]: is running. """ - job_statuses: Dict[str, JobStatus] = collections.defaultdict(lambda: JobStatus('', '', -1, {}, -1, '', '')) + job_statuses: dict[str, JobStatus] = collections.defaultdict( + lambda: JobStatus("", "", -1, {}, -1, "", "") + ) batch_to_job_id = {} try: - with open(path, 'rb') as log_stream: + with open(path, "rb") as log_stream: # Read all the full, properly-terminated messages about job updates - for event in MessageBus.scan_bus_messages(log_stream, [JobUpdatedMessage, JobIssuedMessage, JobCompletedMessage, - JobFailedMessage, JobAnnotationMessage, ExternalBatchIdMessage]): + for event in MessageBus.scan_bus_messages( + log_stream, + [ + JobUpdatedMessage, + JobIssuedMessage, + JobCompletedMessage, + JobFailedMessage, + JobAnnotationMessage, + ExternalBatchIdMessage, + ], + ): # And for each of them - logger.debug('Got message from workflow: %s', event) + logger.debug("Got message from workflow: %s", event) if isinstance(event, JobUpdatedMessage): # Apply the latest return code from the job with this ID. @@ -736,16 +792,23 @@ def replay_message_bus(path: str) -> Dict[str, JobStatus]: job_statuses[event.job_id].exit_code = 1 elif isinstance(event, JobAnnotationMessage): # Remember the last value of any annotation that is set - job_statuses[event.job_id].annotations[event.annotation_name] = event.annotation_value + job_statuses[event.job_id].annotations[ + event.annotation_name + ] = event.annotation_value elif isinstance(event, ExternalBatchIdMessage): if event.toil_batch_id in batch_to_job_id: - job_statuses[batch_to_job_id[event.toil_batch_id]].external_batch_id = event.external_batch_id - job_statuses[batch_to_job_id[event.toil_batch_id]].batch_system = event.batch_system + job_statuses[ + batch_to_job_id[event.toil_batch_id] + ].external_batch_id = event.external_batch_id + job_statuses[ + batch_to_job_id[event.toil_batch_id] + ].batch_system = event.batch_system except FileNotFoundError: logger.warning("We were unable to access the file") return job_statuses + def gen_message_bus_path(tmpdir: Optional[str] = None) -> str: """ Return a file path in tmp to store the message bus at. @@ -758,4 +821,4 @@ def gen_message_bus_path(tmpdir: Optional[str] = None) -> str: fd, path = tempfile.mkstemp(dir=tmpdir) os.close(fd) return path - #TODO Might want to clean up the tmpfile at some point after running the workflow + # TODO Might want to clean up the tmpfile at some point after running the workflow diff --git a/src/toil/common.py b/src/toil/common.py index 4515cc47bb..81ea6395df 100644 --- a/src/toil/common.py +++ b/src/toil/common.py @@ -23,76 +23,68 @@ import time import uuid import warnings -from io import StringIO - -from ruamel.yaml import YAML -from ruamel.yaml.comments import CommentedMap -from configargparse import ArgParser, YAMLConfigFileParser -from argparse import (SUPPRESS, - ArgumentDefaultsHelpFormatter, - ArgumentParser, - Namespace, - _ArgumentGroup, Action, _StoreFalseAction, _StoreTrueAction, _AppendAction) +from argparse import ( + SUPPRESS, + ArgumentDefaultsHelpFormatter, + ArgumentParser, + Namespace, + _ArgumentGroup, + _StoreFalseAction, + _StoreTrueAction, +) from functools import lru_cache from types import TracebackType -from typing import (IO, - TYPE_CHECKING, - Any, - Callable, - ContextManager, - Dict, - List, - Optional, - Set, - Tuple, - Type, - TypeVar, - Union, - cast, - overload) -from urllib.parse import urlparse, unquote, quote +from typing import ( + IO, + TYPE_CHECKING, + Any, + Callable, + ContextManager, + Literal, + Optional, + TypeVar, + Union, + cast, + overload, +) +from urllib.parse import quote, unquote, urlparse import requests - -from toil.options.common import add_base_toil_options, JOBSTORE_HELP -from toil.options.cwl import add_cwl_options -from toil.options.runner import add_runner_options -from toil.options.wdl import add_wdl_options - -if sys.version_info >= (3, 8): - from typing import Literal -else: - from typing_extensions import Literal +from configargparse import ArgParser, YAMLConfigFileParser +from ruamel.yaml import YAML +from ruamel.yaml.comments import CommentedMap from toil import logProcessContext, lookupEnvVar from toil.batchSystems.options import set_batchsystem_options -from toil.bus import (ClusterDesiredSizeMessage, - ClusterSizeMessage, - JobCompletedMessage, - JobFailedMessage, - JobIssuedMessage, - JobMissingMessage, - MessageBus, - QueueSizeMessage, gen_message_bus_path) +from toil.bus import ( + ClusterDesiredSizeMessage, + ClusterSizeMessage, + JobCompletedMessage, + JobFailedMessage, + JobIssuedMessage, + JobMissingMessage, + MessageBus, + QueueSizeMessage, + gen_message_bus_path, +) from toil.fileStores import FileID from toil.lib.compatibility import deprecated -from toil.lib.io import try_path, AtomicFileCreate +from toil.lib.io import AtomicFileCreate, try_path from toil.lib.retry import retry from toil.lib.threading import ensure_filesystem_lockable -from toil.provisioners import (add_provisioner_options, - cluster_factory) +from toil.options.common import JOBSTORE_HELP, add_base_toil_options +from toil.options.cwl import add_cwl_options +from toil.options.runner import add_runner_options +from toil.options.wdl import add_wdl_options +from toil.provisioners import add_provisioner_options, cluster_factory from toil.realtimeLogger import RealtimeLogger -from toil.statsAndLogging import (add_logging_options, - set_logging_from_options) +from toil.statsAndLogging import add_logging_options, set_logging_from_options from toil.version import dockerRegistry, dockerTag, version if TYPE_CHECKING: from toil.batchSystems.abstractBatchSystem import AbstractBatchSystem from toil.batchSystems.options import OptionSetter - from toil.job import (AcceleratorRequirement, - Job, - JobDescription, - TemporaryID) + from toil.job import AcceleratorRequirement, Job, JobDescription, TemporaryID from toil.jobStores.abstractJobStore import AbstractJobStore from toil.provisioners.abstractProvisioner import AbstractProvisioner from toil.resource import ModuleDescriptor @@ -108,6 +100,7 @@ class Config: """Class to represent configuration operations for a toil workflow run.""" + logFile: Optional[str] logRotating: bool cleanWorkDir: str @@ -174,23 +167,22 @@ class Config: # Autoscaling options provisioner: Optional[str] - nodeTypes: List[Tuple[Set[str], Optional[float]]] - minNodes: List[int] - maxNodes: List[int] + nodeTypes: list[tuple[set[str], Optional[float]]] + minNodes: list[int] + maxNodes: list[int] targetTime: float betaInertia: float scaleInterval: int preemptibleCompensation: float nodeStorage: int - nodeStorageOverrides: List[str] + nodeStorageOverrides: list[str] metrics: bool assume_zero_overhead: bool # Parameters to limit service jobs, so preventing deadlock scheduling scenarios maxPreemptibleServiceJobs: int maxServiceJobs: int - deadlockWait: Union[ - float, int] + deadlockWait: Union[float, int] deadlockCheckInterval: Union[float, int] # Resource requirements @@ -201,7 +193,7 @@ class Config: # TODO: These names are generated programmatically in # Requirer._fetchRequirement so we can't use snake_case until we fix # that (and add compatibility getters/setters?) - defaultAccelerators: List['AcceleratorRequirement'] + defaultAccelerators: list["AcceleratorRequirement"] maxCores: int maxMemory: int maxDisk: int @@ -223,7 +215,7 @@ class Config: realTimeLogging: bool # Misc - environment: Dict[str, str] + environment: dict[str, str] disableChaining: bool disableJobStoreChecksumVerification: bool sseKey: Optional[str] @@ -283,8 +275,7 @@ def prepare_restart(self) -> None: def setOptions(self, options: Namespace) -> None: """Creates a config object from the options object.""" - def set_option(option_name: str, - old_names: Optional[List[str]] = None) -> None: + def set_option(option_name: str, old_names: Optional[list[str]] = None) -> None: """ Determine the correct value for the given option. @@ -307,15 +298,21 @@ def set_option(option_name: str, for old_name in old_names: # If the option is already set with the new name and not the old name # prioritize the new name over the old name and break - if option_value is not None and option_value != [] and option_value != {}: + if ( + option_value is not None + and option_value != [] + and option_value != {} + ): break # Try all the old names in case user code is setting them # in an options object. # This does assume that all deprecated options have a default value of None if getattr(options, old_name, None) is not None: - warnings.warn(f'Using deprecated option field {old_name} to ' - f'provide value for config field {option_name}', - DeprecationWarning) + warnings.warn( + f"Using deprecated option field {old_name} to " + f"provide value for config field {option_name}", + DeprecationWarning, + ) option_value = getattr(options, old_name) if option_value is not None or not hasattr(self, option_name): setattr(self, option_name, option_value) @@ -330,13 +327,14 @@ def set_option(option_name: str, set_option("stats") set_option("cleanWorkDir") set_option("clean") - set_option('clusterStats') + set_option("clusterStats") set_option("restart") # Batch system options set_option("batchSystem") - set_batchsystem_options(None, cast("OptionSetter", - set_option)) # None as that will make set_batchsystem_options iterate through all batch systems and set their corresponding values + set_batchsystem_options( + None, cast("OptionSetter", set_option) + ) # None as that will make set_batchsystem_options iterate through all batch systems and set their corresponding values # File store options set_option("symlinkImports", old_names=["linkImports"]) @@ -424,31 +422,39 @@ def set_option(option_name: str, # Apply overrides as highest priority # Override workDir with value of TOIL_WORKDIR_OVERRIDE if it exists - if os.getenv('TOIL_WORKDIR_OVERRIDE') is not None: - self.workDir = os.getenv('TOIL_WORKDIR_OVERRIDE') + if os.getenv("TOIL_WORKDIR_OVERRIDE") is not None: + self.workDir = os.getenv("TOIL_WORKDIR_OVERRIDE") # Override workDir with value of TOIL_WORKDIR_OVERRIDE if it exists - if os.getenv('TOIL_COORDINATION_DIR_OVERRIDE') is not None: - self.workDir = os.getenv('TOIL_COORDINATION_DIR_OVERRIDE') + if os.getenv("TOIL_COORDINATION_DIR_OVERRIDE") is not None: + self.workDir = os.getenv("TOIL_COORDINATION_DIR_OVERRIDE") self.check_configuration_consistency() def check_configuration_consistency(self) -> None: """Old checks that cannot be fit into an action class for argparse""" if self.writeLogs and self.writeLogsGzip: - raise ValueError("Cannot use both --writeLogs and --writeLogsGzip at the same time.") + raise ValueError( + "Cannot use both --writeLogs and --writeLogsGzip at the same time." + ) if self.writeLogsFromAllJobs and not self.writeLogs and not self.writeLogsGzip: - raise ValueError("To enable --writeLogsFromAllJobs, either --writeLogs or --writeLogsGzip must be set.") + raise ValueError( + "To enable --writeLogsFromAllJobs, either --writeLogs or --writeLogsGzip must be set." + ) for override in self.nodeStorageOverrides: tokens = override.split(":") if not any(tokens[0] in n[0] for n in self.nodeTypes): - raise ValueError("Instance type in --nodeStorageOverrides must be in --nodeTypes") + raise ValueError( + "Instance type in --nodeStorageOverrides must be in --nodeTypes" + ) if self.stats: if self.clean != "never" and self.clean is not None: - logger.warning("Contradicting options passed: Clean flag is set to %s " - "despite the stats flag requiring " - "the jobStore to be intact at the end of the run. " - "Setting clean to \'never\'." % self.clean) + logger.warning( + "Contradicting options passed: Clean flag is set to %s " + "despite the stats flag requiring " + "the jobStore to be intact at the end of the run. " + "Setting clean to 'never'." % self.clean + ) self.clean = "never" def __eq__(self, other: object) -> bool: @@ -468,7 +474,9 @@ def check_and_create_toil_home_dir() -> None: dir_path = try_path(TOIL_HOME_DIR) if dir_path is None: - raise RuntimeError(f"Cannot create or access Toil configuration directory {TOIL_HOME_DIR}") + raise RuntimeError( + f"Cannot create or access Toil configuration directory {TOIL_HOME_DIR}" + ) def check_and_create_default_config_file() -> None: @@ -526,9 +534,23 @@ def generate_config(filepath: str) -> None: # and --caching respectively # Skip StoreTrue and StoreFalse options that have opposite defaults as including it in the config would # override those defaults - deprecated_or_redundant_options = ("help", "config", "logCritical", "logDebug", "logError", "logInfo", "logOff", - "logWarning", "linkImports", "noLinkImports", "moveExports", "noMoveExports", - "enableCaching", "disableCaching", "version") + deprecated_or_redundant_options = ( + "help", + "config", + "logCritical", + "logDebug", + "logError", + "logInfo", + "logOff", + "logWarning", + "linkImports", + "noLinkImports", + "moveExports", + "noMoveExports", + "enableCaching", + "disableCaching", + "version", + ) def create_config_dict_from_parser(parser: ArgumentParser) -> CommentedMap: """ @@ -539,9 +561,12 @@ def create_config_dict_from_parser(parser: ArgumentParser) -> CommentedMap: :return: CommentedMap of what to put into the config file """ data = CommentedMap() # to preserve order - group_title_key: Dict[str, str] = dict() + group_title_key: dict[str, str] = dict() for action in parser._actions: - if any(s.replace("-", "") in deprecated_or_redundant_options for s in action.option_strings): + if any( + s.replace("-", "") in deprecated_or_redundant_options + for s in action.option_strings + ): continue # if action is StoreFalse and default is True then don't include if isinstance(action, _StoreFalseAction) and action.default is True: @@ -553,8 +578,11 @@ def create_config_dict_from_parser(parser: ArgumentParser) -> CommentedMap: if len(action.option_strings) == 0: continue - option_string = action.option_strings[0] if action.option_strings[0].find("--") != -1 else \ - action.option_strings[1] + option_string = ( + action.option_strings[0] + if action.option_strings[0].find("--") != -1 + else action.option_strings[1] + ) option = option_string[2:] default = action.default @@ -577,10 +605,12 @@ def create_config_dict_from_parser(parser: ArgumentParser) -> CommentedMap: add_base_toil_options(parser, jobstore_as_flag=True, cwl=False) toil_base_data = create_config_dict_from_parser(parser) - toil_base_data.yaml_set_start_comment("This is the configuration file for Toil. To set an option, uncomment an " - "existing option and set its value. The current values are the defaults. " - "If the default configuration file is outdated, it can be refreshed with " - "`toil config ~/.toil/default.yaml`.\n\nBASE TOIL OPTIONS\n") + toil_base_data.yaml_set_start_comment( + "This is the configuration file for Toil. To set an option, uncomment an " + "existing option and set its value. The current values are the defaults. " + "If the default configuration file is outdated, it can be refreshed with " + "`toil config ~/.toil/default.yaml`.\n\nBASE TOIL OPTIONS\n" + ) all_data.append(toil_base_data) parser = ArgParser(YAMLConfigFileParser()) @@ -612,40 +642,52 @@ def create_config_dict_from_parser(parser: ArgumentParser) -> CommentedMap: with AtomicFileCreate(filepath) as temp_path: with open(temp_path, "w") as f: f.write("config_version: 1.0\n") - yaml = YAML(typ='rt') + yaml = YAML(typ="rt") for data in all_data: data.pop("config_version", None) yaml.dump( data, f, - transform=lambda s: re.sub(r'^(.)', r'#\1', s, flags=re.MULTILINE), + transform=lambda s: re.sub(r"^(.)", r"#\1", s, flags=re.MULTILINE), ) def parser_with_common_options( - provisioner_options: bool = False, - jobstore_option: bool = True, - prog: Optional[str] = None, - default_log_level: Optional[int] = None + provisioner_options: bool = False, + jobstore_option: bool = True, + prog: Optional[str] = None, + default_log_level: Optional[int] = None, ) -> ArgParser: - parser = ArgParser(prog=prog or "Toil", formatter_class=ArgumentDefaultsHelpFormatter) + parser = ArgParser( + prog=prog or "Toil", formatter_class=ArgumentDefaultsHelpFormatter + ) if provisioner_options: add_provisioner_options(parser) if jobstore_option: - parser.add_argument('jobStore', type=str, help=JOBSTORE_HELP) + parser.add_argument("jobStore", type=str, help=JOBSTORE_HELP) # always add these add_logging_options(parser, default_log_level) - parser.add_argument("--version", action='version', version=version) - parser.add_argument("--tempDirRoot", dest="tempDirRoot", type=str, default=tempfile.gettempdir(), - help="Path to where temporary directory containing all temp files are created, " - "by default generates a fresh tmp dir with 'tempfile.gettempdir()'.") + parser.add_argument("--version", action="version", version=version) + parser.add_argument( + "--tempDirRoot", + dest="tempDirRoot", + type=str, + default=tempfile.gettempdir(), + help="Path to where temporary directory containing all temp files are created, " + "by default generates a fresh tmp dir with 'tempfile.gettempdir()'.", + ) return parser -def addOptions(parser: ArgumentParser, jobstore_as_flag: bool = False, cwl: bool = False, wdl: bool = False) -> None: +def addOptions( + parser: ArgumentParser, + jobstore_as_flag: bool = False, + cwl: bool = False, + wdl: bool = False, +) -> None: """ Add all Toil command line options to a parser. @@ -658,10 +700,13 @@ def addOptions(parser: ArgumentParser, jobstore_as_flag: bool = False, cwl: bool :param wdl: Whether WDL options are expected. If so, WDL options won't be suppressed. """ if cwl and wdl: - raise RuntimeError("CWL and WDL cannot both be true at the same time when adding options.") + raise RuntimeError( + "CWL and WDL cannot both be true at the same time when adding options." + ) if not (isinstance(parser, ArgumentParser) or isinstance(parser, _ArgumentGroup)): raise ValueError( - f"Unanticipated class: {parser.__class__}. Must be: argparse.ArgumentParser or ArgumentGroup.") + f"Unanticipated class: {parser.__class__}. Must be: argparse.ArgumentParser or ArgumentGroup." + ) if isinstance(parser, ArgParser): # in case the user passes in their own configargparse instance instead of calling getDefaultArgumentParser() @@ -671,10 +716,12 @@ def addOptions(parser: ArgumentParser, jobstore_as_flag: bool = False, cwl: bool else: # configargparse advertises itself as a drag and drop replacement, and running the normal argparse ArgumentParser # through this code still seems to work (with the exception of --config and environmental variables) - warnings.warn(f'Using deprecated library argparse for options parsing.' - f'This will not parse config files or use environment variables.' - f'Use configargparse instead or call Job.Runner.getDefaultArgumentParser()', - DeprecationWarning) + warnings.warn( + f"Using deprecated library argparse for options parsing." + f"This will not parse config files or use environment variables." + f"Use configargparse instead or call Job.Runner.getDefaultArgumentParser()", + DeprecationWarning, + ) check_and_create_default_config_file() # Check on the config file to make sure it is sensible @@ -683,16 +730,17 @@ def addOptions(parser: ArgumentParser, jobstore_as_flag: bool = False, cwl: bool # If we have an empty config file, someone has to manually delete # it before we will work again. raise RuntimeError( - f"Config file {DEFAULT_CONFIG_FILE} exists but is empty. Delete it! Stat says: {config_status}") + f"Config file {DEFAULT_CONFIG_FILE} exists but is empty. Delete it! Stat says: {config_status}" + ) try: - with open(DEFAULT_CONFIG_FILE, "r") as f: + with open(DEFAULT_CONFIG_FILE) as f: yaml = YAML(typ="safe") s = yaml.load(f) logger.debug("Initialized default configuration: %s", json.dumps(s)) except: # Something went wrong reading the default config, so dump its # contents to the log. - logger.info("Configuration file contents: %s", open(DEFAULT_CONFIG_FILE, 'r').read()) + logger.info("Configuration file contents: %s", open(DEFAULT_CONFIG_FILE).read()) raise # Add base toil options @@ -719,27 +767,41 @@ def check_arguments(typ: str) -> None: add_runner_options(check_parser) for action in check_parser._actions: action.default = SUPPRESS - other_options, _ = check_parser.parse_known_args(sys.argv[1:], ignore_help_args=True) + other_options, _ = check_parser.parse_known_args( + sys.argv[1:], ignore_help_args=True + ) if len(vars(other_options)) != 0: - raise parser.error(f"{'WDL' if typ == 'cwl' else 'CWL'} options are not allowed on the command line.") + raise parser.error( + f"{'WDL' if typ == 'cwl' else 'CWL'} options are not allowed on the command line." + ) # if cwl is set, format the namespace for cwl and check that wdl options are not set on the command line if cwl: parser.add_argument("cwltool", type=str, help="CWL file to run.") - parser.add_argument("cwljob", nargs="*", help="Input file or CWL options. If CWL workflow takes an input, " - "the name of the input can be used as an option. " - "For example: \"%(prog)s workflow.cwl --file1 file\". " - "If an input has the same name as a Toil option, pass '--' before it.") + parser.add_argument( + "cwljob", + nargs="*", + help="Input file or CWL options. If CWL workflow takes an input, " + "the name of the input can be used as an option. " + 'For example: "%(prog)s workflow.cwl --file1 file". ' + "If an input has the same name as a Toil option, pass '--' before it.", + ) check_arguments(typ="cwl") # if wdl is set, format the namespace for wdl and check that cwl options are not set on the command line if wdl: - parser.add_argument("wdl_uri", type=str, - help="WDL document URI") - parser.add_argument("inputs_uri", type=str, nargs='?', - help="WDL input JSON URI") - parser.add_argument("--input", "--inputs", "-i", dest="inputs_uri", type=str, - help="WDL input JSON URI") + parser.add_argument("wdl_uri", type=str, help="WDL document URI") + parser.add_argument( + "inputs_uri", type=str, nargs="?", help="WDL input JSON URI" + ) + parser.add_argument( + "--input", + "--inputs", + "-i", + dest="inputs_uri", + type=str, + help="WDL input JSON URI", + ) check_arguments(typ="wdl") @@ -762,15 +824,20 @@ def getNodeID() -> str: with open(idSourceFile) as inp: nodeID = inp.readline().strip() except OSError: - logger.warning(f"Exception when trying to read ID file {idSourceFile}. " - f"Will try next method to get node ID.", exc_info=True) + logger.warning( + f"Exception when trying to read ID file {idSourceFile}. " + f"Will try next method to get node ID.", + exc_info=True, + ) else: if len(nodeID.split()) == 1: logger.debug(f"Obtained node ID {nodeID} from file {idSourceFile}") break else: - logger.warning(f"Node ID {nodeID} from file {idSourceFile} contains spaces. " - f"Will try next method to get node ID.") + logger.warning( + f"Node ID {nodeID} from file {idSourceFile} contains spaces. " + f"Will try next method to get node ID." + ) else: nodeIDs = [] for i_call in range(2): @@ -784,18 +851,22 @@ def getNodeID() -> str: if nodeIDs[0] == nodeIDs[1]: nodeID = nodeIDs[0] else: - logger.warning(f"Different node IDs {nodeIDs} received from repeated calls to uuid.getnode(). " - f"You should use another method to generate node ID.") + logger.warning( + f"Different node IDs {nodeIDs} received from repeated calls to uuid.getnode(). " + f"You should use another method to generate node ID." + ) logger.debug(f"Obtained node ID {nodeID} from uuid.getnode()") if not nodeID: - logger.warning("Failed to generate stable node ID, returning empty string. If you see this message with a " - "work dir on a shared file system when using workers running on multiple nodes, you might " - "experience cryptic job failures") - if len(nodeID.replace('-', '')) < UUID_LENGTH: + logger.warning( + "Failed to generate stable node ID, returning empty string. If you see this message with a " + "work dir on a shared file system when using workers running on multiple nodes, you might " + "experience cryptic job failures" + ) + if len(nodeID.replace("-", "")) < UUID_LENGTH: # Some platforms (Mac) give us not enough actual hex characters. # Repeat them so the result is convertible to a uuid.UUID - nodeID = nodeID.replace('-', '') + nodeID = nodeID.replace("-", "") num_repeats = UUID_LENGTH // len(nodeID) + 1 nodeID = nodeID * num_repeats nodeID = nodeID[:UUID_LENGTH] @@ -808,6 +879,7 @@ class Toil(ContextManager["Toil"]): Specifically the batch system, job store, and its configuration. """ + config: Config _jobStore: "AbstractJobStore" _batchSystem: "AbstractBatchSystem" @@ -824,7 +896,7 @@ def __init__(self, options: Namespace) -> None: """ super().__init__() self.options = options - self._jobCache: Dict[Union[str, "TemporaryID"], "JobDescription"] = {} + self._jobCache: dict[Union[str, "TemporaryID"], "JobDescription"] = {} self._inContextManager: bool = False self._inRestart: bool = False @@ -867,10 +939,10 @@ def __enter__(self) -> "Toil": return self def __exit__( - self, - exc_type: Optional[Type[BaseException]], - exc_val: Optional[BaseException], - exc_tb: Optional[TracebackType], + self, + exc_type: Optional[type[BaseException]], + exc_val: Optional[BaseException], + exc_tb: Optional[TracebackType], ) -> Literal[False]: """ Clean up after a workflow invocation. @@ -878,24 +950,33 @@ def __exit__( Depending on the configuration, delete the job store. """ try: - if (exc_type is not None and self.config.clean == "onError" or - exc_type is None and self.config.clean == "onSuccess" or - self.config.clean == "always"): + if ( + exc_type is not None + and self.config.clean == "onError" + or exc_type is None + and self.config.clean == "onSuccess" + or self.config.clean == "always" + ): try: if self.config.restart and not self._inRestart: pass else: self._jobStore.destroy() - logger.info("Successfully deleted the job store: %s" % str(self._jobStore)) + logger.info( + "Successfully deleted the job store: %s" + % str(self._jobStore) + ) except: - logger.info("Failed to delete the job store: %s" % str(self._jobStore)) + logger.info( + "Failed to delete the job store: %s" % str(self._jobStore) + ) raise except Exception as e: if exc_type is None: raise else: - logger.exception('The following error was raised during clean up:') + logger.exception("The following error was raised during clean up:") self._inContextManager = False self._inRestart = False return False # let exceptions through @@ -927,8 +1008,10 @@ def start(self, rootJob: "Job") -> Any: self._jobStore.write_leader_node_id() if self.config.restart: - raise ToilRestartException('A Toil workflow can only be started once. Use ' - 'Toil.restart() to resume it.') + raise ToilRestartException( + "A Toil workflow can only be started once. Use " + "Toil.restart() to resume it." + ) self._batchSystem = self.createBatchSystem(self.config) self._setupAutoDeployment(rootJob.getUserScript()) @@ -941,7 +1024,7 @@ def start(self, rootJob: "Job") -> Any: # a shared file, where we can find and unpickle it at the end of the workflow. # Unpickling the promise will automatically substitute the promise for the actual # return value. - with self._jobStore.write_shared_file_stream('rootJobReturnValue') as fH: + with self._jobStore.write_shared_file_stream("rootJobReturnValue") as fH: rootJob.prepareForPromiseRegistration(self._jobStore) promise = rootJob.rv() pickle.dump(promise, fH, protocol=pickle.HIGHEST_PROTOCOL) @@ -969,15 +1052,18 @@ def restart(self) -> Any: self._jobStore.write_leader_node_id() if not self.config.restart: - raise ToilRestartException('A Toil workflow must be initiated with Toil.start(), ' - 'not restart().') + raise ToilRestartException( + "A Toil workflow must be initiated with Toil.start(), " "not restart()." + ) from toil.job import JobException + try: self._jobStore.load_root_job() except JobException: logger.warning( - 'Requested restart but the workflow has already been completed; allowing exports to rerun.') + "Requested restart but the workflow has already been completed; allowing exports to rerun." + ) return self._jobStore.get_root_job_return_value() self._batchSystem = self.createBatchSystem(self.config) @@ -996,12 +1082,14 @@ def _setProvisioner(self) -> None: if self.config.provisioner is None: self._provisioner = None else: - self._provisioner = cluster_factory(provisioner=self.config.provisioner, - clusterName=None, - zone=None, # read from instance meta-data - nodeStorage=self.config.nodeStorage, - nodeStorageOverrides=self.config.nodeStorageOverrides, - sseKey=self.config.sseKey) + self._provisioner = cluster_factory( + provisioner=self.config.provisioner, + clusterName=None, + zone=None, # read from instance meta-data + nodeStorage=self.config.nodeStorage, + nodeStorageOverrides=self.config.nodeStorageOverrides, + sseKey=self.config.sseKey, + ) self._provisioner.setAutoscaledNodeTypes(self.config.nodeTypes) @classmethod @@ -1014,27 +1102,30 @@ def getJobStore(cls, locator: str) -> "AbstractJobStore": :return: an instance of a concrete subclass of AbstractJobStore """ name, rest = cls.parseLocator(locator) - if name == 'file': + if name == "file": from toil.jobStores.fileJobStore import FileJobStore + return FileJobStore(rest) - elif name == 'aws': + elif name == "aws": from toil.jobStores.aws.jobStore import AWSJobStore + return AWSJobStore(rest) - elif name == 'google': + elif name == "google": from toil.jobStores.googleJobStore import GoogleJobStore + return GoogleJobStore(rest) else: raise RuntimeError("Unknown job store implementation '%s'" % name) @staticmethod - def parseLocator(locator: str) -> Tuple[str, str]: - if locator[0] in '/.' or ':' not in locator: - return 'file', locator + def parseLocator(locator: str) -> tuple[str, str]: + if locator[0] in "/." or ":" not in locator: + return "file", locator else: try: - name, rest = locator.split(':', 1) + name, rest = locator.split(":", 1) except ValueError: - raise RuntimeError('Invalid job store locator syntax.') + raise RuntimeError("Invalid job store locator syntax.") else: return name, rest @@ -1042,7 +1133,7 @@ def parseLocator(locator: str) -> Tuple[str, str]: def buildLocator(name: str, rest: str) -> str: if ":" in name: raise ValueError(f"Can't have a ':' in the name: '{name}'.") - return f'{name}:{rest}' + return f"{name}:{rest}" @classmethod def resumeJobStore(cls, locator: str) -> "AbstractJobStore": @@ -1059,30 +1150,39 @@ def createBatchSystem(config: Config) -> "AbstractBatchSystem": :return: an instance of a concrete subclass of AbstractBatchSystem """ - kwargs = dict(config=config, - maxCores=config.maxCores, - maxMemory=config.maxMemory, - maxDisk=config.maxDisk) + kwargs = dict( + config=config, + maxCores=config.maxCores, + maxMemory=config.maxMemory, + maxDisk=config.maxDisk, + ) from toil.batchSystems.registry import get_batch_system, get_batch_systems try: batch_system = get_batch_system(config.batchSystem) except KeyError: - raise RuntimeError(f'Unrecognized batch system: {config.batchSystem} ' - f'(choose from: {", ".join(get_batch_systems())})') + raise RuntimeError( + f"Unrecognized batch system: {config.batchSystem} " + f'(choose from: {", ".join(get_batch_systems())})' + ) if config.caching and not batch_system.supportsWorkerCleanup(): - raise RuntimeError(f'{config.batchSystem} currently does not support shared caching, because it ' - 'does not support cleaning up a worker after the last job finishes. Set ' - '--caching=false') - - logger.debug('Using the %s' % re.sub("([a-z])([A-Z])", r"\g<1> \g<2>", batch_system.__name__).lower()) + raise RuntimeError( + f"{config.batchSystem} currently does not support shared caching, because it " + "does not support cleaning up a worker after the last job finishes. Set " + "--caching=false" + ) + + logger.debug( + "Using the %s" + % re.sub("([a-z])([A-Z])", r"\g<1> \g<2>", batch_system.__name__).lower() + ) return batch_system(**kwargs) def _setupAutoDeployment( - self, userScript: Optional["ModuleDescriptor"] = None + self, userScript: Optional["ModuleDescriptor"] = None ) -> None: """ Determine the user script, save it to the job store and inject a reference to the saved copy into the batch system. @@ -1095,39 +1195,55 @@ def _setupAutoDeployment( if userScript is not None: # This branch is hit when a workflow is being started if userScript.belongsToToil: - logger.debug('User script %s belongs to Toil. No need to auto-deploy it.', userScript) + logger.debug( + "User script %s belongs to Toil. No need to auto-deploy it.", + userScript, + ) userScript = None else: - if (self._batchSystem.supportsAutoDeployment() and - not self.config.disableAutoDeployment): + if ( + self._batchSystem.supportsAutoDeployment() + and not self.config.disableAutoDeployment + ): # Note that by saving the ModuleDescriptor, and not the Resource we allow for # redeploying a potentially modified user script on workflow restarts. - with self._jobStore.write_shared_file_stream('userScript') as f: + with self._jobStore.write_shared_file_stream("userScript") as f: pickle.dump(userScript, f, protocol=pickle.HIGHEST_PROTOCOL) else: - from toil.batchSystems.singleMachine import \ - SingleMachineBatchSystem + from toil.batchSystems.singleMachine import SingleMachineBatchSystem + if not isinstance(self._batchSystem, SingleMachineBatchSystem): - logger.warning('Batch system does not support auto-deployment. The user script ' - '%s will have to be present at the same location on every worker.', userScript) + logger.warning( + "Batch system does not support auto-deployment. The user script " + "%s will have to be present at the same location on every worker.", + userScript, + ) userScript = None else: # This branch is hit on restarts - if self._batchSystem.supportsAutoDeployment() and not self.config.disableAutoDeployment: + if ( + self._batchSystem.supportsAutoDeployment() + and not self.config.disableAutoDeployment + ): # We could deploy a user script from toil.jobStores.abstractJobStore import NoSuchFileException + try: - with self._jobStore.read_shared_file_stream('userScript') as f: + with self._jobStore.read_shared_file_stream("userScript") as f: userScript = safeUnpickleFromStream(f) except NoSuchFileException: - logger.debug('User script neither set explicitly nor present in the job store.') + logger.debug( + "User script neither set explicitly nor present in the job store." + ) userScript = None if userScript is None: - logger.debug('No user script to auto-deploy.') + logger.debug("No user script to auto-deploy.") else: - logger.debug('Saving user script %s as a resource', userScript) + logger.debug("Saving user script %s as a resource", userScript) userScriptResource = userScript.saveAsResourceTo(self._jobStore) - logger.debug('Injecting user script %s into batch system.', userScriptResource) + logger.debug( + "Injecting user script %s into batch system.", userScriptResource + ) self._batchSystem.setUserScript(userScriptResource) def url_exists(self, src_uri: str) -> bool: @@ -1137,55 +1253,55 @@ def url_exists(self, src_uri: str) -> bool: # returns a file ID. Explain this to MyPy. @overload - def importFile(self, - srcUrl: str, - sharedFileName: str, - symlink: bool = True) -> None: - ... + def importFile( + self, srcUrl: str, sharedFileName: str, symlink: bool = True + ) -> None: ... @overload - def importFile(self, - srcUrl: str, - sharedFileName: None = None, - symlink: bool = True) -> FileID: - ... - - @deprecated(new_function_name='import_file') - def importFile(self, - srcUrl: str, - sharedFileName: Optional[str] = None, - symlink: bool = True) -> Optional[FileID]: + def importFile( + self, srcUrl: str, sharedFileName: None = None, symlink: bool = True + ) -> FileID: ... + + @deprecated(new_function_name="import_file") + def importFile( + self, srcUrl: str, sharedFileName: Optional[str] = None, symlink: bool = True + ) -> Optional[FileID]: return self.import_file(srcUrl, sharedFileName, symlink) @overload - def import_file(self, - src_uri: str, - shared_file_name: str, - symlink: bool = True, - check_existence: bool = True) -> None: - ... + def import_file( + self, + src_uri: str, + shared_file_name: str, + symlink: bool = True, + check_existence: bool = True, + ) -> None: ... @overload - def import_file(self, - src_uri: str, - shared_file_name: None = None, - symlink: bool = True, - check_existence: Literal[True] = True) -> FileID: - ... + def import_file( + self, + src_uri: str, + shared_file_name: None = None, + symlink: bool = True, + check_existence: Literal[True] = True + ) -> FileID: ... @overload - def import_file(self, - src_uri: str, - shared_file_name: None = None, - symlink: bool = True, - check_existence: bool = True) -> Optional[FileID]: - ... - - def import_file(self, - src_uri: str, - shared_file_name: Optional[str] = None, - symlink: bool = True, - check_existence: bool = True) -> Optional[FileID]: + def import_file( + self, + src_uri: str, + shared_file_name: None = None, + symlink: bool = True, + check_existence: bool = True + ) -> Optional[FileID]: ... + + def import_file( + self, + src_uri: str, + shared_file_name: Optional[str] = None, + symlink: bool = True, + check_existence: bool = True + ) -> Optional[FileID]: """ Import the file at the given URL into the job store. @@ -1201,7 +1317,9 @@ def import_file(self, self._assertContextManagerUsed() full_uri = self.normalize_uri(src_uri, check_existence=check_existence) try: - imported = self._jobStore.import_file(full_uri, shared_file_name=shared_file_name, symlink=symlink) + imported = self._jobStore.import_file( + full_uri, shared_file_name=shared_file_name, symlink=symlink + ) except FileNotFoundError: # TODO: I thought we refactored the different job store import # methods to not raise and instead return None, but that looks to @@ -1218,10 +1336,10 @@ def import_file(self, # We need to protect the caller from missing files. # We think a file was missing, and we got None becasuse of it. # We didn't get None instead because of usign a shared file name. - raise FileNotFoundError(f'Could not find file {src_uri}') + raise FileNotFoundError(f"Could not find file {src_uri}") return imported - @deprecated(new_function_name='export_file') + @deprecated(new_function_name="export_file") def exportFile(self, jobStoreFileID: FileID, dstUrl: str) -> None: return self.export_file(jobStoreFileID, dstUrl) @@ -1244,18 +1362,21 @@ def normalize_uri(uri: str, check_existence: bool = False) -> str: :param check_existence: If set, raise FileNotFoundError if a URI points to a local file that does not exist. """ - if urlparse(uri).scheme == 'file': - uri = unquote(urlparse(uri).path) # this should strip off the local file scheme; it will be added back + if urlparse(uri).scheme == "file": + uri = unquote( + urlparse(uri).path + ) # this should strip off the local file scheme; it will be added back # account for the scheme-less case, which should be coerced to a local absolute path - if urlparse(uri).scheme == '': + if urlparse(uri).scheme == "": abs_path = os.path.abspath(uri) if not os.path.exists(abs_path) and check_existence: raise FileNotFoundError( f'Could not find local file "{abs_path}" when importing "{uri}".\n' f'Make sure paths are relative to "{os.getcwd()}" or use absolute paths.\n' - f'If this is not a local file, please include the scheme (s3:/, gs:/, ftp://, etc.).') - return f'file://{quote(abs_path)}' + f"If this is not a local file, please include the scheme (s3:/, gs:/, ftp://, etc.)." + ) + return f"file://{quote(abs_path)}" return uri def _setBatchSystemEnvVars(self) -> None: @@ -1267,15 +1388,19 @@ def _setBatchSystemEnvVars(self) -> None: def _serialiseEnv(self) -> None: """Put the environment in a globally accessible pickle file.""" # Dump out the environment of this process in the environment pickle file. - with self._jobStore.write_shared_file_stream("environment.pickle") as fileHandle: + with self._jobStore.write_shared_file_stream( + "environment.pickle" + ) as fileHandle: pickle.dump(dict(os.environ), fileHandle, pickle.HIGHEST_PROTOCOL) logger.debug("Written the environment for the jobs to the environment file") def _cacheAllJobs(self) -> None: """Download all jobs in the current job store into self.jobCache.""" - logger.debug('Caching all jobs in job store') - self._jobCache = {jobDesc.jobStoreID: jobDesc for jobDesc in self._jobStore.jobs()} - logger.debug(f'{len(self._jobCache)} jobs downloaded.') + logger.debug("Caching all jobs in job store") + self._jobCache = { + jobDesc.jobStoreID: jobDesc for jobDesc in self._jobStore.jobs() + } + logger.debug(f"{len(self._jobCache)} jobs downloaded.") def _cacheJob(self, job: "JobDescription") -> None: """ @@ -1297,14 +1422,22 @@ def getToilWorkDir(configWorkDir: Optional[str] = None) -> str: :param configWorkDir: Value passed to the program using the --workDir flag :return: Path to the Toil work directory, constant across all machines """ - workDir = os.getenv('TOIL_WORKDIR_OVERRIDE') or configWorkDir or os.getenv( - 'TOIL_WORKDIR') or tempfile.gettempdir() + workDir = ( + os.getenv("TOIL_WORKDIR_OVERRIDE") + or configWorkDir + or os.getenv("TOIL_WORKDIR") + or tempfile.gettempdir() + ) if not os.path.exists(workDir): - raise RuntimeError(f'The directory specified by --workDir or TOIL_WORKDIR ({workDir}) does not exist.') + raise RuntimeError( + f"The directory specified by --workDir or TOIL_WORKDIR ({workDir}) does not exist." + ) return workDir @classmethod - def get_toil_coordination_dir(cls, config_work_dir: Optional[str], config_coordination_dir: Optional[str]) -> str: + def get_toil_coordination_dir( + cls, config_work_dir: Optional[str], config_coordination_dir: Optional[str] + ) -> str: """ Return a path to a writable directory, which will be in memory if convenient. Ought to be used for file locking and coordination. @@ -1326,32 +1459,43 @@ def get_toil_coordination_dir(cls, config_work_dir: Optional[str], config_coordi # succeeds. coordination_dir: Optional[str] = ( # First try an override env var - os.getenv('TOIL_COORDINATION_DIR_OVERRIDE') or - # Then the value from the config - config_coordination_dir or - # Then a normal env var - # TODO: why/how would this propagate when not using single machine? - os.getenv('TOIL_COORDINATION_DIR') or - # Then try a `toil` subdirectory of the XDG runtime directory - # (often /var/run/users/). But only if we are actually in a - # session that has the env var set. Otherwise it might belong to a - # different set of sessions and get cleaned up out from under us - # when that session ends. - # We don't think Slurm XDG sessions are trustworthy, depending on - # the cluster's PAM configuration, so don't use them. - ('XDG_RUNTIME_DIR' in os.environ and 'SLURM_JOBID' not in os.environ and try_path( - os.path.join(os.environ['XDG_RUNTIME_DIR'], 'toil'))) or - # Try under /run/lock. It might be a temp dir style sticky directory. - try_path('/run/lock') or - # Try all possible temp directories, falling back to the current working - # directory - tempfile.gettempdir() or - # Finally, fall back on the work dir and hope it's a legit filesystem. - cls.getToilWorkDir(config_work_dir) + os.getenv("TOIL_COORDINATION_DIR_OVERRIDE") + or + # Then the value from the config + config_coordination_dir + or + # Then a normal env var + # TODO: why/how would this propagate when not using single machine? + os.getenv("TOIL_COORDINATION_DIR") + or + # Then try a `toil` subdirectory of the XDG runtime directory + # (often /var/run/users/). But only if we are actually in a + # session that has the env var set. Otherwise it might belong to a + # different set of sessions and get cleaned up out from under us + # when that session ends. + # We don't think Slurm XDG sessions are trustworthy, depending on + # the cluster's PAM configuration, so don't use them. + ( + "XDG_RUNTIME_DIR" in os.environ + and "SLURM_JOBID" not in os.environ + and try_path(os.path.join(os.environ["XDG_RUNTIME_DIR"], "toil")) + ) + or + # Try under /run/lock. It might be a temp dir style sticky directory. + try_path("/run/lock") + or + # Try all possible temp directories, falling back to the current working + # directory + tempfile.gettempdir() + or + # Finally, fall back on the work dir and hope it's a legit filesystem. + cls.getToilWorkDir(config_work_dir) ) if coordination_dir is None: - raise RuntimeError("Could not determine a coordination directory by any method!") + raise RuntimeError( + "Could not determine a coordination directory by any method!" + ) return coordination_dir @@ -1365,11 +1509,13 @@ def get_workflow_path_component(workflow_id: str) -> str: :param workflow_id: The ID of the current Toil workflow. """ - return "toilwf-" + str(uuid.uuid5(uuid.UUID(getNodeID()), workflow_id)).replace('-', '') + return "toilwf-" + str(uuid.uuid5(uuid.UUID(getNodeID()), workflow_id)).replace( + "-", "" + ) @classmethod def getLocalWorkflowDir( - cls, workflowID: str, configWorkDir: Optional[str] = None + cls, workflowID: str, configWorkDir: Optional[str] = None ) -> str: """ Return the directory where worker directories and the cache will be located for this workflow on this machine. @@ -1382,7 +1528,9 @@ def getLocalWorkflowDir( # Create a directory unique to each host in case workDir is on a shared FS. # This prevents workers on different nodes from erasing each other's directories. - workflowDir: str = os.path.join(base, cls.get_workflow_path_component(workflowID)) + workflowDir: str = os.path.join( + base, cls.get_workflow_path_component(workflowID) + ) try: # Directory creation is atomic os.mkdir(workflowDir) @@ -1391,15 +1539,17 @@ def getLocalWorkflowDir( # The directory exists if a previous worker set it up. raise else: - logger.debug('Created the workflow directory for this machine at %s' % workflowDir) + logger.debug( + "Created the workflow directory for this machine at %s" % workflowDir + ) return workflowDir @classmethod def get_local_workflow_coordination_dir( - cls, - workflow_id: str, - config_work_dir: Optional[str], - config_coordination_dir: Optional[str] + cls, + workflow_id: str, + config_work_dir: Optional[str], + config_coordination_dir: Optional[str], ) -> str: """ Return the directory where coordination files should be located for @@ -1433,8 +1583,7 @@ def get_local_workflow_coordination_dir( # Don't let it out if it smells like an unacceptable filesystem for locks ensure_filesystem_lockable( - subdir, - hint="Use --coordinationDir to provide a different location." + subdir, hint="Use --coordinationDir to provide a different location." ) # Return it @@ -1448,23 +1597,31 @@ def _runMainLoop(self, rootJob: "JobDescription") -> Any: """ logProcessContext(self.config) - with RealtimeLogger(self._batchSystem, level=self.options.logLevel if self.options.realTimeLogging else 'INFO'): + with RealtimeLogger( + self._batchSystem, + level=self.options.logLevel if self.options.realTimeLogging else "INFO", + ): # FIXME: common should not import from leader from toil.leader import Leader - return Leader(config=self.config, - batchSystem=self._batchSystem, - provisioner=self._provisioner, - jobStore=self._jobStore, - rootJob=rootJob, - jobCache=self._jobCache).run() + + return Leader( + config=self.config, + batchSystem=self._batchSystem, + provisioner=self._provisioner, + jobStore=self._jobStore, + rootJob=rootJob, + jobCache=self._jobCache, + ).run() def _shutdownBatchSystem(self) -> None: """Shuts down current batch system if it has been created.""" startTime = time.time() - logger.debug('Shutting down batch system ...') + logger.debug("Shutting down batch system ...") self._batchSystem.shutdown() - logger.debug('... finished shutting down the batch system in %s seconds.' - % (time.time() - startTime)) + logger.debug( + "... finished shutting down the batch system in %s seconds." + % (time.time() - startTime) + ) def _assertContextManagerUsed(self) -> None: if not self._inContextManager: @@ -1479,27 +1636,33 @@ def __init__(self, message: str) -> None: class ToilContextManagerException(Exception): def __init__(self) -> None: super().__init__( - 'This method cannot be called outside the "with Toil(...)" context manager.') + 'This method cannot be called outside the "with Toil(...)" context manager.' + ) class ToilMetrics: - def __init__(self, bus: MessageBus, provisioner: Optional["AbstractProvisioner"] = None) -> None: + def __init__( + self, bus: MessageBus, provisioner: Optional["AbstractProvisioner"] = None + ) -> None: clusterName = "none" region = "us-west-2" if provisioner is not None: clusterName = str(provisioner.clusterName) if provisioner._zone is not None: - if provisioner.cloud == 'aws': + if provisioner.cloud == "aws": # lazy import to avoid AWS dependency if the aws extra is not installed from toil.lib.aws import zone_to_region + # Remove AZ name region = zone_to_region(provisioner._zone) else: region = provisioner._zone - registry = lookupEnvVar(name='docker registry', - envName='TOIL_DOCKER_REGISTRY', - defaultValue=dockerRegistry) + registry = lookupEnvVar( + name="docker registry", + envName="TOIL_DOCKER_REGISTRY", + defaultValue=dockerRegistry, + ) self.mtailImage = f"{registry}/toil-mtail:{dockerTag}" self.grafanaImage = f"{registry}/toil-grafana:{dockerTag}" @@ -1516,14 +1679,21 @@ def __init__(self, bus: MessageBus, provisioner: Optional["AbstractProvisioner"] try: self.mtailProc: Optional[subprocess.Popen[bytes]] = subprocess.Popen( - ["docker", "run", - "--rm", - "--interactive", - "--net=host", - "--name", "toil_mtail", - "-p", "3903:3903", - self.mtailImage], - stdin=subprocess.PIPE, stdout=subprocess.PIPE) + [ + "docker", + "run", + "--rm", + "--interactive", + "--net=host", + "--name", + "toil_mtail", + "-p", + "3903:3903", + self.mtailImage, + ], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + ) except subprocess.CalledProcessError: logger.warning("Couldn't start toil metrics server.") self.mtailProc = None @@ -1536,20 +1706,32 @@ def __init__(self, bus: MessageBus, provisioner: Optional["AbstractProvisioner"] if not provisioner: try: self.nodeExporterProc = subprocess.Popen( - ["docker", "run", - "--rm", - "--net=host", - "-p", "9100:9100", - "-v", "/proc:/host/proc", - "-v", "/sys:/host/sys", - "-v", "/:/rootfs", - "quay.io/prometheus/node-exporter:v1.3.1", - "-collector.procfs", "/host/proc", - "-collector.sysfs", "/host/sys", - "-collector.filesystem.ignored-mount-points", - "^/(sys|proc|dev|host|etc)($|/)"]) + [ + "docker", + "run", + "--rm", + "--net=host", + "-p", + "9100:9100", + "-v", + "/proc:/host/proc", + "-v", + "/sys:/host/sys", + "-v", + "/:/rootfs", + "quay.io/prometheus/node-exporter:v1.3.1", + "-collector.procfs", + "/host/proc", + "-collector.sysfs", + "/host/sys", + "-collector.filesystem.ignored-mount-points", + "^/(sys|proc|dev|host|etc)($|/)", + ] + ) except subprocess.CalledProcessError: - logger.warning("Couldn't start node exporter, won't get RAM and CPU usage for dashboard.") + logger.warning( + "Couldn't start node exporter, won't get RAM and CPU usage for dashboard." + ) except KeyboardInterrupt: if self.nodeExporterProc is not None: self.nodeExporterProc.terminate() @@ -1566,23 +1748,32 @@ def __init__(self, bus: MessageBus, provisioner: Optional["AbstractProvisioner"] JobMissingMessage: self.logMissingJob, JobIssuedMessage: self.logIssuedJob, JobFailedMessage: self.logFailedJob, - JobCompletedMessage: self.logCompletedJob + JobCompletedMessage: self.logCompletedJob, } # The only way to make this inteligible to MyPy is to wrap the dict in # a function that can cast. - MessageType = TypeVar('MessageType') + MessageType = TypeVar("MessageType") - def get_listener(message_type: Type[MessageType]) -> Callable[[MessageType], None]: + def get_listener( + message_type: type[MessageType], + ) -> Callable[[MessageType], None]: return cast(Callable[[MessageType], None], TARGETS[message_type]) # Then set up the listeners. - self._listeners = [bus.subscribe(message_type, get_listener(message_type)) for message_type in TARGETS.keys()] + self._listeners = [ + bus.subscribe(message_type, get_listener(message_type)) + for message_type in TARGETS.keys() + ] @staticmethod def _containerRunning(containerName: str) -> bool: try: - result = subprocess.check_output(["docker", "inspect", "-f", - "'{{.State.Running}}'", containerName]).decode('utf-8') == "true" + result = ( + subprocess.check_output( + ["docker", "inspect", "-f", "'{{.State.Running}}'", containerName] + ).decode("utf-8") + == "true" + ) except subprocess.CalledProcessError: result = False return result @@ -1594,24 +1785,38 @@ def startDashboard(self, clusterName: str, zone: str) -> None: subprocess.check_call(["docker", "rm", "-f", "toil_prometheus"]) except subprocess.CalledProcessError: pass - subprocess.check_call(["docker", "run", - "--name", "toil_prometheus", - "--net=host", - "-d", - "-p", "9090:9090", - self.prometheusImage, - clusterName, - zone]) + subprocess.check_call( + [ + "docker", + "run", + "--name", + "toil_prometheus", + "--net=host", + "-d", + "-p", + "9090:9090", + self.prometheusImage, + clusterName, + zone, + ] + ) if not self._containerRunning("toil_grafana"): try: subprocess.check_call(["docker", "rm", "-f", "toil_grafana"]) except subprocess.CalledProcessError: pass - subprocess.check_call(["docker", "run", - "--name", "toil_grafana", - "-d", "-p=3000:3000", - self.grafanaImage]) + subprocess.check_call( + [ + "docker", + "run", + "--name", + "toil_grafana", + "-d", + "-p=3000:3000", + self.grafanaImage, + ] + ) except subprocess.CalledProcessError: logger.warning("Could not start prometheus/grafana dashboard.") return @@ -1619,15 +1824,17 @@ def startDashboard(self, clusterName: str, zone: str) -> None: try: self.add_prometheus_data_source() except requests.exceptions.ConnectionError: - logger.debug("Could not add data source to Grafana dashboard - no metrics will be displayed.") + logger.debug( + "Could not add data source to Grafana dashboard - no metrics will be displayed." + ) @retry(errors=[requests.exceptions.ConnectionError]) def add_prometheus_data_source(self) -> None: requests.post( - 'http://localhost:3000/api/datasources', - auth=('admin', 'admin'), + "http://localhost:3000/api/datasources", + auth=("admin", "admin"), data='{"name":"DS_PROMETHEUS","type":"prometheus", "url":"http://localhost:9090", "access":"direct"}', - headers={'content-type': 'application/json', "access": "direct"} + headers={"content-type": "application/json", "access": "direct"}, ) def log(self, message: str) -> None: @@ -1638,14 +1845,10 @@ def log(self, message: str) -> None: # Note: The mtail configuration (dashboard/mtail/toil.mtail) depends on these messages # remaining intact - def logClusterSize( - self, m: ClusterSizeMessage - ) -> None: + def logClusterSize(self, m: ClusterSizeMessage) -> None: self.log("current_size '%s' %i" % (m.instance_type, m.current_size)) - def logClusterDesiredSize( - self, m: ClusterDesiredSizeMessage - ) -> None: + def logClusterDesiredSize(self, m: ClusterDesiredSizeMessage) -> None: self.log("desired_size '%s' %i" % (m.instance_type, m.desired_size)) def logQueueSize(self, m: QueueSizeMessage) -> None: @@ -1665,13 +1868,13 @@ def logCompletedJob(self, m: JobCompletedMessage) -> None: def shutdown(self) -> None: if self.mtailProc is not None: - logger.debug('Stopping mtail') + logger.debug("Stopping mtail") self.mtailProc.kill() - logger.debug('Stopped mtail') + logger.debug("Stopped mtail") if self.nodeExporterProc is not None: - logger.debug('Stopping node exporter') + logger.debug("Stopping node exporter") self.nodeExporterProc.kill() - logger.debug('Stopped node exporter') + logger.debug("Stopped node exporter") self._listeners = [] @@ -1679,7 +1882,7 @@ def cacheDirName(workflowID: str) -> str: """ :return: Name of the cache directory. """ - return f'cache-{workflowID}' + return f"cache-{workflowID}" def getDirSizeRecursively(dirPath: str) -> int: @@ -1705,8 +1908,16 @@ def getDirSizeRecursively(dirPath: str) -> int: dirPath = os.path.abspath(dirPath) try: - return int(subprocess.check_output(['du', '-s', dirPath], - env=dict(os.environ, BLOCKSIZE='512')).decode('utf-8').split()[0]) * 512 + return ( + int( + subprocess.check_output( + ["du", "-s", dirPath], env=dict(os.environ, BLOCKSIZE="512") + ) + .decode("utf-8") + .split()[0] + ) + * 512 + ) # The environment variable 'BLOCKSIZE'='512' is set instead of the much cleaner # --block-size=1 because Apple can't handle it. except (OSError, subprocess.CalledProcessError): @@ -1721,7 +1932,7 @@ def getDirSizeRecursively(dirPath: str) -> int: return total_size -def getFileSystemSize(dirPath: str) -> Tuple[int, int]: +def getFileSystemSize(dirPath: str) -> tuple[int, int]: """ Return the free space, and total size of the file system hosting `dirPath`. @@ -1729,7 +1940,7 @@ def getFileSystemSize(dirPath: str) -> Tuple[int, int]: :return: free space and total size of file system """ if not os.path.exists(dirPath): - raise RuntimeError(f'Could not find dir size for non-existent path: {dirPath}') + raise RuntimeError(f"Could not find dir size for non-existent path: {dirPath}") diskStats = os.statvfs(dirPath) freeSpace = diskStats.f_frsize * diskStats.f_bavail diskSize = diskStats.f_frsize * diskStats.f_blocks diff --git a/src/toil/cwl/__init__.py b/src/toil/cwl/__init__.py index 7d18702a00..56fad0438b 100644 --- a/src/toil/cwl/__init__.py +++ b/src/toil/cwl/__init__.py @@ -13,8 +13,8 @@ # limitations under the License. import sys from functools import lru_cache +from importlib.metadata import PackageNotFoundError, version -from importlib.metadata import version, PackageNotFoundError from toil.version import cwltool_version diff --git a/src/toil/cwl/cwltoil.py b/src/toil/cwl/cwltoil.py index 0e61eb476b..65c09ec710 100644 --- a/src/toil/cwl/cwltoil.py +++ b/src/toil/cwl/cwltoil.py @@ -1,4 +1,5 @@ """Implemented support for Common Workflow Language (CWL) for Toil.""" + # Copyright (C) 2015 Curoverse, Inc # Copyright (C) 2015-2021 Regents of the University of California # Copyright (C) 2019-2020 Seven Bridges @@ -33,24 +34,10 @@ import sys import textwrap import uuid +from collections.abc import Iterator, Mapping, MutableMapping, MutableSequence from tempfile import NamedTemporaryFile, TemporaryFile, gettempdir from threading import Thread -from typing import (IO, - Any, - Callable, - Dict, - Iterator, - List, - Mapping, - MutableMapping, - MutableSequence, - Optional, - TextIO, - Tuple, - Type, - TypeVar, - Union, - cast) +from typing import IO, Any, Callable, Literal, Optional, TextIO, TypeVar, Union, cast from urllib.parse import quote, unquote, urlparse, urlsplit import cwl_utils.errors @@ -70,31 +57,36 @@ from cwltool.mpi import MpiConfig from cwltool.mutation import MutationManager from cwltool.pathmapper import MapperEnt, PathMapper -from cwltool.process import (Process, - add_sizes, - compute_checksums, - fill_in_defaults, - shortname) +from cwltool.process import ( + Process, + add_sizes, + compute_checksums, + fill_in_defaults, + shortname, +) from cwltool.secrets import SecretStore -from cwltool.software_requirements import (DependenciesConfiguration, - get_container_from_software_requirements) -from cwltool.stdfsaccess import StdFsAccess, abspath -from cwltool.utils import (CWLObjectType, - CWLOutputType, - DirectoryType, - adjustDirObjs, - aslist, - downloadHttpFile, - get_listing, - normalizeFilesDirs, - visit_class) from cwltool.singularity import SingularityCommandLineJob +from cwltool.software_requirements import ( + DependenciesConfiguration, + get_container_from_software_requirements, +) +from cwltool.stdfsaccess import StdFsAccess, abspath +from cwltool.utils import ( + CWLObjectType, + CWLOutputType, + DirectoryType, + adjustDirObjs, + aslist, + downloadHttpFile, + get_listing, + normalizeFilesDirs, + visit_class, +) from ruamel.yaml.comments import CommentedMap, CommentedSeq from schema_salad.avro.schema import Names from schema_salad.exceptions import ValidationException from schema_salad.ref_resolver import file_uri, uri_file_path from schema_salad.sourceline import SourceLine -from typing_extensions import Literal from toil.batchSystems.abstractBatchSystem import InsufficientSystemResources from toil.batchSystems.registry import DEFAULT_BATCH_SYSTEM @@ -104,17 +96,24 @@ from toil.provisioners.clusterScaler import JobTooBigError check_cwltool_version() -from toil.cwl.utils import (CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION, - CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE, - download_structure, - get_from_structure, - visit_cwl_class_and_reduce) +from toil.cwl.utils import ( + CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION, + CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE, + download_structure, + get_from_structure, + visit_cwl_class_and_reduce, +) from toil.exceptions import FailedJobsException from toil.fileStores import FileID from toil.fileStores.abstractFileStore import AbstractFileStore from toil.job import AcceleratorRequirement, Job, Promise, Promised, unwrap -from toil.jobStores.abstractJobStore import (AbstractJobStore, NoSuchFileException, LocatorException, - InvalidImportExportUrlException, UnimplementedURLException) +from toil.jobStores.abstractJobStore import ( + AbstractJobStore, + InvalidImportExportUrlException, + LocatorException, + NoSuchFileException, + UnimplementedURLException, +) from toil.jobStores.fileJobStore import FileJobStore from toil.jobStores.utils import JobStoreUnavailableException, generate_locator from toil.lib.io import mkdtemp @@ -150,7 +149,7 @@ def cwltoil_was_removed() -> None: # output object to the correct key of the input object. -class UnresolvedDict(Dict[Any, Any]): +class UnresolvedDict(dict[Any, Any]): """Tag to indicate a dict contains promises that must be resolved.""" @@ -185,7 +184,7 @@ def filter_skip_null(name: str, value: Any) -> Any: return value -def _filter_skip_null(value: Any, err_flag: List[bool]) -> Any: +def _filter_skip_null(value: Any, err_flag: list[bool]) -> Any: """ Private implementation for recursively filtering out SkipNull objects from 'value'. @@ -234,7 +233,9 @@ def ensure_no_collisions( seen_names.add(wanted_name) -def try_prepull(cwl_tool_uri: str, runtime_context: cwltool.context.RuntimeContext, batchsystem: str) -> None: +def try_prepull( + cwl_tool_uri: str, runtime_context: cwltool.context.RuntimeContext, batchsystem: str +) -> None: """ Try to prepull all containers in a CWL workflow with Singularity or Docker. This will not prepull the default container specified on the command line. @@ -246,7 +247,15 @@ def try_prepull(cwl_tool_uri: str, runtime_context: cwltool.context.RuntimeConte if runtime_context.singularity: if "CWL_SINGULARITY_CACHE" in os.environ: logger.info("Prepulling the workflow's containers with Singularity...") - call_command(["cwl-docker-extract", "--singularity", "--dir", os.environ['CWL_SINGULARITY_CACHE'], cwl_tool_uri]) + call_command( + [ + "cwl-docker-extract", + "--singularity", + "--dir", + os.environ["CWL_SINGULARITY_CACHE"], + cwl_tool_uri, + ] + ) elif not runtime_context.user_space_docker_cmd and not runtime_context.podman: # For udocker and podman prefetching is unimplemented # This is docker @@ -266,8 +275,8 @@ class Conditional: def __init__( self, expression: Optional[str] = None, - outputs: Union[Dict[str, CWLOutputType], None] = None, - requirements: Optional[List[CWLObjectType]] = None, + outputs: Union[dict[str, CWLOutputType], None] = None, + requirements: Optional[list[CWLObjectType]] = None, container_engine: str = "docker", ): """ @@ -312,7 +321,7 @@ def is_false(self, job: CWLObjectType) -> bool: "'%s' evaluated to a non-boolean value" % self.expression ) - def skipped_outputs(self) -> Dict[str, SkipNull]: + def skipped_outputs(self) -> dict[str, SkipNull]: """Generate a dict of SkipNull objects corresponding to the output structure.""" outobj = {} @@ -332,14 +341,14 @@ def sn(n: Any) -> str: class ResolveSource: """Apply linkMerge and pickValue operators to values coming into a port.""" - promise_tuples: Union[List[Tuple[str, Promise]], Tuple[str, Promise]] + promise_tuples: Union[list[tuple[str, Promise]], tuple[str, Promise]] def __init__( self, name: str, - input: Dict[str, CWLObjectType], + input: dict[str, CWLObjectType], source_key: str, - promises: Dict[str, Job], + promises: dict[str, Job], ): """ Construct a container object. @@ -398,7 +407,7 @@ def resolve(self) -> Any: ) else: name, rv = self.promise_tuples - result = cast(Dict[str, Any], rv).get(name) + result = cast(dict[str, Any], rv).get(name) result = self.pick_value(result) result = filter_skip_null(self.name, result) @@ -406,7 +415,7 @@ def resolve(self) -> Any: def link_merge( self, values: CWLObjectType - ) -> Union[List[CWLOutputType], CWLOutputType]: + ) -> Union[list[CWLOutputType], CWLOutputType]: """ Apply linkMerge operator to `values` object. @@ -419,7 +428,7 @@ def link_merge( return values elif link_merge_type == "merge_flattened": - result: List[CWLOutputType] = [] + result: list[CWLOutputType] = [] for v in values: if isinstance(v, MutableSequence): result.extend(v) @@ -432,7 +441,7 @@ def link_merge( f"Unsupported linkMerge '{link_merge_type}' on {self.name}." ) - def pick_value(self, values: Union[List[Union[str, SkipNull]], Any]) -> Any: + def pick_value(self, values: Union[list[Union[str, SkipNull]], Any]) -> Any: """ Apply pickValue operator to `values` object. @@ -500,7 +509,7 @@ class StepValueFrom: """ def __init__( - self, expr: str, source: Any, req: List[CWLObjectType], container_engine: str + self, expr: str, source: Any, req: list[CWLObjectType], container_engine: str ): """ Instantiate an object to carry all know about this valueFrom expression. @@ -632,7 +641,7 @@ def resolve(self) -> Any: def resolve_dict_w_promises( dict_w_promises: Union[ - UnresolvedDict, CWLObjectType, Dict[str, Union[str, StepValueFrom]] + UnresolvedDict, CWLObjectType, dict[str, Union[str, StepValueFrom]] ], file_store: Optional[AbstractFileStore] = None, ) -> CWLObjectType: @@ -687,7 +696,7 @@ class ToilPathMapper(PathMapper): def __init__( self, - referenced_files: List[CWLObjectType], + referenced_files: list[CWLObjectType], basedir: str, stagedir: str, separateDirs: bool = True, @@ -925,7 +934,7 @@ def visit( # Keep recursing self.visitlisting( - cast(List[CWLObjectType], obj.get("listing", [])), + cast(list[CWLObjectType], obj.get("listing", [])), tgt, basedir, copy=copy, @@ -965,7 +974,9 @@ def visit( # URI for a local file it downloaded. if self.get_file: deref = self.get_file( - location, obj.get("streamable", False), self.streaming_allowed + location, + obj.get("streamable", False), + self.streaming_allowed, ) else: deref = ab @@ -1001,7 +1012,7 @@ def visit( # Handle all secondary files that need to be next to this one. self.visitlisting( - cast(List[CWLObjectType], obj.get("secondaryFiles", [])), + cast(list[CWLObjectType], obj.get("secondaryFiles", [])), stagedir, basedir, copy=copy, @@ -1027,17 +1038,27 @@ def run_jobs( ) -> None: """run_jobs from SingleJobExecutor, but not in a top level runtime context.""" runtime_context.toplevel = False - if isinstance(process, cwltool.command_line_tool.CommandLineTool) and isinstance(process.make_job_runner(runtime_context), SingularityCommandLineJob): + if isinstance( + process, cwltool.command_line_tool.CommandLineTool + ) and isinstance( + process.make_job_runner(runtime_context), SingularityCommandLineJob + ): # Set defaults for singularity cache environment variables, similar to what we do in wdltoil # Use the same place as the default singularity cache directory singularity_cache = os.path.join(os.path.expanduser("~"), ".singularity") - os.environ['SINGULARITY_CACHEDIR'] = os.environ.get("SINGULARITY_CACHEDIR", singularity_cache) + os.environ["SINGULARITY_CACHEDIR"] = os.environ.get( + "SINGULARITY_CACHEDIR", singularity_cache + ) # If singularity is detected, prepull the image to ensure locking - (docker_req, docker_is_req) = process.get_requirement(feature="DockerRequirement") - with global_mutex(os.environ['SINGULARITY_CACHEDIR'], 'toil_singularity_cache_mutex'): + (docker_req, docker_is_req) = process.get_requirement( + feature="DockerRequirement" + ) + with global_mutex( + os.environ["SINGULARITY_CACHEDIR"], "toil_singularity_cache_mutex" + ): SingularityCommandLineJob.get_image( - dockerRequirement=cast(Dict[str, str], docker_req), + dockerRequirement=cast(dict[str, str], docker_req), pull_image=runtime_context.pull_image, force_pull=runtime_context.force_docker_pull, tmp_outdir_prefix=runtime_context.tmp_outdir_prefix, @@ -1057,7 +1078,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: # Reserve a spot for the Toil job that ends up executing this tool. self._toil_job: Optional[Job] = None # Remember path mappers we have used so we can interrogate them later to find out what the job mapped. - self._path_mappers: List[cwltool.pathmapper.PathMapper] = [] + self._path_mappers: list[cwltool.pathmapper.PathMapper] = [] def connect_toil_job(self, job: Job) -> None: """ @@ -1069,7 +1090,7 @@ def connect_toil_job(self, job: Job) -> None: def make_path_mapper( self, - reffiles: List[Any], + reffiles: list[Any], stagedir: str, runtimeContext: cwltool.context.RuntimeContext, separateDirs: bool, @@ -1127,13 +1148,15 @@ def _initialworkdir( # Make a table of all the places we mapped files to when downloading the inputs. # We want to hint which host paths and container (if any) paths correspond - host_and_job_paths: List[Tuple[str, str]] = [] + host_and_job_paths: list[tuple[str, str]] = [] for pm in self._path_mappers: for _, mapper_entry in pm.items_exclude_children(): # We know that mapper_entry.target as seen by the task is # mapper_entry.resolved on the host. - host_and_job_paths.append((mapper_entry.resolved, mapper_entry.target)) + host_and_job_paths.append( + (mapper_entry.resolved, mapper_entry.target) + ) # Notice that we have downloaded our inputs. Explain which files # those are here and what the task will expect to call them. @@ -1165,7 +1188,7 @@ def toil_make_tool( # URI instead of raising an error right away, in case it is optional. MISSING_FILE = "missing://" -DirectoryContents = Dict[str, Union[str, "DirectoryContents"]] +DirectoryContents = dict[str, Union[str, "DirectoryContents"]] def check_directory_dict_invariants(contents: DirectoryContents) -> None: @@ -1187,7 +1210,7 @@ def check_directory_dict_invariants(contents: DirectoryContents) -> None: def decode_directory( dir_path: str, -) -> Tuple[DirectoryContents, Optional[str], str]: +) -> tuple[DirectoryContents, Optional[str], str]: """ Decode a directory from a "toildir:" path to a directory (or a file in it). @@ -1262,7 +1285,7 @@ def __init__( # they know what will happen. # Also maps files and directories from external URLs to downloaded # locations. - self.dir_to_download: Dict[str, str] = {} + self.dir_to_download: dict[str, str] = {} super().__init__(basedir) @@ -1385,14 +1408,16 @@ def download_to(url: str, dest: str) -> None: destination = super()._abs(destination) return destination - def glob(self, pattern: str) -> List[str]: + def glob(self, pattern: str) -> list[str]: parse = urlparse(pattern) if parse.scheme == "file": pattern = os.path.abspath(unquote(parse.path)) elif parse.scheme == "": pattern = os.path.abspath(pattern) else: - raise RuntimeError(f"Cannot efficiently support globbing on {parse.scheme} URIs") + raise RuntimeError( + f"Cannot efficiently support globbing on {parse.scheme} URIs" + ) # Actually do the glob return [schema_salad.ref_resolver.file_uri(f) for f in glob.glob(pattern)] @@ -1429,12 +1454,12 @@ def open(self, fn: str, mode: str) -> IO[Any]: else: # This should be supported by a job store. byte_stream = AbstractJobStore.open_url(fn) - if 'b' in mode: + if "b" in mode: # Pass stream along in binary return byte_stream else: # Wrap it in a text decoder - return io.TextIOWrapper(byte_stream, encoding='utf-8') + return io.TextIOWrapper(byte_stream, encoding="utf-8") def exists(self, path: str) -> bool: """Test for file existence.""" @@ -1541,7 +1566,7 @@ def isdir(self, fn: str) -> bool: logger.debug("AbstractJobStore said: %s", status) return status - def listdir(self, fn: str) -> List[str]: + def listdir(self, fn: str) -> list[str]: # This needs to return full URLs for everything in the directory. # URLs are not allowed to end in '/', even for subdirectories. logger.debug("ToilFsAccess listing %s", fn) @@ -1562,7 +1587,9 @@ def listdir(self, fn: str) -> List[str]: if got is None: raise RuntimeError(f"Cannot list nonexistent directory: {fn}") if isinstance(got, str): - raise RuntimeError(f"Cannot list file or dubdirectory of a file: {fn}") + raise RuntimeError( + f"Cannot list file or dubdirectory of a file: {fn}" + ) here = got # List all the things in here and make full URIs to them return [os.path.join(fn, k) for k in here.keys()] @@ -1572,7 +1599,7 @@ def listdir(self, fn: str) -> List[str]: for entry in AbstractJobStore.list_url(fn) ] - def join(self, path, *paths): # type: (str, *str) -> str + def join(self, path: str, *paths: str) -> str: # This falls back on os.path.join return super().join(path, *paths) @@ -1585,12 +1612,12 @@ def realpath(self, fn: str) -> str: def toil_get_file( file_store: AbstractFileStore, - index: Dict[str, str], - existing: Dict[str, str], + index: dict[str, str], + existing: dict[str, str], uri: str, streamable: bool = False, streaming_allowed: bool = True, - pipe_threads: Optional[List[Tuple[Thread, int]]] = None, + pipe_threads: Optional[list[tuple[Thread, int]]] = None, ) -> str: """ Set up the given file or directory from the Toil jobstore at a file URI @@ -1691,9 +1718,7 @@ def write_to_pipe( and streamable and not isinstance(file_store.jobStore, FileJobStore) ): - logger.debug( - "Streaming file %s", uri - ) + logger.debug("Streaming file %s", uri) src_path = file_store.getLocalTempFileName() os.mkfifo(src_path) th = ExceptionalThread( @@ -1715,30 +1740,29 @@ def write_to_pipe( if uri.startswith("toilfile:"): # Download from the file store file_store_id = FileID.unpack(uri[len("toilfile:") :]) - src_path = file_store.readGlobalFile( - file_store_id, symlink=True - ) + src_path = file_store.readGlobalFile(file_store_id, symlink=True) else: # Download from the URI via the job store. # Figure out where it goes. src_path = file_store.getLocalTempFileName() # Open that path exclusively to make sure we created it - with open(src_path, 'xb') as fh: + with open(src_path, "xb") as fh: # Download into the file - size, executable = AbstractJobStore.read_from_url(uri, fh) - if executable: - # Set the execute bit in the file's permissions - os.chmod(src_path, os.stat(src_path).st_mode | stat.S_IXUSR) + size, executable = AbstractJobStore.read_from_url(uri, fh) + if executable: + # Set the execute bit in the file's permissions + os.chmod(src_path, os.stat(src_path).st_mode | stat.S_IXUSR) index[src_path] = uri existing[uri] = src_path return schema_salad.ref_resolver.file_uri(src_path) + def write_file( writeFunc: Callable[[str], FileID], - index: Dict[str, str], - existing: Dict[str, str], + index: dict[str, str], + existing: dict[str, str], file_uri: str, ) -> str: """ @@ -1786,13 +1810,13 @@ def path_to_loc(obj: CWLObjectType) -> None: def import_files( import_function: Callable[[str], FileID], fs_access: StdFsAccess, - fileindex: Dict[str, str], - existing: Dict[str, str], + fileindex: dict[str, str], + existing: dict[str, str], cwl_object: Optional[CWLObjectType], mark_broken: bool = False, skip_remote: bool = False, bypass_file_store: bool = False, - log_level: int = logging.DEBUG + log_level: int = logging.DEBUG, ) -> None: """ Prepare all files and directories. @@ -1874,7 +1898,7 @@ def import_and_log(url: str) -> FileID: def visit_file_or_directory_down( rec: CWLObjectType, - ) -> Optional[List[CWLObjectType]]: + ) -> Optional[list[CWLObjectType]]: """ Visit each CWL File or Directory on the way down. @@ -1901,7 +1925,7 @@ def visit_file_or_directory_down( ensure_no_collisions(cast(DirectoryType, rec)) # Pull out the old listing, if any - old_listing = cast(Optional[List[CWLObjectType]], rec.get("listing", None)) + old_listing = cast(Optional[list[CWLObjectType]], rec.get("listing", None)) if not cast(str, rec["location"]).startswith("_:"): # This is a thing we can list and not just a literal, so we @@ -1923,8 +1947,8 @@ def visit_file_or_directory_down( def visit_file_or_directory_up( rec: CWLObjectType, - down_result: Optional[List[CWLObjectType]], - child_results: List[DirectoryContents], + down_result: Optional[list[CWLObjectType]], + child_results: list[DirectoryContents], ) -> DirectoryContents: """ For a CWL File or Directory, make sure it is uploaded and it has a @@ -1949,7 +1973,12 @@ def visit_file_or_directory_up( # Upload the file itself, which will adjust its location. upload_file( - import_and_log, fileindex, existing, rec, mark_broken=mark_broken, skip_remote=skip_remote + import_and_log, + fileindex, + existing, + rec, + mark_broken=mark_broken, + skip_remote=skip_remote, ) # Make a record for this file under its name @@ -2053,11 +2082,11 @@ def upload_directory( def upload_file( uploadfunc: Callable[[str], FileID], - fileindex: Dict[str, str], - existing: Dict[str, str], + fileindex: dict[str, str], + existing: dict[str, str], file_metadata: CWLObjectType, mark_broken: bool = False, - skip_remote: bool = False + skip_remote: bool = False, ) -> None: """ Update a file object so that the file will be accessible from another machine. @@ -2092,11 +2121,15 @@ def upload_file( logger.debug("File %s is missing", file_metadata) file_metadata["location"] = location = MISSING_FILE else: - raise cwl_utils.errors.WorkflowException("File is missing: %s" % file_metadata) + raise cwl_utils.errors.WorkflowException( + "File is missing: %s" % file_metadata + ) if location.startswith("file://") or not skip_remote: # This is a local file, or we also need to download and re-upload remote files - file_metadata["location"] = write_file(uploadfunc, fileindex, existing, location) + file_metadata["location"] = write_file( + uploadfunc, fileindex, existing, location + ) logger.debug("Sending file at: %s", file_metadata["location"]) @@ -2109,7 +2142,7 @@ def writeGlobalFileWrapper(file_store: AbstractFileStore, fileuri: str) -> FileI def remove_empty_listings(rec: CWLObjectType) -> None: if rec.get("class") != "Directory": - finddirs = [] # type: List[CWLObjectType] + finddirs: list[CWLObjectType] = [] visit_class(rec, ("Directory",), finddirs.append) for f in finddirs: remove_empty_listings(f) @@ -2129,7 +2162,7 @@ def __init__( cores: Union[float, None] = 1, memory: Union[int, str, None] = "1GiB", disk: Union[int, str, None] = "1MiB", - accelerators: Optional[List[AcceleratorRequirement]] = None, + accelerators: Optional[list[AcceleratorRequirement]] = None, preemptible: Optional[bool] = None, tool_id: Optional[str] = None, parent_name: Optional[str] = None, @@ -2204,10 +2237,10 @@ def run(self, file_store: AbstractFileStore) -> CWLObjectType: def toilStageFiles( toil: Toil, - cwljob: Union[CWLObjectType, List[CWLObjectType]], + cwljob: Union[CWLObjectType, list[CWLObjectType]], outdir: str, destBucket: Union[str, None] = None, - log_level: int = logging.DEBUG + log_level: int = logging.DEBUG, ) -> None: """ Copy input files out of the global file store and update location and path. @@ -2219,7 +2252,7 @@ def toilStageFiles( """ def _collectDirEntries( - obj: Union[CWLObjectType, List[CWLObjectType]] + obj: Union[CWLObjectType, list[CWLObjectType]] ) -> Iterator[CWLObjectType]: if isinstance(obj, dict): if obj.get("class") in ("File", "Directory"): @@ -2301,13 +2334,17 @@ def _realpath( # TODO: Use direct S3 to S3 copy on exports as well file_id_or_contents = ( "toilfile:" - + toil.import_file(file_id_or_contents, symlink=False).pack() + + toil.import_file( + file_id_or_contents, symlink=False + ).pack() ) if file_id_or_contents.startswith("toilfile:"): # This is something we can export # TODO: Do we need to urlencode the parts before sending them to S3? - dest_url = "/".join(s.strip("/") for s in [destBucket, baseName]) + dest_url = "/".join( + s.strip("/") for s in [destBucket, baseName] + ) logger.log(log_level, "Saving %s...", dest_url) toil.export_file( FileID.unpack(file_id_or_contents[len("toilfile:") :]), @@ -2456,7 +2493,7 @@ def __init__( resources={}, mutation_manager=runtime_context.mutation_manager, formatgraph=tool.formatgraph, - make_fs_access=cast(Type[StdFsAccess], runtime_context.make_fs_access), + make_fs_access=cast(type[StdFsAccess], runtime_context.make_fs_access), fs_access=runtime_context.make_fs_access(""), job_script_provider=runtime_context.job_script_provider, timeout=runtime_context.eval_timeout, @@ -2487,7 +2524,7 @@ def __init__( # We use a None requirement and the Toil default applies. memory = None - accelerators: Optional[List[AcceleratorRequirement]] = None + accelerators: Optional[list[AcceleratorRequirement]] = None if req.get("cudaDeviceCount", 0) > 0: # There's a CUDARequirement, which cwltool processed for us # TODO: How is cwltool deciding what value to use between min and max? @@ -2565,7 +2602,7 @@ def __init__( self.step_inputs = self.cwltool.tool["inputs"] self.workdir: str = runtime_context.workdir # type: ignore[attr-defined] - def required_env_vars(self, cwljob: Any) -> Iterator[Tuple[str, str]]: + def required_env_vars(self, cwljob: Any) -> Iterator[tuple[str, str]]: """Yield environment variables from EnvVarRequirement.""" if isinstance(cwljob, dict): if cwljob.get("class") == "EnvVarRequirement": @@ -2577,7 +2614,7 @@ def required_env_vars(self, cwljob: Any) -> Iterator[Tuple[str, str]]: for env_var in cwljob: yield from self.required_env_vars(env_var) - def populate_env_vars(self, cwljob: CWLObjectType) -> Dict[str, str]: + def populate_env_vars(self, cwljob: CWLObjectType) -> dict[str, str]: """ Prepare environment variables necessary at runtime for the job. @@ -2593,9 +2630,9 @@ def populate_env_vars(self, cwljob: CWLObjectType) -> Dict[str, str]: required_env_vars = {} # iterate over EnvVarRequirement env vars, if any for k, v in self.required_env_vars(cwljob): - required_env_vars[ - k - ] = v # will tell cwltool which env vars to take from the environment + required_env_vars[k] = ( + v # will tell cwltool which env vars to take from the environment + ) os.environ[k] = v # needs to actually be populated in the environment as well or # they're not used @@ -2605,7 +2642,7 @@ def populate_env_vars(self, cwljob: CWLObjectType) -> Dict[str, str]: # env var with the same name is found for req in self.cwltool.requirements: if req["class"] == "EnvVarRequirement": - envDefs = cast(List[Dict[str, str]], req["envDef"]) + envDefs = cast(list[dict[str, str]], req["envDef"]) for env_def in envDefs: env_name = env_def["envName"] if env_name in required_env_vars: @@ -2637,7 +2674,7 @@ def run(self, file_store: AbstractFileStore) -> Any: for inp_id in immobile_cwljob_dict.keys(): found = False for field in cast( - List[Dict[str, str]], self.cwltool.inputs_record_schema["fields"] + list[dict[str, str]], self.cwltool.inputs_record_schema["fields"] ): if field["name"] == inp_id: found = True @@ -2652,8 +2689,8 @@ def run(self, file_store: AbstractFileStore) -> Any: functools.partial(remove_empty_listings), ) - index: Dict[str, str] = {} - existing: Dict[str, str] = {} + index: dict[str, str] = {} + existing: dict[str, str] = {} # Prepare the run instructions for cwltool runtime_context = self.runtime_context.copy() @@ -2665,7 +2702,7 @@ def run(self, file_store: AbstractFileStore) -> Any: # will come and grab this function for fetching files from the Toil # file store. pipe_threads is used for keeping track of separate # threads launched to stream files around. - pipe_threads: List[Tuple[Thread, int]] = [] + pipe_threads: list[tuple[Thread, int]] = [] setattr( runtime_context, "toil_get_file", @@ -2699,7 +2736,7 @@ def run(self, file_store: AbstractFileStore) -> Any: # function and a path_mapper type or factory function. runtime_context.make_fs_access = cast( - Type[StdFsAccess], + type[StdFsAccess], functools.partial(ToilFsAccess, file_store=file_store), ) @@ -2712,9 +2749,13 @@ def run(self, file_store: AbstractFileStore) -> Any: # Collect standard output and standard error somewhere if they don't go to files. # We need to keep two FDs to these because cwltool will close what we give it. default_stdout = TemporaryFile() - runtime_context.default_stdout = os.fdopen(os.dup(default_stdout.fileno()), 'wb') + runtime_context.default_stdout = os.fdopen( + os.dup(default_stdout.fileno()), "wb" + ) default_stderr = TemporaryFile() - runtime_context.default_stderr = os.fdopen(os.dup(default_stderr.fileno()), 'wb') + runtime_context.default_stderr = os.fdopen( + os.dup(default_stderr.fileno()), "wb" + ) process_uuid = uuid.uuid4() # noqa F841 started_at = datetime.datetime.now() # noqa F841 @@ -2745,17 +2786,27 @@ def run(self, file_store: AbstractFileStore) -> Any: default_stdout.seek(0, os.SEEK_END) if default_stdout.tell() > 0: default_stdout.seek(0) - file_store.log_user_stream(self.description.unitName + '.stdout', default_stdout) + file_store.log_user_stream( + self.description.unitName + ".stdout", default_stdout + ) if status != "success": default_stdout.seek(0) - logger.error("Failed command standard output:\n%s", default_stdout.read().decode("utf-8", errors="replace")) + logger.error( + "Failed command standard output:\n%s", + default_stdout.read().decode("utf-8", errors="replace"), + ) default_stderr.seek(0, os.SEEK_END) if default_stderr.tell(): default_stderr.seek(0) - file_store.log_user_stream(self.description.unitName + '.stderr', default_stderr) + file_store.log_user_stream( + self.description.unitName + ".stderr", default_stderr + ) if status != "success": default_stderr.seek(0) - logger.error("Failed command standard error:\n%s", default_stderr.read().decode("utf-8", errors="replace")) + logger.error( + "Failed command standard error:\n%s", + default_stderr.read().decode("utf-8", errors="replace"), + ) if status != "success": raise cwl_utils.errors.WorkflowException(status) @@ -2802,13 +2853,14 @@ def get_container_engine(runtime_context: cwltool.context.RuntimeContext) -> str return "singularity" return "docker" + def makeRootJob( tool: Process, jobobj: CWLObjectType, runtime_context: cwltool.context.RuntimeContext, initialized_job_order: CWLObjectType, options: Namespace, - toil: Toil + toil: Toil, ) -> CWLNamedJob: """ Create the Toil root Job object for the CWL tool. Is the same as makeJob() except this also handles import logic. @@ -2823,8 +2875,15 @@ def makeRootJob( import_job = CWLImportJob(initialized_job_order, tool, runtime_context, options) return import_job else: - import_workflow_inputs(toil._jobStore, options, initialized_job_order=initialized_job_order, tool=tool) - rootJob, followOn = makeJob(tool, jobobj, runtime_context, None, None) # toplevel, no name needed + import_workflow_inputs( + toil._jobStore, + options, + initialized_job_order=initialized_job_order, + tool=tool, + ) + rootJob, followOn = makeJob( + tool, jobobj, runtime_context, None, None + ) # toplevel, no name needed rootJob.cwljob = initialized_job_order return rootJob @@ -2836,9 +2895,9 @@ def makeJob( parent_name: Optional[str], conditional: Union[Conditional, None], ) -> Union[ - Tuple["CWLWorkflow", ResolveIndirect], - Tuple[CWLJob, CWLJob], - Tuple[CWLJobWrapper, CWLJobWrapper], + tuple["CWLWorkflow", ResolveIndirect], + tuple[CWLJob, CWLJob], + tuple[CWLJobWrapper, CWLJobWrapper], ]: """ Create the correct Toil Job object for the CWL tool. @@ -2925,16 +2984,16 @@ def __init__( def flat_crossproduct_scatter( self, joborder: CWLObjectType, - scatter_keys: List[str], - outputs: List[Promised[CWLObjectType]], + scatter_keys: list[str], + outputs: list[Promised[CWLObjectType]], postScatterEval: Callable[[CWLObjectType], CWLObjectType], ) -> None: """Cartesian product of the inputs, then flattened.""" scatter_key = shortname(scatter_keys[0]) - for n in range(0, len(cast(List[CWLObjectType], joborder[scatter_key]))): + for n in range(0, len(cast(list[CWLObjectType], joborder[scatter_key]))): updated_joborder = copy.copy(joborder) updated_joborder[scatter_key] = cast( - List[CWLObjectType], joborder[scatter_key] + list[CWLObjectType], joborder[scatter_key] )[n] if len(scatter_keys) == 1: updated_joborder = postScatterEval(updated_joborder) @@ -2955,16 +3014,16 @@ def flat_crossproduct_scatter( def nested_crossproduct_scatter( self, joborder: CWLObjectType, - scatter_keys: List[str], + scatter_keys: list[str], postScatterEval: Callable[[CWLObjectType], CWLObjectType], - ) -> List[Promised[CWLObjectType]]: + ) -> list[Promised[CWLObjectType]]: """Cartesian product of the inputs.""" scatter_key = shortname(scatter_keys[0]) - outputs: List[Promised[CWLObjectType]] = [] - for n in range(0, len(cast(List[CWLObjectType], joborder[scatter_key]))): + outputs: list[Promised[CWLObjectType]] = [] + for n in range(0, len(cast(list[CWLObjectType], joborder[scatter_key]))): updated_joborder = copy.copy(joborder) updated_joborder[scatter_key] = cast( - List[CWLObjectType], joborder[scatter_key] + list[CWLObjectType], joborder[scatter_key] )[n] if len(scatter_keys) == 1: updated_joborder = postScatterEval(updated_joborder) @@ -2985,7 +3044,7 @@ def nested_crossproduct_scatter( ) return outputs - def run(self, file_store: AbstractFileStore) -> List[Promised[CWLObjectType]]: + def run(self, file_store: AbstractFileStore) -> list[Promised[CWLObjectType]]: """Generate the follow on scatter jobs.""" cwljob = resolve_dict_w_promises(self.cwljob, file_store) @@ -2997,7 +3056,7 @@ def run(self, file_store: AbstractFileStore) -> List[Promised[CWLObjectType]]: scatterMethod = self.step.tool.get("scatterMethod", None) if len(scatter) == 1: scatterMethod = "dotproduct" - outputs: List[Promised[CWLObjectType]] = [] + outputs: list[Promised[CWLObjectType]] = [] valueFrom = { shortname(i["id"]): i["valueFrom"] @@ -3029,11 +3088,11 @@ def valueFromFunc(k: str, v: Any) -> Any: if scatterMethod == "dotproduct": for i in range( - 0, len(cast(List[CWLObjectType], cwljob[shortname(scatter[0])])) + 0, len(cast(list[CWLObjectType], cwljob[shortname(scatter[0])])) ): copyjob = copy.copy(cwljob) for sc in [shortname(x) for x in scatter]: - copyjob[sc] = cast(List[CWLObjectType], cwljob[sc])[i] + copyjob[sc] = cast(list[CWLObjectType], cwljob[sc])[i] copyjob = postScatterEval(copyjob) subjob, follow_on = makeJob( tool=self.step.embedded_tool, @@ -3072,7 +3131,7 @@ class CWLGather(Job): def __init__( self, step: cwltool.workflow.WorkflowStep, - outputs: Promised[Union[CWLObjectType, List[CWLObjectType]]], + outputs: Promised[Union[CWLObjectType, list[CWLObjectType]]], ): """Collect our context for later gathering.""" super().__init__(cores=1, memory="1GiB", disk="1MiB", local=True) @@ -3081,24 +3140,24 @@ def __init__( @staticmethod def extract( - obj: Union[CWLObjectType, List[CWLObjectType]], k: str - ) -> Union[CWLOutputType, List[CWLObjectType]]: + obj: Union[CWLObjectType, list[CWLObjectType]], k: str + ) -> Union[CWLOutputType, list[CWLObjectType]]: """ Extract the given key from the obj. If the object is a list, extract it from all members of the list. """ if isinstance(obj, Mapping): - return cast(Union[CWLOutputType, List[CWLObjectType]], obj.get(k)) + return cast(Union[CWLOutputType, list[CWLObjectType]], obj.get(k)) elif isinstance(obj, MutableSequence): - cp: List[CWLObjectType] = [] + cp: list[CWLObjectType] = [] for item in obj: cp.append(cast(CWLObjectType, CWLGather.extract(item, k))) return cp else: - return cast(List[CWLObjectType], []) + return cast(list[CWLObjectType], []) - def run(self, file_store: AbstractFileStore) -> Dict[str, Any]: + def run(self, file_store: AbstractFileStore) -> dict[str, Any]: """Gather all the outputs of the scatter.""" outobj = {} @@ -3109,8 +3168,8 @@ def sn(n: Union[Mapping[str, Any], str]) -> str: return shortname(n) # TODO: MyPy can't understand that this is the type we should get by unwrapping the promise - outputs: Union[CWLObjectType, List[CWLObjectType]] = cast( - Union[CWLObjectType, List[CWLObjectType]], unwrap(self.outputs) + outputs: Union[CWLObjectType, list[CWLObjectType]] = cast( + Union[CWLObjectType, list[CWLObjectType]], unwrap(self.outputs) ) for k in [sn(i) for i in self.step.tool["out"]]: outobj[k] = self.extract(outputs, k) @@ -3192,7 +3251,7 @@ def __init__( def run( self, file_store: AbstractFileStore - ) -> Union[UnresolvedDict, Dict[str, SkipNull]]: + ) -> Union[UnresolvedDict, dict[str, SkipNull]]: """ Convert a CWL Workflow graph into a Toil job graph. @@ -3213,7 +3272,7 @@ def run( # that may be used as a "source" for a step input workflow output # parameter # to: the job that will produce that value. - promises: Dict[str, Job] = {} + promises: dict[str, Job] = {} parent_name = shortname(self.cwlwf.tool["id"]) @@ -3242,7 +3301,7 @@ def run( stepinputs_fufilled = False if stepinputs_fufilled: logger.debug("Ready to make job for workflow step %s", step_id) - jobobj: Dict[ + jobobj: dict[ str, Union[ResolveSource, DefaultWithSource, StepValueFrom] ] = {} @@ -3380,7 +3439,13 @@ class CWLSetupJob(CWLNamedJob): """ Job to take a CWL tool and job order with all files imported and makes a CWLWorkflow as a child to run it. """ - def __init__(self, initialized_job_order: Promised[CWLObjectType], tool: Promised[Process], runtime_context: cwltool.context.RuntimeContext): + + def __init__( + self, + initialized_job_order: Promised[CWLObjectType], + tool: Promised[Process], + runtime_context: cwltool.context.RuntimeContext, + ): super().__init__() self.initialized_job_order = initialized_job_order self.tool = tool @@ -3392,7 +3457,9 @@ def run(self, file_store: AbstractFileStore) -> Any: """ initialized_job_order = unwrap(self.initialized_job_order) tool = unwrap(self.tool) - root_job, _ = makeJob(tool, initialized_job_order, self.runtime_context, None, None) + root_job, _ = makeJob( + tool, initialized_job_order, self.runtime_context, None, None + ) self.addChild(root_job) root_job.cwljob = initialized_job_order @@ -3406,7 +3473,14 @@ class CWLImportJob(CWLNamedJob): This class is only used when runImportsOnWorkers is enabled. """ - def __init__(self, initialized_job_order: CWLObjectType, tool: Process, runtime_context: cwltool.context.RuntimeContext, options: Namespace): + + def __init__( + self, + initialized_job_order: CWLObjectType, + tool: Process, + runtime_context: cwltool.context.RuntimeContext, + options: Namespace, + ): super().__init__(local=False, disk=options.import_workers_disk) self.initialized_job_order = initialized_job_order self.tool = tool @@ -3418,15 +3492,24 @@ def run(self, file_store: AbstractFileStore) -> Any: Import the workflow inputs and then create and run the workflow. :return: Promise of workflow outputs """ - import_workflow_inputs(file_store.jobStore, self.options, self.initialized_job_order, self.tool) - setup_job = CWLSetupJob(self.initialized_job_order, self.tool, self.runtime_context) + import_workflow_inputs( + file_store.jobStore, self.options, self.initialized_job_order, self.tool + ) + setup_job = CWLSetupJob( + self.initialized_job_order, self.tool, self.runtime_context + ) self.addChild(setup_job) return setup_job.rv() -def import_workflow_inputs(jobstore: AbstractJobStore, options: Namespace, initialized_job_order: CWLObjectType, tool: Process) -> None: - fileindex: Dict[str, str] = {} - existing: Dict[str, str] = {} +def import_workflow_inputs( + jobstore: AbstractJobStore, + options: Namespace, + initialized_job_order: CWLObjectType, + tool: Process, +) -> None: + fileindex: dict[str, str] = {} + existing: dict[str, str] = {} # Define something we can call to import a file and get its file # ID. # We cast this because import_file is overloaded depending on if we @@ -3517,7 +3600,7 @@ def rm_unprocessed_secondary_files(job_params: Any) -> None: def filtered_secondary_files( unfiltered_secondary_files: CWLObjectType, -) -> List[CWLObjectType]: +) -> list[CWLObjectType]: """ Remove unprocessed secondary files. @@ -3538,28 +3621,33 @@ def filtered_secondary_files( intermediate_secondary_files = [] final_secondary_files = [] # remove secondary files still containing interpolated strings - for sf in cast(List[CWLObjectType], unfiltered_secondary_files["secondaryFiles"]): + for sf in cast(list[CWLObjectType], unfiltered_secondary_files["secondaryFiles"]): sf_bn = cast(str, sf.get("basename", "")) sf_loc = cast(str, sf.get("location", "")) if ("$(" not in sf_bn) and ("${" not in sf_bn): if ("$(" not in sf_loc) and ("${" not in sf_loc): intermediate_secondary_files.append(sf) else: - logger.debug("Secondary file %s is dropped because it has an uninterpolated location", sf) + logger.debug( + "Secondary file %s is dropped because it has an uninterpolated location", + sf, + ) else: - logger.debug("Secondary file %s is dropped because it has an uninterpolated basename", sf) + logger.debug( + "Secondary file %s is dropped because it has an uninterpolated basename", + sf, + ) # remove secondary files that are not present in the filestore or pointing # to existant things on disk for sf in intermediate_secondary_files: sf_loc = cast(str, sf.get("location", "")) - if ( - sf_loc != MISSING_FILE - or sf.get("class", "") == "Directory" - ): + if sf_loc != MISSING_FILE or sf.get("class", "") == "Directory": # Pass imported files, and all Directories final_secondary_files.append(sf) else: - logger.debug("Secondary file %s is dropped because it is known to be missing", sf) + logger.debug( + "Secondary file %s is dropped because it is known to be missing", sf + ) return final_secondary_files @@ -3664,8 +3752,6 @@ def determine_load_listing( class NoAvailableJobStoreException(Exception): """Indicates that no job store name is available.""" - pass - def generate_default_job_store( batch_system_name: Optional[str], @@ -3748,7 +3834,8 @@ def generate_default_job_store( ] ) -def get_options(args: List[str]) -> Namespace: + +def get_options(args: list[str]) -> Namespace: """ Parse given args and properly add non-Toil arguments into the cwljob of the Namespace. :param args: List of args from command line @@ -3763,7 +3850,7 @@ def get_options(args: List[str]) -> Namespace: return options -def main(args: Optional[List[str]] = None, stdout: TextIO = sys.stdout) -> int: +def main(args: Optional[list[str]] = None, stdout: TextIO = sys.stdout) -> int: """Run the main loop for toil-cwl-runner.""" # Remove cwltool logger's stream handler so it uses Toil's cwllogger.removeHandler(defaultStreamHandler) @@ -3775,12 +3862,16 @@ def main(args: Optional[List[str]] = None, stdout: TextIO = sys.stdout) -> int: # Take care of incompatible arguments related to file imports if options.run_imports_on_workers is True and options.import_workers_disk is None: - logger.error("Commandline arguments --runImportsOnWorkers and --importWorkersDisk must both be set to run file imports on workers.") + logger.error( + "Commandline arguments --runImportsOnWorkers and --importWorkersDisk must both be set to run file imports on workers." + ) return 1 # Do cwltool setup cwltool.main.setup_schema(args=options, custom_schema_callback=None) - tmpdir_prefix = options.tmpdir_prefix = options.tmpdir_prefix or DEFAULT_TMPDIR_PREFIX + tmpdir_prefix = options.tmpdir_prefix = ( + options.tmpdir_prefix or DEFAULT_TMPDIR_PREFIX + ) # We need a workdir for the CWL runtime contexts. if tmpdir_prefix != DEFAULT_TMPDIR_PREFIX: @@ -3928,16 +4019,20 @@ def main(args: Optional[List[str]] = None, stdout: TextIO = sys.stdout) -> int: # See https://github.com/common-workflow-language/cwl-utils/issues/309 try_prepull(uri, runtime_context, toil.config.batchSystem) else: - logger.debug("Not prepulling containers as cwltool extensions are not supported.") + logger.debug( + "Not prepulling containers as cwltool extensions are not supported." + ) options.tool_help = None options.debug = options.logLevel == "DEBUG" - job_order_object, options.basedir, jobloader = cwltool.main.load_job_order( - options, - sys.stdin, - loading_context.fetcher_constructor, - loading_context.overrides_list, - tool_file_uri, + job_order_object, options.basedir, jobloader = ( + cwltool.main.load_job_order( + options, + sys.stdin, + loading_context.fetcher_constructor, + loading_context.overrides_list, + tool_file_uri, + ) ) if options.overrides: loading_context.overrides_list.extend( @@ -4011,9 +4106,9 @@ def main(args: Optional[List[str]] = None, stdout: TextIO = sys.stdout) -> int: shortname(inp["id"]) in initialized_job_order and inp["type"] == "File" ): - cast(CWLObjectType, initialized_job_order[shortname(inp["id"])])[ - "streamable" - ] = inp.get("streamable", False) + cast( + CWLObjectType, initialized_job_order[shortname(inp["id"])] + )["streamable"] = inp.get("streamable", False) # TODO also for nested types that contain streamable Files runtime_context.use_container = not options.no_container @@ -4060,7 +4155,7 @@ def main(args: Optional[List[str]] = None, stdout: TextIO = sys.stdout) -> int: runtime_context=runtime_context, initialized_job_order=initialized_job_order, options=options, - toil=toil + toil=toil, ) except CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION as err: logging.error(err) @@ -4081,7 +4176,7 @@ def main(args: Optional[List[str]] = None, stdout: TextIO = sys.stdout) -> int: outobj, outdir, destBucket=options.destBucket, - log_level=logging.INFO + log_level=logging.INFO, ) logger.info("Stored workflow outputs") @@ -4144,8 +4239,13 @@ def remove_at_id(doc: Any) -> None: else: logging.error(err) return 1 - except (InsufficientSystemResources, LocatorException, InvalidImportExportUrlException, UnimplementedURLException, - JobTooBigError) as err: + except ( + InsufficientSystemResources, + LocatorException, + InvalidImportExportUrlException, + UnimplementedURLException, + JobTooBigError, + ) as err: logging.error(err) return 1 diff --git a/src/toil/cwl/utils.py b/src/toil/cwl/utils.py index c2c5d45aae..beac3e1c49 100644 --- a/src/toil/cwl/utils.py +++ b/src/toil/cwl/utils.py @@ -16,21 +16,11 @@ import logging import os -from pathlib import PurePosixPath import posixpath import stat -from typing import ( - Any, - Callable, - Dict, - Iterable, - List, - MutableMapping, - MutableSequence, - Type, - TypeVar, - Union, -) +from collections.abc import Iterable, MutableMapping, MutableSequence +from pathlib import PurePosixPath +from typing import Any, Callable, TypeVar, Union from toil.fileStores import FileID from toil.fileStores.abstractFileStore import AbstractFileStore @@ -55,7 +45,7 @@ class CWLUnsupportedException(Exception): import cwltool.errors CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION: Union[ - Type[cwltool.errors.UnsupportedRequirement], Type[CWLUnsupportedException] + type[cwltool.errors.UnsupportedRequirement], type[CWLUnsupportedException] ] = cwltool.errors.UnsupportedRequirement except ImportError: CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION = CWLUnsupportedException @@ -92,8 +82,8 @@ def visit_cwl_class_and_reduce( rec: Any, classes: Iterable[str], op_down: Callable[[Any], DownReturnType], - op_up: Callable[[Any, DownReturnType, List[UpReturnType]], UpReturnType], -) -> List[UpReturnType]: + op_up: Callable[[Any, DownReturnType, list[UpReturnType]], UpReturnType], +) -> list[UpReturnType]: """ Apply the given operations to all CWL objects with the given named CWL class. @@ -130,9 +120,12 @@ def visit_cwl_class_and_reduce( return results -DirectoryStructure = Dict[str, Union[str, "DirectoryStructure"]] +DirectoryStructure = dict[str, Union[str, "DirectoryStructure"]] -def get_from_structure(dir_dict: DirectoryStructure, path: str) -> Union[str, DirectoryStructure, None]: + +def get_from_structure( + dir_dict: DirectoryStructure, path: str +) -> Union[str, DirectoryStructure, None]: """ Given a relative path, follow it in the given directory structure. @@ -144,7 +137,7 @@ def get_from_structure(dir_dict: DirectoryStructure, path: str) -> Union[str, Di parts = PurePosixPath(posixpath.normpath(path)).parts if len(parts) == 0: return dir_dict - if parts[0] in ('..', '/'): + if parts[0] in ("..", "/"): raise RuntimeError(f"Path {path} not resolvable in virtual directory") found: Union[str, DirectoryStructure] = dir_dict for part in parts: @@ -161,8 +154,8 @@ def get_from_structure(dir_dict: DirectoryStructure, path: str) -> Union[str, Di def download_structure( file_store: AbstractFileStore, - index: Dict[str, str], - existing: Dict[str, str], + index: dict[str, str], + existing: dict[str, str], dir_dict: DirectoryStructure, into_dir: str, ) -> None: @@ -215,7 +208,9 @@ def download_structure( ) else: # We need to download from some other kind of URL. - size, executable = AbstractJobStore.read_from_url(value, open(dest_path, 'wb')) + size, executable = AbstractJobStore.read_from_url( + value, open(dest_path, "wb") + ) if executable: # Make the written file executable os.chmod(dest_path, os.stat(dest_path).st_mode | stat.S_IXUSR) diff --git a/src/toil/deferred.py b/src/toil/deferred.py index 5f8f93b5db..838d0842d9 100644 --- a/src/toil/deferred.py +++ b/src/toil/deferred.py @@ -20,16 +20,17 @@ import dill - from toil.lib.io import robust_rmtree +from toil.lib.threading import safe_lock, safe_unlock_and_close from toil.realtimeLogger import RealtimeLogger from toil.resource import ModuleDescriptor -from toil.lib.threading import safe_lock, safe_unlock_and_close logger = logging.getLogger(__name__) -class DeferredFunction(namedtuple('DeferredFunction', 'function args kwargs name module')): +class DeferredFunction( + namedtuple("DeferredFunction", "function args kwargs name module") +): """ >>> from collections import defaultdict >>> df = DeferredFunction.create(defaultdict, None, {'x':1}, y=2) @@ -38,6 +39,7 @@ class DeferredFunction(namedtuple('DeferredFunction', 'function args kwargs name >>> df.invoke() == defaultdict(None, x=1, y=2) True """ + @classmethod def create(cls, function, *args, **kwargs): """ @@ -52,21 +54,25 @@ def create(cls, function, *args, **kwargs): # concurrently running jobs when the cache state is loaded from disk. By implication we # should serialize as early as possible. We need to serialize the function as well as its # arguments. - return cls(*list(map(dill.dumps, (function, args, kwargs))), - name=function.__name__, - module=ModuleDescriptor.forModule(function.__module__).globalize()) + return cls( + *list(map(dill.dumps, (function, args, kwargs))), + name=function.__name__, + module=ModuleDescriptor.forModule(function.__module__).globalize(), + ) def invoke(self): """ Invoke the captured function with the captured arguments. """ - logger.debug('Running deferred function %s.', self) + logger.debug("Running deferred function %s.", self) self.module.makeLoadable() - function, args, kwargs = list(map(dill.loads, (self.function, self.args, self.kwargs))) + function, args, kwargs = list( + map(dill.loads, (self.function, self.args, self.kwargs)) + ) return function(*args, **kwargs) def __str__(self): - return f'{self.__class__.__name__}({self.name}, ...)' + return f"{self.__class__.__name__}({self.name}, ...)" __repr__ = __str__ @@ -95,13 +101,13 @@ class DeferredFunctionManager: """ # Define what directory the state directory should actaully be, under the base - STATE_DIR_STEM = 'deferred' + STATE_DIR_STEM = "deferred" # Have a prefix to distinguish our deferred functions from e.g. NFS # "silly rename" files, or other garbage that people put in our # directory - PREFIX = 'func' + PREFIX = "func" # And a suffix to distinguish in-progress from completed files - WIP_SUFFIX = '.tmp' + WIP_SUFFIX = ".tmp" def __init__(self, stateDirBase: str) -> None: """ @@ -124,9 +130,9 @@ def __init__(self, stateDirBase: str) -> None: # We need to get a state file, locked by us and not somebody scanning for abandoned state files. # So we suffix not-yet-ready ones with our suffix - self.stateFD, self.stateFileName = tempfile.mkstemp(dir=self.stateDir, - prefix=self.PREFIX, - suffix=self.WIP_SUFFIX) + self.stateFD, self.stateFileName = tempfile.mkstemp( + dir=self.stateDir, prefix=self.PREFIX, suffix=self.WIP_SUFFIX + ) # Lock the state file. The lock will automatically go away if our process does. try: @@ -134,19 +140,21 @@ def __init__(self, stateDirBase: str) -> None: except OSError as e: if e.errno in (errno.EACCES, errno.EAGAIN): # Someone else locked it even though they should not have. - raise RuntimeError(f"Could not lock deferred function state file {self.stateFileName}") from e + raise RuntimeError( + f"Could not lock deferred function state file {self.stateFileName}" + ) from e else: # Something else went wrong raise # Rename it to remove the suffix - os.rename(self.stateFileName, self.stateFileName[:-len(self.WIP_SUFFIX)]) - self.stateFileName = self.stateFileName[:-len(self.WIP_SUFFIX)] + os.rename(self.stateFileName, self.stateFileName[: -len(self.WIP_SUFFIX)]) + self.stateFileName = self.stateFileName[: -len(self.WIP_SUFFIX)] # Get a Python file object for the file, which we will use to actually use it. # Problem: we can't be readable and writable at the same time. So we need two file objects. - self.stateFileOut = open(self.stateFileName, 'wb') - self.stateFileIn = open(self.stateFileName, 'rb') + self.stateFileOut = open(self.stateFileName, "wb") + self.stateFileIn = open(self.stateFileName, "rb") logger.debug("Opened with own state file %s" % self.stateFileName) @@ -180,6 +188,7 @@ def open(self): self._runOrphanedDeferredFunctions() try: + def defer(deferredFunction): # Just serialize deferred functions one after the other. # If serializing later ones fails, eariler ones will still be intact. @@ -219,7 +228,6 @@ def cleanupWorker(cls, stateDirBase: str) -> None: logger.exception(err) # we tried, lets move on - def _runDeferredFunction(self, deferredFunction): """ Run a deferred function (either our own or someone else's). @@ -231,9 +239,15 @@ def _runDeferredFunction(self, deferredFunction): deferredFunction.invoke() except Exception as err: # Report this in real time, if enabled. Otherwise the only place it ends up is the worker log. - RealtimeLogger.error("Failed to run deferred function %s: %s", repr(deferredFunction), str(err)) + RealtimeLogger.error( + "Failed to run deferred function %s: %s", + repr(deferredFunction), + str(err), + ) except: - RealtimeLogger.error("Failed to run deferred function %s", repr(deferredFunction)) + RealtimeLogger.error( + "Failed to run deferred function %s", repr(deferredFunction) + ) def _runAllDeferredFunctions(self, fileObj): """ @@ -338,7 +352,7 @@ def _runOrphanedDeferredFunctions(self): foundFiles = True # Actually run all the stored deferred functions - fileObj = open(fullFilename, 'rb') + fileObj = open(fullFilename, "rb") self._runAllDeferredFunctions(fileObj) states_handled += 1 @@ -352,4 +366,7 @@ def _runOrphanedDeferredFunctions(self): # Unlock it safe_unlock_and_close(fd) - logger.debug("Ran orphaned deferred functions from %d abandoned state files", states_handled) + logger.debug( + "Ran orphaned deferred functions from %d abandoned state files", + states_handled, + ) diff --git a/src/toil/exceptions.py b/src/toil/exceptions.py index 4d3f4adb86..d083134ee8 100644 --- a/src/toil/exceptions.py +++ b/src/toil/exceptions.py @@ -1,7 +1,7 @@ """Neutral place for exceptions, to break import cycles.""" import logging -from typing import TYPE_CHECKING, List +from typing import TYPE_CHECKING from toil.statsAndLogging import StatsAndLogging @@ -16,7 +16,7 @@ class FailedJobsException(Exception): def __init__( self, job_store: "AbstractJobStore", - failed_jobs: List["JobDescription"], + failed_jobs: list["JobDescription"], exit_code: int = 1, ): """ @@ -36,7 +36,9 @@ def __init__( for job_desc in failed_jobs: if job_desc.logJobStoreFileID: with job_desc.getLogFileHandle(job_store) as f: - self.msg += "\n" + StatsAndLogging.formatLogStream(f, f'Log from job "{job_desc}"') + self.msg += "\n" + StatsAndLogging.formatLogStream( + f, f'Log from job "{job_desc}"' + ) # catch failures to prepare more complex details and only return the basics except Exception: logger.exception("Exception when compiling information about failed jobs") diff --git a/src/toil/fileStores/__init__.py b/src/toil/fileStores/__init__.py index 4dded0c21b..b2f57bd496 100644 --- a/src/toil/fileStores/__init__.py +++ b/src/toil/fileStores/__init__.py @@ -28,7 +28,7 @@ class FileID(str): the job store if unavailable in the ID. """ - def __new__(cls, fileStoreID: str, *args: Any) -> 'FileID': + def __new__(cls, fileStoreID: str, *args: Any) -> "FileID": return super().__new__(cls, fileStoreID) def __init__(self, fileStoreID: str, size: int, executable: bool = False) -> None: @@ -43,18 +43,18 @@ def pack(self) -> str: return f'{self.size}:{"1" if self.executable else "0"}:{self}' @classmethod - def forPath(cls, fileStoreID: str, filePath: str) -> 'FileID': + def forPath(cls, fileStoreID: str, filePath: str) -> "FileID": executable = os.stat(filePath).st_mode & stat.S_IXUSR != 0 return cls(fileStoreID, os.stat(filePath).st_size, executable) @classmethod - def unpack(cls, packedFileStoreID: str) -> 'FileID': + def unpack(cls, packedFileStoreID: str) -> "FileID": """Unpack the result of pack() into a FileID object.""" # Only separate twice in case the FileID itself has colons in it - vals = packedFileStoreID.split(':', 2) + vals = packedFileStoreID.split(":", 2) # Break up the packed value size = int(vals[0]) - executable = (vals[1] == "1") + executable = vals[1] == "1" value = vals[2] # Create the FileID return cls(value, size, executable) diff --git a/src/toil/fileStores/abstractFileStore.py b/src/toil/fileStores/abstractFileStore.py index 263e74a3be..6242dcefb0 100644 --- a/src/toil/fileStores/abstractFileStore.py +++ b/src/toil/fileStores/abstractFileStore.py @@ -14,32 +14,28 @@ import logging import os from abc import ABC, abstractmethod +from collections.abc import Generator, Iterator from contextlib import contextmanager from tempfile import mkstemp from threading import Event, Semaphore -from typing import (IO, - TYPE_CHECKING, - Any, - Callable, - ContextManager, - Dict, - Generator, - Iterator, - List, - Literal, - Optional, - Set, - Tuple, - Type, - Union, - cast, - overload) +from typing import ( + IO, + TYPE_CHECKING, + Any, + Callable, + ContextManager, + Literal, + Optional, + Union, + cast, + overload, +) import dill from toil.common import Toil, cacheDirName, getDirSizeRecursively from toil.fileStores import FileID -from toil.job import Job, JobDescription, DebugStoppingPointReached +from toil.job import DebugStoppingPointReached, Job, JobDescription from toil.jobStores.abstractJobStore import AbstractJobStore from toil.lib.compatibility import deprecated from toil.lib.conversions import bytes2human @@ -76,9 +72,10 @@ class AbstractFileStore(ABC): Also responsible for committing completed jobs back to the job store with an update operation, and allowing that commit operation to be waited for. """ + # Variables used for syncing reads/writes _pendingFileWritesLock = Semaphore() - _pendingFileWrites: Set[str] = set() + _pendingFileWrites: set[str] = set() _terminateEvent = Event() # Used to signify crashes in threads def __init__( @@ -111,20 +108,26 @@ def __init__( # This gets replaced with a subdirectory of itself on open() self.localTempDir: str = os.path.abspath(file_store_dir) assert self.jobStore.config.workflowID is not None - self.workflow_dir: str = Toil.getLocalWorkflowDir(self.jobStore.config.workflowID, self.jobStore.config.workDir) - self.coordination_dir: str =Toil.get_local_workflow_coordination_dir(self.jobStore.config.workflowID, self.jobStore.config.workDir, self.jobStore.config.coordination_dir) + self.workflow_dir: str = Toil.getLocalWorkflowDir( + self.jobStore.config.workflowID, self.jobStore.config.workDir + ) + self.coordination_dir: str = Toil.get_local_workflow_coordination_dir( + self.jobStore.config.workflowID, + self.jobStore.config.workDir, + self.jobStore.config.coordination_dir, + ) self.jobName: str = str(self.jobDesc) self.waitForPreviousCommit = waitForPreviousCommit - self.logging_messages: List[Dict[str, Union[int, str]]] = [] - self.logging_user_streams: List[dict[str, str]] = [] + self.logging_messages: list[dict[str, Union[int, str]]] = [] + self.logging_user_streams: list[dict[str, str]] = [] # Records file IDs of files deleted during the current job. Doesn't get # committed back until the job is completely successful, because if the # job is re-run it will need to be able to re-delete these files. # This is a set of str objects, not FileIDs. - self.filesToDelete: Set[str] = set() + self.filesToDelete: set[str] = set() # Holds records of file ID, or file ID and local path, for reporting # the accessed files of failed jobs. - self._accessLog: List[Tuple[str, ...]] = [] + self._accessLog: list[tuple[str, ...]] = [] # Holds total bytes of observed disk usage for the last job run under open() self._job_disk_used: Optional[int] = None @@ -141,13 +144,17 @@ def createFileStore( from toil.fileStores.cachingFileStore import CachingFileStore from toil.fileStores.nonCachingFileStore import NonCachingFileStore - fileStoreCls: Union[Type["CachingFileStore"], Type["NonCachingFileStore"]] = ( + fileStoreCls: Union[type["CachingFileStore"], type["NonCachingFileStore"]] = ( CachingFileStore if caching else NonCachingFileStore ) return fileStoreCls(jobStore, jobDesc, file_store_dir, waitForPreviousCommit) @staticmethod - def shutdownFileStore(workflowID: str, config_work_dir: Optional[str], config_coordination_dir: Optional[str]) -> None: + def shutdownFileStore( + workflowID: str, + config_work_dir: Optional[str], + config_coordination_dir: Optional[str], + ) -> None: """ Carry out any necessary filestore-specific cleanup. @@ -167,7 +174,9 @@ def shutdownFileStore(workflowID: str, config_work_dir: Optional[str], config_co from toil.fileStores.nonCachingFileStore import NonCachingFileStore workflowDir = Toil.getLocalWorkflowDir(workflowID, config_work_dir) - coordination_dir = Toil.get_local_workflow_coordination_dir(workflowID, config_work_dir, config_coordination_dir) + coordination_dir = Toil.get_local_workflow_coordination_dir( + workflowID, config_work_dir, config_coordination_dir + ) cacheDir = os.path.join(workflowDir, cacheDirName(workflowID)) if os.path.exists(cacheDir): # The presence of the cacheDir suggests this was a cached run. We don't need @@ -208,12 +217,16 @@ def open(self, job: Job) -> Generator[None, None, None]: percent: float = 0.0 if job_requested_disk and job_requested_disk > 0: percent = float(self._job_disk_used) / job_requested_disk * 100 - disk_usage: str = (f"Job {self.jobName} used {percent:.2f}% disk ({bytes2human(self._job_disk_used)}B [{self._job_disk_used}B] used, " - f"{bytes2human(job_requested_disk)}B [{job_requested_disk}B] requested).") + disk_usage: str = ( + f"Job {self.jobName} used {percent:.2f}% disk ({bytes2human(self._job_disk_used)}B [{self._job_disk_used}B] used, " + f"{bytes2human(job_requested_disk)}B [{job_requested_disk}B] requested)." + ) if self._job_disk_used > job_requested_disk: - self.log_to_leader("Job used more disk than requested. For CWL, consider increasing the outdirMin " - f"requirement, otherwise, consider increasing the disk requirement. {disk_usage}", - level=logging.WARNING) + self.log_to_leader( + "Job used more disk than requested. For CWL, consider increasing the outdirMin " + f"requirement, otherwise, consider increasing the disk requirement. {disk_usage}", + level=logging.WARNING, + ) else: self.log_to_leader(disk_usage, level=logging.DEBUG) @@ -226,7 +239,6 @@ def get_disk_usage(self) -> Optional[int]: """ return self._job_disk_used - # Functions related to temp files and directories def getLocalTempDir(self) -> str: """ @@ -241,7 +253,9 @@ def getLocalTempDir(self) -> str: """ return os.path.abspath(mkdtemp(dir=self.localTempDir)) - def getLocalTempFile(self, suffix: Optional[str] = None, prefix: Optional[str] = None) -> str: + def getLocalTempFile( + self, suffix: Optional[str] = None, prefix: Optional[str] = None + ) -> str: """ Get a new local temporary file that will persist for the duration of the job. @@ -258,12 +272,14 @@ def getLocalTempFile(self, suffix: Optional[str] = None, prefix: Optional[str] = handle, tmpFile = mkstemp( suffix=".tmp" if suffix is None else suffix, prefix="tmp" if prefix is None else prefix, - dir=self.localTempDir + dir=self.localTempDir, ) os.close(handle) return os.path.abspath(tmpFile) - def getLocalTempFileName(self, suffix: Optional[str] = None, prefix: Optional[str] = None) -> str: + def getLocalTempFileName( + self, suffix: Optional[str] = None, prefix: Optional[str] = None + ) -> str: """ Get a valid name for a new local file. Don't actually create a file at the path. @@ -317,7 +333,7 @@ def writeGlobalFileStream( basename: Optional[str] = None, encoding: Optional[str] = None, errors: Optional[str] = None, - ) -> Iterator[Tuple[WriteWatchingStream, FileID]]: + ) -> Iterator[tuple[WriteWatchingStream, FileID]]: """ Similar to writeGlobalFile, but allows the writing of a stream to the job store. The yielded file handle does not need to and should not be closed explicitly. @@ -357,11 +373,14 @@ def writeGlobalFileStream( def handle(numBytes: int) -> None: # No scope problem here, because we don't assign to a fileID local fileID.size += numBytes + wrappedStream.onWrite(handle) yield wrappedStream, fileID - def _dumpAccessLogs(self, job_type: str = "Failed", log_level: int = logging.WARNING) -> None: + def _dumpAccessLogs( + self, job_type: str = "Failed", log_level: int = logging.WARNING + ) -> None: """ Log a report of the files accessed. @@ -370,7 +389,7 @@ def _dumpAccessLogs(self, job_type: str = "Failed", log_level: int = logging.WAR :param job_type: Adjective to describe the job in the report. """ if len(self._accessLog) > 0: - logger.log(log_level, '%s job accessed files:', job_type) + logger.log(log_level, "%s job accessed files:", job_type) for item in self._accessLog: # For each access record @@ -379,14 +398,29 @@ def _dumpAccessLogs(self, job_type: str = "Failed", log_level: int = logging.WAR file_id, dest_path = item if os.path.exists(dest_path): if os.path.islink(dest_path): - logger.log(log_level, 'Symlinked file \'%s\' to path \'%s\'', file_id, dest_path) + logger.log( + log_level, + "Symlinked file '%s' to path '%s'", + file_id, + dest_path, + ) else: - logger.log(log_level, 'Downloaded file \'%s\' to path \'%s\'', file_id, dest_path) + logger.log( + log_level, + "Downloaded file '%s' to path '%s'", + file_id, + dest_path, + ) else: - logger.log(log_level, 'Downloaded file \'%s\' to path \'%s\' (gone!)', file_id, dest_path) + logger.log( + log_level, + "Downloaded file '%s' to path '%s' (gone!)", + file_id, + dest_path, + ) else: # Otherwise dump without the name - logger.log(log_level, 'Streamed file \'%s\'', *item) + logger.log(log_level, "Streamed file '%s'", *item) def logAccess( self, fileStoreID: Union[FileID, str], destination: Union[str, None] = None @@ -453,14 +487,12 @@ def readGlobalFileStream( fileStoreID: str, encoding: Literal[None] = None, errors: Optional[str] = None, - ) -> ContextManager[IO[bytes]]: - ... + ) -> ContextManager[IO[bytes]]: ... @overload def readGlobalFileStream( self, fileStoreID: str, encoding: str, errors: Optional[str] = None - ) -> ContextManager[IO[str]]: - ... + ) -> ContextManager[IO[str]]: ... @abstractmethod def readGlobalFileStream( @@ -504,7 +536,7 @@ def getGlobalFileSize(self, fileStoreID: Union[FileID, str]) -> int: :return: File's size in bytes, as stored in the job store """ # First try and see if the size is still attached - size = getattr(fileStoreID, 'size', None) + size = getattr(fileStoreID, "size", None) if size is None: # It fell off @@ -557,7 +589,7 @@ def import_file( ) -> Optional[FileID]: return self.jobStore.import_file(src_uri, shared_file_name=shared_file_name) - @deprecated(new_function_name='export_file') + @deprecated(new_function_name="export_file") def exportFile(self, jobStoreFileID: FileID, dstUrl: str) -> None: return self.export_file(jobStoreFileID, dstUrl) @@ -586,7 +618,7 @@ def _resolveAbsoluteLocalPath(self, filePath: str) -> str: class _StateFile: """Read and write dill-ed state dictionaries from/to a file into a namespace.""" - def __init__(self, stateDict: Dict[str, Any]): + def __init__(self, stateDict: dict[str, Any]): assert isinstance(stateDict, dict) self.__dict__.update(stateDict) @@ -614,7 +646,7 @@ def _load(cls, fileName: str) -> Any: """ # Read the value from the cache state file then initialize and instance of # _CacheState with it. - with open(fileName, 'rb') as fH: + with open(fileName, "rb") as fH: infoDict = dill.load(fH) return cls(infoDict) @@ -624,14 +656,14 @@ def write(self, fileName: str) -> None: :param fileName: Path to the state file. """ - with open(fileName + '.tmp', 'wb') as fH: + with open(fileName + ".tmp", "wb") as fH: # Based on answer by user "Mark" at: # http://stackoverflow.com/questions/2709800/how-to-pickle-yourself # We can't pickle nested classes. So we have to pickle the variables # of the class. # If we ever change this, we need to ensure it doesn't break FileID dill.dump(self.__dict__, fH) - os.rename(fileName + '.tmp', fileName) + os.rename(fileName + ".tmp", fileName) # Functions related to logging def log_to_leader(self, text: str, level: int = logging.INFO) -> None: @@ -645,8 +677,7 @@ def log_to_leader(self, text: str, level: int = logging.INFO) -> None: logger.log(level=level, msg=("LOG-TO-MASTER: " + text)) self.logging_messages.append(dict(text=text, level=level)) - - @deprecated(new_function_name='export_file') + @deprecated(new_function_name="export_file") def logToMaster(self, text: str, level: int = logging.INFO) -> None: self.log_to_leader(text, level) @@ -664,7 +695,7 @@ def log_user_stream(self, name: str, stream: IO[bytes]) -> None: """ # Read the whole stream into memory - steam_data = stream.read().decode('utf-8', errors='replace') + steam_data = stream.read().decode("utf-8", errors="replace") # And remember it for the worker to fish out self.logging_user_streams.append(dict(name=name, text=steam_data)) diff --git a/src/toil/fileStores/cachingFileStore.py b/src/toil/fileStores/cachingFileStore.py index 9e5189eaf7..595fe4583c 100644 --- a/src/toil/fileStores/cachingFileStore.py +++ b/src/toil/fileStores/cachingFileStore.py @@ -22,15 +22,10 @@ import stat import threading import time +from collections.abc import Generator, Iterator, Sequence from contextlib import contextmanager from tempfile import mkstemp -from typing import (Any, - Callable, - Generator, - Iterator, - Optional, - Sequence, - Tuple) +from typing import Any, Callable, Optional from toil.common import cacheDirName, getFileSystemSize from toil.fileStores import FileID @@ -38,11 +33,13 @@ from toil.job import Job, JobDescription from toil.jobStores.abstractJobStore import AbstractJobStore from toil.lib.compatibility import deprecated -from toil.lib.io import (atomic_copy, - atomic_copyobj, - make_public_dir, - mkdtemp, - robust_rmtree) +from toil.lib.io import ( + atomic_copy, + atomic_copyobj, + make_public_dir, + mkdtemp, + robust_rmtree, +) from toil.lib.retry import ErrorCondition, retry from toil.lib.threading import get_process_name, process_name_exists @@ -66,9 +63,12 @@ class CacheUnbalancedError(CacheError): """ Raised if file store can't free enough space for caching """ - message = 'Unable unable to free enough space for caching. This error frequently arises due ' \ - 'to jobs using more disk than they have requested. Turn on debug logging to see ' \ - 'more information leading up to this error through cache usage logs.' + + message = ( + "Unable unable to free enough space for caching. This error frequently arises due " + "to jobs using more disk than they have requested. Turn on debug logging to see " + "more information leading up to this error through cache usage logs." + ) def __init__(self): super().__init__(self.message) @@ -87,9 +87,11 @@ class IllegalDeletionCacheError(CacheError): """ def __init__(self, deletedFile): - message = 'Cache tracked file (%s) has been deleted or moved by user ' \ - ' without updating cache database. Use deleteLocalFile to ' \ - 'delete such files.' % deletedFile + message = ( + "Cache tracked file (%s) has been deleted or moved by user " + " without updating cache database. Use deleteLocalFile to " + "delete such files." % deletedFile + ) super().__init__(message) @@ -208,13 +210,15 @@ def __init__( # Variables related to caching # Decide where the cache directory will be. We put it in the local # workflow directory. - self.localCacheDir = os.path.join(self.workflow_dir, cacheDirName(self.jobStore.config.workflowID)) + self.localCacheDir = os.path.join( + self.workflow_dir, cacheDirName(self.jobStore.config.workflowID) + ) # Since each worker has it's own unique CachingFileStore instance, and only one Job can run # at a time on a worker, we can track some stuff about the running job in ourselves. self.jobName: str = str(self.jobDesc) self.jobID = self.jobDesc.jobStoreID - logger.debug('Starting job (%s) with ID (%s).', self.jobName, self.jobID) + logger.debug("Starting job (%s) with ID (%s).", self.jobName, self.jobID) # When the job actually starts, we will fill this in with the job's disk requirement. self.jobDiskBytes: Optional[float] = None @@ -230,7 +234,9 @@ def __init__( # the workflow left one behind without cleaning up properly; we need to # be able to tell that from showing up on a machine where a cache has # already been created. - self.dbPath = os.path.join(self.coordination_dir, f'cache-{self.workflowAttemptNumber}.db') + self.dbPath = os.path.join( + self.coordination_dir, f"cache-{self.workflowAttemptNumber}.db" + ) # Database connections are provided by magic properties self.con and # self.cur that always have the right object for the current thread to @@ -254,7 +260,14 @@ def __init__( # Initialize the space accounting properties freeSpace, _ = getFileSystemSize(self.localCacheDir) - self._write([('INSERT OR IGNORE INTO properties VALUES (?, ?)', ('maxSpace', freeSpace))]) + self._write( + [ + ( + "INSERT OR IGNORE INTO properties VALUES (?, ?)", + ("maxSpace", freeSpace), + ) + ] + ) # Space used by caching and by jobs is accounted with queries @@ -284,10 +297,12 @@ def con(self) -> sqlite3.Connection: """ Get the database connection to be used for the current thread. """ - if not hasattr(self._thread_local, 'con'): + if not hasattr(self._thread_local, "con"): # Connect to the database for this thread. # TODO: We assume the connection closes when the thread goes away and can no longer use it. - self._thread_local.con = sqlite3.connect(self.dbPath, timeout=SQLITE_TIMEOUT_SECS) + self._thread_local.con = sqlite3.connect( + self.dbPath, timeout=SQLITE_TIMEOUT_SECS + ) return self._thread_local.con @property @@ -295,18 +310,20 @@ def cur(self) -> sqlite3.Cursor: """ Get the main cursor to be used for the current thread. """ - if not hasattr(self._thread_local, 'cur'): + if not hasattr(self._thread_local, "cur"): # If we don't already have a main cursor for the thread, make one. self._thread_local.cur = self.con.cursor() return self._thread_local.cur @staticmethod - @retry(infinite_retries=True, - errors=[ - ErrorCondition( - error=sqlite3.OperationalError, - error_message_must_include='is locked') - ]) + @retry( + infinite_retries=True, + errors=[ + ErrorCondition( + error=sqlite3.OperationalError, error_message_must_include="is locked" + ) + ], + ) def _static_write(con, cur, operations): """ Write to the caching database, using the given connection. @@ -340,7 +357,7 @@ def _static_write(con, cur, operations): # Do it cur.execute(command, args) except Exception as e: - logging.error('Error talking to caching database: %s', str(e)) + logging.error("Error talking to caching database: %s", str(e)) # Try to make sure we don't somehow leave anything part-done if a # middle operation somehow fails. @@ -360,13 +377,17 @@ def _static_write(con, cur, operations): return cur.rowcount @staticmethod - @retry(infinite_retries=True, - errors=[ - ErrorCondition( - error=sqlite3.OperationalError, - error_message_must_include='is locked') - ]) - def _static_read(cur: sqlite3.Cursor, query: str, args: Optional[Sequence[Any]] = ()) -> Iterator[Any]: + @retry( + infinite_retries=True, + errors=[ + ErrorCondition( + error=sqlite3.OperationalError, error_message_must_include="is locked" + ) + ], + ) + def _static_read( + cur: sqlite3.Cursor, query: str, args: Optional[Sequence[Any]] = () + ) -> Iterator[Any]: """ Read from the database. @@ -419,7 +440,11 @@ def _ensureTables(cls, con): # Get a cursor cur = con.cursor() - cls._static_write(con, cur, [""" + cls._static_write( + con, + cur, + [ + """ CREATE TABLE IF NOT EXISTS files ( id TEXT NOT NULL PRIMARY KEY, path TEXT UNIQUE NOT NULL, @@ -427,7 +452,8 @@ def _ensureTables(cls, con): state TEXT NOT NULL, owner TEXT ) - """, """ + """, + """ CREATE TABLE IF NOT EXISTS refs ( path TEXT NOT NULL, file_id TEXT NOT NULL, @@ -435,19 +461,23 @@ def _ensureTables(cls, con): state TEXT NOT NULL, PRIMARY KEY (path, file_id) ) - """, """ + """, + """ CREATE TABLE IF NOT EXISTS jobs ( id TEXT NOT NULL PRIMARY KEY, tempdir TEXT NOT NULL, disk INT NOT NULL, worker TEXT ) - """, """ + """, + """ CREATE TABLE IF NOT EXISTS properties ( name TEXT NOT NULL PRIMARY KEY, value INT NOT NULL ) - """]) + """, + ], + ) # Caching-specific API @@ -458,10 +488,12 @@ def getCacheLimit(self): If no limit is available, raises an error. """ - for row in self.cur.execute('SELECT value FROM properties WHERE name = ?', ('maxSpace',)): + for row in self.cur.execute( + "SELECT value FROM properties WHERE name = ?", ("maxSpace",) + ): return row[0] - raise RuntimeError('Unable to retrieve cache limit') + raise RuntimeError("Unable to retrieve cache limit") def getCacheUsed(self): """ @@ -474,10 +506,10 @@ def getCacheUsed(self): if self.cachingIsFree(): return 0 - for row in self._read('SELECT TOTAL(size) FROM files'): + for row in self._read("SELECT TOTAL(size) FROM files"): return row[0] - raise RuntimeError('Unable to retrieve cache usage') + raise RuntimeError("Unable to retrieve cache usage") def getCacheExtraJobSpace(self): """ @@ -492,15 +524,17 @@ def getCacheExtraJobSpace(self): """ # Total up the sizes of all the reads of files and subtract it from the total disk reservation of all jobs - for row in self._read(""" + for row in self._read( + """ SELECT ( (SELECT TOTAL(disk) FROM jobs) - (SELECT TOTAL(files.size) FROM refs INNER JOIN files ON refs.file_id = files.id WHERE refs.state == 'immutable') ) as result - """): + """ + ): return row[0] - raise RuntimeError('Unable to retrieve extra job space') + raise RuntimeError("Unable to retrieve extra job space") def getCacheAvailable(self): """ @@ -519,33 +553,38 @@ def getCacheAvailable(self): # Do a little report first for row in self._read("SELECT value FROM properties WHERE name = 'maxSpace'"): - logger.debug('Max space: %d', row[0]) + logger.debug("Max space: %d", row[0]) for row in self._read("SELECT TOTAL(size) FROM files"): - logger.debug('Total file size: %d', row[0]) + logger.debug("Total file size: %d", row[0]) for row in self._read("SELECT TOTAL(disk) FROM jobs"): - logger.debug('Total job disk requirement size: %d', row[0]) - for row in self._read("SELECT TOTAL(files.size) FROM refs INNER JOIN files ON refs.file_id = files.id WHERE refs.state = 'immutable'"): - logger.debug('Total immutable reference size: %d', row[0]) + logger.debug("Total job disk requirement size: %d", row[0]) + for row in self._read( + "SELECT TOTAL(files.size) FROM refs INNER JOIN files ON refs.file_id = files.id WHERE refs.state = 'immutable'" + ): + logger.debug("Total immutable reference size: %d", row[0]) if self.cachingIsFree(): # If caching is free, we just say that all the space is always available. - for row in self._read("SELECT value FROM properties WHERE name = 'maxSpace'"): + for row in self._read( + "SELECT value FROM properties WHERE name = 'maxSpace'" + ): return row[0] - raise RuntimeError('Unable to retrieve available cache space') + raise RuntimeError("Unable to retrieve available cache space") - - for row in self._read(""" + for row in self._read( + """ SELECT ( (SELECT value FROM properties WHERE name = 'maxSpace') - (SELECT TOTAL(size) FROM files) - ((SELECT TOTAL(disk) FROM jobs) - (SELECT TOTAL(files.size) FROM refs INNER JOIN files ON refs.file_id = files.id WHERE refs.state = 'immutable')) ) as result - """): + """ + ): return row[0] - raise RuntimeError('Unable to retrieve available cache space') + raise RuntimeError("Unable to retrieve available cache space") def getSpaceUsableForJobs(self): """ @@ -555,15 +594,17 @@ def getSpaceUsableForJobs(self): If not retrievable, raises an error. """ - for row in self._read(""" + for row in self._read( + """ SELECT ( (SELECT value FROM properties WHERE name = 'maxSpace') - (SELECT TOTAL(disk) FROM jobs) ) as result - """): + """ + ): return row[0] - raise RuntimeError('Unable to retrieve usabel space for jobs') + raise RuntimeError("Unable to retrieve usabel space for jobs") def getCacheUnusedJobRequirement(self): """ @@ -575,28 +616,36 @@ def getCacheUnusedJobRequirement(self): If no value is available, raises an error. """ - logger.debug('Get unused space for job %s', self.jobID) - - for row in self._read('SELECT * FROM files'): - logger.debug('File record: %s', str(row)) + logger.debug("Get unused space for job %s", self.jobID) - for row in self._read('SELECT * FROM refs'): - logger.debug('Ref record: %s', str(row)) + for row in self._read("SELECT * FROM files"): + logger.debug("File record: %s", str(row)) + for row in self._read("SELECT * FROM refs"): + logger.debug("Ref record: %s", str(row)) - for row in self._read('SELECT TOTAL(files.size) FROM refs INNER JOIN files ON refs.file_id = files.id WHERE refs.job_id = ? AND refs.state != ?', - (self.jobID, 'mutable')): + for row in self._read( + "SELECT TOTAL(files.size) FROM refs INNER JOIN files ON refs.file_id = files.id WHERE refs.job_id = ? AND refs.state != ?", + (self.jobID, "mutable"), + ): # Sum up all the sizes of our referenced files, then subtract that from how much we came in with return self.jobDiskBytes - row[0] - raise RuntimeError('Unable to retrieve unused job requirement space') + raise RuntimeError("Unable to retrieve unused job requirement space") def adjustCacheLimit(self, newTotalBytes): """ Adjust the total cache size limit to the given number of bytes. """ - self._write([('UPDATE properties SET value = ? WHERE name = ?', (newTotalBytes, 'maxSpace'))]) + self._write( + [ + ( + "UPDATE properties SET value = ? WHERE name = ?", + (newTotalBytes, "maxSpace"), + ) + ] + ) def fileIsCached(self, fileID): """ @@ -607,8 +656,10 @@ def fileIsCached(self, fileID): file you need to do it in a transaction. """ - for row in self._read('SELECT COUNT(*) FROM files WHERE id = ? AND (state = ? OR state = ? OR state = ?)', - (fileID, 'cached', 'uploadable', 'uploading')): + for row in self._read( + "SELECT COUNT(*) FROM files WHERE id = ? AND (state = ? OR state = ? OR state = ?)", + (fileID, "cached", "uploadable", "uploading"), + ): return row[0] > 0 return False @@ -620,7 +671,7 @@ def getFileReaderCount(self, fileID): Counts mutable references too. """ - for row in self._read('SELECT COUNT(*) FROM refs WHERE file_id = ?', (fileID,)): + for row in self._read("SELECT COUNT(*) FROM refs WHERE file_id = ?", (fileID,)): return row[0] return 0 @@ -633,11 +684,14 @@ def cachingIsFree(self): configurations, most notably the FileJobStore. """ - for row in self._read('SELECT value FROM properties WHERE name = ?', ('freeCaching',)): + for row in self._read( + "SELECT value FROM properties WHERE name = ?", ("freeCaching",) + ): return row[0] == 1 # Otherwise we need to set it from toil.jobStores.fileJobStore import FileJobStore + if isinstance(self.jobStore, FileJobStore) and not self.forceNonFreeCaching: # Caching may be free since we are using a file job store. @@ -646,7 +700,7 @@ def cachingIsFree(self): # Read it out to a generated name. destDir = mkdtemp(dir=self.localCacheDir) - cachedFile = os.path.join(destDir, 'sniffLinkCount') + cachedFile = os.path.join(destDir, "sniffLinkCount") self.jobStore.read_file(emptyID, cachedFile, symlink=False) # Check the link count @@ -666,7 +720,9 @@ def cachingIsFree(self): free = 0 # Save to the database if we're the first to work this out - self._write([('INSERT OR IGNORE INTO properties VALUES (?, ?)', ('freeCaching', free))]) + self._write( + [("INSERT OR IGNORE INTO properties VALUES (?, ?)", ("freeCaching", free))] + ) # Return true if we said caching was free return free == 1 @@ -683,7 +739,7 @@ def _getNewCachingPath(self, fileStoreID): # Hash the file ID hasher = hashlib.sha1() - hasher.update(fileStoreID.encode('utf-8')) + hasher.update(fileStoreID.encode("utf-8")) # Get a unique temp file name, including the file ID's hash to make # sure we can never collide even though we are going to remove the @@ -707,17 +763,19 @@ def _stealWorkFromTheDead(self): # Get a list of all file owner processes on this node. # Exclude NULL because it comes out as 0 and we can't look for PID 0. owners = [] - for row in self._read('SELECT DISTINCT owner FROM files WHERE owner IS NOT NULL'): + for row in self._read( + "SELECT DISTINCT owner FROM files WHERE owner IS NOT NULL" + ): owners.append(row[0]) # Work out which of them have died. deadOwners = [] for owner in owners: if not process_name_exists(self.coordination_dir, owner): - logger.debug('Owner %s is dead', owner) + logger.debug("Owner %s is dead", owner) deadOwners.append(owner) else: - logger.debug('Owner %s is alive', owner) + logger.debug("Owner %s is alive", owner) for owner in deadOwners: # Try and adopt all the files that any dead owner had @@ -736,14 +794,28 @@ def _stealWorkFromTheDead(self): # # TODO: if we ever let other PIDs be responsible for writing our # files asynchronously, this will need to change. - self._write([('UPDATE files SET owner = ?, state = ? WHERE owner = ? AND state = ?', - (me, 'deleting', owner, 'deleting')), - ('UPDATE files SET owner = ?, state = ? WHERE owner = ? AND state = ?', - (me, 'deleting', owner, 'downloading')), - ('UPDATE files SET owner = NULL, state = ? WHERE owner = ? AND (state = ? OR state = ?)', - ('cached', owner, 'uploadable', 'uploading'))]) + self._write( + [ + ( + "UPDATE files SET owner = ?, state = ? WHERE owner = ? AND state = ?", + (me, "deleting", owner, "deleting"), + ), + ( + "UPDATE files SET owner = ?, state = ? WHERE owner = ? AND state = ?", + (me, "deleting", owner, "downloading"), + ), + ( + "UPDATE files SET owner = NULL, state = ? WHERE owner = ? AND (state = ? OR state = ?)", + ("cached", owner, "uploadable", "uploading"), + ), + ] + ) - logger.debug('Tried to adopt file operations from dead worker %s to ourselves as %s', owner, me) + logger.debug( + "Tried to adopt file operations from dead worker %s to ourselves as %s", + owner, + me, + ) def _executePendingDeletions(self): """ @@ -757,16 +829,19 @@ def _executePendingDeletions(self): # Remember the file IDs we are deleting deletedFiles = [] - for row in self._read('SELECT id, path FROM files WHERE owner = ? AND state = ?', (me, 'deleting')): + for row in self._read( + "SELECT id, path FROM files WHERE owner = ? AND state = ?", + (me, "deleting"), + ): # Grab everything we are supposed to delete and delete it fileID = row[0] filePath = row[1] try: os.unlink(filePath) - logger.debug('Successfully deleted: %s', filePath) + logger.debug("Successfully deleted: %s", filePath) except OSError: # Probably already deleted - logger.debug('File already gone: %s', filePath) + logger.debug("File already gone: %s", filePath) # Still need to mark it as deleted # Whether we deleted the file or just found out that it is gone, we @@ -777,8 +852,15 @@ def _executePendingDeletions(self): for fileID in deletedFiles: # Drop all the files. They should have stayed in deleting state. We move them from there to not present at all. # Also drop their references, if they had any from dead downloaders. - self._write([('DELETE FROM files WHERE id = ? AND state = ?', (fileID, 'deleting')), - ('DELETE FROM refs WHERE file_id = ?', (fileID,))]) + self._write( + [ + ( + "DELETE FROM files WHERE id = ? AND state = ?", + (fileID, "deleting"), + ), + ("DELETE FROM refs WHERE file_id = ?", (fileID,)), + ] + ) return len(deletedFiles) @@ -798,7 +880,11 @@ def _executePendingUploads(self): # Try and find a file we might want to upload fileID = None filePath = None - for row in self._static_read(self.cur, 'SELECT id, path FROM files WHERE state = ? AND owner = ? LIMIT 1', ('uploadable', me)): + for row in self._static_read( + self.cur, + "SELECT id, path FROM files WHERE state = ? AND owner = ? LIMIT 1", + ("uploadable", me), + ): fileID = row[0] filePath = row[1] @@ -807,30 +893,57 @@ def _executePendingUploads(self): break # We need to set it to uploading in a way that we can detect that *we* won the update race instead of anyone else. - rowCount = self._static_write(self.con, self.cur, [('UPDATE files SET state = ? WHERE id = ? AND state = ?', ('uploading', fileID, 'uploadable'))]) + rowCount = self._static_write( + self.con, + self.cur, + [ + ( + "UPDATE files SET state = ? WHERE id = ? AND state = ?", + ("uploading", fileID, "uploadable"), + ) + ], + ) if rowCount != 1: # We didn't manage to update it. Someone else (a running job if # we are a committing thread, or visa versa) must have grabbed # it. - logger.debug('Lost race to upload %s', fileID) + logger.debug("Lost race to upload %s", fileID) # Try again to see if there is something else to grab. continue # Upload the file - logger.debug('Actually executing upload for file %s', fileID) + logger.debug("Actually executing upload for file %s", fileID) try: self.jobStore.update_file(fileID, filePath) except: # We need to set the state back to 'uploadable' in case of any failures to ensure # we can retry properly. - self._static_write(self.con, self.cur, [('UPDATE files SET state = ? WHERE id = ? AND state = ?', ('uploadable', fileID, 'uploading'))]) + self._static_write( + self.con, + self.cur, + [ + ( + "UPDATE files SET state = ? WHERE id = ? AND state = ?", + ("uploadable", fileID, "uploading"), + ) + ], + ) raise # Count it for the total uploaded files value we need to return uploadedCount += 1 # Remember that we uploaded it in the database - self._static_write(self.con, self.cur, [('UPDATE files SET state = ?, owner = NULL WHERE id = ?', ('cached', fileID))]) + self._static_write( + self.con, + self.cur, + [ + ( + "UPDATE files SET state = ?, owner = NULL WHERE id = ?", + ("cached", fileID), + ) + ], + ) return uploadedCount @@ -854,7 +967,14 @@ def _allocateSpaceForJob(self, newJobReqs): # But we won't actually let the job run and use any of this space until # the cache has been successfully cleared out. with self.as_process() as me: - self._write([('INSERT INTO jobs VALUES (?, ?, ?, ?)', (self.jobID, self.localTempDir, newJobReqs, me))]) + self._write( + [ + ( + "INSERT INTO jobs VALUES (?, ?, ?, ?)", + (self.jobID, self.localTempDir, newJobReqs, me), + ) + ] + ) # Now we need to make sure that we can fit all currently cached files, # and the parts of the total job requirements not currently spent on @@ -862,7 +982,7 @@ def _allocateSpaceForJob(self, newJobReqs): available = self.getCacheAvailable() - logger.debug('Available space with job: %d bytes', available) + logger.debug("Available space with job: %d bytes", available) if available >= 0: # We're fine on disk space @@ -886,10 +1006,14 @@ def _removeJob(cls, con, cur, jobID): """ # Get the job's temp dir - for row in cls._static_read(cur, 'SELECT tempdir FROM jobs WHERE id = ?', (jobID,)): + for row in cls._static_read( + cur, "SELECT tempdir FROM jobs WHERE id = ?", (jobID,) + ): jobTemp = row[0] - for row in cls._static_read(cur, 'SELECT path FROM refs WHERE job_id = ?', (jobID,)): + for row in cls._static_read( + cur, "SELECT path FROM refs WHERE job_id = ?", (jobID,) + ): try: # Delete all the reference files. os.unlink(row[0]) @@ -897,7 +1021,7 @@ def _removeJob(cls, con, cur, jobID): # May not exist pass # And their database entries - cls._static_write(con, cur, [('DELETE FROM refs WHERE job_id = ?', (jobID,))]) + cls._static_write(con, cur, [("DELETE FROM refs WHERE job_id = ?", (jobID,))]) try: # Delete the job's temp directory to the extent that we can. @@ -906,7 +1030,7 @@ def _removeJob(cls, con, cur, jobID): pass # Strike the job from the database - cls._static_write(con, cur, [('DELETE FROM jobs WHERE id = ?', (jobID,))]) + cls._static_write(con, cur, [("DELETE FROM jobs WHERE id = ?", (jobID,))]) def _deallocateSpaceForJob(self): """ @@ -937,12 +1061,12 @@ def _tryToFreeUpSpace(self): if self._executePendingDeletions() > 0: # We actually had something to delete, which we deleted. # Maybe there is space now - logger.debug('Successfully executed pending deletions to free space') + logger.debug("Successfully executed pending deletions to free space") return True if self._executePendingUploads() > 0: # We had something to upload. Maybe it can be evicted now. - logger.debug('Successfully executed pending uploads to free space') + logger.debug("Successfully executed pending uploads to free space") return True # Otherwise, not enough files could be found in deleting state to solve our problem. @@ -952,37 +1076,45 @@ def _tryToFreeUpSpace(self): # soon as we hit the cache limit. # Find something that has no non-mutable references and is not already being deleted. - self._read(""" + self._read( + """ SELECT files.id FROM files WHERE files.state = 'cached' AND NOT EXISTS ( SELECT NULL FROM refs WHERE refs.file_id = files.id AND refs.state != 'mutable' ) LIMIT 1 - """) + """ + ) row = self.cur.fetchone() if row is None: # Nothing can be evicted by us. # Someone else might be in the process of evicting something that will free up space for us too. # Or someone mught be uploading something and we have to wait for them to finish before it can be deleted. - logger.debug('Could not find anything to evict! Cannot free up space!') + logger.debug("Could not find anything to evict! Cannot free up space!") return False # Otherwise we found an eviction candidate. fileID = row[0] # Try and grab it for deletion, subject to the condition that nothing has started reading it - self._write([(""" + self._write( + [ + ( + """ UPDATE files SET owner = ?, state = ? WHERE id = ? AND state = ? AND owner IS NULL AND NOT EXISTS ( SELECT NULL FROM refs WHERE refs.file_id = files.id AND refs.state != 'mutable' ) """, - (me, 'deleting', fileID, 'cached'))]) + (me, "deleting", fileID, "cached"), + ) + ] + ) - logger.debug('Evicting file %s', fileID) + logger.debug("Evicting file %s", fileID) # Whether we actually got it or not, try deleting everything we have to delete if self._executePendingDeletions() > 0: # We deleted something - logger.debug('Successfully executed pending deletions to free space') + logger.debug("Successfully executed pending deletions to free space") return True def _freeUpSpace(self): @@ -999,7 +1131,10 @@ def _freeUpSpace(self): while availableSpace < 0: # While there isn't enough space for the thing we want - logger.debug('Cache is full (%d bytes free). Trying to free up space!', availableSpace) + logger.debug( + "Cache is full (%d bytes free). Trying to free up space!", + availableSpace, + ) # Free up space. See if we made any progress progress = self._tryToFreeUpSpace() availableSpace = self.getCacheAvailable() @@ -1011,19 +1146,23 @@ def _freeUpSpace(self): # See if we've been oversubscribed. jobSpace = self.getSpaceUsableForJobs() if jobSpace < 0: - logger.critical('Jobs on this machine have oversubscribed our total available space (%d bytes)!', jobSpace) + logger.critical( + "Jobs on this machine have oversubscribed our total available space (%d bytes)!", + jobSpace, + ) raise CacheUnbalancedError else: patience -= 1 if patience <= 0: - logger.critical('Waited implausibly long for active uploads and deletes.') + logger.critical( + "Waited implausibly long for active uploads and deletes." + ) raise CacheUnbalancedError else: # Wait a bit and come back time.sleep(2) - logger.debug('Cache has %d bytes free.', availableSpace) - + logger.debug("Cache has %d bytes free.", availableSpace) # Normal AbstractFileStore API @@ -1044,8 +1183,13 @@ def open(self, job: Job) -> Generator[None, None, None]: # have filled the cache or not. self.jobDiskBytes = job.disk - logger.debug('Actually running job (%s) with ID (%s) which wants %d of our %d bytes.', - self.jobName, self.jobID, self.jobDiskBytes, self.getCacheLimit()) + logger.debug( + "Actually running job (%s) with ID (%s) which wants %d of our %d bytes.", + self.jobName, + self.jobID, + self.jobDiskBytes, + self.getCacheLimit(), + ) # Register the current job as taking this much space, and evict files # from the cache to make room before letting the job run. @@ -1079,7 +1223,9 @@ def writeGlobalFile(self, localFileName, cleanup=False, executable=False): # Create an empty file to get an ID. # Make sure to pass along the file basename. # TODO: this empty file could leak if we die now... - fileID = self.jobStore.get_empty_file_store_id(creatorID, cleanup, os.path.basename(localFileName)) + fileID = self.jobStore.get_empty_file_store_id( + creatorID, cleanup, os.path.basename(localFileName) + ) # Work out who we are with self.as_process() as me: @@ -1088,10 +1234,22 @@ def writeGlobalFile(self, localFileName, cleanup=False, executable=False): # Create a file in uploadable state and a reference, in the same transaction. # Say the reference is an immutable reference - self._write([('INSERT INTO files VALUES (?, ?, ?, ?, ?)', (fileID, cachePath, fileSize, 'uploadable', me)), - ('INSERT INTO refs VALUES (?, ?, ?, ?)', (absLocalFileName, fileID, creatorID, 'immutable'))]) + self._write( + [ + ( + "INSERT INTO files VALUES (?, ?, ?, ?, ?)", + (fileID, cachePath, fileSize, "uploadable", me), + ), + ( + "INSERT INTO refs VALUES (?, ?, ?, ?)", + (absLocalFileName, fileID, creatorID, "immutable"), + ), + ] + ) - if absLocalFileName.startswith(self.localTempDir) and not os.path.islink(absLocalFileName): + if absLocalFileName.startswith(self.localTempDir) and not os.path.islink( + absLocalFileName + ): # We should link into the cache, because the upload is coming from our local temp dir (and not via a symlink in there) try: # Try and hardlink the file into the cache. @@ -1102,8 +1260,14 @@ def writeGlobalFile(self, localFileName, cleanup=False, executable=False): linkedToCache = True - logger.debug('Hardlinked file %s into cache at %s; deferring write to job store', localFileName, cachePath) - assert not os.path.islink(cachePath), "Symlink %s has invaded cache!" % cachePath + logger.debug( + "Hardlinked file %s into cache at %s; deferring write to job store", + localFileName, + cachePath, + ) + assert not os.path.islink(cachePath), ( + "Symlink %s has invaded cache!" % cachePath + ) # Don't do the upload now. Let it be deferred until later (when the job is committing). except OSError: @@ -1117,7 +1281,6 @@ def writeGlobalFile(self, localFileName, cleanup=False, executable=False): # files to vanish from our cache. linkedToCache = False - if not linkedToCache: # If we can't do the link into the cache and upload from there, we # have to just upload right away. We can't guarantee sufficient @@ -1126,27 +1289,40 @@ def writeGlobalFile(self, localFileName, cleanup=False, executable=False): # Change the reference to 'mutable', which it will be. # And drop the file altogether. - self._write([('UPDATE refs SET state = ? WHERE path = ? AND file_id = ?', ('mutable', absLocalFileName, fileID)), - ('DELETE FROM files WHERE id = ?', (fileID,))]) + self._write( + [ + ( + "UPDATE refs SET state = ? WHERE path = ? AND file_id = ?", + ("mutable", absLocalFileName, fileID), + ), + ("DELETE FROM files WHERE id = ?", (fileID,)), + ] + ) # Save the file to the job store right now - logger.debug('Actually executing upload immediately for file %s', fileID) + logger.debug( + "Actually executing upload immediately for file %s", fileID + ) self.jobStore.update_file(fileID, absLocalFileName) # Ship out the completed FileID object with its real size. return FileID.forPath(fileID, absLocalFileName) - def readGlobalFile(self, fileStoreID, userPath=None, cache=True, mutable=False, symlink=False): + def readGlobalFile( + self, fileStoreID, userPath=None, cache=True, mutable=False, symlink=False + ): if str(fileStoreID) in self.filesToDelete: # File has already been deleted - raise FileNotFoundError(f'Attempted to read deleted file: {fileStoreID}') + raise FileNotFoundError(f"Attempted to read deleted file: {fileStoreID}") if userPath is not None: # Validate the destination we got localFilePath = self._resolveAbsoluteLocalPath(userPath) if os.path.exists(localFilePath): - raise RuntimeError(' File %s ' % localFilePath + ' exists. Cannot Overwrite.') + raise RuntimeError( + " File %s " % localFilePath + " exists. Cannot Overwrite." + ) else: # Make our own destination localFilePath = self.getLocalTempFileName() @@ -1158,22 +1334,29 @@ def readGlobalFile(self, fileStoreID, userPath=None, cache=True, mutable=False, # We want to use the cache if mutable: - finalPath = self._readGlobalFileMutablyWithCache(fileStoreID, localFilePath, readerID) + finalPath = self._readGlobalFileMutablyWithCache( + fileStoreID, localFilePath, readerID + ) else: - finalPath = self._readGlobalFileWithCache(fileStoreID, localFilePath, symlink, readerID) + finalPath = self._readGlobalFileWithCache( + fileStoreID, localFilePath, symlink, readerID + ) else: # We do not want to use the cache - finalPath = self._readGlobalFileWithoutCache(fileStoreID, localFilePath, mutable, symlink, readerID) + finalPath = self._readGlobalFileWithoutCache( + fileStoreID, localFilePath, mutable, symlink, readerID + ) - if getattr(fileStoreID, 'executable', False): + if getattr(fileStoreID, "executable", False): os.chmod(finalPath, os.stat(finalPath).st_mode | stat.S_IXUSR) # Record access in case the job crashes and we have to log it self.logAccess(fileStoreID, finalPath) return finalPath - - def _readGlobalFileWithoutCache(self, fileStoreID, localFilePath, mutable, symlink, readerID): + def _readGlobalFileWithoutCache( + self, fileStoreID, localFilePath, mutable, symlink, readerID + ): """ Read a file without putting it into the cache. @@ -1191,7 +1374,9 @@ def _readGlobalFileWithoutCache(self, fileStoreID, localFilePath, mutable, symli # read a file that is 'uploadable' or 'uploading' and hasn't hit # the backing job store yet. - with self._with_copying_reference_to_upload(fileStoreID, readerID, localFilePath) as ref_path: + with self._with_copying_reference_to_upload( + fileStoreID, readerID, localFilePath + ) as ref_path: if ref_path is not None: # We got a copying reference, so the file is being uploaded and # must be read from the cache for consistency. And it will @@ -1205,11 +1390,16 @@ def _readGlobalFileWithoutCache(self, fileStoreID, localFilePath, mutable, symli # Find where the file is cached cachedPath = None - for row in self._read('SELECT path FROM files WHERE id = ?', (fileStoreID,)): + for row in self._read( + "SELECT path FROM files WHERE id = ?", (fileStoreID,) + ): cachedPath = row[0] if cachedPath is None: - raise RuntimeError('File %s went away while we had a reference to it!' % fileStoreID) + raise RuntimeError( + "File %s went away while we had a reference to it!" + % fileStoreID + ) if self.forceDownloadDelay is not None: # Wait around to simulate a big file for testing @@ -1218,8 +1408,14 @@ def _readGlobalFileWithoutCache(self, fileStoreID, localFilePath, mutable, symli atomic_copy(cachedPath, ref_path) # Change the reference to mutable so it sticks around - self._write([('UPDATE refs SET state = ? WHERE path = ? and file_id = ?', - ('mutable', ref_path, fileStoreID))]) + self._write( + [ + ( + "UPDATE refs SET state = ? WHERE path = ? and file_id = ?", + ("mutable", ref_path, fileStoreID), + ) + ] + ) else: # File is not being uploaded currently. @@ -1229,8 +1425,14 @@ def _readGlobalFileWithoutCache(self, fileStoreID, localFilePath, mutable, symli # Create a 'mutable' reference (even if we end up with a link) # so we can see this file in deleteLocalFile. - self._write([('INSERT INTO refs VALUES (?, ?, ?, ?)', - (localFilePath, fileStoreID, readerID, 'mutable'))]) + self._write( + [ + ( + "INSERT INTO refs VALUES (?, ?, ?, ?)", + (localFilePath, fileStoreID, readerID, "mutable"), + ) + ] + ) if self.forceDownloadDelay is not None: # Wait around to simulate a big file for testing @@ -1290,15 +1492,32 @@ def _readGlobalFileMutablyWithCache(self, fileStoreID, localFilePath, readerID): # Start a loop until we can do one of these while True: # Try and create a downloading entry if no entry exists - logger.debug('Trying to make file record for id %s', fileStoreID) - self._write([('INSERT OR IGNORE INTO files VALUES (?, ?, ?, ?, ?)', - (fileStoreID, cachedPath, self.getGlobalFileSize(fileStoreID), 'downloading', me))]) + logger.debug("Trying to make file record for id %s", fileStoreID) + self._write( + [ + ( + "INSERT OR IGNORE INTO files VALUES (?, ?, ?, ?, ?)", + ( + fileStoreID, + cachedPath, + self.getGlobalFileSize(fileStoreID), + "downloading", + me, + ), + ) + ] + ) # See if we won the race - self._read('SELECT COUNT(*) FROM files WHERE id = ? AND state = ? AND owner = ?', (fileStoreID, 'downloading', me)) + self._read( + "SELECT COUNT(*) FROM files WHERE id = ? AND state = ? AND owner = ?", + (fileStoreID, "downloading", me), + ) if self.cur.fetchone()[0] > 0: # We are responsible for downloading the file - logger.debug('We are now responsible for downloading file %s', fileStoreID) + logger.debug( + "We are now responsible for downloading file %s", fileStoreID + ) # Make sure we have space for this download. self._freeUpSpace() @@ -1313,37 +1532,65 @@ def _readGlobalFileMutablyWithCache(self, fileStoreID, localFilePath, readerID): # two readers, one cached copy, and space for two copies total. # Make the copying reference - self._write([('INSERT INTO refs VALUES (?, ?, ?, ?)', - (localFilePath, fileStoreID, readerID, 'copying'))]) + self._write( + [ + ( + "INSERT INTO refs VALUES (?, ?, ?, ?)", + (localFilePath, fileStoreID, readerID, "copying"), + ) + ] + ) # Fulfill it with a full copy or by giving away the cached copy - self._fulfillCopyingReference(fileStoreID, cachedPath, localFilePath) + self._fulfillCopyingReference( + fileStoreID, cachedPath, localFilePath + ) # Now we're done return localFilePath else: - logger.debug('Someone else is already responsible for file %s', fileStoreID) + logger.debug( + "Someone else is already responsible for file %s", fileStoreID + ) # A record already existed for this file. # Try and create an immutable or copying reference to an entry that # is in 'cached' or 'uploadable' or 'uploading' state. # It might be uploading because *we* are supposed to be uploading it. - logger.debug('Trying to make reference to file %s', fileStoreID) - self._write([('INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND (state = ? OR state = ? OR state = ?)', - (localFilePath, readerID, 'copying', fileStoreID, 'cached', 'uploadable', 'uploading'))]) + logger.debug("Trying to make reference to file %s", fileStoreID) + self._write( + [ + ( + "INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND (state = ? OR state = ? OR state = ?)", + ( + localFilePath, + readerID, + "copying", + fileStoreID, + "cached", + "uploadable", + "uploading", + ), + ) + ] + ) # See if we got it - self._read('SELECT COUNT(*) FROM refs WHERE path = ? and file_id = ?', (localFilePath, fileStoreID)) + self._read( + "SELECT COUNT(*) FROM refs WHERE path = ? and file_id = ?", + (localFilePath, fileStoreID), + ) if self.cur.fetchone()[0] > 0: # The file is cached and we can copy or link it - logger.debug('Obtained reference to file %s', fileStoreID) + logger.debug("Obtained reference to file %s", fileStoreID) # Get the path it is actually at in the cache, instead of where we wanted to put it - for row in self._read('SELECT path FROM files WHERE id = ?', (fileStoreID,)): + for row in self._read( + "SELECT path FROM files WHERE id = ?", (fileStoreID,) + ): cachedPath = row[0] - while self.getCacheAvailable() < 0: # Since we now have a copying reference, see if we have used too much space. # If so, try to free up some space by deleting or uploading, but @@ -1356,15 +1603,23 @@ def _readGlobalFileMutablyWithCache(self, fileStoreID, localFilePath, readerID): # See if we have no other references and we can give away the file. # Change it to downloading owned by us if we can grab it. - self._write([(""" + self._write( + [ + ( + """ UPDATE files SET files.owner = ?, files.state = ? WHERE files.id = ? AND files.state = ? AND files.owner IS NULL AND NOT EXISTS ( SELECT NULL FROM refs WHERE refs.file_id = files.id AND refs.state != 'mutable' ) """, - (me, 'downloading', fileStoreID, 'cached'))]) - - if self._giveAwayDownloadingFile(fileStoreID, cachedPath, localFilePath): + (me, "downloading", fileStoreID, "cached"), + ) + ] + ) + + if self._giveAwayDownloadingFile( + fileStoreID, cachedPath, localFilePath + ): # We got ownership of the file and managed to give it away. return localFilePath @@ -1385,14 +1640,23 @@ def _readGlobalFileMutablyWithCache(self, fileStoreID, localFilePath, readerID): atomic_copy(cachedPath, localFilePath) # Change the reference to mutable - self._write([('UPDATE refs SET state = ? WHERE path = ? AND file_id = ?', ('mutable', localFilePath, fileStoreID))]) + self._write( + [ + ( + "UPDATE refs SET state = ? WHERE path = ? AND file_id = ?", + ("mutable", localFilePath, fileStoreID), + ) + ] + ) # Now we're done return localFilePath else: # We didn't get a reference. Maybe it is still downloading. - logger.debug('Could not obtain reference to file %s', fileStoreID) + logger.debug( + "Could not obtain reference to file %s", fileStoreID + ) # Loop around again and see if either we can download it or we can get a reference to it. @@ -1432,8 +1696,14 @@ def _fulfillCopyingReference(self, fileStoreID, cachedPath, localFilePath): # Expose this file as cached so other people can copy off of it too. # Change state from downloading to cached - self._write([('UPDATE files SET state = ?, owner = NULL WHERE id = ?', - ('cached', fileStoreID))]) + self._write( + [ + ( + "UPDATE files SET state = ?, owner = NULL WHERE id = ?", + ("cached", fileStoreID), + ) + ] + ) if self.forceDownloadDelay is not None: # Wait around to simulate a big file for testing @@ -1443,12 +1713,18 @@ def _fulfillCopyingReference(self, fileStoreID, cachedPath, localFilePath): atomic_copy(cachedPath, localFilePath) # Change our reference to mutable - self._write([('UPDATE refs SET state = ? WHERE path = ? AND file_id = ?', ('mutable', localFilePath, fileStoreID))]) + self._write( + [ + ( + "UPDATE refs SET state = ? WHERE path = ? AND file_id = ?", + ("mutable", localFilePath, fileStoreID), + ) + ] + ) # Now we're done return - def _giveAwayDownloadingFile(self, fileStoreID, cachedPath, localFilePath): """ Move a downloaded file in 'downloading' state, owned by us, from the cache to a user-specified destination path. @@ -1468,8 +1744,10 @@ def _giveAwayDownloadingFile(self, fileStoreID, cachedPath, localFilePath): with self.as_process() as me: # See if we actually own this file and can giove it away - self._read('SELECT COUNT(*) FROM files WHERE id = ? AND state = ? AND owner = ?', - (fileStoreID, 'downloading', me)) + self._read( + "SELECT COUNT(*) FROM files WHERE id = ? AND state = ? AND owner = ?", + (fileStoreID, "downloading", me), + ) if self.cur.fetchone()[0] > 0: # Now we have exclusive control of the cached copy of the file, so we can give it away. @@ -1478,8 +1756,15 @@ def _giveAwayDownloadingFile(self, fileStoreID, cachedPath, localFilePath): # We are giving it away shutil.move(cachedPath, localFilePath) # Record that. - self._write([('UPDATE refs SET state = ? WHERE path = ? AND file_id = ?', ('mutable', localFilePath, fileStoreID)), - ('DELETE FROM files WHERE id = ?', (fileStoreID,))]) + self._write( + [ + ( + "UPDATE refs SET state = ? WHERE path = ? AND file_id = ?", + ("mutable", localFilePath, fileStoreID), + ), + ("DELETE FROM files WHERE id = ?", (fileStoreID,)), + ] + ) # Now we're done return True @@ -1504,7 +1789,9 @@ def _createLinkFromCache(self, cachedPath, localFilePath, symlink=True): :rtype: bool """ - assert os.path.exists(cachedPath), "Cannot create link to missing cache file %s" % cachedPath + assert os.path.exists(cachedPath), ( + "Cannot create link to missing cache file %s" % cachedPath + ) try: # Try and make the hard link. @@ -1546,17 +1833,46 @@ def _readGlobalFileWithCache(self, fileStoreID, localFilePath, symlink, readerID # Try and create a downloading entry if no entry exists. # Make sure to create a reference at the same time if it succeeds, to bill it against our job's space. # Don't create the mutable reference yet because we might not necessarily be able to clear that space. - logger.debug('Trying to make file downloading file record and reference for id %s', fileStoreID) - self._write([('INSERT OR IGNORE INTO files VALUES (?, ?, ?, ?, ?)', - (fileStoreID, cachedPath, self.getGlobalFileSize(fileStoreID), 'downloading', me)), - ('INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND state = ? AND owner = ?', - (localFilePath, readerID, 'immutable', fileStoreID, 'downloading', me))]) + logger.debug( + "Trying to make file downloading file record and reference for id %s", + fileStoreID, + ) + self._write( + [ + ( + "INSERT OR IGNORE INTO files VALUES (?, ?, ?, ?, ?)", + ( + fileStoreID, + cachedPath, + self.getGlobalFileSize(fileStoreID), + "downloading", + me, + ), + ), + ( + "INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND state = ? AND owner = ?", + ( + localFilePath, + readerID, + "immutable", + fileStoreID, + "downloading", + me, + ), + ), + ] + ) # See if we won the race - self._read('SELECT COUNT(*) FROM files WHERE id = ? AND state = ? AND owner = ?', (fileStoreID, 'downloading', me)) + self._read( + "SELECT COUNT(*) FROM files WHERE id = ? AND state = ? AND owner = ?", + (fileStoreID, "downloading", me), + ) if self.cur.fetchone()[0] > 0: # We are responsible for downloading the file (and we have the reference) - logger.debug('We are now responsible for downloading file %s', fileStoreID) + logger.debug( + "We are now responsible for downloading file %s", fileStoreID + ) # Make sure we have space for this download. self._freeUpSpace() @@ -1570,8 +1886,14 @@ def _readGlobalFileWithCache(self, fileStoreID, localFilePath, symlink, readerID # We made the link! # Change file state from downloading to cached so other people can use it - self._write([('UPDATE files SET state = ?, owner = NULL WHERE id = ?', - ('cached', fileStoreID))]) + self._write( + [ + ( + "UPDATE files SET state = ?, owner = NULL WHERE id = ?", + ("cached", fileStoreID), + ) + ] + ) # Now we're done! return localFilePath @@ -1579,36 +1901,69 @@ def _readGlobalFileWithCache(self, fileStoreID, localFilePath, symlink, readerID # We could not make a link. We need to make a copy. # Change the reference to copying. - self._write([('UPDATE refs SET state = ? WHERE path = ? AND file_id = ?', ('copying', localFilePath, fileStoreID))]) + self._write( + [ + ( + "UPDATE refs SET state = ? WHERE path = ? AND file_id = ?", + ("copying", localFilePath, fileStoreID), + ) + ] + ) # Fulfill it with a full copy or by giving away the cached copy - self._fulfillCopyingReference(fileStoreID, cachedPath, localFilePath) + self._fulfillCopyingReference( + fileStoreID, cachedPath, localFilePath + ) # Now we're done return localFilePath else: - logger.debug('We already have an entry in the cache database for file %s', fileStoreID) + logger.debug( + "We already have an entry in the cache database for file %s", + fileStoreID, + ) # A record already existed for this file. # Try and create an immutable reference to an entry that # is in 'cached' or 'uploadable' or 'uploading' state. # It might be uploading because *we* are supposed to be uploading it. - logger.debug('Trying to make reference to file %s', fileStoreID) - self._write([('INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND (state = ? OR state = ? OR state = ?)', - (localFilePath, readerID, 'immutable', fileStoreID, 'cached', 'uploadable', 'uploading'))]) + logger.debug("Trying to make reference to file %s", fileStoreID) + self._write( + [ + ( + "INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND (state = ? OR state = ? OR state = ?)", + ( + localFilePath, + readerID, + "immutable", + fileStoreID, + "cached", + "uploadable", + "uploading", + ), + ) + ] + ) # See if we got it - self._read('SELECT COUNT(*) FROM refs WHERE path = ? and file_id = ?', (localFilePath, fileStoreID)) + self._read( + "SELECT COUNT(*) FROM refs WHERE path = ? and file_id = ?", + (localFilePath, fileStoreID), + ) if self.cur.fetchone()[0] > 0: # The file is cached and we can copy or link it - logger.debug('Obtained reference to file %s', fileStoreID) + logger.debug("Obtained reference to file %s", fileStoreID) # Get the path it is actually at in the cache, instead of where we wanted to put it - for row in self._read('SELECT path FROM files WHERE id = ?', (fileStoreID,)): + for row in self._read( + "SELECT path FROM files WHERE id = ?", (fileStoreID,) + ): cachedPath = row[0] - if self._createLinkFromCache(cachedPath, localFilePath, symlink): + if self._createLinkFromCache( + cachedPath, localFilePath, symlink + ): # We managed to make the link return localFilePath else: @@ -1620,11 +1975,22 @@ def _readGlobalFileWithCache(self, fileStoreID, localFilePath, symlink, readerID # we already have code for that for mutable downloads, # so just clear the reference and download mutably. - self._write([('DELETE FROM refs WHERE path = ? AND file_id = ?', (localFilePath, fileStoreID))]) - - return self._readGlobalFileMutablyWithCache(fileStoreID, localFilePath, readerID) + self._write( + [ + ( + "DELETE FROM refs WHERE path = ? AND file_id = ?", + (localFilePath, fileStoreID), + ) + ] + ) + + return self._readGlobalFileMutablyWithCache( + fileStoreID, localFilePath, readerID + ) else: - logger.debug('Could not obtain reference to file %s', fileStoreID) + logger.debug( + "Could not obtain reference to file %s", fileStoreID + ) # If we didn't get a download or a reference, adopt and do work from dead workers and loop again. # We may have to wait for someone else's download or delete to @@ -1640,7 +2006,12 @@ def _readGlobalFileWithCache(self, fileStoreID, localFilePath, symlink, readerID time.sleep(self.contentionBackoff) @contextmanager - def _with_copying_reference_to_upload(self, file_store_id: FileID, reader_id: str, local_file_path: Optional[str] = None) -> Generator: + def _with_copying_reference_to_upload( + self, + file_store_id: FileID, + reader_id: str, + local_file_path: Optional[str] = None, + ) -> Generator: """ Get a context manager that gives you either the local file path for a copyuing reference to the given file, or None if that file is not in an @@ -1662,12 +2033,28 @@ def _with_copying_reference_to_upload(self, file_store_id: FileID, reader_id: st local_file_path = self.getLocalTempFileName() # Try and make a 'copying' reference to such a file - self._write([('INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND (state = ? OR state = ?)', - (local_file_path, reader_id, 'copying', file_store_id, 'uploadable', 'uploading'))]) + self._write( + [ + ( + "INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND (state = ? OR state = ?)", + ( + local_file_path, + reader_id, + "copying", + file_store_id, + "uploadable", + "uploading", + ), + ) + ] + ) # See if we got it have_reference = False - for row in self._read('SELECT COUNT(*) FROM refs WHERE path = ? and file_id = ?', (local_file_path, file_store_id)): + for row in self._read( + "SELECT COUNT(*) FROM refs WHERE path = ? and file_id = ?", + (local_file_path, file_store_id), + ): have_reference = row[0] > 0 if have_reference: @@ -1676,8 +2063,14 @@ def _with_copying_reference_to_upload(self, file_store_id: FileID, reader_id: st yield local_file_path finally: # Clean up the reference if it is unmodified - self._write([('DELETE FROM refs WHERE path = ? AND file_id = ? AND state = ?', - (local_file_path, file_store_id, 'copying'))]) + self._write( + [ + ( + "DELETE FROM refs WHERE path = ? AND file_id = ? AND state = ?", + (local_file_path, file_store_id, "copying"), + ) + ] + ) else: # No reference was obtained. yield None @@ -1686,11 +2079,13 @@ def _with_copying_reference_to_upload(self, file_store_id: FileID, reader_id: st def readGlobalFileStream(self, fileStoreID, encoding=None, errors=None): if str(fileStoreID) in self.filesToDelete: # File has already been deleted - raise FileNotFoundError(f'Attempted to read deleted file: {fileStoreID}') + raise FileNotFoundError(f"Attempted to read deleted file: {fileStoreID}") self.logAccess(fileStoreID) - with self._with_copying_reference_to_upload(fileStoreID, self.jobDesc.jobStoreID) as ref_path: + with self._with_copying_reference_to_upload( + fileStoreID, self.jobDesc.jobStoreID + ) as ref_path: # Try and grab a reference to the file if it is being uploaded. if ref_path is not None: # We have an update in the cache that isn't written back yet. @@ -1699,11 +2094,16 @@ def readGlobalFileStream(self, fileStoreID, encoding=None, errors=None): # The ref file is not actually copied to; find the actual file # in the cache cached_path = None - for row in self._read('SELECT path FROM files WHERE id = ?', (fileStoreID,)): + for row in self._read( + "SELECT path FROM files WHERE id = ?", (fileStoreID,) + ): cached_path = row[0] if cached_path is None: - raise RuntimeError('File %s went away while we had a reference to it!' % fileStoreID) + raise RuntimeError( + "File %s went away while we had a reference to it!" + % fileStoreID + ) with open(cached_path, encoding=encoding, errors=errors) as result: # Pass along the results of the open context manager on the @@ -1714,7 +2114,9 @@ def readGlobalFileStream(self, fileStoreID, encoding=None, errors=None): else: # No local update, so we can stream from the job store # TODO: Maybe stream from cache even when not required for consistency? - with self.jobStore.read_file_stream(fileStoreID, encoding=encoding, errors=errors) as result: + with self.jobStore.read_file_stream( + fileStoreID, encoding=encoding, errors=errors + ) as result: yield result def deleteLocalFile(self, fileStoreID): @@ -1727,7 +2129,10 @@ def deleteLocalFile(self, fileStoreID): # missing ref file, we will raise an error about it and stop deleting # things. missingFile = None - for row in self._read('SELECT path FROM refs WHERE file_id = ? AND job_id = ?', (fileStoreID, jobID)): + for row in self._read( + "SELECT path FROM refs WHERE file_id = ? AND job_id = ?", + (fileStoreID, jobID), + ): # Delete all the files that are references to this cached file (even mutable copies) path = row[0] @@ -1748,12 +2153,22 @@ def deleteLocalFile(self, fileStoreID): if len(deleted) == 0 and not missingFile: # We have to tell the user if they tried to delete 0 local copies. # But if we found a missing local copy, go on to report that instead. - raise OSError(errno.ENOENT, f"Attempting to delete local copies of a file with none: {fileStoreID}") + raise OSError( + errno.ENOENT, + f"Attempting to delete local copies of a file with none: {fileStoreID}", + ) for path in deleted: # Drop the references - self._write([('DELETE FROM refs WHERE file_id = ? AND job_id = ? AND path = ?', (fileStoreID, jobID, path))]) - logger.debug('Deleted local file %s for global file %s', path, fileStoreID) + self._write( + [ + ( + "DELETE FROM refs WHERE file_id = ? AND job_id = ? AND path = ?", + (fileStoreID, jobID, path), + ) + ] + ) + logger.debug("Deleted local file %s for global file %s", path, fileStoreID) # Now space has been revoked from the cache because that job needs its space back. # That might result in stuff having to be evicted. @@ -1781,13 +2196,25 @@ def deleteGlobalFile(self, fileStoreID): with self.as_process() as me: # Make sure nobody else has references to it - for row in self._read('SELECT job_id FROM refs WHERE file_id = ? AND state != ?', (fileStoreID, 'mutable')): - raise RuntimeError(f'Deleted file ID {fileStoreID} which is still in use by job {row[0]}') + for row in self._read( + "SELECT job_id FROM refs WHERE file_id = ? AND state != ?", + (fileStoreID, "mutable"), + ): + raise RuntimeError( + f"Deleted file ID {fileStoreID} which is still in use by job {row[0]}" + ) # TODO: should we just let other jobs and the cache keep the file until # it gets evicted, and only delete at the back end? # Pop the file into deleting state owned by us if it exists - self._write([('UPDATE files SET state = ?, owner = ? WHERE id = ?', ('deleting', me, fileStoreID))]) + self._write( + [ + ( + "UPDATE files SET state = ?, owner = ? WHERE id = ?", + ("deleting", me, fileStoreID), + ) + ] + ) # Finish the delete if the file is present self._executePendingDeletions() @@ -1795,10 +2222,13 @@ def deleteGlobalFile(self, fileStoreID): # Add the file to the list of files to be deleted from the job store # once the run method completes. self.filesToDelete.add(str(fileStoreID)) - self.log_to_leader('Added file with ID \'%s\' to the list of files to be' % fileStoreID + - ' globally deleted.', level=logging.DEBUG) + self.log_to_leader( + "Added file with ID '%s' to the list of files to be" % fileStoreID + + " globally deleted.", + level=logging.DEBUG, + ) - @deprecated(new_function_name='export_file') + @deprecated(new_function_name="export_file") def exportFile(self, jobStoreFileID: FileID, dstUrl: str) -> None: return self.export_file(jobStoreFileID, dstUrl) @@ -1829,7 +2259,10 @@ def waitForCommit(self) -> bool: # thread. It can do some destructor work after it finishes its real # work. - if self.commitThread is not None and self.commitThread is not threading.current_thread(): + if ( + self.commitThread is not None + and self.commitThread is not threading.current_thread() + ): self.commitThread.join() return True @@ -1856,17 +2289,23 @@ def startCommit(self, jobState=False): # might be necessary for later jobs to see earlier jobs' deleted # before they are committed? - logger.debug('Starting commit of %s forked from %s', state_to_commit, self.jobDesc) + logger.debug( + "Starting commit of %s forked from %s", state_to_commit, self.jobDesc + ) # Make sure the deep copy isn't summoning ghosts of old job # versions. It must be as new or newer at this point. self.jobDesc.assert_is_not_newer_than(state_to_commit) # Bump the original's version since saving will do that too and we # don't want duplicate versions. - self.jobDesc.reserve_versions(1 if len(state_to_commit.filesToDelete) == 0 else 2) + self.jobDesc.reserve_versions( + 1 if len(state_to_commit.filesToDelete) == 0 else 2 + ) # Start the commit thread - self.commitThread = threading.Thread(target=self.startCommitThread, args=(state_to_commit,)) + self.commitThread = threading.Thread( + target=self.startCommitThread, args=(state_to_commit,) + ) self.commitThread.start() def startCommitThread(self, state_to_commit: Optional[JobDescription]): @@ -1879,7 +2318,7 @@ def startCommitThread(self, state_to_commit: Optional[JobDescription]): self.waitForPreviousCommit() try: - logger.debug('Committing file uploads asynchronously') + logger.debug("Committing file uploads asynchronously") # Finish all uploads self._executePendingUploads() @@ -1889,7 +2328,10 @@ def startCommitThread(self, state_to_commit: Optional[JobDescription]): if state_to_commit is not None: # Do all the things that make this job not redoable - logger.debug('Committing file deletes and job state changes asynchronously from %s', state_to_commit) + logger.debug( + "Committing file deletes and job state changes asynchronously from %s", + state_to_commit, + ) # Complete the job self.jobStore.update_job(state_to_commit) @@ -1905,10 +2347,8 @@ def startCommitThread(self, state_to_commit: Optional[JobDescription]): self._terminateEvent.set() raise - - @classmethod - def shutdown(cls, shutdown_info: Tuple[str, str]) -> None: + def shutdown(cls, shutdown_info: tuple[str, str]) -> None: """ :param shutdown_info: Tuple of the coordination directory (where the cache database is) and the cache directory (where the cached data is). @@ -1935,7 +2375,7 @@ def shutdown(cls, shutdown_info: Tuple[str, str]) -> None: # So we just go and find the cache-n.db with the largest n value, # and use that. dbFilename = None - dbAttempt = float('-inf') + dbAttempt = float("-inf") # We also need to remember all the plausible database files and # journals @@ -1943,12 +2383,15 @@ def shutdown(cls, shutdown_info: Tuple[str, str]) -> None: for dbCandidate in os.listdir(coordination_dir): # For each thing in the coordination directory, see if it starts like a database file. - match = re.match('^cache-([0-9]+).db.*', dbCandidate) + match = re.match("^cache-([0-9]+).db.*", dbCandidate) if match: # This is caching-related. all_db_files.append(dbCandidate) attempt_number = int(match.group(1)) - if attempt_number > dbAttempt and dbCandidate == f"cache-{attempt_number}.db": + if ( + attempt_number > dbAttempt + and dbCandidate == f"cache-{attempt_number}.db" + ): # This is a main database, and the newest we have seen. dbFilename = dbCandidate dbAttempt = attempt_number @@ -1956,7 +2399,9 @@ def shutdown(cls, shutdown_info: Tuple[str, str]) -> None: if dbFilename is not None: # We found a caching database - logger.debug('Connecting to latest caching database %s for cleanup', dbFilename) + logger.debug( + "Connecting to latest caching database %s for cleanup", dbFilename + ) dbPath = os.path.join(coordination_dir, dbFilename) @@ -1980,7 +2425,7 @@ def shutdown(cls, shutdown_info: Tuple[str, str]) -> None: con.close() else: - logger.debug('No caching database found in %s', dir_) + logger.debug("No caching database found in %s", dir_) # Whether or not we found a database, we need to clean up the cache # directory. Delete everything cached. @@ -2017,7 +2462,9 @@ def _removeDeadJobs(cls, coordination_dir, con): # Get all the dead worker PIDs workers = [] - for row in cls._static_read(cur, 'SELECT DISTINCT worker FROM jobs WHERE worker IS NOT NULL'): + for row in cls._static_read( + cur, "SELECT DISTINCT worker FROM jobs WHERE worker IS NOT NULL" + ): workers.append(row[0]) # Work out which of them are not currently running. @@ -2030,14 +2477,18 @@ def _removeDeadJobs(cls, coordination_dir, con): # Now we know which workers are dead. # Clear them off of the jobs they had. for deadWorker in deadWorkers: - cls._static_write(con, cur, [('UPDATE jobs SET worker = NULL WHERE worker = ?', (deadWorker,))]) + cls._static_write( + con, + cur, + [("UPDATE jobs SET worker = NULL WHERE worker = ?", (deadWorker,))], + ) if len(deadWorkers) > 0: - logger.debug('Reaped %d dead workers', len(deadWorkers)) + logger.debug("Reaped %d dead workers", len(deadWorkers)) while True: # Find an unowned job. # Don't take all of them; other people could come along and want to help us with the other jobs. - cls._static_read(cur, 'SELECT id FROM jobs WHERE worker IS NULL LIMIT 1') + cls._static_read(cur, "SELECT id FROM jobs WHERE worker IS NULL LIMIT 1") row = cur.fetchone() if row is None: # We cleaned up all the jobs @@ -2046,10 +2497,23 @@ def _removeDeadJobs(cls, coordination_dir, con): jobID = row[0] # Try to own this job - cls._static_write(con, cur, [('UPDATE jobs SET worker = ? WHERE id = ? AND worker IS NULL', (me, jobID))]) + cls._static_write( + con, + cur, + [ + ( + "UPDATE jobs SET worker = ? WHERE id = ? AND worker IS NULL", + (me, jobID), + ) + ], + ) # See if we won the race - cls._static_read(cur, 'SELECT id, tempdir FROM jobs WHERE id = ? AND worker = ?', (jobID, me)) + cls._static_read( + cur, + "SELECT id, tempdir FROM jobs WHERE id = ? AND worker = ?", + (jobID, me), + ) row = cur.fetchone() if row is None: # We didn't win the race. Try another one. @@ -2058,6 +2522,6 @@ def _removeDeadJobs(cls, coordination_dir, con): # If we did win, delete the job and its files and temp dir cls._removeJob(con, cur, jobID) - logger.debug('Cleaned up orphaned job %s', jobID) + logger.debug("Cleaned up orphaned job %s", jobID) # Now we have cleaned up all the jobs that belonged to dead workers that were dead when we entered this function. diff --git a/src/toil/fileStores/nonCachingFileStore.py b/src/toil/fileStores/nonCachingFileStore.py index 4d86d65cde..31dd513805 100644 --- a/src/toil/fileStores/nonCachingFileStore.py +++ b/src/toil/fileStores/nonCachingFileStore.py @@ -16,21 +16,20 @@ import os import tempfile from collections import defaultdict +from collections.abc import Generator, Iterator from contextlib import contextmanager -from typing import (IO, - Any, - Callable, - ContextManager, - DefaultDict, - Dict, - Generator, - Iterator, - List, - Literal, - Optional, - Union, - cast, - overload) +from typing import ( + IO, + Any, + Callable, + ContextManager, + DefaultDict, + Literal, + Optional, + Union, + cast, + overload, +) import dill @@ -42,7 +41,12 @@ from toil.lib.compatibility import deprecated from toil.lib.io import make_public_dir, robust_rmtree from toil.lib.retry import ErrorCondition, retry -from toil.lib.threading import get_process_name, process_name_exists, safe_lock, safe_unlock_and_close +from toil.lib.threading import ( + get_process_name, + process_name_exists, + safe_lock, + safe_unlock_and_close, +) logger: logging.Logger = logging.getLogger(__name__) @@ -58,7 +62,7 @@ def __init__( super().__init__(jobStore, jobDesc, file_store_dir, waitForPreviousCommit) # This will be defined in the `open` method. self.jobStateFile: Optional[str] = None - self.localFileMap: DefaultDict[str, List[str]] = defaultdict(list) + self.localFileMap: DefaultDict[str, list[str]] = defaultdict(list) self.check_for_state_corruption() @@ -77,10 +81,10 @@ def check_for_coordination_corruption(coordination_dir: Optional[str]) -> None: if coordination_dir and not os.path.exists(coordination_dir): raise RuntimeError( - f'The Toil coordination directory at {coordination_dir} ' - f'was removed while the workflow was running! Please provide a ' - f'TOIL_COORDINATION_DIR or --coordinationDir at a location that ' - f'is safe from automated cleanup during the workflow run.' + f"The Toil coordination directory at {coordination_dir} " + f"was removed while the workflow was running! Please provide a " + f"TOIL_COORDINATION_DIR or --coordinationDir at a location that " + f"is safe from automated cleanup during the workflow run." ) def check_for_state_corruption(self) -> None: @@ -92,22 +96,26 @@ def check_for_state_corruption(self) -> None: if self.jobStateFile and not os.path.exists(self.jobStateFile): raise RuntimeError( - f'The job state file {self.jobStateFile} ' - f'was removed while the workflow was running! Please provide a ' - f'TOIL_COORDINATION_DIR or --coordinationDir at a location that ' - f'is safe from automated cleanup during the workflow run.' + f"The job state file {self.jobStateFile} " + f"was removed while the workflow was running! Please provide a " + f"TOIL_COORDINATION_DIR or --coordinationDir at a location that " + f"is safe from automated cleanup during the workflow run." ) @contextmanager def open(self, job: Job) -> Generator[None, None, None]: startingDir = os.getcwd() - self.localTempDir: str = make_public_dir(self.localTempDir, suggested_name="job") + self.localTempDir: str = make_public_dir( + self.localTempDir, suggested_name="job" + ) self._removeDeadJobs(self.coordination_dir) self.jobStateFile = self._createJobStateFile() self.check_for_state_corruption() freeSpace, diskSize = getFileSystemSize(self.localTempDir) if freeSpace <= 0.1 * diskSize: - logger.warning(f'Starting job {self.jobName} with less than 10%% of disk space remaining.') + logger.warning( + f"Starting job {self.jobName} with less than 10%% of disk space remaining." + ) try: os.chdir(self.localTempDir) with super().open(job): @@ -119,10 +127,12 @@ def open(self, job: Job) -> Generator[None, None, None]: try: os.remove(self.jobStateFile) except FileNotFoundError: - logger.exception('Job state file %s has gone missing unexpectedly; some cleanup for failed jobs may be getting skipped!', self.jobStateFile) - pass + logger.exception( + "Job state file %s has gone missing unexpectedly; some cleanup for failed jobs may be getting skipped!", + self.jobStateFile, + ) - def writeGlobalFile(self, localFileName: str, cleanup: bool=False) -> FileID: + def writeGlobalFile(self, localFileName: str, cleanup: bool = False) -> FileID: absLocalFileName = self._resolveAbsoluteLocalPath(localFileName) creatorID = str(self.jobDesc.jobStoreID) fileStoreID = self.jobStore.write_file(absLocalFileName, creatorID, cleanup) @@ -132,12 +142,20 @@ def writeGlobalFile(self, localFileName: str, cleanup: bool=False) -> FileID: self.localFileMap[fileStoreID].append(absLocalFileName) return FileID.forPath(fileStoreID, absLocalFileName) - def readGlobalFile(self, fileStoreID: str, userPath: Optional[str] = None, cache: bool=True, mutable: bool=False, - symlink: bool=False) -> str: + def readGlobalFile( + self, + fileStoreID: str, + userPath: Optional[str] = None, + cache: bool = True, + mutable: bool = False, + symlink: bool = False, + ) -> str: if userPath is not None: localFilePath = self._resolveAbsoluteLocalPath(userPath) if os.path.exists(localFilePath): - raise RuntimeError(' File %s ' % localFilePath + ' exists. Cannot Overwrite.') + raise RuntimeError( + " File %s " % localFilePath + " exists. Cannot Overwrite." + ) else: localFilePath = self.getLocalTempFileName() @@ -152,25 +170,30 @@ def readGlobalFileStream( fileStoreID: str, encoding: Literal[None] = None, errors: Optional[str] = None, - ) -> ContextManager[IO[bytes]]: - ... + ) -> ContextManager[IO[bytes]]: ... @overload def readGlobalFileStream( self, fileStoreID: str, encoding: str, errors: Optional[str] = None - ) -> ContextManager[IO[str]]: - ... + ) -> ContextManager[IO[str]]: ... # TODO: This seems to hit https://github.com/python/mypy/issues/11373 # But that is supposedly fixed. - @contextmanager # type: ignore - def readGlobalFileStream(self, fileStoreID: str, encoding: Optional[str] = None, errors: Optional[str] = None) -> Iterator[Union[IO[bytes], IO[str]]]: - with self.jobStore.read_file_stream(fileStoreID, encoding=encoding, errors=errors) as f: + @contextmanager # type: ignore + def readGlobalFileStream( + self, + fileStoreID: str, + encoding: Optional[str] = None, + errors: Optional[str] = None, + ) -> Iterator[Union[IO[bytes], IO[str]]]: + with self.jobStore.read_file_stream( + fileStoreID, encoding=encoding, errors=errors + ) as f: self.logAccess(fileStoreID) yield f - @deprecated(new_function_name='export_file') + @deprecated(new_function_name="export_file") def exportFile(self, jobStoreFileID: FileID, dstUrl: str) -> None: return self.export_file(jobStoreFileID, dstUrl) @@ -181,7 +204,9 @@ def deleteLocalFile(self, fileStoreID: str) -> None: try: localFilePaths = self.localFileMap.pop(fileStoreID) except KeyError: - raise OSError(errno.ENOENT, "Attempting to delete local copies of a file with none") + raise OSError( + errno.ENOENT, "Attempting to delete local copies of a file with none" + ) else: for localFilePath in localFilePaths: os.remove(localFilePath) @@ -232,7 +257,6 @@ def startCommit(self, jobState: bool = False) -> None: self._terminateEvent.set() raise - def __del__(self) -> None: """ Cleanup function that is run when destroying the class instance. Nothing to do since there @@ -240,7 +264,9 @@ def __del__(self) -> None: """ @classmethod - def _removeDeadJobs(cls, coordination_dir: str, batchSystemShutdown: bool=False) -> None: + def _removeDeadJobs( + cls, coordination_dir: str, batchSystemShutdown: bool = False + ) -> None: """ Look at the state of all jobs registered in the individual job state files, and handle them (clean up the disk) @@ -253,13 +279,13 @@ def _removeDeadJobs(cls, coordination_dir: str, batchSystemShutdown: bool=False) cls.check_for_coordination_corruption(coordination_dir) for jobState in cls._getAllJobStates(coordination_dir): - if not process_name_exists(coordination_dir, jobState['jobProcessName']): + if not process_name_exists(coordination_dir, jobState["jobProcessName"]): # We need to have a race to pick someone to clean up. try: # Open the directory. # We can't open a directory for write, only for read. - dirFD = os.open(jobState['jobDir'], os.O_RDONLY) + dirFD = os.open(jobState["jobDir"], os.O_RDONLY) except FileNotFoundError: # The cleanup has happened and we can't contest for it continue @@ -276,8 +302,11 @@ def _removeDeadJobs(cls, coordination_dir: str, batchSystemShutdown: bool=False) # has it locked. So loop around again. else: # We got it - logger.warning('Detected that job (%s) prematurely terminated. Fixing the ' - 'state of the job on disk.', jobState['jobName']) + logger.warning( + "Detected that job (%s) prematurely terminated. Fixing the " + "state of the job on disk.", + jobState["jobName"], + ) try: if not batchSystemShutdown: @@ -285,12 +314,12 @@ def _removeDeadJobs(cls, coordination_dir: str, batchSystemShutdown: bool=False) # Delete the old work directory if it still exists. Do this only during # the life of the program and dont' do it during the batch system # cleanup. Leave that to the batch system cleanup code. - robust_rmtree(jobState['jobDir']) + robust_rmtree(jobState["jobDir"]) finally: safe_unlock_and_close(dirFD) @classmethod - def _getAllJobStates(cls, coordination_dir: str) -> Iterator[Dict[str, str]]: + def _getAllJobStates(cls, coordination_dir: str) -> Iterator[dict[str, str]]: """ Generator function that deserializes and yields the job state for every job on the node, one at a time. @@ -307,7 +336,7 @@ def _getAllJobStates(cls, coordination_dir: str) -> Iterator[Dict[str, str]]: # So we need to work in bytes. for entry in os.scandir(os.fsencode(coordination_dir)): # For each job state file in the coordination directory - if entry.name.endswith(b'.jobState'): + if entry.name.endswith(b".jobState"): # This is the state of a job jobStateFiles.append(os.fsdecode(entry.path)) @@ -320,7 +349,7 @@ def _getAllJobStates(cls, coordination_dir: str) -> Iterator[Dict[str, str]]: # job finished & deleted its jobState file since the jobState files were discovered continue elif e.errno == 5: - # This is a OSError: [Errno 5] Input/output error (jobStatefile seems to disappear + # This is a OSError: [Errno 5] Input/output error (jobStatefile seems to disappear # on network file system sometimes) continue else: @@ -328,16 +357,16 @@ def _getAllJobStates(cls, coordination_dir: str) -> Iterator[Dict[str, str]]: @staticmethod # Retry on any OSError except FileNotFoundError, which we throw immediately - @retry(errors=[ - OSError, - ErrorCondition( - error=FileNotFoundError, - retry_on_this_condition=False - )]) - def _readJobState(jobStateFileName: str) -> Dict[str, str]: - with open(jobStateFileName, 'rb') as fH: + @retry( + errors=[ + OSError, + ErrorCondition(error=FileNotFoundError, retry_on_this_condition=False), + ] + ) + def _readJobState(jobStateFileName: str) -> dict[str, str]: + with open(jobStateFileName, "rb") as fH: state = dill.load(fH) - return cast(Dict[str, str], state) + return cast(dict[str, str], state) def _createJobStateFile(self) -> str: """ @@ -350,20 +379,26 @@ def _createJobStateFile(self) -> str: :rtype: str """ self.check_for_state_corruption() - jobState = {'jobProcessName': get_process_name(self.coordination_dir), - 'jobName': self.jobName, - 'jobDir': self.localTempDir} + jobState = { + "jobProcessName": get_process_name(self.coordination_dir), + "jobName": self.jobName, + "jobDir": self.localTempDir, + } try: - (fd, jobStateFile) = tempfile.mkstemp(suffix='.jobState.tmp', dir=self.coordination_dir) + (fd, jobStateFile) = tempfile.mkstemp( + suffix=".jobState.tmp", dir=self.coordination_dir + ) except Exception as e: - raise RuntimeError("Could not make state file in " + self.coordination_dir) from e - with open(fd, 'wb') as fH: + raise RuntimeError( + "Could not make state file in " + self.coordination_dir + ) from e + with open(fd, "wb") as fH: # Write data dill.dump(jobState, fH) # Drop suffix - jobStateFile = jobStateFile[:-len('.tmp')] + jobStateFile = jobStateFile[: -len(".tmp")] # Put in place - os.rename(jobStateFile + '.tmp', jobStateFile) + os.rename(jobStateFile + ".tmp", jobStateFile) return jobStateFile @classmethod diff --git a/src/toil/job.py b/src/toil/job.py index b6132bde6f..2a7e56c6b1 100644 --- a/src/toil/job.py +++ b/src/toil/job.py @@ -25,48 +25,36 @@ import uuid from abc import ABCMeta, abstractmethod from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, Namespace +from collections.abc import Iterator, Mapping, Sequence from contextlib import contextmanager from io import BytesIO -from typing import (TYPE_CHECKING, - Any, - Callable, - Dict, - Iterator, - List, - Mapping, - NamedTuple, - Optional, - Sequence, - Set, - Tuple, - TypeVar, - Union, - cast, - overload) - -from configargparse import ArgParser - -from toil.bus import Names -from toil.lib.compatibility import deprecated - -if sys.version_info >= (3, 8): - from typing import TypedDict -else: - from typing_extensions import TypedDict +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Literal, + NamedTuple, + Optional, + TypedDict, + TypeVar, + Union, + cast, + overload, +) import dill -# TODO: When this gets into the standard library, get it from there and drop -# typing-extensions dependency on Pythons that are new enough. -from typing_extensions import NotRequired +from configargparse import ArgParser -if sys.version_info >= (3, 8): - from typing import Literal +if sys.version_info < (3, 11): + from typing_extensions import NotRequired else: - from typing_extensions import Literal + from typing import NotRequired +from toil.bus import Names from toil.common import Config, Toil, addOptions, safeUnpickleFromStream from toil.deferred import DeferredFunction from toil.fileStores import FileID +from toil.lib.compatibility import deprecated from toil.lib.conversions import bytes2human, human2bytes from toil.lib.expando import Expando from toil.lib.resources import ResourceMonitor @@ -122,24 +110,28 @@ def __init__(self, predecessor: "Job", successor: "Job") -> None: f'The given job: "{predecessor.description}" is already a predecessor of job: "{successor.description}".' ) + class DebugStoppingPointReached(BaseException): """ Raised when a job reaches a point at which it has been instructed to stop for debugging. """ - pass + class FilesDownloadedStoppingPointReached(DebugStoppingPointReached): """ Raised when a job stops because it was asked to download its files, and the files are downloaded. """ - def __init__(self, message, host_and_job_paths: Optional[List[Tuple[str, str]]] = None): + def __init__( + self, message, host_and_job_paths: Optional[list[tuple[str, str]]] = None + ): super().__init__(message) # Save the host and user-code-visible paths of files, in case we're # using a container and they are different. self.host_and_job_paths = host_and_job_paths + class TemporaryID: """ Placeholder for a unregistered job ID used by a JobDescription. @@ -161,7 +153,7 @@ def __str__(self) -> str: return self.__repr__() def __repr__(self) -> str: - return f'TemporaryID({self._value})' + return f"TemporaryID({self._value})" def __hash__(self) -> int: return hash(self._value) @@ -172,6 +164,7 @@ def __eq__(self, other: Any) -> bool: def __ne__(self, other: Any) -> bool: return not isinstance(other, TemporaryID) or self._value != other._value + class AcceleratorRequirement(TypedDict): """Requirement for one or more computational accelerators, like a GPU or FPGA.""" @@ -210,7 +203,10 @@ class AcceleratorRequirement(TypedDict): # TODO: support requesting any GPU with X amount of vram -def parse_accelerator(spec: Union[int, str, Dict[str, Union[str, int]]]) -> AcceleratorRequirement: + +def parse_accelerator( + spec: Union[int, str, dict[str, Union[str, int]]] +) -> AcceleratorRequirement: """ Parse an AcceleratorRequirement specified by user code. @@ -247,16 +243,16 @@ def parse_accelerator(spec: Union[int, str, Dict[str, Union[str, int]]]) -> Acce :raises ValueError: if it gets something it can't parse :raises TypeError: if it gets something it can't parse because it's the wrong type. """ - KINDS = {'gpu'} - BRANDS = {'nvidia', 'amd'} - APIS = {'cuda', 'rocm', 'opencl'} + KINDS = {"gpu"} + BRANDS = {"nvidia", "amd"} + APIS = {"cuda", "rocm", "opencl"} - parsed: AcceleratorRequirement = {'count': 1, 'kind': 'gpu'} + parsed: AcceleratorRequirement = {"count": 1, "kind": "gpu"} if isinstance(spec, int): - parsed['count'] = spec + parsed["count"] = spec elif isinstance(spec, str): - parts = spec.split(':') + parts = spec.split(":") if len(parts) > 2: raise ValueError("Could not parse AcceleratorRequirement: " + spec) @@ -265,7 +261,7 @@ def parse_accelerator(spec: Union[int, str, Dict[str, Union[str, int]]]) -> Acce try: # If they have : and then a count, or just a count, handle that. - parsed['count'] = int(possible_count) + parsed["count"] = int(possible_count) if len(parts) > 1: # Then we take whatever was before the colon as text possible_description = parts[0] @@ -275,73 +271,97 @@ def parse_accelerator(spec: Union[int, str, Dict[str, Union[str, int]]]) -> Acce # It doesn't end with a number if len(parts) == 2: # We should have a number though. - raise ValueError("Could not parse AcceleratorRequirement count in: " + spec) + raise ValueError( + "Could not parse AcceleratorRequirement count in: " + spec + ) else: # Must be just the description possible_description = possible_count # Determine if we have a kind, brand, API, or (by default) model if possible_description in KINDS: - parsed['kind'] = possible_description + parsed["kind"] = possible_description elif possible_description in BRANDS: - parsed['brand'] = possible_description + parsed["brand"] = possible_description elif possible_description in APIS: - parsed['api'] = possible_description + parsed["api"] = possible_description else: if possible_description is not None: - parsed['model'] = possible_description + parsed["model"] = possible_description elif isinstance(spec, dict): # It's a dict, so merge with the defaults. parsed.update(spec) # TODO: make sure they didn't misspell keys or something else: - raise TypeError(f"Cannot parse value of type {type(spec)} as an AcceleratorRequirement") + raise TypeError( + f"Cannot parse value of type {type(spec)} as an AcceleratorRequirement" + ) - if parsed['kind'] == 'gpu': + if parsed["kind"] == "gpu": # Use some smarts about what current GPUs are like to elaborate the # description. - if 'brand' not in parsed and 'model' in parsed: + if "brand" not in parsed and "model" in parsed: # Try to guess the brand from the model for brand in BRANDS: - if parsed['model'].startswith(brand): + if parsed["model"].startswith(brand): # The model often starts with the brand - parsed['brand'] = brand + parsed["brand"] = brand break - if 'brand' not in parsed and 'api' in parsed: + if "brand" not in parsed and "api" in parsed: # Try to guess the brand from the API - if parsed['api'] == 'cuda': + if parsed["api"] == "cuda": # Only nvidia makes cuda cards - parsed['brand'] = 'nvidia' - elif parsed['api'] == 'rocm': + parsed["brand"] = "nvidia" + elif parsed["api"] == "rocm": # Only amd makes rocm cards - parsed['brand'] = 'amd' + parsed["brand"] = "amd" return parsed -def accelerator_satisfies(candidate: AcceleratorRequirement, requirement: AcceleratorRequirement, ignore: List[str] = []) -> bool: + +def accelerator_satisfies( + candidate: AcceleratorRequirement, + requirement: AcceleratorRequirement, + ignore: list[str] = [], +) -> bool: """ Test if candidate partially satisfies the given requirement. :returns: True if the given candidate at least partially satisfies the given requirement (i.e. check all fields other than count). """ - for key in ['kind', 'brand', 'api', 'model']: + for key in ["kind", "brand", "api", "model"]: if key in ignore: # Skip this aspect. continue if key in requirement: if key not in candidate: - logger.debug('Candidate %s does not satisfy requirement %s because it does not have a %s', candidate, requirement, key) + logger.debug( + "Candidate %s does not satisfy requirement %s because it does not have a %s", + candidate, + requirement, + key, + ) return False if candidate[key] != requirement[key]: - logger.debug('Candidate %s does not satisfy requirement %s because it does not have the correct %s', candidate, requirement, key) + logger.debug( + "Candidate %s does not satisfy requirement %s because it does not have the correct %s", + candidate, + requirement, + key, + ) return False # If all these match or are more specific than required, we match! return True -def accelerators_fully_satisfy(candidates: Optional[List[AcceleratorRequirement]], requirement: AcceleratorRequirement, ignore: List[str] = []) -> bool: + +def accelerators_fully_satisfy( + candidates: Optional[list[AcceleratorRequirement]], + requirement: AcceleratorRequirement, + ignore: list[str] = [], +) -> bool: """ Determine if a set of accelerators satisfy a requirement. @@ -352,21 +372,22 @@ def accelerators_fully_satisfy(candidates: Optional[List[AcceleratorRequirement] together (i.e. check all fields including count). """ - count_remaining = requirement['count'] + count_remaining = requirement["count"] if candidates: for candidate in candidates: if accelerator_satisfies(candidate, requirement, ignore=ignore): - if candidate['count'] > count_remaining: + if candidate["count"] > count_remaining: # We found all the matching accelerators we need count_remaining = 0 break else: - count_remaining -= candidate['count'] + count_remaining -= candidate["count"] # If we have no count left we are fully satisfied return count_remaining == 0 + class RequirementsDict(TypedDict): """ Typed storage for requirements for a job. @@ -377,22 +398,35 @@ class RequirementsDict(TypedDict): cores: NotRequired[Union[int, float]] memory: NotRequired[int] disk: NotRequired[int] - accelerators: NotRequired[List[AcceleratorRequirement]] + accelerators: NotRequired[list[AcceleratorRequirement]] preemptible: NotRequired[bool] + # These must be all the key names in RequirementsDict REQUIREMENT_NAMES = ["disk", "memory", "cores", "accelerators", "preemptible"] # This is the supertype of all value types in RequirementsDict -ParsedRequirement = Union[int, float, bool, List[AcceleratorRequirement]] +ParsedRequirement = Union[int, float, bool, list[AcceleratorRequirement]] # We define some types for things we can parse into different kind of requirements ParseableIndivisibleResource = Union[str, int] ParseableDivisibleResource = Union[str, int, float] ParseableFlag = Union[str, int, bool] -ParseableAcceleratorRequirement = Union[str, int, Mapping[str, Any], AcceleratorRequirement, Sequence[Union[str, int, Mapping[str, Any], AcceleratorRequirement]]] +ParseableAcceleratorRequirement = Union[ + str, + int, + Mapping[str, Any], + AcceleratorRequirement, + Sequence[Union[str, int, Mapping[str, Any], AcceleratorRequirement]], +] + +ParseableRequirement = Union[ + ParseableIndivisibleResource, + ParseableDivisibleResource, + ParseableFlag, + ParseableAcceleratorRequirement, +] -ParseableRequirement = Union[ParseableIndivisibleResource, ParseableDivisibleResource, ParseableFlag, ParseableAcceleratorRequirement] class Requirer: """ @@ -403,9 +437,7 @@ class Requirer: _requirementOverrides: RequirementsDict - def __init__( - self, requirements: Mapping[str, ParseableRequirement] - ) -> None: + def __init__(self, requirements: Mapping[str, ParseableRequirement]) -> None: """ Parse and save the given requirements. @@ -446,12 +478,11 @@ def assignConfig(self, config: Config) -> None: raise RuntimeError(f"Config assigned multiple times to {self}") self._config = config - - def __getstate__(self) -> Dict[str, Any]: + def __getstate__(self) -> dict[str, Any]: """Return the dict to use as the instance's __dict__ when pickling.""" # We want to exclude the config from pickling. state = self.__dict__.copy() - state['_config'] = None + state["_config"] = None return state def __copy__(self) -> "Requirer": @@ -492,37 +523,29 @@ def __deepcopy__(self, memo: Any) -> "Requirer": @overload @staticmethod def _parseResource( - name: Union[Literal["memory"], Literal["disks"]], value: ParseableIndivisibleResource - ) -> int: - ... + name: Union[Literal["memory"], Literal["disks"]], + value: ParseableIndivisibleResource, + ) -> int: ... @overload @staticmethod def _parseResource( name: Literal["cores"], value: ParseableDivisibleResource - ) -> Union[int, float]: - ... + ) -> Union[int, float]: ... @overload @staticmethod def _parseResource( name: Literal["accelerators"], value: ParseableAcceleratorRequirement - ) -> List[AcceleratorRequirement]: - ... + ) -> list[AcceleratorRequirement]: ... @overload @staticmethod - def _parseResource( - name: str, value: ParseableRequirement - ) -> ParsedRequirement: - ... + def _parseResource(name: str, value: ParseableRequirement) -> ParsedRequirement: ... @overload @staticmethod - def _parseResource( - name: str, value: None - ) -> None: - ... + def _parseResource(name: str, value: None) -> None: ... @staticmethod def _parseResource( @@ -559,43 +582,53 @@ def _parseResource( # Anything can be None. return value - if name in ('memory', 'disk', 'cores'): + if name in ("memory", "disk", "cores"): # These should be numbers that accept things like "5G". if isinstance(value, (str, bytes)): value = human2bytes(value) if isinstance(value, int): return value - elif isinstance(value, float) and name == 'cores': + elif isinstance(value, float) and name == "cores": # But only cores can be fractional. return value else: - raise TypeError(f"The '{name}' requirement does not accept values that are of type {type(value)}") - elif name == 'preemptible': + raise TypeError( + f"The '{name}' requirement does not accept values that are of type {type(value)}" + ) + elif name == "preemptible": if isinstance(value, str): if value.lower() == "true": return True elif value.lower() == "false": return False else: - raise ValueError(f"The '{name}' requirement, as a string, must be 'true' or 'false' but is {value}") + raise ValueError( + f"The '{name}' requirement, as a string, must be 'true' or 'false' but is {value}" + ) elif isinstance(value, int): if value == 1: return True if value == 0: return False else: - raise ValueError(f"The '{name}' requirement, as an int, must be 1 or 0 but is {value}") + raise ValueError( + f"The '{name}' requirement, as an int, must be 1 or 0 but is {value}" + ) elif isinstance(value, bool): return value else: - raise TypeError(f"The '{name}' requirement does not accept values that are of type {type(value)}") - elif name == 'accelerators': + raise TypeError( + f"The '{name}' requirement does not accept values that are of type {type(value)}" + ) + elif name == "accelerators": # The type checking for this is delegated to the # AcceleratorRequirement class. if isinstance(value, list): - return [parse_accelerator(v) for v in value] #accelerators={'kind': 'gpu', 'brand': 'nvidia', 'count': 2} + return [ + parse_accelerator(v) for v in value + ] # accelerators={'kind': 'gpu', 'brand': 'nvidia', 'count': 2} else: - return [parse_accelerator(value)] #accelerators=1 + return [parse_accelerator(value)] # accelerators=1 else: # Anything else we just pass along without opinons return cast(ParsedRequirement, value) @@ -618,7 +651,10 @@ def _fetchRequirement(self, requirement: str) -> Optional[ParsedRequirement]: ) return value elif self._config is not None: - values = [getattr(self._config, 'default_' + requirement, None), getattr(self._config, 'default' + requirement.capitalize(), None)] + values = [ + getattr(self._config, "default_" + requirement, None), + getattr(self._config, "default" + requirement.capitalize(), None), + ] value = values[0] if values[0] is not None else values[1] if value is None: raise AttributeError( @@ -679,10 +715,13 @@ def preemptable(self, val: ParseableFlag) -> None: self._requirementOverrides["preemptible"] = Requirer._parseResource( "preemptible", val ) + @property - def accelerators(self) -> List[AcceleratorRequirement]: + def accelerators(self) -> list[AcceleratorRequirement]: """Any accelerators, such as GPUs, that are needed.""" - return cast(List[AcceleratorRequirement], self._fetchRequirement("accelerators")) + return cast( + list[AcceleratorRequirement], self._fetchRequirement("accelerators") + ) @accelerators.setter def accelerators(self, val: ParseableAcceleratorRequirement) -> None: @@ -705,7 +744,7 @@ def scale(self, requirement: str, factor: float) -> "Requirer": if isinstance(original_value, (int, float)): # This is something we actually can scale up and down new_value = original_value * factor - if requirement in ('memory', 'disk'): + if requirement in ("memory", "disk"): # Must round to an int new_value = math.ceil(new_value) setattr(scaled, requirement, new_value) @@ -723,29 +762,32 @@ def requirements_string(self) -> str: if isinstance(v, (int, float)) and v > 1000: # Make large numbers readable v = bytes2human(v) - parts.append(f'{k}: {v}') + parts.append(f"{k}: {v}") if len(parts) == 0: - parts = ['no requirements'] - return ', '.join(parts) + parts = ["no requirements"] + return ", ".join(parts) + class JobBodyReference(NamedTuple): """ Reference from a job description to its body. """ + file_store_id: str """File ID (or special shared file name for the root job) of the job's body.""" - module_string: str + module_string: str """Stringified description of the module needed to load the body.""" + class JobDescription(Requirer): """ Stores all the information that the Toil Leader ever needs to know about a Job. - + This includes: * Resource requirements. * Which jobs are children or follow-ons or predecessors of this job. * A reference to the Job object in the job store. - + Can be obtained from an actual (i.e. executable) Job object, and can be used to obtain the Job object from the JobStore. @@ -760,9 +802,9 @@ def __init__( requirements: Mapping[str, Union[int, str, bool]], jobName: str, unitName: Optional[str] = "", - displayName: Optional[str] = "", + displayName: Optional[str] = "", local: Optional[bool] = None, - files: Optional[Set[FileID]] = None + files: Optional[set[FileID]] = None, ) -> None: """ Create a new JobDescription. @@ -796,10 +838,11 @@ def __init__( # Save names, making sure they are strings and not e.g. bytes or None. def makeString(x: Union[str, bytes, None]) -> str: if isinstance(x, bytes): - return x.decode('utf-8', errors='replace') + return x.decode("utf-8", errors="replace") if x is None: return "" return x + self.jobName = makeString(jobName) self.unitName = makeString(unitName) self.displayName = makeString(displayName) @@ -846,7 +889,7 @@ def makeString(x: Union[str, bytes, None]) -> str: # chained-in job with its original ID, and also this job's ID with its # original names, or is empty if no chaining has happened. # The first job in the chain comes first in the list. - self._merged_job_names: List[Names] = [] + self._merged_job_names: list[Names] = [] # The number of direct predecessors of the job. Needs to be stored at # the JobDescription to support dynamically-created jobs with multiple @@ -869,17 +912,17 @@ def makeString(x: Union[str, bytes, None]) -> str: # The IDs of all child jobs of the described job. # Children which are done must be removed with filterSuccessors. - self.childIDs: Set[str] = set() + self.childIDs: set[str] = set() # The IDs of all follow-on jobs of the described job. # Follow-ons which are done must be removed with filterSuccessors. - self.followOnIDs: Set[str] = set() + self.followOnIDs: set[str] = set() # We keep our own children and follow-ons in a list of successor # phases, along with any successors adopted from jobs we have chained # from. When we finish our own children and follow-ons, we may have to # go back and finish successors for those jobs. - self.successor_phases: List[Set[str]] = [self.followOnIDs, self.childIDs] + self.successor_phases: list[set[str]] = [self.followOnIDs, self.childIDs] # Dict from ServiceHostJob ID to list of child ServiceHostJobs that start after it. # All services must have an entry, if only to an empty list. @@ -904,9 +947,15 @@ def get_names(self) -> Names: """ Get the names and ID of this job as a named tuple. """ - return Names(self.jobName, self.unitName, self.displayName, self.displayName, str(self.jobStoreID)) + return Names( + self.jobName, + self.unitName, + self.displayName, + self.displayName, + str(self.jobStoreID), + ) - def get_chain(self) -> List[Names]: + def get_chain(self) -> list[Names]: """ Get all the jobs that executed in this job's chain, in order. @@ -921,7 +970,7 @@ def get_chain(self) -> List[Names]: else: return list(self._merged_job_names) - def serviceHostIDsInBatches(self) -> Iterator[List[str]]: + def serviceHostIDsInBatches(self) -> Iterator[list[str]]: """ Find all batches of service host job IDs that can be started at the same time. @@ -962,10 +1011,9 @@ def allSuccessors(self) -> Iterator[str]: """ for phase in self.successor_phases: - for successor in phase: - yield successor + yield from phase - def successors_by_phase(self) -> Iterator[Tuple[int, str]]: + def successors_by_phase(self) -> Iterator[tuple[int, str]]: """ Get an iterator over all child/follow-on/chained inherited successor job IDs, along with their phase number on the stack. @@ -1010,7 +1058,7 @@ def detach_body(self) -> None: """ self._body = None - def get_body(self) -> Tuple[str, ModuleDescriptor]: + def get_body(self) -> tuple[str, ModuleDescriptor]: """ Get the information needed to load the job body. @@ -1023,9 +1071,11 @@ def get_body(self) -> Tuple[str, ModuleDescriptor]: if not self.has_body(): raise RuntimeError(f"Cannot load the body of a job {self} without one") - return self._body.file_store_id, ModuleDescriptor.fromCommand(self._body.module_string) + return self._body.file_store_id, ModuleDescriptor.fromCommand( + self._body.module_string + ) - def nextSuccessors(self) -> Optional[Set[str]]: + def nextSuccessors(self) -> Optional[set[str]]: """ Return the collection of job IDs for the successors of this job that are ready to run. @@ -1108,7 +1158,9 @@ def is_subtree_done(self) -> bool: :returns: True if the job appears to be done, and all related child, follow-on, and service jobs appear to be finished and removed. """ - return not self.has_body() and next(self.successorsAndServiceHosts(), None) is None + return ( + not self.has_body() and next(self.successorsAndServiceHosts(), None) is None + ) def replace(self, other: "JobDescription") -> None: """ @@ -1127,11 +1179,15 @@ def replace(self, other: "JobDescription") -> None: # TODO: We can't join the job graphs with Job._jobGraphsJoined, is that a problem? # Take all the successors other than this one - old_phases = [{i for i in p if i != self.jobStoreID} for p in other.successor_phases] + old_phases = [ + {i for i in p if i != self.jobStoreID} for p in other.successor_phases + ] # And drop empty phases old_phases = [p for p in old_phases if len(p) > 0] # And put in front of our existing phases - logger.debug('%s is adopting successor phases from %s of: %s', self, other, old_phases) + logger.debug( + "%s is adopting successor phases from %s of: %s", self, other, old_phases + ) self.successor_phases = old_phases + self.successor_phases # When deleting, we need to delete the files for our old ID, and also @@ -1155,9 +1211,13 @@ def replace(self, other: "JobDescription") -> None: self.jobStoreID = other.jobStoreID if len(other.filesToDelete) > 0: - raise RuntimeError("Trying to take on the ID of a job that is in the process of being committed!") + raise RuntimeError( + "Trying to take on the ID of a job that is in the process of being committed!" + ) if len(self.filesToDelete) > 0: - raise RuntimeError("Trying to take on the ID of anothe job while in the process of being committed!") + raise RuntimeError( + "Trying to take on the ID of anothe job while in the process of being committed!" + ) self._job_version = other._job_version self._job_version_writer = os.getpid() @@ -1167,7 +1227,9 @@ def assert_is_not_newer_than(self, other: "JobDescription") -> None: Make sure this JobDescription is not newer than a prospective new version of the JobDescription. """ if other._job_version < self._job_version: - raise RuntimeError(f"Cannot replace {self} from PID {self._job_version_writer} with older version {other} from PID {other._job_version_writer}") + raise RuntimeError( + f"Cannot replace {self} from PID {self._job_version_writer} with older version {other} from PID {other._job_version_writer}" + ) def is_updated_by(self, other: "JobDescription") -> bool: """ @@ -1184,7 +1246,7 @@ def is_updated_by(self, other: "JobDescription") -> bool: other._job_version_writer, self.jobStoreID, self, - self._job_version_writer + self._job_version_writer, ) return False @@ -1196,7 +1258,7 @@ def is_updated_by(self, other: "JobDescription") -> bool: other, other._job_version_writer, self, - self._job_version_writer + self._job_version_writer, ) return False @@ -1236,7 +1298,7 @@ def hasServiceHostJob(self, serviceID) -> bool: """Test if the ServiceHostJob is a service of the described job.""" return serviceID in self.serviceTree - def renameReferences(self, renames: Dict[TemporaryID, str]) -> None: + def renameReferences(self, renames: dict[TemporaryID, str]) -> None: """ Apply the given dict of ID renames to all references to jobs. @@ -1252,8 +1314,12 @@ def renameReferences(self, renames: Dict[TemporaryID, str]) -> None: # Replace each renamed item one at a time to preserve set identity phase.remove(item) phase.add(renames[item]) - self.serviceTree = {renames.get(parent, parent): [renames.get(child, child) for child in children] - for parent, children in self.serviceTree.items()} + self.serviceTree = { + renames.get(parent, parent): [ + renames.get(child, child) for child in children + ] + for parent, children in self.serviceTree.items() + } def addPredecessor(self) -> None: """Notify the JobDescription that a predecessor has been added to its Job.""" @@ -1271,7 +1337,11 @@ def onRegistration(self, jobStore: "AbstractJobStore") -> None: :param jobStore: The job store we are being placed into """ - def setupJobAfterFailure(self, exit_status: Optional[int] = None, exit_reason: Optional["BatchJobExitReason"] = None) -> None: + def setupJobAfterFailure( + self, + exit_status: Optional[int] = None, + exit_reason: Optional["BatchJobExitReason"] = None, + ) -> None: """ Configure job after a failure. @@ -1294,30 +1364,49 @@ def setupJobAfterFailure(self, exit_status: Optional[int] = None, exit_reason: O if self._config is None: raise RuntimeError("The job's config is not assigned.") - if self._config.enableUnlimitedPreemptibleRetries and exit_reason == BatchJobExitReason.LOST: - logger.info("*Not* reducing try count (%s) of job %s with ID %s", - self.remainingTryCount, self, self.jobStoreID) + if ( + self._config.enableUnlimitedPreemptibleRetries + and exit_reason == BatchJobExitReason.LOST + ): + logger.info( + "*Not* reducing try count (%s) of job %s with ID %s", + self.remainingTryCount, + self, + self.jobStoreID, + ) else: self.remainingTryCount = max(0, self.remainingTryCount - 1) - logger.warning("Due to failure we are reducing the remaining try count of job %s with ID %s to %s", - self, self.jobStoreID, self.remainingTryCount) + logger.warning( + "Due to failure we are reducing the remaining try count of job %s with ID %s to %s", + self, + self.jobStoreID, + self.remainingTryCount, + ) # Set the default memory to be at least as large as the default, in # case this was a malloc failure (we do this because of the combined # batch system) if exit_reason == BatchJobExitReason.MEMLIMIT and self._config.doubleMem: self.memory = self.memory * 2 - logger.warning("We have doubled the memory of the failed job %s to %s bytes due to doubleMem flag", - self, self.memory) + logger.warning( + "We have doubled the memory of the failed job %s to %s bytes due to doubleMem flag", + self, + self.memory, + ) if self.memory < self._config.defaultMemory: self.memory = self._config.defaultMemory - logger.warning("We have increased the default memory of the failed job %s to %s bytes", - self, self.memory) + logger.warning( + "We have increased the default memory of the failed job %s to %s bytes", + self, + self.memory, + ) if self.disk < self._config.defaultDisk: self.disk = self._config.defaultDisk - logger.warning("We have increased the disk of the failed job %s to the default of %s bytes", - self, self.disk) - + logger.warning( + "We have increased the disk of the failed job %s to the default of %s bytes", + self, + self.disk, + ) def getLogFileHandle(self, jobStore): """ @@ -1367,12 +1456,12 @@ def __str__(self) -> str: """Produce a useful logging string identifying this job.""" printedName = "'" + self.jobName + "'" if self.unitName: - printedName += ' ' + self.unitName + printedName += " " + self.unitName if self.jobStoreID is not None: - printedName += ' ' + str(self.jobStoreID) + printedName += " " + str(self.jobStoreID) - printedName += ' v' + str(self._job_version) + printedName += " v" + str(self._job_version) return printedName @@ -1381,7 +1470,7 @@ def __str__(self) -> str: # a time, keyed by jobStoreID. def __repr__(self): - return f'{self.__class__.__name__}( **{self.__dict__!r} )' + return f"{self.__class__.__name__}( **{self.__dict__!r} )" def reserve_versions(self, count: int) -> None: """ @@ -1401,6 +1490,7 @@ def pre_update_hook(self) -> None: self._job_version_writer = os.getpid() logger.debug("New job version: %s", self) + class ServiceJobDescription(JobDescription): """A description of a job that hosts a service.""" @@ -1471,7 +1561,7 @@ def restore_checkpoint(self) -> None: raise RuntimeError(f"Cannot restore an empty checkpoint for a job {self}") self._body = self.checkpoint - def restartCheckpoint(self, jobStore: "AbstractJobStore") -> List[str]: + def restartCheckpoint(self, jobStore: "AbstractJobStore") -> list[str]: """ Restart a checkpoint after the total failure of jobs in its subtree. @@ -1482,24 +1572,30 @@ def restartCheckpoint(self, jobStore: "AbstractJobStore") -> List[str]: Returns a list with the IDs of any successors deleted. """ if self.checkpoint is None: - raise RuntimeError("Cannot restart a checkpoint job. The checkpoint was never set.") + raise RuntimeError( + "Cannot restart a checkpoint job. The checkpoint was never set." + ) successorsDeleted = [] all_successors = list(self.allSuccessors()) if len(all_successors) > 0 or self.serviceTree or self.has_body(): if self.has_body(): if self._body != self.checkpoint: - raise RuntimeError("The stored body reference and checkpoint are not the same.") + raise RuntimeError( + "The stored body reference and checkpoint are not the same." + ) logger.debug("Checkpoint job already has body set to run") else: self.restore_checkpoint() - jobStore.update_job(self) # Update immediately to ensure that checkpoint + jobStore.update_job(self) # Update immediately to ensure that checkpoint # is made before deleting any remaining successors if len(all_successors) > 0 or self.serviceTree: # If the subtree of successors is not complete restart everything - logger.debug("Checkpoint job has unfinished successor jobs, deleting successors: %s, services: %s " % - (all_successors, self.serviceTree.keys())) + logger.debug( + "Checkpoint job has unfinished successor jobs, deleting successors: %s, services: %s " + % (all_successors, self.serviceTree.keys()) + ) # Delete everything on the stack, as these represent successors to clean # up as we restart the queue @@ -1512,9 +1608,13 @@ def recursiveDelete(jobDesc): logger.debug("Job %s has already been deleted", otherJobID) if jobDesc.jobStoreID != self.jobStoreID: # Delete everything under us except us. - logger.debug("Checkpoint is deleting old successor job: %s", jobDesc.jobStoreID) + logger.debug( + "Checkpoint is deleting old successor job: %s", + jobDesc.jobStoreID, + ) jobStore.delete_job(jobDesc.jobStoreID) successorsDeleted.append(jobDesc.jobStoreID) + recursiveDelete(self) # Cut links to the jobs we deleted. @@ -1543,7 +1643,7 @@ def __init__( displayName: Optional[str] = "", descriptionClass: Optional[type] = None, local: Optional[bool] = None, - files: Optional[Set[FileID]] = None + files: Optional[set[FileID]] = None, ) -> None: """ Job initializer. @@ -1580,14 +1680,20 @@ def __init__( jobName = self.__class__.__name__ displayName = displayName if displayName else jobName - #Some workflows use preemptable instead of preemptible + # Some workflows use preemptable instead of preemptible if preemptable and not preemptible: - logger.warning("Preemptable as a keyword has been deprecated, please use preemptible.") + logger.warning( + "Preemptable as a keyword has been deprecated, please use preemptible." + ) preemptible = preemptable # Build a requirements dict for the description - requirements = {'memory': memory, 'cores': cores, 'disk': disk, - 'accelerators': accelerators, - 'preemptible': preemptible} + requirements = { + "memory": memory, + "cores": cores, + "disk": disk, + "accelerators": accelerators, + "preemptible": preemptible, + } if descriptionClass is None: if checkpoint: # Actually describe as a checkpoint job @@ -1604,7 +1710,7 @@ def __init__( unitName=unitName, displayName=displayName, local=local, - files=files + files=files, ) # Private class variables needed to actually execute a job, in the worker. @@ -1627,7 +1733,9 @@ def __init__( # Note that self.__module__ is not necessarily this module, i.e. job.py. It is the module # defining the class self is an instance of, which may be a subclass of Job that may be # defined in a different module. - self.userModule: ModuleDescriptor = ModuleDescriptor.forModule(self.__module__).globalize() + self.userModule: ModuleDescriptor = ModuleDescriptor.forModule( + self.__module__ + ).globalize() # Maps index paths into composite return values to lists of IDs of files containing # promised values for those return value items. An index path is a tuple of indices that # traverses a nested data structure of lists, dicts, tuples or any other type supporting @@ -1640,7 +1748,7 @@ def __init__( self._tempDir = None # Holds flags set by set_debug_flag() - self._debug_flags: Set[str] = set() + self._debug_flags: set[str] = set() def __str__(self): """ @@ -1650,7 +1758,7 @@ def __str__(self): if self.description is None: return repr(self) else: - return 'Job(' + str(self.description) + ')' + return "Job(" + str(self.description) + ")" def check_initialized(self) -> None: """ @@ -1662,8 +1770,10 @@ def check_initialized(self) -> None: If __init__() has not been called, raise an error. """ if not hasattr(self, "_description"): - raise ValueError(f"Job instance of type {type(self)} has not been initialized. super().__init__() may not " - f"have been called.") + raise ValueError( + f"Job instance of type {type(self)} has not been initialized. super().__init__() may not " + f"have been called." + ) @property def jobStoreID(self) -> Union[str, TemporaryID]: @@ -1683,33 +1793,37 @@ def description(self) -> JobDescription: def disk(self) -> int: """The maximum number of bytes of disk the job will require to run.""" return self.description.disk + @disk.setter def disk(self, val): - self.description.disk = val + self.description.disk = val @property def memory(self): """The maximum number of bytes of memory the job will require to run.""" return self.description.memory + @memory.setter def memory(self, val): - self.description.memory = val + self.description.memory = val @property def cores(self) -> Union[int, float]: """The number of CPU cores required.""" return self.description.cores + @cores.setter def cores(self, val): - self.description.cores = val + self.description.cores = val @property - def accelerators(self) -> List[AcceleratorRequirement]: + def accelerators(self) -> list[AcceleratorRequirement]: """Any accelerators, such as GPUs, that are needed.""" return self.description.accelerators + @accelerators.setter - def accelerators(self, val: List[ParseableAcceleratorRequirement]) -> None: - self.description.accelerators = val + def accelerators(self, val: list[ParseableAcceleratorRequirement]) -> None: + self.description.accelerators = val @property def preemptible(self) -> bool: @@ -1719,9 +1833,10 @@ def preemptible(self) -> bool: @deprecated(new_function_name="preemptible") def preemptable(self): return self.description.preemptible + @preemptible.setter def preemptible(self, val): - self.description.preemptible = val + self.description.preemptible = val @property def checkpoint(self) -> bool: @@ -1729,11 +1844,11 @@ def checkpoint(self) -> bool: return isinstance(self._description, CheckpointJobDescription) @property - def files_to_use(self) -> Set[FileID]: + def files_to_use(self) -> set[FileID]: return self.description.files_to_use @files_to_use.setter - def files_to_use(self, val: Set[FileID]): + def files_to_use(self, val: set[FileID]): self.description.files_to_use = val def add_to_files_to_use(self, val: FileID): @@ -1855,7 +1970,7 @@ def addFollowOn(self, followOnJob: "Job") -> "Job": return followOnJob - def hasPredecessor(self, job: 'Job') -> bool: + def hasPredecessor(self, job: "Job") -> bool: """Check if a given job is already a predecessor of this job.""" return job in self._directPredecessors @@ -1917,7 +2032,9 @@ def addService( def hasService(self, service: "Job.Service") -> bool: """Return True if the given Service is a service of this job, and False otherwise.""" - return service.hostID is None or self._description.hasServiceHostJob(service.hostID) + return service.hostID is None or self._description.hasServiceHostJob( + service.hostID + ) # Convenience functions for creating jobs @@ -1965,7 +2082,9 @@ def addChildJobFn(self, fn: Callable, *args, **kwargs) -> "FunctionWrappingJob": :return: The new child job that wraps fn. """ if PromisedRequirement.convertPromises(kwargs): - return self.addChild(PromisedRequirementJobFunctionWrappingJob.create(fn, *args, **kwargs)) + return self.addChild( + PromisedRequirementJobFunctionWrappingJob.create(fn, *args, **kwargs) + ) else: return self.addChild(JobFunctionWrappingJob(fn, *args, **kwargs)) @@ -1981,7 +2100,9 @@ def addFollowOnJobFn(self, fn: Callable, *args, **kwargs) -> "FunctionWrappingJo :return: The new follow-on job that wraps fn. """ if PromisedRequirement.convertPromises(kwargs): - return self.addFollowOn(PromisedRequirementJobFunctionWrappingJob.create(fn, *args, **kwargs)) + return self.addFollowOn( + PromisedRequirementJobFunctionWrappingJob.create(fn, *args, **kwargs) + ) else: return self.addFollowOn(JobFunctionWrappingJob(fn, *args, **kwargs)) @@ -2083,8 +2204,12 @@ def registerPromise(self, path): raise JobPromiseConstraintError(self) # TODO: can we guarantee self.jobStoreID is populated and so pass that here? with self._promiseJobStore.write_file_stream() as (fileHandle, jobStoreFileID): - promise = UnfulfilledPromiseSentinel(str(self.description), jobStoreFileID, False) - logger.debug('Issuing promise %s for result of %s', jobStoreFileID, self.description) + promise = UnfulfilledPromiseSentinel( + str(self.description), jobStoreFileID, False + ) + logger.debug( + "Issuing promise %s for result of %s", jobStoreFileID, self.description + ) pickle.dump(promise, fileHandle, pickle.HIGHEST_PROTOCOL) self._rvs[path].append(jobStoreFileID) return self._promiseJobStore.config.jobStore, jobStoreFileID @@ -2134,7 +2259,7 @@ def checkJobGraphForDeadlocks(self): self.checkJobGraphAcylic() self.checkNewCheckpointsAreLeafVertices() - def getRootJobs(self) -> Set['Job']: + def getRootJobs(self) -> set["Job"]: """ Return the set of root job objects that contain this job. @@ -2166,8 +2291,9 @@ def checkJobGraphConnected(self): """ rootJobs = self.getRootJobs() if len(rootJobs) != 1: - raise JobGraphDeadlockException("Graph does not contain exactly one" - " root job: %s" % rootJobs) + raise JobGraphDeadlockException( + "Graph does not contain exactly one" " root job: %s" % rootJobs + ) def checkJobGraphAcylic(self): """ @@ -2187,15 +2313,15 @@ def checkJobGraphAcylic(self): Only deals with jobs created here, rather than loaded from the job store. """ - #Get the root jobs + # Get the root jobs roots = self.getRootJobs() if len(roots) == 0: raise JobGraphDeadlockException("Graph contains no root jobs due to cycles") - #Get implied edges + # Get implied edges extraEdges = self._getImpliedEdges(roots) - #Check for directed cycles in the augmented graph + # Check for directed cycles in the augmented graph visited = set() for root in roots: root._checkJobGraphAcylicDFS([], visited, extraEdges) @@ -2205,17 +2331,23 @@ def _checkJobGraphAcylicDFS(self, stack, visited, extraEdges): if self not in visited: visited.add(self) stack.append(self) - for successor in [self._registry[jID] for jID in self.description.allSuccessors() if jID in self._registry] + extraEdges[self]: + for successor in [ + self._registry[jID] + for jID in self.description.allSuccessors() + if jID in self._registry + ] + extraEdges[self]: # Grab all the successors in the current registry (i.e. added form this node) and look at them. successor._checkJobGraphAcylicDFS(stack, visited, extraEdges) if stack.pop() != self: raise RuntimeError("The stack ordering/elements was changed.") if self in stack: stack.append(self) - raise JobGraphDeadlockException("A cycle of job dependencies has been detected '%s'" % stack) + raise JobGraphDeadlockException( + "A cycle of job dependencies has been detected '%s'" % stack + ) @staticmethod - def _getImpliedEdges(roots) -> Dict["Job", List["Job"]]: + def _getImpliedEdges(roots) -> dict["Job", list["Job"]]: """ Gets the set of implied edges (between children and follow-ons of a common job). @@ -2225,17 +2357,17 @@ def _getImpliedEdges(roots) -> Dict["Job", List["Job"]]: :returns: dict from Job object to list of Job objects that must be done before it can start. """ - #Get nodes (Job objects) in job graph + # Get nodes (Job objects) in job graph nodes = set() for root in roots: root._collectAllSuccessors(nodes) ##For each follow-on edge calculate the extra implied edges - #Adjacency list of implied edges, i.e. map of jobs to lists of jobs - #connected by an implied edge + # Adjacency list of implied edges, i.e. map of jobs to lists of jobs + # connected by an implied edge extraEdges = {n: [] for n in nodes} for job in nodes: - # Get all the nonempty successor phases + # Get all the nonempty successor phases phases = [p for p in job.description.successor_phases if len(p) > 0] for depth in range(1, len(phases)): # Add edges from all jobs in the earlier/upper subtrees to all @@ -2255,7 +2387,11 @@ def _getImpliedEdges(roots) -> Dict["Job", List["Job"]]: for inUpper in reacheable: # Add extra edges to the roots of all the lower subtrees # But skip anything in the lower subtree not in the current _registry (i.e. not created hear) - extraEdges[inUpper] += [job._registry[lowerID] for lowerID in lower if lowerID in job._registry] + extraEdges[inUpper] += [ + job._registry[lowerID] + for lowerID in lower + if lowerID in job._registry + ] return extraEdges @@ -2275,17 +2411,21 @@ def checkNewCheckpointsAreLeafVertices(self) -> None: :raises toil.job.JobGraphDeadlockException: if there exists a job being added to the graph for which \ checkpoint=True and which is not a leaf. """ - roots = self.getRootJobs() # Roots jobs of component, these are preexisting jobs in the graph + roots = ( + self.getRootJobs() + ) # Roots jobs of component, these are preexisting jobs in the graph # All jobs in the component of the job graph containing self jobs = set() - list(map(lambda x : x._collectAllSuccessors(jobs), roots)) + list(map(lambda x: x._collectAllSuccessors(jobs), roots)) # Check for each job for which checkpoint is true that it is a cut vertex or leaf for y in [x for x in jobs if x.checkpoint]: - if y not in roots: # The roots are the prexisting jobs + if y not in roots: # The roots are the prexisting jobs if not Job._isLeafVertex(y): - raise JobGraphDeadlockException("New checkpoint job %s is not a leaf in the job graph" % y) + raise JobGraphDeadlockException( + "New checkpoint job %s is not a leaf in the job graph" % y + ) #################################################### # Deferred function system @@ -2314,7 +2454,9 @@ def defer(self, function, *args, **kwargs) -> None: :param dict kwargs: The keyword arguments to the function """ if self._defer is None: - raise Exception('A deferred function may only be registered with a job while that job is running.') + raise Exception( + "A deferred function may only be registered with a job while that job is running." + ) self._defer(DeferredFunction.create(function, *args, **kwargs)) #################################################### @@ -2323,7 +2465,7 @@ def defer(self, function, *args, **kwargs) -> None: # and defining a service (Job.Service) #################################################### - class Runner(): + class Runner: """Used to setup and run Toil workflow.""" @staticmethod @@ -2339,7 +2481,9 @@ def getDefaultArgumentParser(jobstore_as_flag: bool = False) -> ArgumentParser: return parser @staticmethod - def getDefaultOptions(jobStore: Optional[str] = None, jobstore_as_flag: bool = False) -> Namespace: + def getDefaultOptions( + jobStore: Optional[str] = None, jobstore_as_flag: bool = False + ) -> Namespace: """ Get default options for a toil workflow. @@ -2350,9 +2494,13 @@ def getDefaultOptions(jobStore: Optional[str] = None, jobstore_as_flag: bool = F """ # setting jobstore_as_flag to True allows the user to declare the jobstore in the config file instead if not jobstore_as_flag and jobStore is None: - raise RuntimeError("The jobstore argument cannot be missing if the jobstore_as_flag argument is set " - "to False!") - parser = Job.Runner.getDefaultArgumentParser(jobstore_as_flag=jobstore_as_flag) + raise RuntimeError( + "The jobstore argument cannot be missing if the jobstore_as_flag argument is set " + "to False!" + ) + parser = Job.Runner.getDefaultArgumentParser( + jobstore_as_flag=jobstore_as_flag + ) arguments = [] if jobstore_as_flag and jobStore is not None: arguments = ["--jobstore", jobStore] @@ -2361,7 +2509,10 @@ def getDefaultOptions(jobStore: Optional[str] = None, jobstore_as_flag: bool = F return parser.parse_args(args=arguments) @staticmethod - def addToilOptions(parser: Union["OptionParser", ArgumentParser], jobstore_as_flag: bool = False) -> None: + def addToilOptions( + parser: Union["OptionParser", ArgumentParser], + jobstore_as_flag: bool = False, + ) -> None: """ Adds the default toil options to an :mod:`optparse` or :mod:`argparse` parser object. @@ -2401,19 +2552,29 @@ class Service(Requirer, metaclass=ABCMeta): Is not executed as a job; runs within a ServiceHostJob. """ - def __init__(self, memory=None, cores=None, disk=None, accelerators=None, preemptible=None, unitName=None): + def __init__( + self, + memory=None, + cores=None, + disk=None, + accelerators=None, + preemptible=None, + unitName=None, + ): """ Memory, core and disk requirements are specified identically to as in \ :func:`toil.job.Job.__init__`. """ # Save the requirements in ourselves so they are visible on `self` to user code. - super().__init__({ - 'memory': memory, - 'cores': cores, - 'disk': disk, - 'accelerators': accelerators, - 'preemptible': preemptible - }) + super().__init__( + { + "memory": memory, + "cores": cores, + "disk": disk, + "accelerators": accelerators, + "preemptible": preemptible, + } + ) # And the unit name self.unitName = unitName @@ -2491,15 +2652,19 @@ def _unpickle(cls, userModule, fileHandle, requireInstanceOf=None): def filter_main(module_name, class_name): try: - if module_name == '__main__': + if module_name == "__main__": return getattr(userModule, class_name) else: return getattr(importlib.import_module(module_name), class_name) except: - if module_name == '__main__': - logger.debug('Failed getting %s from module %s.', class_name, userModule) + if module_name == "__main__": + logger.debug( + "Failed getting %s from module %s.", class_name, userModule + ) else: - logger.debug('Failed getting %s from module %s.', class_name, module_name) + logger.debug( + "Failed getting %s from module %s.", class_name, module_name + ) raise class FilteredUnpickler(pickle.Unpickler): @@ -2509,7 +2674,9 @@ def find_class(self, module, name): unpickler = FilteredUnpickler(fileHandle) runnable = unpickler.load() - if requireInstanceOf is not None and not isinstance(runnable, requireInstanceOf): + if requireInstanceOf is not None and not isinstance( + runnable, requireInstanceOf + ): raise RuntimeError(f"Did not find a {requireInstanceOf} when expected") return runnable @@ -2542,15 +2709,28 @@ def _fulfillPromises(self, returnValues, jobStore): # File may be gone if the job is a service being re-run and the accessing job is # already complete. if jobStore.file_exists(promiseFileStoreID): - logger.debug("Resolve promise %s from %s with a %s", promiseFileStoreID, self, type(promisedValue)) + logger.debug( + "Resolve promise %s from %s with a %s", + promiseFileStoreID, + self, + type(promisedValue), + ) with jobStore.update_file_stream(promiseFileStoreID) as fileHandle: try: - pickle.dump(promisedValue, fileHandle, pickle.HIGHEST_PROTOCOL) + pickle.dump( + promisedValue, fileHandle, pickle.HIGHEST_PROTOCOL + ) except AttributeError: - logger.exception("Could not pickle promise result %s", promisedValue) + logger.exception( + "Could not pickle promise result %s", promisedValue + ) raise else: - logger.debug("Do not resolve promise %s from %s because it is no longer needed", promiseFileStoreID, self) + logger.debug( + "Do not resolve promise %s from %s because it is no longer needed", + promiseFileStoreID, + self, + ) # Functions associated with Job.checkJobGraphAcyclic to establish that the job graph does not # contain any cycles of dependencies: @@ -2575,7 +2755,7 @@ def _collectAllSuccessors(self, visited): # We added this successor locally todo.append(self._registry[successorID]) - def getTopologicalOrderingOfJobs(self) -> List["Job"]: + def getTopologicalOrderingOfJobs(self) -> list["Job"]: """ :returns: a list of jobs such that for all pairs of indices i, j for which i < j, \ the job at index i can be run before the job at index j. @@ -2597,8 +2777,8 @@ def getTopologicalOrderingOfJobs(self) -> List["Job"]: job = todo[-1] todo.pop() - #Do not add the job to the ordering until all its predecessors have been - #added to the ordering + # Do not add the job to the ordering until all its predecessors have been + # added to the ordering outstandingPredecessor = False for predJob in job._directPredecessors: if predJob.jobStoreID not in visited: @@ -2623,7 +2803,7 @@ def getTopologicalOrderingOfJobs(self) -> List["Job"]: # Storing Jobs into the JobStore #################################################### - def _register(self, jobStore) -> List[Tuple[TemporaryID, str]]: + def _register(self, jobStore) -> list[tuple[TemporaryID, str]]: """ If this job lacks a JobStore-assigned ID, assign this job an ID. Must be called for each job before it is saved to the JobStore for the first time. @@ -2652,7 +2832,7 @@ def _register(self, jobStore) -> List[Tuple[TemporaryID, str]]: # We already have an ID. No assignment or reference rewrite necessary. return [] - def _renameReferences(self, renames: Dict[TemporaryID, str]) -> None: + def _renameReferences(self, renames: dict[TemporaryID, str]) -> None: """ Apply the given dict of ID renames to all references to other jobs. @@ -2688,8 +2868,8 @@ def saveBody(self, jobStore: "AbstractJobStore") -> None: # Clear out old Cactus compatibility fields that don't need to be # preserved and shouldn't be serialized. - if hasattr(self, '_services'): - delattr(self, '_services') + if hasattr(self, "_services"): + delattr(self, "_services") # Remember fields we will overwrite description = self._description @@ -2707,7 +2887,9 @@ def saveBody(self, jobStore: "AbstractJobStore") -> None: self._directPredecessors = set() # Save the body of the job - with jobStore.write_file_stream(description.jobStoreID, cleanup=True) as (fileHandle, fileStoreID): + with jobStore.write_file_stream( + description.jobStoreID, cleanup=True + ) as (fileHandle, fileStoreID): pickle.dump(self, fileHandle, pickle.HIGHEST_PROTOCOL) finally: # Restore important fields (before handling errors) @@ -2733,7 +2915,12 @@ def saveBody(self, jobStore: "AbstractJobStore") -> None: # Connect the body of the job to the JobDescription self._description.attach_body(fileStoreID, userScript) - def _saveJobGraph(self, jobStore: "AbstractJobStore", saveSelf: bool = False, returnValues: bool = None): + def _saveJobGraph( + self, + jobStore: "AbstractJobStore", + saveSelf: bool = False, + returnValues: bool = None, + ): """ Save job data and new JobDescriptions to the given job store for this job and all descending jobs, including services. @@ -2784,7 +2971,12 @@ def _saveJobGraph(self, jobStore: "AbstractJobStore", saveSelf: bool = False, re # Set up to save last job first, so promises flow the right way ordering.reverse() - logger.debug("Saving graph of %d jobs, %d non-service, %d new", len(allJobs), len(ordering), len(fakeToReal)) + logger.debug( + "Saving graph of %d jobs, %d non-service, %d new", + len(allJobs), + len(ordering), + len(fakeToReal), + ) # Make sure we're the root if ordering[-1] != self: @@ -2797,15 +2989,15 @@ def _saveJobGraph(self, jobStore: "AbstractJobStore", saveSelf: bool = False, re if not isinstance(j, ServiceHostJob) and j.jobStoreID not in ordered_ids: raise RuntimeError(f"{j} not found in ordering {ordering}") - - if not saveSelf: # Fulfil promises for return values (even if value is None) self._fulfillPromises(returnValues, jobStore) for job in ordering: logger.debug("Processing job %s", job.description) - for serviceBatch in reversed(list(job.description.serviceHostIDsInBatches())): + for serviceBatch in reversed( + list(job.description.serviceHostIDsInBatches()) + ): # For each batch of service host jobs in reverse order they start for serviceID in serviceBatch: logger.debug("Processing service %s", serviceID) @@ -2843,7 +3035,8 @@ def saveAsRootJob(self, jobStore: "AbstractJobStore") -> JobDescription: # All other job vertices in the graph are checked by checkNewCheckpointsAreLeafVertices if self.checkpoint and not Job._isLeafVertex(self): raise JobGraphDeadlockException( - 'New checkpoint job %s is not a leaf in the job graph' % self) + "New checkpoint job %s is not a leaf in the job graph" % self + ) # Save the root job and all descendants and services self._saveJobGraph(jobStore, saveSelf=True) @@ -2869,19 +3062,19 @@ def loadJob( :param job_description: the JobDescription of the job to retrieve. :returns: The job referenced by the JobDescription. """ - + file_store_id, user_module_descriptor = job_description.get_body() - logger.debug('Loading user module %s.', user_module_descriptor) + logger.debug("Loading user module %s.", user_module_descriptor) user_module = cls._loadUserModule(user_module_descriptor) - #Loads context manager using file stream + # Loads context manager using file stream if file_store_id == "firstJob": # This one is actually a shared file name and not a file ID. manager = job_store.read_shared_file_stream(file_store_id) else: manager = job_store.read_file_stream(file_store_id) - #Open and unpickle + # Open and unpickle with manager as file_handle: job = cls._unpickle(user_module, file_handle, requireInstanceOf=Job) @@ -2893,7 +3086,6 @@ def loadJob( return job - def _run(self, jobGraph=None, fileStore=None, **kwargs): """ Function which worker calls to ultimately invoke @@ -2958,7 +3150,9 @@ def _executor(self, stats, fileStore): os.chdir(baseDir) # Finish up the stats if stats is not None: - totalCpuTime, totalMemoryUsage = ResourceMonitor.get_total_cpu_time_and_memory_usage() + totalCpuTime, totalMemoryUsage = ( + ResourceMonitor.get_total_cpu_time_and_memory_usage() + ) stats.jobs.append( Expando( time=str(time.time() - startTime), @@ -2966,7 +3160,7 @@ def _executor(self, stats, fileStore): class_name=self._jobName(), memory=str(totalMemoryUsage), requested_cores=str(self.cores), - disk=str(fileStore.get_disk_usage()) + disk=str(fileStore.get_disk_usage()), ) ) @@ -3011,13 +3205,12 @@ def _runner( self._defer = None self._fileStore = None - # Serialize the new Jobs defined by the run method to the jobStore self._saveJobGraph(jobStore, saveSelf=False, returnValues=returnValues) # Clear out the body, because the job is done. self.description.detach_body() - + # That and the new child/follow-on relationships will need to be # recorded later by an update() of the JobDescription. @@ -3040,7 +3233,9 @@ def has_debug_flag(self, flag: str) -> bool: return flag in self._debug_flags - def files_downloaded_hook(self, host_and_job_paths: Optional[List[Tuple[str, str]]] = None) -> None: + def files_downloaded_hook( + self, host_and_job_paths: Optional[list[tuple[str, str]]] = None + ) -> None: """ Function that subclasses can call when they have downloaded their input files. @@ -3055,7 +3250,10 @@ def files_downloaded_hook(self, host_and_job_paths: Optional[List[Tuple[str, str # Stop the worker! logger.info("Job has downloaded its files. Stopping.") # Send off the path mapping for the debugging wrapper. - raise FilesDownloadedStoppingPointReached("Files downloaded", host_and_job_paths=host_and_job_paths) + raise FilesDownloadedStoppingPointReached( + "Files downloaded", host_and_job_paths=host_and_job_paths + ) + class JobException(Exception): """General job exception.""" @@ -3069,6 +3267,7 @@ class JobGraphDeadlockException(JobException): An exception raised in the event that a workflow contains an unresolvable \ dependency, such as a cycle. See :func:`toil.job.Job.checkJobGraphForDeadlocks`. """ + def __init__(self, string): super().__init__(string) @@ -3077,6 +3276,7 @@ class FunctionWrappingJob(Job): """ Job used to wrap a function. In its `run` method the wrapped function is called. """ + def __init__(self, userFunction, *args, **kwargs): """ :param callable userFunction: The function to wrap. It will be called with ``*args`` and @@ -3096,7 +3296,9 @@ def __init__(self, userFunction, *args, **kwargs): if argSpec.defaults is None: argDict = {} else: - argDict = dict(list(zip(argSpec.args[-len(argSpec.defaults):], argSpec.defaults))) + argDict = dict( + list(zip(argSpec.args[-len(argSpec.defaults) :], argSpec.defaults)) + ) def resolve(key, default=None, dehumanize=False): try: @@ -3114,36 +3316,48 @@ def resolve(key, default=None, dehumanize=False): value = human2bytes(value) return value - super().__init__(memory=resolve('memory', dehumanize=True), - cores=resolve('cores', dehumanize=True), - disk=resolve('disk', dehumanize=True), - accelerators=resolve('accelerators'), - preemptible=resolve('preemptible'), - checkpoint=resolve('checkpoint', default=False), - unitName=resolve('name', default=None)) + super().__init__( + memory=resolve("memory", dehumanize=True), + cores=resolve("cores", dehumanize=True), + disk=resolve("disk", dehumanize=True), + accelerators=resolve("accelerators"), + preemptible=resolve("preemptible"), + checkpoint=resolve("checkpoint", default=False), + unitName=resolve("name", default=None), + ) - self.userFunctionModule = ModuleDescriptor.forModule(userFunction.__module__).globalize() + self.userFunctionModule = ModuleDescriptor.forModule( + userFunction.__module__ + ).globalize() self.userFunctionName = str(userFunction.__name__) self.description.jobName = self.userFunctionName self._args = args self._kwargs = kwargs def _getUserFunction(self): - logger.debug('Loading user function %s from module %s.', - self.userFunctionName, - self.userFunctionModule) + logger.debug( + "Loading user function %s from module %s.", + self.userFunctionName, + self.userFunctionModule, + ) userFunctionModule = self._loadUserModule(self.userFunctionModule) return getattr(userFunctionModule, self.userFunctionName) - def run(self,fileStore): - userFunction = self._getUserFunction( ) + def run(self, fileStore): + userFunction = self._getUserFunction() return userFunction(*self._args, **self._kwargs) def getUserScript(self): return self.userFunctionModule def _jobName(self): - return ".".join((self.__class__.__name__, self.userFunctionModule.name, self.userFunctionName)) + return ".".join( + ( + self.__class__.__name__, + self.userFunctionModule.name, + self.userFunctionName, + ) + ) class JobFunctionWrappingJob(FunctionWrappingJob): @@ -3189,10 +3403,20 @@ class PromisedRequirementFunctionWrappingJob(FunctionWrappingJob): Spawns child function using parent function parameters and fulfilled promised resource requirements. """ + def __init__(self, userFunction, *args, **kwargs): self._promisedKwargs = kwargs.copy() # Replace resource requirements in intermediate job with small values. - kwargs.update(dict(disk='1M', memory='32M', cores=0.1, accelerators=[], preemptible=True, preemptable=True)) + kwargs.update( + dict( + disk="1M", + memory="32M", + cores=0.1, + accelerators=[], + preemptible=True, + preemptable=True, + ) + ) super().__init__(userFunction, *args, **kwargs) @classmethod @@ -3217,7 +3441,9 @@ def evaluatePromisedRequirements(self): for requirement in REQUIREMENT_NAMES: try: if isinstance(self._promisedKwargs[requirement], PromisedRequirement): - self._promisedKwargs[requirement] = self._promisedKwargs[requirement].getValue() + self._promisedKwargs[requirement] = self._promisedKwargs[ + requirement + ].getValue() except KeyError: pass @@ -3231,7 +3457,9 @@ class PromisedRequirementJobFunctionWrappingJob(PromisedRequirementFunctionWrapp def run(self, fileStore): self.evaluatePromisedRequirements() userFunction = self._getUserFunction() - return self.addChildJobFn(userFunction, *self._args, **self._promisedKwargs).rv() + return self.addChildJobFn( + userFunction, *self._args, **self._promisedKwargs + ).rv() class EncapsulatedJob(Job): @@ -3258,6 +3486,7 @@ class EncapsulatedJob(Job): is the return value of the root job, e.g. A().encapsulate().rv() and A().rv() will resolve to the same value after A or A.encapsulate() has been run. """ + def __init__(self, job, unitName=None): """ :param toil.job.Job job: the job to encapsulate. @@ -3277,7 +3506,12 @@ def __init__(self, job, unitName=None): Job.addChild(self, job) # Use small resource requirements for dummy Job instance. # But not too small, or the job won't have enough resources to safely start up Toil. - self.encapsulatedFollowOn = Job(disk='100M', memory='512M', cores=0.1, unitName=None if unitName is None else unitName + '-followOn') + self.encapsulatedFollowOn = Job( + disk="100M", + memory="512M", + cores=0.1, + unitName=None if unitName is None else unitName + "-followOn", + ) Job.addFollowOn(self, self.encapsulatedFollowOn) else: # Unpickling on the worker, to be run as a no-op. @@ -3289,17 +3523,25 @@ def __init__(self, job, unitName=None): def addChild(self, childJob): if self.encapsulatedFollowOn is None: - raise RuntimeError("Children cannot be added to EncapsulatedJob while it is running") + raise RuntimeError( + "Children cannot be added to EncapsulatedJob while it is running" + ) return Job.addChild(self.encapsulatedFollowOn, childJob) def addService(self, service, parentService=None): if self.encapsulatedFollowOn is None: - raise RuntimeError("Services cannot be added to EncapsulatedJob while it is running") - return Job.addService(self.encapsulatedFollowOn, service, parentService=parentService) + raise RuntimeError( + "Services cannot be added to EncapsulatedJob while it is running" + ) + return Job.addService( + self.encapsulatedFollowOn, service, parentService=parentService + ) def addFollowOn(self, followOnJob): if self.encapsulatedFollowOn is None: - raise RuntimeError("Follow-ons cannot be added to EncapsulatedJob while it is running") + raise RuntimeError( + "Follow-ons cannot be added to EncapsulatedJob while it is running" + ) return Job.addFollowOn(self.encapsulatedFollowOn, followOnJob) def rv(self, *path) -> "Promise": @@ -3342,6 +3584,7 @@ class ServiceHostJob(Job): """ Job that runs a service. Used internally by Toil. Users should subclass Service instead of using this. """ + def __init__(self, service): """ This constructor should not be called by a user. @@ -3352,12 +3595,17 @@ def __init__(self, service): # Make sure the service hasn't been given a host already. if service.hostID is not None: - raise RuntimeError("Cannot set the host. The service has already been given a host.") + raise RuntimeError( + "Cannot set the host. The service has already been given a host." + ) # Make ourselves with name info from the Service and a # ServiceJobDescription that has the service control flags. - super().__init__(**service.requirements, - unitName=service.unitName, descriptionClass=ServiceJobDescription) + super().__init__( + **service.requirements, + unitName=service.unitName, + descriptionClass=ServiceJobDescription, + ) # Make sure the service knows it has a host now service.hostID = self.jobStoreID @@ -3395,13 +3643,19 @@ def _renameReferences(self, renames): # stuff onto us. def addChild(self, child): - raise RuntimeError("Service host jobs cannot have children, follow-ons, or services") + raise RuntimeError( + "Service host jobs cannot have children, follow-ons, or services" + ) def addFollowOn(self, followOn): - raise RuntimeError("Service host jobs cannot have children, follow-ons, or services") + raise RuntimeError( + "Service host jobs cannot have children, follow-ons, or services" + ) def addService(self, service, parentService=None): - raise RuntimeError("Service host jobs cannot have children, follow-ons, or services") + raise RuntimeError( + "Service host jobs cannot have children, follow-ons, or services" + ) def saveBody(self, jobStore): """ @@ -3410,7 +3664,9 @@ def saveBody(self, jobStore): # Save unpickled service service = self.service # Serialize service - self.pickledService = pickle.dumps(self.service, protocol=pickle.HIGHEST_PROTOCOL) + self.pickledService = pickle.dumps( + self.service, protocol=pickle.HIGHEST_PROTOCOL + ) # Clear real service until we have the module to load it back self.service = None # Save body as normal @@ -3421,24 +3677,30 @@ def saveBody(self, jobStore): def run(self, fileStore): # Unpickle the service - logger.debug('Loading service module %s.', self.serviceModule) + logger.debug("Loading service module %s.", self.serviceModule) userModule = self._loadUserModule(self.serviceModule) - service = self._unpickle(userModule, BytesIO(self.pickledService), requireInstanceOf=Job.Service) + service = self._unpickle( + userModule, BytesIO(self.pickledService), requireInstanceOf=Job.Service + ) self.pickledService = None # Make sure it has the config, since it wasn't load()-ed via the JobStore service.assignConfig(fileStore.jobStore.config) - #Start the service + # Start the service startCredentials = service.start(self) try: - #The start credentials must be communicated to processes connecting to - #the service, to do this while the run method is running we - #cheat and set the return value promise within the run method + # The start credentials must be communicated to processes connecting to + # the service, to do this while the run method is running we + # cheat and set the return value promise within the run method self._fulfillPromises(startCredentials, fileStore.jobStore) - self._rvs = {} # Set this to avoid the return values being updated after the - #run method has completed! + self._rvs = ( + {} + ) # Set this to avoid the return values being updated after the + # run method has completed! - #Now flag that the service is running jobs can connect to it - logger.debug("Removing the start jobStoreID to indicate that establishment of the service") + # Now flag that the service is running jobs can connect to it + logger.debug( + "Removing the start jobStoreID to indicate that establishment of the service" + ) if self.description.startJobStoreID is None: raise RuntimeError("No start jobStoreID to remove.") if fileStore.jobStore.file_exists(self.description.startJobStoreID): @@ -3446,23 +3708,33 @@ def run(self, fileStore): if fileStore.jobStore.file_exists(self.description.startJobStoreID): raise RuntimeError("The start jobStoreID is not a file.") - #Now block until we are told to stop, which is indicated by the removal - #of a file + # Now block until we are told to stop, which is indicated by the removal + # of a file if self.description.terminateJobStoreID is None: raise RuntimeError("No terminate jobStoreID to use.") while True: # Check for the terminate signal - if not fileStore.jobStore.file_exists(self.description.terminateJobStoreID): - logger.debug("Detected that the terminate jobStoreID has been removed so exiting") - if not fileStore.jobStore.file_exists(self.description.errorJobStoreID): - raise RuntimeError("Detected the error jobStoreID has been removed so exiting with an error") + if not fileStore.jobStore.file_exists( + self.description.terminateJobStoreID + ): + logger.debug( + "Detected that the terminate jobStoreID has been removed so exiting" + ) + if not fileStore.jobStore.file_exists( + self.description.errorJobStoreID + ): + raise RuntimeError( + "Detected the error jobStoreID has been removed so exiting with an error" + ) break # Check the service's status and exit if failed or complete try: if not service.check(): - logger.debug("The service has finished okay, but we have not been told to terminate. " - "Waiting for leader to tell us to come back.") + logger.debug( + "The service has finished okay, but we have not been told to terminate. " + "Waiting for leader to tell us to come back." + ) # TODO: Adjust leader so that it keys on something # other than the services finishing (assumed to be # after the children) to know when to run follow-on @@ -3473,7 +3745,9 @@ def run(self, fileStore): logger.debug("Detected abnormal termination of the service") raise - time.sleep(fileStore.jobStore.config.servicePollingInterval) #Avoid excessive polling + time.sleep( + fileStore.jobStore.config.servicePollingInterval + ) # Avoid excessive polling logger.debug("Service is done") finally: @@ -3544,7 +3818,9 @@ def __reduce__(self): def __new__(cls, *args) -> "Promise": """Instantiate this Promise.""" if len(args) != 2: - raise RuntimeError("Cannot instantiate promise. Invalid number of arguments given (Expected 2).") + raise RuntimeError( + "Cannot instantiate promise. Invalid number of arguments given (Expected 2)." + ) if isinstance(args[0], Job): # Regular instantiation when promise is created, before it is being pickled return super().__new__(cls) @@ -3565,6 +3841,7 @@ def _resolve(cls, jobStoreLocator, jobStoreFileID): value = safeUnpickleFromStream(fileHandle) return value + # Machinery for type-safe-ish Toil Python workflows. # # TODO: Until we make Promise generic on the promised type, and work out how to @@ -3572,12 +3849,13 @@ def _resolve(cls, jobStoreLocator, jobStoreFileID): # method returns, this won't actually be type-safe, because any Promise will be # a Promised[] for any type. -T = TypeVar('T') +T = TypeVar("T") # We have type shorthand for a promised value. # Uses a generic type alias, so you can have a Promised[T]. See . Promised = Union[Promise, T] + def unwrap(p: Promised[T]) -> T: """ Function for ensuring you actually have a promised value, and not just a promise. @@ -3586,9 +3864,10 @@ def unwrap(p: Promised[T]) -> T: The "unwrap" terminology is borrowed from Rust. """ if isinstance(p, Promise): - raise TypeError(f'Attempted to unwrap a value that is still a Promise: {p}') + raise TypeError(f"Attempted to unwrap a value that is still a Promise: {p}") return p + def unwrap_all(p: Sequence[Promised[T]]) -> Sequence[T]: """ Function for ensuring you actually have a collection of promised values, @@ -3598,9 +3877,12 @@ def unwrap_all(p: Sequence[Promised[T]]) -> Sequence[T]: """ for i, item in enumerate(p): if isinstance(item, Promise): - raise TypeError(f'Attempted to unwrap a value at index {i} that is still a Promise: {item}') + raise TypeError( + f"Attempted to unwrap a value at index {i} that is still a Promise: {item}" + ) return p + class PromisedRequirement: """ Class for dynamically allocating job function resource requirements. @@ -3627,13 +3909,15 @@ def __init__(self, valueOrCallable, *args): :param args: variable length argument list :type args: int or .Promise """ - if hasattr(valueOrCallable, '__call__'): + if hasattr(valueOrCallable, "__call__"): if len(args) == 0: - raise RuntimeError('Need parameters for PromisedRequirement function.') + raise RuntimeError("Need parameters for PromisedRequirement function.") func = valueOrCallable else: if len(args) != 0: - raise RuntimeError('Define a PromisedRequirement function to handle multiple arguments.') + raise RuntimeError( + "Define a PromisedRequirement function to handle multiple arguments." + ) func = lambda x: x args = [valueOrCallable] @@ -3646,7 +3930,7 @@ def getValue(self): return func(*self._args) @staticmethod - def convertPromises(kwargs: Dict[str, Any]) -> bool: + def convertPromises(kwargs: dict[str, Any]) -> bool: """ Return True if reserved resource keyword is a Promise or PromisedRequirement instance. @@ -3675,15 +3959,15 @@ def __init__(self, fulfillingJobName: str, file_id: str, unpickled: Any) -> None self.file_id = file_id @staticmethod - def __setstate__(stateDict: Dict[str, Any]) -> None: + def __setstate__(stateDict: dict[str, Any]) -> None: """ Only called when unpickling. This won't be unpickled unless the promise wasn't resolved, so we throw an exception. """ - jobName = stateDict['fulfillingJobName'] - file_id = stateDict['file_id'] + jobName = stateDict["fulfillingJobName"] + file_id = stateDict["file_id"] raise RuntimeError( f"This job was passed promise {file_id} that wasn't yet resolved when it " f"ran. The job {jobName} that fulfills this promise hasn't yet " diff --git a/src/toil/jobStores/abstractJobStore.py b/src/toil/jobStores/abstractJobStore.py index d909248ed2..d0ecca100f 100644 --- a/src/toil/jobStores/abstractJobStore.py +++ b/src/toil/jobStores/abstractJobStore.py @@ -16,33 +16,23 @@ import pickle import re import shutil -import sys from abc import ABC, ABCMeta, abstractmethod +from collections.abc import Iterator, ValuesView from contextlib import closing, contextmanager from datetime import timedelta from http.client import BadStatusLine -from typing import (IO, - TYPE_CHECKING, - Any, - Callable, - ContextManager, - Dict, - Iterable, - Iterator, - List, - Optional, - Set, - Tuple, - Union, - ValuesView, - cast, - overload) - -if sys.version_info >= (3, 8): - from typing import Literal -else: - from typing_extensions import Literal - +from typing import ( + IO, + TYPE_CHECKING, + Any, + Callable, + ContextManager, + Literal, + Optional, + Union, + cast, + overload, +) from urllib.error import HTTPError from urllib.parse import ParseResult, urlparse from urllib.request import urlopen @@ -50,10 +40,12 @@ from toil.common import Config, getNodeID, safeUnpickleFromStream from toil.fileStores import FileID -from toil.job import (CheckpointJobDescription, - JobDescription, - JobException, - ServiceJobDescription) +from toil.job import ( + CheckpointJobDescription, + JobDescription, + JobException, + ServiceJobDescription, +) from toil.lib.compatibility import deprecated from toil.lib.io import WriteWatchingStream from toil.lib.memoize import memoize @@ -67,18 +59,22 @@ try: from botocore.exceptions import ProxyConnectionError except ImportError: + class ProxyConnectionError(BaseException): # type: ignore """Dummy class.""" + class LocatorException(Exception): """ Base exception class for all locator exceptions. For example, job store/aws bucket exceptions where they already exist """ - def __init__(self, error_msg: str, locator: str, prefix: Optional[str]=None): + + def __init__(self, error_msg: str, locator: str, prefix: Optional[str] = None): full_locator = locator if prefix is None else f"{prefix}:{locator}" super().__init__(error_msg % full_locator) + class InvalidImportExportUrlException(Exception): def __init__(self, url: ParseResult) -> None: """ @@ -86,6 +82,7 @@ def __init__(self, url: ParseResult) -> None: """ super().__init__("The URL '%s' is invalid." % url.geturl()) + class UnimplementedURLException(RuntimeError): def __init__(self, url: ParseResult, operation: str) -> None: """ @@ -102,8 +99,10 @@ def __init__(self, url: ParseResult, operation: str) -> None: f"with the appropriate extras." ) + class NoSuchJobException(Exception): """Indicates that the specified job does not exist.""" + def __init__(self, jobStoreID: FileID): """ :param str jobStoreID: the jobStoreID that was mistakenly assumed to exist @@ -113,17 +112,21 @@ def __init__(self, jobStoreID: FileID): class ConcurrentFileModificationException(Exception): """Indicates that the file was attempted to be modified by multiple processes at once.""" + def __init__(self, jobStoreFileID: FileID): """ :param jobStoreFileID: the ID of the file that was modified by multiple workers or processes concurrently """ - super().__init__('Concurrent update to file %s detected.' % jobStoreFileID) + super().__init__("Concurrent update to file %s detected." % jobStoreFileID) class NoSuchFileException(Exception): """Indicates that the specified file does not exist.""" - def __init__(self, jobStoreFileID: FileID, customName: Optional[str] = None, *extra: Any): + + def __init__( + self, jobStoreFileID: FileID, customName: Optional[str] = None, *extra: Any + ): """ :param jobStoreFileID: the ID of the file that was mistakenly assumed to exist :param customName: optionally, an alternate name for the nonexistent file @@ -146,22 +149,31 @@ def __init__(self, jobStoreFileID: FileID, customName: Optional[str] = None, *ex class NoSuchJobStoreException(LocatorException): """Indicates that the specified job store does not exist.""" + def __init__(self, locator: str, prefix: str): """ :param str locator: The location of the job store """ - super().__init__("The job store '%s' does not exist, so there is nothing to restart.", locator, prefix) + super().__init__( + "The job store '%s' does not exist, so there is nothing to restart.", + locator, + prefix, + ) class JobStoreExistsException(LocatorException): """Indicates that the specified job store already exists.""" + def __init__(self, locator: str, prefix: str): """ :param str locator: The location of the job store """ super().__init__( "The job store '%s' already exists. Use --restart to resume the workflow, or remove " - "the job store with 'toil clean' to start the workflow from scratch.", locator, prefix) + "the job store with 'toil clean' to start the workflow from scratch.", + locator, + prefix, + ) class AbstractJobStore(ABC): @@ -213,7 +225,7 @@ def initialize(self, config: Config) -> None: self.__config = config self.write_config() - @deprecated(new_function_name='write_config') + @deprecated(new_function_name="write_config") def writeConfig(self) -> None: return self.write_config() @@ -222,7 +234,9 @@ def write_config(self) -> None: Persists the value of the :attr:`AbstractJobStore.config` attribute to the job store, so that it can be retrieved later by other instances of this class. """ - with self.write_shared_file_stream('config.pickle', encrypted=False) as fileHandle: + with self.write_shared_file_stream( + "config.pickle", encrypted=False + ) as fileHandle: pickle.dump(self.__config, fileHandle, pickle.HIGHEST_PROTOCOL) def resume(self) -> None: @@ -232,7 +246,7 @@ def resume(self) -> None: :raises NoSuchJobStoreException: if the physical storage for this job store doesn't exist """ - with self.read_shared_file_stream('config.pickle') as fileHandle: + with self.read_shared_file_stream("config.pickle") as fileHandle: config = safeUnpickleFromStream(fileHandle) assert config.workflowID is not None self.__config = config @@ -250,9 +264,9 @@ def locator(self) -> str: """ return self.__locator - rootJobStoreIDFileName = 'rootJobStoreID' + rootJobStoreIDFileName = "rootJobStoreID" - @deprecated(new_function_name='set_root_job') + @deprecated(new_function_name="set_root_job") def setRootJob(self, rootJobStoreID: FileID) -> None: """Set the root job of the workflow backed by this job store.""" return self.set_root_job(rootJobStoreID) @@ -264,9 +278,9 @@ def set_root_job(self, job_id: FileID) -> None: :param job_id: The ID of the job to set as root """ with self.write_shared_file_stream(self.rootJobStoreIDFileName) as f: - f.write(job_id.encode('utf-8')) + f.write(job_id.encode("utf-8")) - @deprecated(new_function_name='load_root_job') + @deprecated(new_function_name="load_root_job") def loadRootJob(self) -> JobDescription: return self.load_root_job() @@ -281,16 +295,18 @@ def load_root_job(self) -> JobDescription: """ try: with self.read_shared_file_stream(self.rootJobStoreIDFileName) as f: - rootJobStoreID = f.read().decode('utf-8') + rootJobStoreID = f.read().decode("utf-8") except NoSuchFileException: - raise JobException('No job has been set as the root in this job store') + raise JobException("No job has been set as the root in this job store") if not self.job_exists(rootJobStoreID): - raise JobException("The root job '%s' doesn't exist. Either the Toil workflow " - "is finished or has never been started" % rootJobStoreID) + raise JobException( + "The root job '%s' doesn't exist. Either the Toil workflow " + "is finished or has never been started" % rootJobStoreID + ) return self.load_job(rootJobStoreID) # FIXME: This is only used in tests, why do we have it? - @deprecated(new_function_name='create_root_job') + @deprecated(new_function_name="create_root_job") def createRootJob(self, desc: JobDescription) -> JobDescription: return self.create_root_job(desc) @@ -307,7 +323,7 @@ def create_root_job(self, job_description: JobDescription) -> JobDescription: self.set_root_job(job_description.jobStoreID) return job_description - @deprecated(new_function_name='get_root_job_return_value') + @deprecated(new_function_name="get_root_job_return_value") def getRootJobReturnValue(self) -> Any: return self.get_root_job_return_value() @@ -318,12 +334,12 @@ def get_root_job_return_value(self) -> Any: Raises an exception if the root job hasn't fulfilled its promise yet. """ # Parse out the return value from the root job - with self.read_shared_file_stream('rootJobReturnValue') as fH: + with self.read_shared_file_stream("rootJobReturnValue") as fH: return safeUnpickleFromStream(fH) @staticmethod @memoize - def _get_job_store_classes() -> List['AbstractJobStore']: + def _get_job_store_classes() -> list["AbstractJobStore"]: """ A list of concrete AbstractJobStore implementations whose dependencies are installed. @@ -333,23 +349,30 @@ def _get_job_store_classes() -> List['AbstractJobStore']: "toil.jobStores.fileJobStore.FileJobStore", "toil.jobStores.googleJobStore.GoogleJobStore", "toil.jobStores.aws.jobStore.AWSJobStore", - "toil.jobStores.abstractJobStore.JobStoreSupport") + "toil.jobStores.abstractJobStore.JobStoreSupport", + ) jobStoreClasses = [] for className in jobStoreClassNames: - moduleName, className = className.rsplit('.', 1) + moduleName, className = className.rsplit(".", 1) from importlib import import_module + try: module = import_module(moduleName) except (ImportError, ProxyConnectionError): - logger.debug("Unable to import '%s' as is expected if the corresponding extra was " - "omitted at installation time.", moduleName) + logger.debug( + "Unable to import '%s' as is expected if the corresponding extra was " + "omitted at installation time.", + moduleName, + ) else: jobStoreClass = getattr(module, className) jobStoreClasses.append(jobStoreClass) return jobStoreClasses @classmethod - def _findJobStoreForUrl(cls, url: ParseResult, export: bool = False) -> 'AbstractJobStore': + def _findJobStoreForUrl( + cls, url: ParseResult, export: bool = False + ) -> "AbstractJobStore": """ Returns the AbstractJobStore subclass that supports the given URL. @@ -368,46 +391,58 @@ def _findJobStoreForUrl(cls, url: ParseResult, export: bool = False) -> 'Abstrac # returns a file ID. Explain this to MyPy. @overload - def importFile(self, - srcUrl: str, - sharedFileName: str, - hardlink: bool = False, - symlink: bool = True) -> None: ... + def importFile( + self, + srcUrl: str, + sharedFileName: str, + hardlink: bool = False, + symlink: bool = True, + ) -> None: ... @overload - def importFile(self, - srcUrl: str, - sharedFileName: None = None, - hardlink: bool = False, - symlink: bool = True) -> FileID: ... - - @deprecated(new_function_name='import_file') - def importFile(self, - srcUrl: str, - sharedFileName: Optional[str] = None, - hardlink: bool = False, - symlink: bool = True) -> Optional[FileID]: + def importFile( + self, + srcUrl: str, + sharedFileName: None = None, + hardlink: bool = False, + symlink: bool = True, + ) -> FileID: ... + + @deprecated(new_function_name="import_file") + def importFile( + self, + srcUrl: str, + sharedFileName: Optional[str] = None, + hardlink: bool = False, + symlink: bool = True, + ) -> Optional[FileID]: return self.import_file(srcUrl, sharedFileName, hardlink, symlink) @overload - def import_file(self, - src_uri: str, - shared_file_name: str, - hardlink: bool = False, - symlink: bool = True) -> None: ... + def import_file( + self, + src_uri: str, + shared_file_name: str, + hardlink: bool = False, + symlink: bool = True, + ) -> None: ... @overload - def import_file(self, - src_uri: str, - shared_file_name: None = None, - hardlink: bool = False, - symlink: bool = True) -> FileID: ... - - def import_file(self, - src_uri: str, - shared_file_name: Optional[str] = None, - hardlink: bool = False, - symlink: bool = True) -> Optional[FileID]: + def import_file( + self, + src_uri: str, + shared_file_name: None = None, + hardlink: bool = False, + symlink: bool = True, + ) -> FileID: ... + + def import_file( + self, + src_uri: str, + shared_file_name: Optional[str] = None, + hardlink: bool = False, + symlink: bool = True, + ) -> Optional[FileID]: """ Imports the file at the given URL into job store. The ID of the newly imported file is returned. If the name of a shared file name is provided, the file will be imported as @@ -446,18 +481,22 @@ def import_file(self, parseResult = urlparse(src_uri) otherCls = self._findJobStoreForUrl(parseResult) logger.info("Importing input %s...", src_uri) - return self._import_file(otherCls, - parseResult, - shared_file_name=shared_file_name, - hardlink=hardlink, - symlink=symlink) - - def _import_file(self, - otherCls: 'AbstractJobStore', - uri: ParseResult, - shared_file_name: Optional[str] = None, - hardlink: bool = False, - symlink: bool = True) -> Optional[FileID]: + return self._import_file( + otherCls, + parseResult, + shared_file_name=shared_file_name, + hardlink=hardlink, + symlink=symlink, + ) + + def _import_file( + self, + otherCls: "AbstractJobStore", + uri: ParseResult, + shared_file_name: Optional[str] = None, + hardlink: bool = False, + symlink: bool = True, + ) -> Optional[FileID]: """ Import the file at the given URL using the given job store class to retrieve that file. See also :meth:`.importFile`. This method applies a generic approach to importing: it @@ -487,7 +526,7 @@ def _import_file(self, otherCls._read_from_url(uri, writable) return None - @deprecated(new_function_name='export_file') + @deprecated(new_function_name="export_file") def exportFile(self, jobStoreFileID: FileID, dstUrl: str) -> None: return self.export_file(jobStoreFileID, dstUrl) @@ -514,7 +553,9 @@ def export_file(self, file_id: FileID, dst_uri: str) -> None: otherCls = self._findJobStoreForUrl(parseResult, export=True) self._export_file(otherCls, file_id, parseResult) - def _export_file(self, otherCls: 'AbstractJobStore', jobStoreFileID: FileID, url: ParseResult) -> None: + def _export_file( + self, otherCls: "AbstractJobStore", jobStoreFileID: FileID, url: ParseResult + ) -> None: """ Refer to exportFile docstring for information about this method. @@ -529,7 +570,9 @@ def _export_file(self, otherCls: 'AbstractJobStore', jobStoreFileID: FileID, url """ self._default_export_file(otherCls, jobStoreFileID, url) - def _default_export_file(self, otherCls: 'AbstractJobStore', jobStoreFileID: FileID, url: ParseResult) -> None: + def _default_export_file( + self, otherCls: "AbstractJobStore", jobStoreFileID: FileID, url: ParseResult + ) -> None: """ Refer to exportFile docstring for information about this method. @@ -544,7 +587,7 @@ def _default_export_file(self, otherCls: 'AbstractJobStore', jobStoreFileID: Fil """ executable = False with self.read_file_stream(jobStoreFileID) as readable: - if getattr(jobStoreFileID, 'executable', False): + if getattr(jobStoreFileID, "executable", False): executable = jobStoreFileID.executable otherCls._write_to_url(readable, url, executable) @@ -585,7 +628,7 @@ def get_is_directory(cls, src_uri: str) -> bool: return otherCls._get_is_directory(parseResult) @classmethod - def list_url(cls, src_uri: str) -> List[str]: + def list_url(cls, src_uri: str) -> list[str]: """ List the directory at the given URL. Returned path components can be joined with '/' onto the passed URL to form new URLs. Those that end in @@ -610,7 +653,7 @@ def list_url(cls, src_uri: str) -> List[str]: return otherCls._list_url(parseResult) @classmethod - def read_from_url(cls, src_uri: str, writable: IO[bytes]) -> Tuple[int, bool]: + def read_from_url(cls, src_uri: str, writable: IO[bytes]) -> tuple[int, bool]: """ Read the given URL and write its content into the given writable stream. @@ -670,7 +713,7 @@ def _get_is_directory(cls, url: ParseResult) -> bool: @classmethod @abstractmethod - def _read_from_url(cls, url: ParseResult, writable: IO[bytes]) -> Tuple[int, bool]: + def _read_from_url(cls, url: ParseResult, writable: IO[bytes]) -> tuple[int, bool]: """ Reads the contents of the object at the specified location and writes it to the given writable stream. @@ -690,7 +733,7 @@ def _read_from_url(cls, url: ParseResult, writable: IO[bytes]) -> Tuple[int, boo @classmethod @abstractmethod - def _list_url(cls, url: ParseResult) -> List[str]: + def _list_url(cls, url: ParseResult) -> list[str]: """ List the contents of the given URL, which may or may not end in '/' @@ -722,7 +765,12 @@ def _open_url(cls, url: ParseResult) -> IO[bytes]: @classmethod @abstractmethod - def _write_to_url(cls, readable: Union[IO[bytes], IO[str]], url: ParseResult, executable: bool = False) -> None: + def _write_to_url( + cls, + readable: Union[IO[bytes], IO[str]], + url: ParseResult, + executable: bool = False, + ) -> None: """ Reads the contents of the given readable stream and writes it to the object at the specified location. Raises FileNotFoundError if the URL doesn't exist.. @@ -769,11 +817,11 @@ def destroy(self) -> None: """ raise NotImplementedError() - @deprecated(new_function_name='get_env') - def getEnv(self) -> Dict[str, str]: + @deprecated(new_function_name="get_env") + def getEnv(self) -> dict[str, str]: return self.get_env() - def get_env(self) -> Dict[str, str]: + def get_env(self) -> dict[str, str]: """ Returns a dictionary of environment variables that this job store requires to be set in order to function properly on a worker. @@ -784,7 +832,7 @@ def get_env(self) -> Dict[str, str]: # Cleanup functions def clean( - self, jobCache: Optional[Dict[Union[str, "TemporaryID"], JobDescription]] = None + self, jobCache: Optional[dict[Union[str, "TemporaryID"], JobDescription]] = None ) -> JobDescription: """ Function to cleanup the state of a job store after a restart. @@ -812,7 +860,9 @@ def getJobDescription(jobId: str) -> JobDescription: return self.load_job(jobId) def haveJob(jobId: str) -> bool: - assert len(jobId) > 1, f"Job ID {jobId} too short; is a string being used as a list?" + assert ( + len(jobId) > 1 + ), f"Job ID {jobId} too short; is a string being used as a list?" if jobCache is not None: if jobId in jobCache: return True @@ -832,13 +882,15 @@ def updateJobDescription(jobDescription: JobDescription) -> None: jobCache[str(jobDescription.jobStoreID)] = jobDescription self.update_job(jobDescription) - def getJobDescriptions() -> Union[ValuesView[JobDescription], Iterator[JobDescription]]: + def getJobDescriptions() -> ( + Union[ValuesView[JobDescription], Iterator[JobDescription]] + ): if jobCache is not None: return jobCache.values() else: return self.jobs() - def get_jobs_reachable_from_root() -> Set[str]: + def get_jobs_reachable_from_root() -> set[str]: """ Traverse the job graph from the root job and return a flattened set of all active jobstore IDs. @@ -848,8 +900,7 @@ def get_jobs_reachable_from_root() -> Set[str]: # Iterate from the root JobDescription and collate all jobs # that are reachable from it. root_job_description = self.load_root_job() - reachable_from_root: Set[str] = set() - + reachable_from_root: set[str] = set() for merged_in in root_job_description.get_chain(): # Add the job itself and any other jobs that chained with it. @@ -861,7 +912,6 @@ def get_jobs_reachable_from_root() -> Set[str]: if haveJob(service_job_store_id): reachable_from_root.add(service_job_store_id) - # Unprocessed means it might have successor jobs we need to add. unprocessed_job_descriptions = [root_job_description] @@ -874,15 +924,24 @@ def get_jobs_reachable_from_root() -> Set[str]: # exploring them, since we took their successors. reachable_from_root.add(merged_in.job_store_id) for successor_job_store_id in job_description.allSuccessors(): - if successor_job_store_id not in reachable_from_root and haveJob(successor_job_store_id): - successor_job_description = getJobDescription(successor_job_store_id) + if ( + successor_job_store_id not in reachable_from_root + and haveJob(successor_job_store_id) + ): + successor_job_description = getJobDescription( + successor_job_store_id + ) # Add all of the successor's linked service jobs as well. - for service_job_store_id in successor_job_description.services: + for ( + service_job_store_id + ) in successor_job_description.services: if haveJob(service_job_store_id): reachable_from_root.add(service_job_store_id) - new_job_descriptions_to_process.append(successor_job_description) + new_job_descriptions_to_process.append( + successor_job_description + ) unprocessed_job_descriptions = new_job_descriptions_to_process logger.debug(f"{len(reachable_from_root)} jobs reachable from root.") @@ -892,22 +951,32 @@ def get_jobs_reachable_from_root() -> Set[str]: # Cleanup jobs that are not reachable from the root, and therefore orphaned # TODO: Avoid reiterating reachable_from_root (which may be very large) - unreachable = [x for x in getJobDescriptions() if x.jobStoreID not in reachable_from_root] + unreachable = [ + x for x in getJobDescriptions() if x.jobStoreID not in reachable_from_root + ] for jobDescription in unreachable: # clean up any associated files before deletion for fileID in jobDescription.filesToDelete: # Delete any files that should already be deleted - logger.warning(f"Deleting file '{fileID}'. It is marked for deletion but has not yet been removed.") + logger.warning( + f"Deleting file '{fileID}'. It is marked for deletion but has not yet been removed." + ) self.delete_file(fileID) # Delete the job from us and the cache deleteJob(str(jobDescription.jobStoreID)) - jobDescriptionsReachableFromRoot = {id: getJobDescription(id) for id in reachable_from_root} + jobDescriptionsReachableFromRoot = { + id: getJobDescription(id) for id in reachable_from_root + } # Clean up any checkpoint jobs -- delete any successors it # may have launched, and restore the job to a pristine state jobsDeletedByCheckpoints = set() - for jobDescription in [desc for desc in jobDescriptionsReachableFromRoot.values() if isinstance(desc, CheckpointJobDescription)]: + for jobDescription in [ + desc + for desc in jobDescriptionsReachableFromRoot.values() + if isinstance(desc, CheckpointJobDescription) + ]: if jobDescription.jobStoreID in jobsDeletedByCheckpoints: # This is a checkpoint that was nested within an # earlier checkpoint, so it and all its successors are @@ -933,8 +1002,10 @@ def get_jobs_reachable_from_root() -> Set[str]: if len(jobDescription.filesToDelete) != 0: # Delete any files that should already be deleted for fileID in jobDescription.filesToDelete: - logger.critical("Removing file in job store: %s that was " - "marked for deletion but not previously removed" % fileID) + logger.critical( + "Removing file in job store: %s that was " + "marked for deletion but not previously removed" % fileID + ) self.delete_file(fileID) jobDescription.filesToDelete = [] changed[0] = True @@ -947,6 +1018,7 @@ def get_jobs_reachable_from_root() -> Set[str]: def stackSizeFn() -> int: return len(list(jobDescription.allSuccessors())) + startStackSize = stackSizeFn() # Remove deleted jobs jobDescription.filterSuccessors(haveJob) @@ -972,16 +1044,25 @@ def subFlagFile(jobStoreID: str, jobStoreFileID: str, flag: int) -> str: assert isinstance(serviceJobDescription, ServiceJobDescription) if flag == 1: - logger.debug("Recreating a start service flag for job: %s, flag: %s", - jobStoreID, newFlag) + logger.debug( + "Recreating a start service flag for job: %s, flag: %s", + jobStoreID, + newFlag, + ) serviceJobDescription.startJobStoreID = newFlag elif flag == 2: - logger.debug("Recreating a terminate service flag for job: %s, flag: %s", - jobStoreID, newFlag) + logger.debug( + "Recreating a terminate service flag for job: %s, flag: %s", + jobStoreID, + newFlag, + ) serviceJobDescription.terminateJobStoreID = newFlag else: - logger.debug("Recreating a error service flag for job: %s, flag: %s", - jobStoreID, newFlag) + logger.debug( + "Recreating a error service flag for job: %s, flag: %s", + jobStoreID, + newFlag, + ) assert flag == 3 serviceJobDescription.errorJobStoreID = newFlag @@ -994,6 +1075,7 @@ def subFlagFile(jobStoreID: str, jobStoreFileID: str, flag: int) -> str: def servicesSizeFn() -> int: return len(jobDescription.services) + startServicesSize = servicesSizeFn() def replaceFlagsIfNeeded(serviceJobDescription: JobDescription) -> None: @@ -1054,12 +1136,14 @@ def replaceFlagsIfNeeded(serviceJobDescription: JobDescription) -> None: # Remove any crufty stats/logging files from the previous run logger.debug("Discarding old statistics and logs...") + # We have to manually discard the stream to avoid getting # stuck on a blocking write from the job store. def discardStream(stream: Union[IO[bytes], IO[str]]) -> None: """Read the stream 4K at a time until EOF, discarding all input.""" while len(stream.read(4096)) != 0: pass + self.read_logs(discardStream) logger.debug("Job store is clean") @@ -1071,7 +1155,7 @@ def discardStream(stream: Union[IO[bytes], IO[str]]) -> None: # existence of jobs ########################################## - @deprecated(new_function_name='assign_job_id') + @deprecated(new_function_name="assign_job_id") def assignID(self, jobDescription: JobDescription) -> None: return self.assign_job_id(jobDescription) @@ -1095,7 +1179,7 @@ def batch(self) -> Iterator[None]: """ yield - @deprecated(new_function_name='create_job') + @deprecated(new_function_name="create_job") def create(self, jobDescription: JobDescription) -> JobDescription: return self.create_job(jobDescription) @@ -1111,7 +1195,7 @@ def create_job(self, job_description: JobDescription) -> JobDescription: """ raise NotImplementedError() - @deprecated(new_function_name='job_exists') + @deprecated(new_function_name="job_exists") def exists(self, jobStoreID: str) -> bool: return self.job_exists(jobStoreID) @@ -1127,7 +1211,7 @@ def job_exists(self, job_id: str) -> bool: # One year should be sufficient to finish any pipeline ;-) publicUrlExpiration = timedelta(days=365) - @deprecated(new_function_name='get_public_url') + @deprecated(new_function_name="get_public_url") def getPublicUrl(self, fileName: str) -> str: return self.get_public_url(fileName) @@ -1146,7 +1230,7 @@ def get_public_url(self, file_name: str) -> str: """ raise NotImplementedError() - @deprecated(new_function_name='get_shared_public_url') + @deprecated(new_function_name="get_shared_public_url") def getSharedPublicUrl(self, sharedFileName: str) -> str: return self.get_shared_public_url(sharedFileName) @@ -1168,7 +1252,7 @@ def get_shared_public_url(self, shared_file_name: str) -> str: """ raise NotImplementedError() - @deprecated(new_function_name='load_job') + @deprecated(new_function_name="load_job") def load(self, jobStoreID: str) -> JobDescription: return self.load_job(jobStoreID) @@ -1188,7 +1272,7 @@ def load_job(self, job_id: str) -> JobDescription: """ raise NotImplementedError() - @deprecated(new_function_name='update_job') + @deprecated(new_function_name="update_job") def update(self, jobDescription: JobDescription) -> None: return self.update_job(jobDescription) @@ -1203,7 +1287,7 @@ def update_job(self, job_description: JobDescription) -> None: """ raise NotImplementedError() - @deprecated(new_function_name='delete_job') + @deprecated(new_function_name="delete_job") def delete(self, jobStoreID: str) -> None: return self.delete_job(jobStoreID) @@ -1240,12 +1324,19 @@ def jobs(self) -> Iterator[JobDescription]: # associated with a given job. ########################################## - @deprecated(new_function_name='write_file') - def writeFile(self, localFilePath: str, jobStoreID: Optional[str] = None, cleanup: bool = False) -> str: + @deprecated(new_function_name="write_file") + def writeFile( + self, + localFilePath: str, + jobStoreID: Optional[str] = None, + cleanup: bool = False, + ) -> str: return self.write_file(localFilePath, jobStoreID, cleanup) @abstractmethod - def write_file(self, local_path: str, job_id: Optional[str] = None, cleanup: bool = False) -> str: + def write_file( + self, local_path: str, job_id: Optional[str] = None, cleanup: bool = False + ) -> str: """ Takes a file (as a path) and places it in this job store. Returns an ID that can be used to retrieve the file at a later time. The file is written in a atomic manner. It will @@ -1276,19 +1367,27 @@ def write_file(self, local_path: str, job_id: Optional[str] = None, cleanup: boo """ raise NotImplementedError() - @deprecated(new_function_name='write_file_stream') - def writeFileStream(self, jobStoreID: Optional[str] = None, cleanup: bool = False, basename: Optional[str] = None, - encoding: Optional[str] = None, errors: Optional[str] = None) -> ContextManager[Tuple[IO[bytes], str]]: + @deprecated(new_function_name="write_file_stream") + def writeFileStream( + self, + jobStoreID: Optional[str] = None, + cleanup: bool = False, + basename: Optional[str] = None, + encoding: Optional[str] = None, + errors: Optional[str] = None, + ) -> ContextManager[tuple[IO[bytes], str]]: return self.write_file_stream(jobStoreID, cleanup, basename, encoding, errors) @abstractmethod @contextmanager - def write_file_stream(self, - job_id: Optional[str] = None, - cleanup: bool = False, - basename: Optional[str] = None, - encoding: Optional[str] = None, - errors: Optional[str] = None) -> Iterator[Tuple[IO[bytes], str]]: + def write_file_stream( + self, + job_id: Optional[str] = None, + cleanup: bool = False, + basename: Optional[str] = None, + encoding: Optional[str] = None, + errors: Optional[str] = None, + ) -> Iterator[tuple[IO[bytes], str]]: """ Similar to writeFile, but returns a context manager yielding a tuple of 1) a file handle which can be written to and 2) the ID of the resulting @@ -1327,18 +1426,22 @@ def write_file_stream(self, """ raise NotImplementedError() - @deprecated(new_function_name='get_empty_file_store_id') - def getEmptyFileStoreID(self, - jobStoreID: Optional[str] = None, - cleanup: bool = False, - basename: Optional[str] = None) -> str: + @deprecated(new_function_name="get_empty_file_store_id") + def getEmptyFileStoreID( + self, + jobStoreID: Optional[str] = None, + cleanup: bool = False, + basename: Optional[str] = None, + ) -> str: return self.get_empty_file_store_id(jobStoreID, cleanup, basename) @abstractmethod - def get_empty_file_store_id(self, - job_id: Optional[str] = None, - cleanup: bool = False, - basename: Optional[str] = None) -> str: + def get_empty_file_store_id( + self, + job_id: Optional[str] = None, + cleanup: bool = False, + basename: Optional[str] = None, + ) -> str: """ Creates an empty file in the job store and returns its ID. Call to fileExists(getEmptyFileStoreID(jobStoreID)) will return True. @@ -1360,8 +1463,10 @@ def get_empty_file_store_id(self, """ raise NotImplementedError() - @deprecated(new_function_name='read_file') - def readFile(self, jobStoreFileID: str, localFilePath: str, symlink: bool = False) -> None: + @deprecated(new_function_name="read_file") + def readFile( + self, jobStoreFileID: str, localFilePath: str, symlink: bool = False + ) -> None: return self.read_file(jobStoreFileID, localFilePath, symlink) @abstractmethod @@ -1389,7 +1494,7 @@ def read_file(self, file_id: str, local_path: str, symlink: bool = False) -> Non """ raise NotImplementedError() - @deprecated(new_function_name='read_file_stream') + @deprecated(new_function_name="read_file_stream") def readFileStream( self, jobStoreFileID: str, @@ -1404,14 +1509,12 @@ def read_file_stream( file_id: Union[FileID, str], encoding: Literal[None] = None, errors: Optional[str] = None, - ) -> ContextManager[IO[bytes]]: - ... + ) -> ContextManager[IO[bytes]]: ... @overload def read_file_stream( self, file_id: Union[FileID, str], encoding: str, errors: Optional[str] = None - ) -> ContextManager[IO[str]]: - ... + ) -> ContextManager[IO[str]]: ... @abstractmethod def read_file_stream( @@ -1437,7 +1540,7 @@ def read_file_stream( """ raise NotImplementedError() - @deprecated(new_function_name='delete_file') + @deprecated(new_function_name="delete_file") def deleteFile(self, jobStoreFileID: str) -> None: return self.delete_file(jobStoreFileID) @@ -1451,7 +1554,7 @@ def delete_file(self, file_id: str) -> None: """ raise NotImplementedError() - @deprecated(new_function_name='file_exists') + @deprecated(new_function_name="file_exists") def fileExists(self, jobStoreFileID: str) -> bool: """Determine whether a file exists in this job store.""" return self.file_exists(jobStoreFileID) @@ -1465,7 +1568,7 @@ def file_exists(self, file_id: str) -> bool: """ raise NotImplementedError() - @deprecated(new_function_name='get_file_size') + @deprecated(new_function_name="get_file_size") def getFileSize(self, jobStoreFileID: str) -> int: """Get the size of the given file in bytes.""" return self.get_file_size(jobStoreFileID) @@ -1485,7 +1588,7 @@ def get_file_size(self, file_id: str) -> int: """ raise NotImplementedError() - @deprecated(new_function_name='update_file') + @deprecated(new_function_name="update_file") def updateFile(self, jobStoreFileID: str, localFilePath: str) -> None: """Replaces the existing version of a file in the job store.""" return self.update_file(jobStoreFileID, localFilePath) @@ -1506,19 +1609,20 @@ def update_file(self, file_id: str, local_path: str) -> None: """ raise NotImplementedError() - @deprecated(new_function_name='update_file_stream') - def updateFileStream(self, - jobStoreFileID: str, - encoding: Optional[str] = None, - errors: Optional[str] = None) -> ContextManager[IO[Any]]: + @deprecated(new_function_name="update_file_stream") + def updateFileStream( + self, + jobStoreFileID: str, + encoding: Optional[str] = None, + errors: Optional[str] = None, + ) -> ContextManager[IO[Any]]: return self.update_file_stream(jobStoreFileID, encoding, errors) @abstractmethod @contextmanager - def update_file_stream(self, - file_id: str, - encoding: Optional[str] = None, - errors: Optional[str] = None) -> Iterator[IO[Any]]: + def update_file_stream( + self, file_id: str, encoding: Optional[str] = None, errors: Optional[str] = None + ) -> Iterator[IO[Any]]: """ Replaces the existing version of a file in the job store. Similar to writeFile, but returns a context manager yielding a file handle which can be written to. The @@ -1544,20 +1648,29 @@ def update_file_stream(self, # with specific jobs. ########################################## - sharedFileNameRegex = re.compile(r'^[a-zA-Z0-9._-]+$') + sharedFileNameRegex = re.compile(r"^[a-zA-Z0-9._-]+$") - @deprecated(new_function_name='write_shared_file_stream') - def writeSharedFileStream(self, sharedFileName: str, isProtected: Optional[bool] = None, encoding: Optional[str] = None, - errors: Optional[str] = None) -> ContextManager[IO[bytes]]: - return self.write_shared_file_stream(sharedFileName, isProtected, encoding, errors) + @deprecated(new_function_name="write_shared_file_stream") + def writeSharedFileStream( + self, + sharedFileName: str, + isProtected: Optional[bool] = None, + encoding: Optional[str] = None, + errors: Optional[str] = None, + ) -> ContextManager[IO[bytes]]: + return self.write_shared_file_stream( + sharedFileName, isProtected, encoding, errors + ) @abstractmethod @contextmanager - def write_shared_file_stream(self, - shared_file_name: str, - encrypted: Optional[bool] = None, - encoding: Optional[str] = None, - errors: Optional[str] = None) -> Iterator[IO[bytes]]: + def write_shared_file_stream( + self, + shared_file_name: str, + encrypted: Optional[bool] = None, + encoding: Optional[str] = None, + errors: Optional[str] = None, + ) -> Iterator[IO[bytes]]: """ Returns a context manager yielding a writable file handle to the global file referenced by the given name. File will be created in an atomic manner. @@ -1582,19 +1695,23 @@ def write_shared_file_stream(self, """ raise NotImplementedError() - @deprecated(new_function_name='read_shared_file_stream') - def readSharedFileStream(self, - sharedFileName: str, - encoding: Optional[str] = None, - errors: Optional[str] = None) -> ContextManager[IO[bytes]]: + @deprecated(new_function_name="read_shared_file_stream") + def readSharedFileStream( + self, + sharedFileName: str, + encoding: Optional[str] = None, + errors: Optional[str] = None, + ) -> ContextManager[IO[bytes]]: return self.read_shared_file_stream(sharedFileName, encoding, errors) @abstractmethod @contextmanager - def read_shared_file_stream(self, - shared_file_name: str, - encoding: Optional[str] = None, - errors: Optional[str] = None) -> Iterator[IO[bytes]]: + def read_shared_file_stream( + self, + shared_file_name: str, + encoding: Optional[str] = None, + errors: Optional[str] = None, + ) -> Iterator[IO[bytes]]: """ Returns a context manager yielding a readable file handle to the global file referenced by the given name. @@ -1613,7 +1730,7 @@ def read_shared_file_stream(self, """ raise NotImplementedError() - @deprecated(new_function_name='write_logs') + @deprecated(new_function_name="write_logs") def writeStatsAndLogging(self, statsAndLoggingString: str) -> None: return self.write_logs(statsAndLoggingString) @@ -1629,8 +1746,10 @@ def write_logs(self, msg: str) -> None: """ raise NotImplementedError() - @deprecated(new_function_name='read_logs') - def readStatsAndLogging(self, callback: Callable[..., Any], readAll: bool = False) -> int: + @deprecated(new_function_name="read_logs") + def readStatsAndLogging( + self, callback: Callable[..., Any], readAll: bool = False + ) -> int: return self.read_logs(callback, readAll) @abstractmethod @@ -1665,8 +1784,8 @@ def write_leader_pid(self) -> None: this method. Other methods will rely on always having the most current pid available. So far there is no reason to store any old pids. """ - with self.write_shared_file_stream('pid.log') as f: - f.write(str(os.getpid()).encode('utf-8')) + with self.write_shared_file_stream("pid.log") as f: + f.write(str(os.getpid()).encode("utf-8")) def read_leader_pid(self) -> int: """ @@ -1674,7 +1793,7 @@ def read_leader_pid(self) -> int: :raise NoSuchFileException: If the PID file doesn't exist. """ - with self.read_shared_file_stream('pid.log') as f: + with self.read_shared_file_stream("pid.log") as f: return int(f.read().strip()) def write_leader_node_id(self) -> None: @@ -1683,7 +1802,7 @@ def write_leader_node_id(self) -> None: by the leader. """ with self.write_shared_file_stream("leader_node_id.log") as f: - f.write(getNodeID().encode('utf-8')) + f.write(getNodeID().encode("utf-8")) def read_leader_node_id(self) -> str: """ @@ -1692,7 +1811,7 @@ def read_leader_node_id(self) -> str: :raise NoSuchFileException: If the node ID file doesn't exist. """ with self.read_shared_file_stream("leader_node_id.log") as f: - return f.read().decode('utf-8').strip() + return f.read().decode("utf-8").strip() def write_kill_flag(self, kill: bool = False) -> None: """ @@ -1705,7 +1824,7 @@ def write_kill_flag(self, kill: bool = False) -> None: workers are expected to be cleaned up by the leader. """ with self.write_shared_file_stream("_toil_kill_flag") as f: - f.write(("YES" if kill else "NO").encode('utf-8')) + f.write(("YES" if kill else "NO").encode("utf-8")) def read_kill_flag(self) -> bool: """ @@ -1746,6 +1865,7 @@ def _requireValidSharedFileName(cls, sharedFileName: str) -> None: if not cls._validateSharedFileName(sharedFileName): raise ValueError("Not a valid shared file name: '%s'." % sharedFileName) + class JobStoreSupport(AbstractJobStore, metaclass=ABCMeta): """ A mostly fake JobStore to access URLs not really associated with real job @@ -1754,7 +1874,7 @@ class JobStoreSupport(AbstractJobStore, metaclass=ABCMeta): @classmethod def _supports_url(cls, url: ParseResult, export: bool = False) -> bool: - return url.scheme.lower() in ('http', 'https', 'ftp') and not export + return url.scheme.lower() in ("http", "https", "ftp") and not export @classmethod def _url_exists(cls, url: ParseResult) -> bool: @@ -1775,17 +1895,17 @@ def _url_exists(cls, url: ParseResult) -> bool: ] ) def _get_size(cls, url: ParseResult) -> Optional[int]: - if url.scheme.lower() == 'ftp': + if url.scheme.lower() == "ftp": return None with closing(urlopen(url.geturl())) as readable: # just read the header for content length - size = readable.info().get('content-length') + size = readable.info().get("content-length") return int(size) if size is not None else None @classmethod def _read_from_url( cls, url: ParseResult, writable: Union[IO[bytes], IO[str]] - ) -> Tuple[int, bool]: + ) -> tuple[int, bool]: # We can't actually retry after we start writing. # TODO: Implement retry with byte range requests with cls._open_url(url) as readable: @@ -1794,8 +1914,10 @@ def _read_from_url( # nested function can modify it without creating its own # local with the same name. size = [0] + def count(l: int) -> None: size[0] += l + counter = WriteWatchingStream(writable) counter.onWrite(count) @@ -1835,6 +1957,6 @@ def _get_is_directory(cls, url: ParseResult) -> bool: return False @classmethod - def _list_url(cls, url: ParseResult) -> List[str]: + def _list_url(cls, url: ParseResult) -> list[str]: # TODO: Implement HTTP index parsing and FTP directory listing raise NotImplementedError("HTTP and FTP URLs cannot yet be listed") diff --git a/src/toil/jobStores/aws/jobStore.py b/src/toil/jobStores/aws/jobStore.py index 74958a2d77..e1f33e7511 100644 --- a/src/toil/jobStores/aws/jobStore.py +++ b/src/toil/jobStores/aws/jobStore.py @@ -21,19 +21,10 @@ import stat import time import uuid +from collections.abc import Generator from contextlib import contextmanager from io import BytesIO -from typing import ( - IO, - TYPE_CHECKING, - Dict, - Generator, - List, - Optional, - Tuple, - Union, - cast, -) +from typing import IO, TYPE_CHECKING, Optional, Union, cast from urllib.parse import ParseResult, parse_qs, urlencode, urlsplit, urlunsplit from botocore.exceptions import ClientError @@ -98,8 +89,8 @@ from toil import Config boto3_session = establish_boto3_session() -s3_boto3_resource = boto3_session.resource('s3') -s3_boto3_client = boto3_session.client('s3') +s3_boto3_resource = boto3_session.resource("s3") +s3_boto3_client = boto3_session.client("s3") logger = logging.getLogger(__name__) # Sometimes we have to wait for multipart uploads to become real. How long @@ -114,6 +105,7 @@ class ChecksumError(Exception): class DomainDoesNotExist(Exception): """Raised when a domain that is expected to exist does not exist.""" + def __init__(self, domain_name): super().__init__(f"Expected domain {domain_name} to exist!") @@ -131,14 +123,14 @@ class AWSJobStore(AbstractJobStore): # URLs where the may interfere with the certificate common name. We use a double # underscore as a separator instead. # - bucketNameRe = re.compile(r'^[a-z0-9][a-z0-9-]+[a-z0-9]$') + bucketNameRe = re.compile(r"^[a-z0-9][a-z0-9-]+[a-z0-9]$") # See http://docs.aws.amazon.com/AmazonS3/latest/dev/BucketRestrictions.html # minBucketNameLen = 3 maxBucketNameLen = 63 maxNameLen = 10 - nameSeparator = '--' + nameSeparator = "--" def __init__(self, locator: str, partSize: int = 50 << 20) -> None: """ @@ -149,23 +141,35 @@ def __init__(self, locator: str, partSize: int = 50 << 20) -> None: whole file """ super().__init__(locator) - region, namePrefix = locator.split(':') + region, namePrefix = locator.split(":") regions = EC2Regions.keys() if region not in regions: raise ValueError(f'Region "{region}" is not one of: {regions}') if not self.bucketNameRe.match(namePrefix): - raise ValueError("Invalid name prefix '%s'. Name prefixes must contain only digits, " - "hyphens or lower-case letters and must not start or end in a " - "hyphen." % namePrefix) + raise ValueError( + "Invalid name prefix '%s'. Name prefixes must contain only digits, " + "hyphens or lower-case letters and must not start or end in a " + "hyphen." % namePrefix + ) # Reserve 13 for separator and suffix - if len(namePrefix) > self.maxBucketNameLen - self.maxNameLen - len(self.nameSeparator): - raise ValueError("Invalid name prefix '%s'. Name prefixes may not be longer than 50 " - "characters." % namePrefix) - if '--' in namePrefix: - raise ValueError("Invalid name prefix '%s'. Name prefixes may not contain " - "%s." % (namePrefix, self.nameSeparator)) - logger.debug("Instantiating %s for region %s and name prefix '%s'", - self.__class__, region, namePrefix) + if len(namePrefix) > self.maxBucketNameLen - self.maxNameLen - len( + self.nameSeparator + ): + raise ValueError( + "Invalid name prefix '%s'. Name prefixes may not be longer than 50 " + "characters." % namePrefix + ) + if "--" in namePrefix: + raise ValueError( + "Invalid name prefix '%s'. Name prefixes may not contain " + "%s." % (namePrefix, self.nameSeparator) + ) + logger.debug( + "Instantiating %s for region %s and name prefix '%s'", + self.__class__, + region, + namePrefix, + ) self.region = region self.name_prefix = namePrefix self.part_size = partSize @@ -174,7 +178,7 @@ def __init__(self, locator: str, partSize: int = 50 << 20) -> None: self.files_bucket = None self.db = boto3_session.client(service_name="sdb", region_name=region) - self.s3_resource = boto3_session.resource('s3', region_name=self.region) + self.s3_resource = boto3_session.resource("s3", region_name=self.region) self.s3_client = self.s3_resource.meta.client def initialize(self, config: "Config") -> None: @@ -201,7 +205,12 @@ def resume(self) -> None: self._bind(create=False) super().resume() - def _bind(self, create: bool = False, block: bool = True, check_versioning_consistency: bool = True) -> None: + def _bind( + self, + create: bool = False, + block: bool = True, + check_versioning_consistency: bool = True, + ) -> None: def qualify(name): assert len(name) <= self.maxNameLen return self.name_prefix + self.nameSeparator + name @@ -216,11 +225,13 @@ def qualify(name): self.files_domain_name = qualify("files") self._bindDomain(self.files_domain_name, create=create, block=block) if self.files_bucket is None: - self.files_bucket = self._bindBucket(qualify('files'), - create=create, - block=block, - versioning=True, - check_versioning_consistency=check_versioning_consistency) + self.files_bucket = self._bindBucket( + qualify("files"), + create=create, + block=block, + versioning=True, + check_versioning_consistency=check_versioning_consistency, + ) @property def _registered(self) -> Optional[bool]: @@ -242,9 +253,9 @@ def _registered(self) -> Optional[bool]: # can't handle job stores that were partially created by 3.3.0, though. registry_domain_name = "toil-registry" try: - self._bindDomain(domain_name=registry_domain_name, - create=False, - block=False) + self._bindDomain( + domain_name=registry_domain_name, create=False, block=False + ) except DomainDoesNotExist: return False @@ -256,7 +267,7 @@ def _registered(self) -> Optional[bool]: AttributeNames=["exists"], ConsistentRead=True, ) - attributes: List["AttributeTypeDef"] = get_result.get( + attributes: list["AttributeTypeDef"] = get_result.get( "Attributes", [] ) # the documentation says 'Attributes' should always exist, but this is not true exists: Optional[str] = get_item_from_attributes( @@ -264,9 +275,9 @@ def _registered(self) -> Optional[bool]: ) if exists is None: return False - elif exists == 'True': + elif exists == "True": return True - elif exists == 'False': + elif exists == "False": return None else: assert False @@ -275,31 +286,38 @@ def _registered(self) -> Optional[bool]: def _registered(self, value: bool) -> None: registry_domain_name = "toil-registry" try: - self._bindDomain(domain_name=registry_domain_name, - # Only create registry domain when registering or - # transitioning a store - create=value is not False, - block=False) + self._bindDomain( + domain_name=registry_domain_name, + # Only create registry domain when registering or + # transitioning a store + create=value is not False, + block=False, + ) except DomainDoesNotExist: pass else: for attempt in retry_sdb(): with attempt: if value is False: - self.db.delete_attributes(DomainName=registry_domain_name, - ItemName=self.name_prefix) + self.db.delete_attributes( + DomainName=registry_domain_name, ItemName=self.name_prefix + ) else: if value is True: - attributes: List["ReplaceableAttributeTypeDef"] = [ + attributes: list["ReplaceableAttributeTypeDef"] = [ {"Name": "exists", "Value": "True", "Replace": True} ] elif value is None: - attributes = [{"Name": "exists", "Value": "False", "Replace": True}] + attributes = [ + {"Name": "exists", "Value": "False", "Replace": True} + ] else: assert False - self.db.put_attributes(DomainName=registry_domain_name, - ItemName=self.name_prefix, - Attributes=attributes) + self.db.put_attributes( + DomainName=registry_domain_name, + ItemName=self.name_prefix, + Attributes=attributes, + ) def _checkItem(self, item: "ItemTypeDef", enforce: bool = True) -> None: """ @@ -312,23 +330,30 @@ def _checkItem(self, item: "ItemTypeDef", enforce: bool = True) -> None: self._checkAttributes(item["Attributes"], enforce) def _checkAttributes( - self, attributes: List["AttributeTypeDef"], enforce: bool = True + self, attributes: list["AttributeTypeDef"], enforce: bool = True ) -> None: if get_item_from_attributes(attributes=attributes, name="overlargeID") is None: - logger.error("overlargeID attribute isn't present: either SimpleDB entry is " - "corrupt or jobstore is from an extremely old Toil: %s", attributes) + logger.error( + "overlargeID attribute isn't present: either SimpleDB entry is " + "corrupt or jobstore is from an extremely old Toil: %s", + attributes, + ) if enforce: - raise RuntimeError("encountered SimpleDB entry missing required attribute " - "'overlargeID'; is your job store ancient?") + raise RuntimeError( + "encountered SimpleDB entry missing required attribute " + "'overlargeID'; is your job store ancient?" + ) - def _awsJobFromAttributes(self, attributes: List["AttributeTypeDef"]) -> Job: + def _awsJobFromAttributes(self, attributes: list["AttributeTypeDef"]) -> Job: """ Get a Toil Job object from attributes that are defined in an item from the DB :param attributes: List of attributes :return: Toil job """ self._checkAttributes(attributes) - overlarge_id_value = get_item_from_attributes(attributes=attributes, name="overlargeID") + overlarge_id_value = get_item_from_attributes( + attributes=attributes, name="overlargeID" + ) if overlarge_id_value: assert self.file_exists(overlarge_id_value) # This is an overlarge job, download the actual attributes @@ -351,7 +376,7 @@ def _awsJobFromItem(self, item: "ItemTypeDef") -> Job: """ return self._awsJobFromAttributes(item["Attributes"]) - def _awsJobToAttributes(self, job: JobDescription) -> List["AttributeTypeDef"]: + def _awsJobToAttributes(self, job: JobDescription) -> list["AttributeTypeDef"]: binary = pickle.dumps(job, protocol=pickle.HIGHEST_PROTOCOL) if len(binary) > SDBHelper.maxBinarySize(extraReservedChunks=1): # Store as an overlarge job in S3 @@ -373,16 +398,18 @@ def _awsJobToItem(self, job: JobDescription, name: str) -> "ItemTypeDef": def batch(self) -> None: self._batchedUpdates = [] yield - batches = [self._batchedUpdates[i:i + self.jobsPerBatchInsert] for i in - range(0, len(self._batchedUpdates), self.jobsPerBatchInsert)] + batches = [ + self._batchedUpdates[i : i + self.jobsPerBatchInsert] + for i in range(0, len(self._batchedUpdates), self.jobsPerBatchInsert) + ] for batch in batches: - items: List["ReplaceableItemTypeDef"] = [] + items: list["ReplaceableItemTypeDef"] = [] for jobDescription in batch: - item_attributes: List["ReplaceableAttributeTypeDef"] = [] + item_attributes: list["ReplaceableAttributeTypeDef"] = [] jobDescription.pre_update_hook() item_name = compat_bytes(jobDescription.jobStoreID) - got_job_attributes: List["AttributeTypeDef"] = self._awsJobToAttributes( + got_job_attributes: list["AttributeTypeDef"] = self._awsJobToAttributes( jobDescription ) for each_attribute in got_job_attributes: @@ -392,12 +419,13 @@ def batch(self) -> None: "Replace": True, } item_attributes.append(new_attribute) - items.append({"Name": item_name, - "Attributes": item_attributes}) + items.append({"Name": item_name, "Attributes": item_attributes}) for attempt in retry_sdb(): with attempt: - self.db.batch_put_attributes(DomainName=self.jobs_domain_name, Items=items) + self.db.batch_put_attributes( + DomainName=self.jobs_domain_name, Items=items + ) self._batchedUpdates = None def assign_job_id(self, job_description: JobDescription) -> None: @@ -415,19 +443,28 @@ def create_job(self, job_description: JobDescription) -> JobDescription: def job_exists(self, job_id: Union[bytes, str]) -> bool: for attempt in retry_sdb(): with attempt: - return len(self.db.get_attributes(DomainName=self.jobs_domain_name, - ItemName=compat_bytes(job_id), - AttributeNames=[SDBHelper.presenceIndicator()], - ConsistentRead=True).get("Attributes", [])) > 0 + return ( + len( + self.db.get_attributes( + DomainName=self.jobs_domain_name, + ItemName=compat_bytes(job_id), + AttributeNames=[SDBHelper.presenceIndicator()], + ConsistentRead=True, + ).get("Attributes", []) + ) + > 0 + ) def jobs(self) -> Generator[Job, None, None]: - job_items: Optional[List["ItemTypeDef"]] = None + job_items: Optional[list["ItemTypeDef"]] = None for attempt in retry_sdb(): with attempt: - job_items = boto3_pager(self.db.select, - "Items", - ConsistentRead=True, - SelectExpression="select * from `%s`" % self.jobs_domain_name) + job_items = boto3_pager( + self.db.select, + "Items", + ConsistentRead=True, + SelectExpression="select * from `%s`" % self.jobs_domain_name, + ) assert job_items is not None for jobItem in job_items: yield self._awsJobFromItem(jobItem) @@ -436,9 +473,11 @@ def load_job(self, job_id: FileID) -> Job: item_attributes = None for attempt in retry_sdb(): with attempt: - item_attributes = self.db.get_attributes(DomainName=self.jobs_domain_name, - ItemName=compat_bytes(job_id), - ConsistentRead=True).get("Attributes", []) + item_attributes = self.db.get_attributes( + DomainName=self.jobs_domain_name, + ItemName=compat_bytes(job_id), + ConsistentRead=True, + ).get("Attributes", []) if not item_attributes: raise NoSuchJobException(job_id) job = self._awsJobFromAttributes(item_attributes) @@ -451,13 +490,17 @@ def update_job(self, job_description): logger.debug("Updating job %s", job_description.jobStoreID) job_description.pre_update_hook() job_attributes = self._awsJobToAttributes(job_description) - update_attributes: List["ReplaceableAttributeTypeDef"] = [ + update_attributes: list["ReplaceableAttributeTypeDef"] = [ {"Name": attribute["Name"], "Value": attribute["Value"], "Replace": True} for attribute in job_attributes ] for attempt in retry_sdb(): with attempt: - self.db.put_attributes(DomainName=self.jobs_domain_name, ItemName=compat_bytes(job_description.jobStoreID), Attributes=update_attributes) + self.db.put_attributes( + DomainName=self.jobs_domain_name, + ItemName=compat_bytes(job_description.jobStoreID), + Attributes=update_attributes, + ) itemsPerBatchDelete = 25 @@ -468,53 +511,75 @@ def delete_job(self, job_id): # If the job is overlarge, delete its file from the filestore for attempt in retry_sdb(): with attempt: - attributes = self.db.get_attributes(DomainName=self.jobs_domain_name, - ItemName=compat_bytes(job_id), - ConsistentRead=True).get("Attributes", []) + attributes = self.db.get_attributes( + DomainName=self.jobs_domain_name, + ItemName=compat_bytes(job_id), + ConsistentRead=True, + ).get("Attributes", []) # If the overlargeID has fallen off, maybe we partially deleted the # attributes of the item? Or raced on it? Or hit SimpleDB being merely # eventually consistent? We should still be able to get rid of it. self._checkAttributes(attributes, enforce=False) - overlarge_id_value = get_item_from_attributes(attributes=attributes, name="overlargeID") + overlarge_id_value = get_item_from_attributes( + attributes=attributes, name="overlargeID" + ) if overlarge_id_value: logger.debug("Deleting job from filestore") self.delete_file(overlarge_id_value) for attempt in retry_sdb(): with attempt: - self.db.delete_attributes(DomainName=self.jobs_domain_name, ItemName=compat_bytes(job_id)) - items: Optional[List["ItemTypeDef"]] = None + self.db.delete_attributes( + DomainName=self.jobs_domain_name, ItemName=compat_bytes(job_id) + ) + items: Optional[list["ItemTypeDef"]] = None for attempt in retry_sdb(): with attempt: - items = list(boto3_pager(self.db.select, - "Items", - ConsistentRead=True, - SelectExpression=f"select version from `{self.files_domain_name}` where ownerID='{job_id}'")) + items = list( + boto3_pager( + self.db.select, + "Items", + ConsistentRead=True, + SelectExpression=f"select version from `{self.files_domain_name}` where ownerID='{job_id}'", + ) + ) assert items is not None if items: - logger.debug("Deleting %d file(s) associated with job %s", len(items), job_id) + logger.debug( + "Deleting %d file(s) associated with job %s", len(items), job_id + ) n = self.itemsPerBatchDelete - batches = [items[i:i + n] for i in range(0, len(items), n)] + batches = [items[i : i + n] for i in range(0, len(items), n)] for batch in batches: - delete_items: List["DeletableItemTypeDef"] = [ + delete_items: list["DeletableItemTypeDef"] = [ {"Name": item["Name"]} for item in batch ] for attempt in retry_sdb(): with attempt: - self.db.batch_delete_attributes(DomainName=self.files_domain_name, Items=delete_items) + self.db.batch_delete_attributes( + DomainName=self.files_domain_name, Items=delete_items + ) for item in items: item: "ItemTypeDef" - version = get_item_from_attributes(attributes=item["Attributes"], name="version") + version = get_item_from_attributes( + attributes=item["Attributes"], name="version" + ) for attempt in retry_s3(): with attempt: if version: - self.s3_client.delete_object(Bucket=self.files_bucket.name, - Key=compat_bytes(item["Name"]), - VersionId=version) + self.s3_client.delete_object( + Bucket=self.files_bucket.name, + Key=compat_bytes(item["Name"]), + VersionId=version, + ) else: - self.s3_client.delete_object(Bucket=self.files_bucket.name, - Key=compat_bytes(item["Name"])) + self.s3_client.delete_object( + Bucket=self.files_bucket.name, + Key=compat_bytes(item["Name"]), + ) - def get_empty_file_store_id(self, jobStoreID=None, cleanup=False, basename=None) -> FileID: + def get_empty_file_store_id( + self, jobStoreID=None, cleanup=False, basename=None + ) -> FileID: info = self.FileInfo.create(jobStoreID if cleanup else None) with info.uploadStream() as _: # Empty @@ -523,8 +588,14 @@ def get_empty_file_store_id(self, jobStoreID=None, cleanup=False, basename=None) logger.debug("Created %r.", info) return info.fileID - def _import_file(self, otherCls, uri: ParseResult, shared_file_name: Optional[str] = None, - hardlink: bool = False, symlink: bool = True) -> Optional[FileID]: + def _import_file( + self, + otherCls, + uri: ParseResult, + shared_file_name: Optional[str] = None, + hardlink: bool = False, + symlink: bool = True, + ) -> Optional[FileID]: try: if issubclass(otherCls, AWSJobStore): srcObj = get_object_for_url(uri, existing=True) @@ -534,15 +605,19 @@ def _import_file(self, otherCls, uri: ParseResult, shared_file_name: Optional[st else: self._requireValidSharedFileName(shared_file_name) jobStoreFileID = self._shared_file_id(shared_file_name) - info = self.FileInfo.loadOrCreate(jobStoreFileID=jobStoreFileID, - ownerID=str(self.sharedFileOwnerID), - encrypted=None) + info = self.FileInfo.loadOrCreate( + jobStoreFileID=jobStoreFileID, + ownerID=str(self.sharedFileOwnerID), + encrypted=None, + ) info.copyFrom(srcObj) info.save() return FileID(info.fileID, size) if shared_file_name is None else None except (NoBucketLocationError, ServerSideCopyProhibitedError): # AWS refuses to tell us where the bucket is or do this copy for us - logger.warning("Falling back to copying via the local machine. This could get expensive!") + logger.warning( + "Falling back to copying via the local machine. This could get expensive!" + ) # copy if exception return super()._import_file(otherCls, uri, shared_file_name=shared_file_name) @@ -556,7 +631,9 @@ def _export_file(self, otherCls, file_id: FileID, uri: ParseResult) -> None: return except (NoBucketLocationError, ServerSideCopyProhibitedError): # AWS refuses to tell us where the bucket is or do this copy for us - logger.warning("Falling back to copying via the local machine. This could get expensive!") + logger.warning( + "Falling back to copying via the local machine. This could get expensive!" + ) else: super()._default_export_file(otherCls, file_id, uri) @@ -578,34 +655,35 @@ def _get_size(cls, url: ParseResult) -> int: def _read_from_url(cls, url: ParseResult, writable): srcObj = get_object_for_url(url, existing=True) srcObj.download_fileobj(writable) - return ( - srcObj.content_length, - False # executable bit is always False - ) + return (srcObj.content_length, False) # executable bit is always False @classmethod def _open_url(cls, url: ParseResult) -> IO[bytes]: src_obj = get_object_for_url(url, existing=True) response = src_obj.get() # We should get back a response with a stream in 'Body' - if 'Body' not in response: + if "Body" not in response: raise RuntimeError(f"Could not fetch body stream for {url}") - return response['Body'] + return response["Body"] @classmethod - def _write_to_url(cls, readable, url: ParseResult, executable: bool = False) -> None: + def _write_to_url( + cls, readable, url: ParseResult, executable: bool = False + ) -> None: dstObj = get_object_for_url(url) logger.debug("Uploading %s", dstObj.key) # uploadFile takes care of using multipart upload if the file is larger than partSize (default to 5MB) - uploadFile(readable=readable, - resource=s3_boto3_resource, - bucketName=dstObj.bucket_name, - fileID=dstObj.key, - partSize=5 * 1000 * 1000) + uploadFile( + readable=readable, + resource=s3_boto3_resource, + bucketName=dstObj.bucket_name, + fileID=dstObj.key, + partSize=5 * 1000 * 1000, + ) @classmethod - def _list_url(cls, url: ParseResult) -> List[str]: + def _list_url(cls, url: ParseResult) -> list[str]: return list_objects_for_url(url) @classmethod @@ -616,9 +694,11 @@ def _get_is_directory(cls, url: ParseResult) -> bool: @classmethod def _supports_url(cls, url: ParseResult, export: bool = False) -> bool: - return url.scheme.lower() == 's3' + return url.scheme.lower() == "s3" - def write_file(self, local_path: FileID, job_id: Optional[FileID] = None, cleanup: bool = False) -> FileID: + def write_file( + self, local_path: FileID, job_id: Optional[FileID] = None, cleanup: bool = False + ) -> FileID: info = self.FileInfo.create(job_id if cleanup else None) info.upload(local_path, not self.config.disableJobStoreChecksumVerification) info.save() @@ -626,7 +706,14 @@ def write_file(self, local_path: FileID, job_id: Optional[FileID] = None, cleanu return info.fileID @contextmanager - def write_file_stream(self, job_id: Optional[FileID] = None, cleanup: bool = False, basename=None, encoding=None, errors=None): + def write_file_stream( + self, + job_id: Optional[FileID] = None, + cleanup: bool = False, + basename=None, + encoding=None, + errors=None, + ): info = self.FileInfo.create(job_id if cleanup else None) with info.uploadStream(encoding=encoding, errors=errors) as writable: yield writable, info.fileID @@ -634,11 +721,15 @@ def write_file_stream(self, job_id: Optional[FileID] = None, cleanup: bool = Fal logger.debug("Wrote %r.", info) @contextmanager - def write_shared_file_stream(self, shared_file_name, encrypted=None, encoding=None, errors=None): + def write_shared_file_stream( + self, shared_file_name, encrypted=None, encoding=None, errors=None + ): self._requireValidSharedFileName(shared_file_name) - info = self.FileInfo.loadOrCreate(jobStoreFileID=self._shared_file_id(shared_file_name), - ownerID=str(self.sharedFileOwnerID), - encrypted=encrypted) + info = self.FileInfo.loadOrCreate( + jobStoreFileID=self._shared_file_id(shared_file_name), + ownerID=str(self.sharedFileOwnerID), + encrypted=encrypted, + ) with info.uploadStream(encoding=encoding, errors=errors) as writable: yield writable info.save() @@ -671,7 +762,7 @@ def read_file(self, file_id, local_path, symlink=False): info = self.FileInfo.loadOrFail(file_id) logger.debug("Reading %r into %r.", info, local_path) info.download(local_path, not self.config.disableJobStoreChecksumVerification) - if getattr(file_id, 'executable', False): + if getattr(file_id, "executable", False): os.chmod(local_path, os.stat(local_path).st_mode | stat.S_IXUSR) @contextmanager @@ -686,7 +777,9 @@ def read_shared_file_stream(self, shared_file_name, encoding=None, errors=None): self._requireValidSharedFileName(shared_file_name) jobStoreFileID = self._shared_file_id(shared_file_name) info = self.FileInfo.loadOrFail(jobStoreFileID, customName=shared_file_name) - logger.debug("Reading %r for shared file %r into stream.", info, shared_file_name) + logger.debug( + "Reading %r for shared file %r into stream.", info, shared_file_name + ) with info.downloadStream(encoding=encoding, errors=errors) as readable: yield readable @@ -702,7 +795,7 @@ def write_logs(self, msg): with info.uploadStream(multipart=False) as writeable: if isinstance(msg, str): # This stream is for binary data, so encode any non-encoded things - msg = msg.encode('utf-8', errors='ignore') + msg = msg.encode("utf-8", errors="ignore") writeable.write(msg) info.save() @@ -724,10 +817,12 @@ def _read_logs(self, callback, ownerId): items = None for attempt in retry_sdb(): with attempt: - items = boto3_pager(self.db.select, - "Items", - ConsistentRead=True, - SelectExpression="select * from `{}` where ownerID='{}'".format(self.files_domain_name, str(ownerId))) + items = boto3_pager( + self.db.select, + "Items", + ConsistentRead=True, + SelectExpression=f"select * from `{self.files_domain_name}` where ownerID='{str(ownerId)}'", + ) assert items is not None for item in items: info = self.FileInfo.fromItem(item) @@ -744,13 +839,19 @@ def get_public_url(self, jobStoreFileID): with info.uploadStream(allowInlining=False) as f: f.write(info.content) - self.files_bucket.Object(compat_bytes(jobStoreFileID)).Acl().put(ACL='public-read') + self.files_bucket.Object(compat_bytes(jobStoreFileID)).Acl().put( + ACL="public-read" + ) - url = self.s3_client.generate_presigned_url('get_object', - Params={'Bucket': self.files_bucket.name, - 'Key': compat_bytes(jobStoreFileID), - 'VersionId': info.version}, - ExpiresIn=self.publicUrlExpiration.total_seconds()) + url = self.s3_client.generate_presigned_url( + "get_object", + Params={ + "Bucket": self.files_bucket.name, + "Key": compat_bytes(jobStoreFileID), + "VersionId": info.version, + }, + ExpiresIn=self.publicUrlExpiration.total_seconds(), + ) # boto doesn't properly remove the x-amz-security-token parameter when # query_auth is False when using an IAM role (see issue #2043). Including the @@ -758,12 +859,12 @@ def get_public_url(self, jobStoreFileID): # even if the resource is public, so we need to remove it. scheme, netloc, path, query, fragment = urlsplit(url) params = parse_qs(query) - if 'x-amz-security-token' in params: - del params['x-amz-security-token'] - if 'AWSAccessKeyId' in params: - del params['AWSAccessKeyId'] - if 'Signature' in params: - del params['Signature'] + if "x-amz-security-token" in params: + del params["x-amz-security-token"] + if "AWSAccessKeyId" in params: + del params["AWSAccessKeyId"] + if "Signature" in params: + del params["Signature"] query = urlencode(params, doseq=True) url = urlunsplit((scheme, netloc, path, query, fragment)) return url @@ -772,12 +873,14 @@ def get_shared_public_url(self, shared_file_name): self._requireValidSharedFileName(shared_file_name) return self.get_public_url(self._shared_file_id(shared_file_name)) - def _bindBucket(self, - bucket_name: str, - create: bool = False, - block: bool = True, - versioning: bool = False, - check_versioning_consistency: bool = True): + def _bindBucket( + self, + bucket_name: str, + create: bool = False, + block: bool = True, + versioning: bool = False, + check_versioning_consistency: bool = True, + ): """ Return the Boto Bucket object representing the S3 bucket with the given name. If the bucket does not exist and `create` is True, it will be created. @@ -802,8 +905,7 @@ def bucket_retry_predicate(error): Decide, given an error, whether we should retry binding the bucket. """ - if (isinstance(error, ClientError) and - get_error_status(error) in (404, 409)): + if isinstance(error, ClientError) and get_error_status(error) in (404, 409): # Handle cases where the bucket creation is in a weird state that might let us proceed. # https://github.com/BD2KGenomics/toil/issues/955 # https://github.com/BD2KGenomics/toil/issues/995 @@ -813,7 +915,7 @@ def bucket_retry_predicate(error): # OperationAborted == 409 # NoSuchBucket == 404 return True - if get_error_code(error) == 'SlowDown': + if get_error_code(error) == "SlowDown": # We may get told to SlowDown by AWS when we try to create our # bucket. In that case, we should retry and use the exponential # backoff. @@ -846,15 +948,17 @@ def bucket_retry_predicate(error): # NoSuchBucket. We let that kick us back up to the # main retry loop. assert ( - get_bucket_region(bucket_name) == self.region + get_bucket_region(bucket_name) == self.region ), f"bucket_name: {bucket_name}, {get_bucket_region(bucket_name)} != {self.region}" tags = build_tag_dict_from_env() if tags: flat_tags = flatten_tags(tags) - bucket_tagging = self.s3_resource.BucketTagging(bucket_name) - bucket_tagging.put(Tagging={'TagSet': flat_tags}) + bucket_tagging = self.s3_resource.BucketTagging( + bucket_name + ) + bucket_tagging.put(Tagging={"TagSet": flat_tags}) # Configure bucket so that we can make objects in # it public, which was the historical default. @@ -867,7 +971,9 @@ def bucket_retry_predicate(error): # This is raised if the user attempts to get a bucket in a region outside # the specified one, if the specified one is not `us-east-1`. The us-east-1 # server allows a user to use buckets from any region. - raise BucketLocationConflictException(get_bucket_region(bucket_name)) + raise BucketLocationConflictException( + get_bucket_region(bucket_name) + ) else: raise else: @@ -884,24 +990,32 @@ def bucket_retry_predicate(error): # consistent? time.sleep(1) while not self._getBucketVersioning(bucket_name): - logger.warning(f"Waiting for versioning activation on bucket '{bucket_name}'...") + logger.warning( + f"Waiting for versioning activation on bucket '{bucket_name}'..." + ) time.sleep(1) elif check_versioning_consistency: # now test for versioning consistency # we should never see any of these errors since 'versioning' should always be true bucket_versioning = self._getBucketVersioning(bucket_name) if bucket_versioning != versioning: - assert False, 'Cannot modify versioning on existing bucket' + assert False, "Cannot modify versioning on existing bucket" elif bucket_versioning is None: - assert False, 'Cannot use a bucket with versioning suspended' + assert False, "Cannot use a bucket with versioning suspended" if bucketExisted: - logger.debug(f"Using pre-existing job store bucket '{bucket_name}'.") + logger.debug( + f"Using pre-existing job store bucket '{bucket_name}'." + ) else: - logger.debug(f"Created new job store bucket '{bucket_name}' with versioning state {versioning}.") + logger.debug( + f"Created new job store bucket '{bucket_name}' with versioning state {versioning}." + ) return bucket - def _bindDomain(self, domain_name: str, create: bool = False, block: bool = True) -> None: + def _bindDomain( + self, domain_name: str, create: bool = False, block: bool = True + ) -> None: """ Return the Boto3 domain name representing the SDB domain. When create=True, it will create the domain if it does not exist. @@ -920,9 +1034,11 @@ def _bindDomain(self, domain_name: str, create: bool = False, block: bool = True retry timeout expires. """ logger.debug("Binding to job store domain '%s'.", domain_name) - retryargs = dict(predicate=lambda e: no_such_sdb_domain(e) or sdb_unavailable(e)) + retryargs = dict( + predicate=lambda e: no_such_sdb_domain(e) or sdb_unavailable(e) + ) if not block: - retryargs['timeout'] = 15 + retryargs["timeout"] = 15 for attempt in retry_sdb(**retryargs): with attempt: try: @@ -944,13 +1060,13 @@ def _new_job_id(self): return str(uuid.uuid4()) # A dummy job ID under which all shared files are stored - sharedFileOwnerID = uuid.UUID('891f7db6-e4d9-4221-a58e-ab6cc4395f94') + sharedFileOwnerID = uuid.UUID("891f7db6-e4d9-4221-a58e-ab6cc4395f94") # A dummy job ID under which all unread stats files are stored - statsFileOwnerID = uuid.UUID('bfcf5286-4bc7-41ef-a85d-9ab415b69d53') + statsFileOwnerID = uuid.UUID("bfcf5286-4bc7-41ef-a85d-9ab415b69d53") # A dummy job ID under which all read stats files are stored - readStatsFileOwnerID = uuid.UUID('e77fc3aa-d232-4255-ae04-f64ee8eb0bfa') + readStatsFileOwnerID = uuid.UUID("e77fc3aa-d232-4255-ae04-f64ee8eb0bfa") def _shared_file_id(self, shared_file_name): return str(uuid.uuid5(self.sharedFileOwnerID, shared_file_name)) @@ -960,13 +1076,22 @@ class FileInfo(SDBHelper): """ Represents a file in this job store. """ + outer = None """ :type: AWSJobStore """ - def __init__(self, fileID, ownerID, encrypted, - version=None, content=None, numContentChunks=0, checksum=None): + def __init__( + self, + fileID, + ownerID, + encrypted, + version=None, + content=None, + numContentChunks=0, + checksum=None, + ): """ :type fileID: str :param fileID: the file's ID @@ -1045,24 +1170,30 @@ def content(self, content): assert content is None or isinstance(content, bytes) self._content = content if content is not None: - self.version = '' + self.version = "" @classmethod def create(cls, ownerID: str): - return cls(str(uuid.uuid4()), ownerID, encrypted=cls.outer.sseKeyPath is not None) + return cls( + str(uuid.uuid4()), ownerID, encrypted=cls.outer.sseKeyPath is not None + ) @classmethod def presenceIndicator(cls): - return 'encrypted' + return "encrypted" @classmethod def exists(cls, jobStoreFileID): for attempt in retry_sdb(): with attempt: - return bool(cls.outer.db.get_attributes(DomainName=cls.outer.files_domain_name, - ItemName=compat_bytes(jobStoreFileID), - AttributeNames=[cls.presenceIndicator()], - ConsistentRead=True).get("Attributes", [])) + return bool( + cls.outer.db.get_attributes( + DomainName=cls.outer.files_domain_name, + ItemName=compat_bytes(jobStoreFileID), + AttributeNames=[cls.presenceIndicator()], + ConsistentRead=True, + ).get("Attributes", []) + ) @classmethod def load(cls, jobStoreFileID): @@ -1071,10 +1202,13 @@ def load(cls, jobStoreFileID): self = cls.fromItem( { "Name": compat_bytes(jobStoreFileID), - "Attributes": cls.outer.db.get_attributes(DomainName=cls.outer.files_domain_name, - ItemName=compat_bytes(jobStoreFileID), - ConsistentRead=True).get("Attributes", []) - }) + "Attributes": cls.outer.db.get_attributes( + DomainName=cls.outer.files_domain_name, + ItemName=compat_bytes(jobStoreFileID), + ConsistentRead=True, + ).get("Attributes", []), + } + ) return self @classmethod @@ -1117,7 +1251,9 @@ def strOrNone(s): return s if s is None else str(s) # ownerID and encrypted are the only mandatory attributes - ownerID, encrypted, version, checksum = SDBHelper.get_attributes_from_item(item, ["ownerID", "encrypted", "version", "checksum"]) + ownerID, encrypted, version, checksum = SDBHelper.get_attributes_from_item( + item, ["ownerID", "encrypted", "version", "checksum"] + ) if ownerID is None: assert encrypted is None return None @@ -1127,14 +1263,23 @@ def strOrNone(s): if encrypted: sseKeyPath = cls.outer.sseKeyPath if sseKeyPath is None: - raise AssertionError('Content is encrypted but no key was provided.') + raise AssertionError( + "Content is encrypted but no key was provided." + ) if content is not None: content = encryption.decrypt(content, sseKeyPath) - self = cls(fileID=item["Name"], ownerID=ownerID, encrypted=encrypted, version=version, - content=content, numContentChunks=numContentChunks, checksum=checksum) + self = cls( + fileID=item["Name"], + ownerID=ownerID, + encrypted=encrypted, + version=version, + content=content, + numContentChunks=numContentChunks, + checksum=checksum, + ) return self - def toItem(self) -> Tuple[Dict[str, str], int]: + def toItem(self) -> tuple[dict[str, str], int]: """ Convert this instance to a dictionary of attribute names to values @@ -1146,15 +1291,21 @@ def toItem(self) -> Tuple[Dict[str, str], int]: if self.encrypted and content is not None: sseKeyPath = self.outer.sseKeyPath if sseKeyPath is None: - raise AssertionError('Encryption requested but no key was provided.') + raise AssertionError( + "Encryption requested but no key was provided." + ) content = encryption.encrypt(content, sseKeyPath) assert content is None or isinstance(content, bytes) attributes = self.binaryToAttributes(content) - numChunks = int(attributes['numChunks']) - attributes.update(dict(ownerID=self.ownerID or '', - encrypted=str(self.encrypted), - version=self.version or '', - checksum=self.checksum or '')) + numChunks = int(attributes["numChunks"]) + attributes.update( + dict( + ownerID=self.ownerID or "", + encrypted=str(self.encrypted), + version=self.version or "", + checksum=self.checksum or "", + ) + ) return attributes, numChunks @classmethod @@ -1175,22 +1326,32 @@ def save(self): "Exists": False, } else: - expected = {"Name": 'version', "Value": cast(str, self.previousVersion)} + expected = {"Name": "version", "Value": cast(str, self.previousVersion)} try: for attempt in retry_sdb(): with attempt: - self.outer.db.put_attributes(DomainName=self.outer.files_domain_name, - ItemName=compat_bytes(self.fileID), - Attributes=[{"Name": attribute["Name"], "Value": attribute["Value"], "Replace": True} - for attribute in attributes_boto3], - Expected=expected) + self.outer.db.put_attributes( + DomainName=self.outer.files_domain_name, + ItemName=compat_bytes(self.fileID), + Attributes=[ + { + "Name": attribute["Name"], + "Value": attribute["Value"], + "Replace": True, + } + for attribute in attributes_boto3 + ], + Expected=expected, + ) # clean up the old version of the file if necessary and safe if self.previousVersion and (self.previousVersion != self.version): for attempt in retry_s3(): with attempt: - self.outer.s3_client.delete_object(Bucket=self.outer.files_bucket.name, - Key=compat_bytes(self.fileID), - VersionId=self.previousVersion) + self.outer.s3_client.delete_object( + Bucket=self.outer.files_bucket.name, + Key=compat_bytes(self.fileID), + VersionId=self.previousVersion, + ) self._previousVersion = self._version if numNewContentChunks < self._numContentChunks: residualChunks = range(numNewContentChunks, self._numContentChunks) @@ -1198,19 +1359,26 @@ def save(self): # boto3 requires providing the value as well as the name in the attribute, and we don't store it locally # the php sdk resolves this issue by not requiring the Value key https://github.com/aws/aws-sdk-php/issues/185 # but this doesnt extend to boto3 - delete_attributes = self.outer.db.get_attributes(DomainName=self.outer.files_domain_name, - ItemName=compat_bytes(self.fileID), - AttributeNames=[chunk for chunk in residual_chunk_names]).get("Attributes") + delete_attributes = self.outer.db.get_attributes( + DomainName=self.outer.files_domain_name, + ItemName=compat_bytes(self.fileID), + AttributeNames=[chunk for chunk in residual_chunk_names], + ).get("Attributes") for attempt in retry_sdb(): with attempt: - self.outer.db.delete_attributes(DomainName=self.outer.files_domain_name, - ItemName=compat_bytes(self.fileID), - Attributes=delete_attributes) - self.outer.db.get_attributes(DomainName=self.outer.files_domain_name, ItemName=compat_bytes(self.fileID)) + self.outer.db.delete_attributes( + DomainName=self.outer.files_domain_name, + ItemName=compat_bytes(self.fileID), + Attributes=delete_attributes, + ) + self.outer.db.get_attributes( + DomainName=self.outer.files_domain_name, + ItemName=compat_bytes(self.fileID), + ) self._numContentChunks = numNewContentChunks except ClientError as e: - if get_error_code(e) == 'ConditionalCheckFailed': + if get_error_code(e) == "ConditionalCheckFailed": raise ConcurrentFileModificationException(self.fileID) else: raise @@ -1218,24 +1386,30 @@ def save(self): def upload(self, localFilePath, calculateChecksum=True): file_size, file_time = fileSizeAndTime(localFilePath) if file_size <= self.maxInlinedSize(): - with open(localFilePath, 'rb') as f: + with open(localFilePath, "rb") as f: self.content = f.read() # Clear out any old checksum in case of overwrite - self.checksum = '' + self.checksum = "" else: headerArgs = self._s3EncryptionArgs() # Create a new Resource in case it needs to be on its own thread - resource = boto3_session.resource('s3', region_name=self.outer.region) + resource = boto3_session.resource("s3", region_name=self.outer.region) - self.checksum = self._get_file_checksum(localFilePath) if calculateChecksum else None - self.version = uploadFromPath(localFilePath, - resource=resource, - bucketName=self.outer.files_bucket.name, - fileID=compat_bytes(self.fileID), - headerArgs=headerArgs, - partSize=self.outer.part_size) + self.checksum = ( + self._get_file_checksum(localFilePath) + if calculateChecksum + else None + ) + self.version = uploadFromPath( + localFilePath, + resource=resource, + bucketName=self.outer.files_bucket.name, + fileID=compat_bytes(self.fileID), + headerArgs=headerArgs, + partSize=self.outer.part_size, + ) - def _start_checksum(self, to_match=None, algorithm='sha1'): + def _start_checksum(self, to_match=None, algorithm="sha1"): """ Get a hasher that can be used with _update_checksum and _finish_checksum. @@ -1253,12 +1427,12 @@ def _start_checksum(self, to_match=None, algorithm='sha1'): expected = None if to_match is not None: - parts = to_match.split('$') + parts = to_match.split("$") algorithm = parts[0] expected = parts[1] wrapped = getattr(hashlib, algorithm)() - logger.debug(f'Starting {algorithm} checksum to match {expected}') + logger.debug(f"Starting {algorithm} checksum to match {expected}") return algorithm, wrapped, expected def _update_checksum(self, checksum_in_progress, data): @@ -1275,26 +1449,32 @@ def _finish_checksum(self, checksum_in_progress): result_hash = checksum_in_progress[1].hexdigest() - logger.debug(f'Completed checksum with hash {result_hash} vs. expected {checksum_in_progress[2]}') + logger.debug( + f"Completed checksum with hash {result_hash} vs. expected {checksum_in_progress[2]}" + ) if checksum_in_progress[2] is not None: # We expected a particular hash if result_hash != checksum_in_progress[2]: - raise ChecksumError('Checksum mismatch. Expected: %s Actual: %s' % - (checksum_in_progress[2], result_hash)) + raise ChecksumError( + "Checksum mismatch. Expected: %s Actual: %s" + % (checksum_in_progress[2], result_hash) + ) - return '$'.join([checksum_in_progress[0], result_hash]) + return "$".join([checksum_in_progress[0], result_hash]) def _get_file_checksum(self, localFilePath, to_match=None): - with open(localFilePath, 'rb') as f: + with open(localFilePath, "rb") as f: hasher = self._start_checksum(to_match=to_match) contents = f.read(1024 * 1024) - while contents != b'': + while contents != b"": self._update_checksum(hasher, contents) contents = f.read(1024 * 1024) return self._finish_checksum(hasher) @contextmanager - def uploadStream(self, multipart=True, allowInlining=True, encoding=None, errors=None): + def uploadStream( + self, multipart=True, allowInlining=True, encoding=None, errors=None + ): """ Context manager that gives out a binary or text mode upload stream to upload data. """ @@ -1315,14 +1495,14 @@ def readFrom(self, readable): assert isinstance(buf, bytes) if allowInlining and len(buf) <= info.maxInlinedSize(): - logger.debug('Inlining content of %d bytes', len(buf)) + logger.debug("Inlining content of %d bytes", len(buf)) info.content = buf # There will be no checksum - info.checksum = '' + info.checksum = "" else: # We will compute a checksum hasher = info._start_checksum() - logger.debug('Updating checksum with %d bytes', len(buf)) + logger.debug("Updating checksum with %d bytes", len(buf)) info._update_checksum(hasher, buf) client = store.s3_client @@ -1331,47 +1511,72 @@ def readFrom(self, readable): for attempt in retry_s3(): with attempt: - logger.debug('Starting multipart upload') + logger.debug("Starting multipart upload") # low-level clients are thread safe - upload = client.create_multipart_upload(Bucket=bucket_name, - Key=compat_bytes(info.fileID), - **headerArgs) - uploadId = upload['UploadId'] + upload = client.create_multipart_upload( + Bucket=bucket_name, + Key=compat_bytes(info.fileID), + **headerArgs, + ) + uploadId = upload["UploadId"] parts = [] - logger.debug('Multipart upload started as %s', uploadId) + logger.debug("Multipart upload started as %s", uploadId) for attempt in retry_s3(): with attempt: for i in range(CONSISTENCY_TICKS): # Sometimes we can create a multipart upload and not see it. Wait around for it. - response = client.list_multipart_uploads(Bucket=bucket_name, - MaxUploads=1, - Prefix=compat_bytes(info.fileID)) - if ('Uploads' in response and - len(response['Uploads']) != 0 and - response['Uploads'][0]['UploadId'] == uploadId): - - logger.debug('Multipart upload visible as %s', uploadId) + response = client.list_multipart_uploads( + Bucket=bucket_name, + MaxUploads=1, + Prefix=compat_bytes(info.fileID), + ) + if ( + "Uploads" in response + and len(response["Uploads"]) != 0 + and response["Uploads"][0]["UploadId"] + == uploadId + ): + + logger.debug( + "Multipart upload visible as %s", uploadId + ) break else: - logger.debug('Multipart upload %s is not visible; we see %s', uploadId, response.get('Uploads')) - time.sleep(CONSISTENCY_TIME * 2 ** i) + logger.debug( + "Multipart upload %s is not visible; we see %s", + uploadId, + response.get("Uploads"), + ) + time.sleep(CONSISTENCY_TIME * 2**i) try: for part_num in itertools.count(): for attempt in retry_s3(): with attempt: - logger.debug('Uploading part %d of %d bytes to %s', part_num + 1, len(buf), uploadId) + logger.debug( + "Uploading part %d of %d bytes to %s", + part_num + 1, + len(buf), + uploadId, + ) # TODO: include the Content-MD5 header: # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.complete_multipart_upload - part = client.upload_part(Bucket=bucket_name, - Key=compat_bytes(info.fileID), - PartNumber=part_num + 1, - UploadId=uploadId, - Body=BytesIO(buf), - **headerArgs) - - parts.append({"PartNumber": part_num + 1, "ETag": part["ETag"]}) + part = client.upload_part( + Bucket=bucket_name, + Key=compat_bytes(info.fileID), + PartNumber=part_num + 1, + UploadId=uploadId, + Body=BytesIO(buf), + **headerArgs, + ) + + parts.append( + { + "PartNumber": part_num + 1, + "ETag": part["ETag"], + } + ) # Get the next block of data we want to put buf = readable.read(info.outer.part_size) @@ -1384,15 +1589,21 @@ def readFrom(self, readable): with panic(log=logger): for attempt in retry_s3(): with attempt: - client.abort_multipart_upload(Bucket=bucket_name, - Key=compat_bytes(info.fileID), - UploadId=uploadId) + client.abort_multipart_upload( + Bucket=bucket_name, + Key=compat_bytes(info.fileID), + UploadId=uploadId, + ) else: - while not store._getBucketVersioning(store.files_bucket.name): - logger.warning('Versioning does not appear to be enabled yet. Deferring multipart ' - 'upload completion...') + while not store._getBucketVersioning( + store.files_bucket.name + ): + logger.warning( + "Versioning does not appear to be enabled yet. Deferring multipart " + "upload completion..." + ) time.sleep(1) # Save the checksum @@ -1404,32 +1615,46 @@ def readFrom(self, readable): # in tests # (https://github.com/DataBiosphere/toil/issues/3894) with attempt: - logger.debug('Attempting to complete upload...') + logger.debug("Attempting to complete upload...") completed = client.complete_multipart_upload( Bucket=bucket_name, Key=compat_bytes(info.fileID), UploadId=uploadId, - MultipartUpload={"Parts": parts}) - - logger.debug('Completed upload object of type %s: %s', str(type(completed)), - repr(completed)) - info.version = completed.get('VersionId') - logger.debug('Completed upload with version %s', str(info.version)) + MultipartUpload={"Parts": parts}, + ) + + logger.debug( + "Completed upload object of type %s: %s", + str(type(completed)), + repr(completed), + ) + info.version = completed.get("VersionId") + logger.debug( + "Completed upload with version %s", + str(info.version), + ) if info.version is None: # Somehow we don't know the version. Try and get it. - for attempt in retry_s3(predicate=lambda e: retryable_s3_errors(e) or isinstance(e, AssertionError)): + for attempt in retry_s3( + predicate=lambda e: retryable_s3_errors(e) + or isinstance(e, AssertionError) + ): with attempt: - version = client.head_object(Bucket=bucket_name, - Key=compat_bytes(info.fileID), - **headerArgs).get('VersionId', None) - logger.warning('Loaded key for upload with no version and got version %s', - str(version)) + version = client.head_object( + Bucket=bucket_name, + Key=compat_bytes(info.fileID), + **headerArgs, + ).get("VersionId", None) + logger.warning( + "Loaded key for upload with no version and got version %s", + str(version), + ) info.version = version assert info.version is not None # Make sure we actually wrote something, even if an empty file - assert (bool(info.version) or info.content is not None) + assert bool(info.version) or info.content is not None class SinglePartPipe(WritablePipe): def readFrom(self, readable): @@ -1437,10 +1662,10 @@ def readFrom(self, readable): assert isinstance(buf, bytes) dataLength = len(buf) if allowInlining and dataLength <= info.maxInlinedSize(): - logger.debug('Inlining content of %d bytes', len(buf)) + logger.debug("Inlining content of %d bytes", len(buf)) info.content = buf # There will be no checksum - info.checksum = '' + info.checksum = "" else: # We will compute a checksum hasher = info._start_checksum() @@ -1454,39 +1679,57 @@ def readFrom(self, readable): buf = BytesIO(buf) while not store._getBucketVersioning(bucket_name): - logger.warning('Versioning does not appear to be enabled yet. Deferring single part ' - 'upload...') + logger.warning( + "Versioning does not appear to be enabled yet. Deferring single part " + "upload..." + ) time.sleep(1) for attempt in retry_s3(): with attempt: - logger.debug('Uploading single part of %d bytes', dataLength) - client.upload_fileobj(Bucket=bucket_name, - Key=compat_bytes(info.fileID), - Fileobj=buf, - ExtraArgs=headerArgs) + logger.debug( + "Uploading single part of %d bytes", dataLength + ) + client.upload_fileobj( + Bucket=bucket_name, + Key=compat_bytes(info.fileID), + Fileobj=buf, + ExtraArgs=headerArgs, + ) # use head_object with the SSE headers to access versionId and content_length attributes - headObj = client.head_object(Bucket=bucket_name, - Key=compat_bytes(info.fileID), - **headerArgs) - assert dataLength == headObj.get('ContentLength', None) - info.version = headObj.get('VersionId', None) - logger.debug('Upload received version %s', str(info.version)) + headObj = client.head_object( + Bucket=bucket_name, + Key=compat_bytes(info.fileID), + **headerArgs, + ) + assert dataLength == headObj.get("ContentLength", None) + info.version = headObj.get("VersionId", None) + logger.debug( + "Upload received version %s", str(info.version) + ) if info.version is None: # Somehow we don't know the version - for attempt in retry_s3(predicate=lambda e: retryable_s3_errors(e) or isinstance(e, AssertionError)): + for attempt in retry_s3( + predicate=lambda e: retryable_s3_errors(e) + or isinstance(e, AssertionError) + ): with attempt: - headObj = client.head_object(Bucket=bucket_name, - Key=compat_bytes(info.fileID), - **headerArgs) - info.version = headObj.get('VersionId', None) - logger.warning('Reloaded key with no version and got version %s', str(info.version)) + headObj = client.head_object( + Bucket=bucket_name, + Key=compat_bytes(info.fileID), + **headerArgs, + ) + info.version = headObj.get("VersionId", None) + logger.warning( + "Reloaded key with no version and got version %s", + str(info.version), + ) assert info.version is not None # Make sure we actually wrote something, even if an empty file - assert (bool(info.version) or info.content is not None) + assert bool(info.version) or info.content is not None if multipart: pipe = MultiPartPipe(encoding=encoding, errors=errors) @@ -1497,20 +1740,22 @@ def readFrom(self, readable): yield writable if not pipe.reader_done: - logger.debug(f'Version: {self.version} Content: {self.content}') - raise RuntimeError('Escaped context manager without written data being read!') + logger.debug(f"Version: {self.version} Content: {self.content}") + raise RuntimeError( + "Escaped context manager without written data being read!" + ) # We check our work to make sure we have exactly one of embedded # content or a real object version. if self.content is None: if not bool(self.version): - logger.debug(f'Version: {self.version} Content: {self.content}') - raise RuntimeError('No content added and no version created') + logger.debug(f"Version: {self.version} Content: {self.content}") + raise RuntimeError("No content added and no version created") else: if bool(self.version): - logger.debug(f'Version: {self.version} Content: {self.content}') - raise RuntimeError('Content added and version created') + logger.debug(f"Version: {self.version} Content: {self.content}") + raise RuntimeError("Content added and version created") def copyFrom(self, srcObj): """ @@ -1520,18 +1765,20 @@ def copyFrom(self, srcObj): """ assert srcObj.content_length is not None if srcObj.content_length <= self.maxInlinedSize(): - self.content = srcObj.get().get('Body').read() + self.content = srcObj.get().get("Body").read() else: # Create a new Resource in case it needs to be on its own thread - resource = boto3_session.resource('s3', region_name=self.outer.region) - self.version = copyKeyMultipart(resource, - srcBucketName=compat_bytes(srcObj.bucket_name), - srcKeyName=compat_bytes(srcObj.key), - srcKeyVersion=compat_bytes(srcObj.version_id), - dstBucketName=compat_bytes(self.outer.files_bucket.name), - dstKeyName=compat_bytes(self._fileID), - sseAlgorithm='AES256', - sseKey=self._getSSEKey()) + resource = boto3_session.resource("s3", region_name=self.outer.region) + self.version = copyKeyMultipart( + resource, + srcBucketName=compat_bytes(srcObj.bucket_name), + srcKeyName=compat_bytes(srcObj.key), + srcKeyVersion=compat_bytes(srcObj.version_id), + dstBucketName=compat_bytes(self.outer.files_bucket.name), + dstKeyName=compat_bytes(self._fileID), + sseAlgorithm="AES256", + sseKey=self._getSSEKey(), + ) def copyTo(self, dstObj): """ @@ -1545,35 +1792,43 @@ def copyTo(self, dstObj): dstObj.put(Body=self.content) elif self.version: # Create a new Resource in case it needs to be on its own thread - resource = boto3_session.resource('s3', region_name=self.outer.region) + resource = boto3_session.resource("s3", region_name=self.outer.region) for attempt in retry_s3(): # encrypted = True if self.outer.sseKeyPath else False with attempt: - copyKeyMultipart(resource, - srcBucketName=compat_bytes(self.outer.files_bucket.name), - srcKeyName=compat_bytes(self.fileID), - srcKeyVersion=compat_bytes(self.version), - dstBucketName=compat_bytes(dstObj.bucket_name), - dstKeyName=compat_bytes(dstObj.key), - copySourceSseAlgorithm='AES256', - copySourceSseKey=self._getSSEKey()) + copyKeyMultipart( + resource, + srcBucketName=compat_bytes(self.outer.files_bucket.name), + srcKeyName=compat_bytes(self.fileID), + srcKeyVersion=compat_bytes(self.version), + dstBucketName=compat_bytes(dstObj.bucket_name), + dstKeyName=compat_bytes(dstObj.key), + copySourceSseAlgorithm="AES256", + copySourceSseKey=self._getSSEKey(), + ) else: assert False def download(self, localFilePath, verifyChecksum=True): if self.content is not None: with AtomicFileCreate(localFilePath) as tmpPath: - with open(tmpPath, 'wb') as f: + with open(tmpPath, "wb") as f: f.write(self.content) elif self.version: headerArgs = self._s3EncryptionArgs() obj = self.outer.files_bucket.Object(compat_bytes(self.fileID)) - for attempt in retry_s3(predicate=lambda e: retryable_s3_errors(e) or isinstance(e, ChecksumError)): + for attempt in retry_s3( + predicate=lambda e: retryable_s3_errors(e) + or isinstance(e, ChecksumError) + ): with attempt: with AtomicFileCreate(localFilePath) as tmpPath: - obj.download_file(Filename=tmpPath, ExtraArgs={'VersionId': self.version, **headerArgs}) + obj.download_file( + Filename=tmpPath, + ExtraArgs={"VersionId": self.version, **headerArgs}, + ) if verifyChecksum and self.checksum: try: @@ -1581,7 +1836,10 @@ def download(self, localFilePath, verifyChecksum=True): self._get_file_checksum(localFilePath, self.checksum) except ChecksumError as e: # Annotate checksum mismatches with file name - raise ChecksumError('Checksums do not match for file %s.' % localFilePath) from e + raise ChecksumError( + "Checksums do not match for file %s." + % localFilePath + ) from e # The error will get caught and result in a retry of the download until we run out of retries. # TODO: handle obviously truncated downloads by resuming instead. else: @@ -1603,7 +1861,10 @@ def writeTo(self, writable): obj = info.outer.files_bucket.Object(compat_bytes(info.fileID)) for attempt in retry_s3(): with attempt: - obj.download_fileobj(writable, ExtraArgs={'VersionId': info.version, **headerArgs}) + obj.download_fileobj( + writable, + ExtraArgs={"VersionId": info.version, **headerArgs}, + ) else: assert False @@ -1619,7 +1880,7 @@ class HashingPipe(ReadableTransformingPipe): def transform(self, readable, writable): hasher = info._start_checksum(to_match=info.checksum) contents = readable.read(1024 * 1024) - while contents != b'': + while contents != b"": info._update_checksum(hasher, contents) try: writable.write(contents) @@ -1636,7 +1897,9 @@ def transform(self, readable, writable): if verifyChecksum and self.checksum: with DownloadPipe() as readable: # Interpose a pipe to check the hash - with HashingPipe(readable, encoding=encoding, errors=errors) as verified: + with HashingPipe( + readable, encoding=encoding, errors=errors + ) as verified: yield verified else: # Readable end of pipe produces text mode output if encoding specified @@ -1653,15 +1916,19 @@ def delete(self): } for attempt in retry_sdb(): with attempt: - store.db.delete_attributes(DomainName=store.files_domain_name, - ItemName=compat_bytes(self.fileID), - Expected=expected) + store.db.delete_attributes( + DomainName=store.files_domain_name, + ItemName=compat_bytes(self.fileID), + Expected=expected, + ) if self.previousVersion: for attempt in retry_s3(): with attempt: - store.s3_client.delete_object(Bucket=store.files_bucket.name, - Key=compat_bytes(self.fileID), - VersionId=self.previousVersion) + store.s3_client.delete_object( + Bucket=store.files_bucket.name, + Key=compat_bytes(self.fileID), + VersionId=self.previousVersion, + ) def getSize(self): """ @@ -1680,7 +1947,7 @@ def getSize(self): def _getSSEKey(self) -> Optional[bytes]: sseKeyPath = self.outer.sseKeyPath if sseKeyPath: - with open(sseKeyPath, 'rb') as f: + with open(sseKeyPath, "rb") as f: sseKey = f.read() return sseKey @@ -1689,25 +1956,30 @@ def _s3EncryptionArgs(self): # parameters and will be used to set the http headers if self.encrypted: sseKey = self._getSSEKey() - assert sseKey is not None, 'Content is encrypted but no key was provided.' + assert ( + sseKey is not None + ), "Content is encrypted but no key was provided." assert len(sseKey) == 32 # boto3 encodes the key and calculates the MD5 for us - return {'SSECustomerAlgorithm': 'AES256', 'SSECustomerKey': sseKey} + return {"SSECustomerAlgorithm": "AES256", "SSECustomerKey": sseKey} else: return {} def __repr__(self): r = custom_repr - d = (('fileID', r(self.fileID)), - ('ownerID', r(self.ownerID)), - ('encrypted', r(self.encrypted)), - ('version', r(self.version)), - ('previousVersion', r(self.previousVersion)), - ('content', r(self.content)), - ('checksum', r(self.checksum)), - ('_numContentChunks', r(self._numContentChunks))) - return "{}({})".format(type(self).__name__, - ', '.join(f'{k}={v}' for k, v in d)) + d = ( + ("fileID", r(self.fileID)), + ("ownerID", r(self.ownerID)), + ("encrypted", r(self.encrypted)), + ("version", r(self.version)), + ("previousVersion", r(self.previousVersion)), + ("content", r(self.content)), + ("checksum", r(self.checksum)), + ("_numContentChunks", r(self._numContentChunks)), + ) + return "{}({})".format( + type(self).__name__, ", ".join(f"{k}={v}" for k, v in d) + ) versionings = dict(Enabled=True, Disabled=False, Suspended=None) @@ -1744,7 +2016,7 @@ def destroy(self): if self.files_bucket is not None: self._delete_bucket(self.files_bucket) self.files_bucket = None - for name in 'files_domain_name', 'jobs_domain_name': + for name in "files_domain_name", "jobs_domain_name": domainName = getattr(self, name) if domainName is not None: self._delete_domain(domainName) @@ -1768,12 +2040,14 @@ def _delete_bucket(bucket): for attempt in retry_s3(): with attempt: try: - uploads = s3_boto3_client.list_multipart_uploads(Bucket=bucket.name).get('Uploads') + uploads = s3_boto3_client.list_multipart_uploads( + Bucket=bucket.name + ).get("Uploads") if uploads: for u in uploads: - s3_boto3_client.abort_multipart_upload(Bucket=bucket.name, - Key=u["Key"], - UploadId=u["UploadId"]) + s3_boto3_client.abort_multipart_upload( + Bucket=bucket.name, Key=u["Key"], UploadId=u["UploadId"] + ) bucket.objects.all().delete() bucket.object_versions.delete() @@ -1793,5 +2067,7 @@ def _delete_bucket(bucket): class BucketLocationConflictException(LocatorException): def __init__(self, bucketRegion): super().__init__( - 'A bucket with the same name as the jobstore was found in another region (%s). ' - 'Cannot proceed as the unique bucket name is already in use.', locator=bucketRegion) + "A bucket with the same name as the jobstore was found in another region (%s). " + "Cannot proceed as the unique bucket name is already in use.", + locator=bucketRegion, + ) diff --git a/src/toil/jobStores/aws/utils.py b/src/toil/jobStores/aws/utils.py index 83ea4cf42c..3d890f2138 100644 --- a/src/toil/jobStores/aws/utils.py +++ b/src/toil/jobStores/aws/utils.py @@ -17,7 +17,7 @@ import os import types from ssl import SSLError -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, cast +from typing import TYPE_CHECKING, Optional, cast from boto3.s3.transfer import TransferConfig from botocore.client import Config @@ -49,10 +49,10 @@ # also need to set a special flag to make sure we don't use the generic # s3.amazonaws.com for us-east-1, or else we might not actually end up talking # to us-east-1 when a bucket is there. -DIAL_SPECIFIC_REGION_CONFIG = Config(s3={ - 'addressing_style': 'path', - 'us_east_1_regional_endpoint': 'regional' -}) +DIAL_SPECIFIC_REGION_CONFIG = Config( + s3={"addressing_style": "path", "us_east_1_regional_endpoint": "regional"} +) + class SDBHelper: """ @@ -92,6 +92,7 @@ class SDBHelper: True """ + # The SDB documentation is not clear as to whether the attribute value size limit of 1024 # applies to the base64-encoded value or the raw value. It suggests that responses are # automatically encoded from which I conclude that the limit should apply to the raw, @@ -104,8 +105,8 @@ class SDBHelper: maxValueSize = 1024 maxRawValueSize = maxValueSize * 3 // 4 # Just make sure we don't have a problem with padding or integer truncation: - assert len(base64.b64encode(b' ' * maxRawValueSize)) == 1024 - assert len(base64.b64encode(b' ' * (1 + maxRawValueSize))) > 1024 + assert len(base64.b64encode(b" " * maxRawValueSize)) == 1024 + assert len(base64.b64encode(b" " * (1 + maxRawValueSize))) > 1024 @classmethod def _reservedAttributes(cls): @@ -121,39 +122,44 @@ def _maxChunks(cls): @classmethod def maxBinarySize(cls, extraReservedChunks=0): - return (cls._maxChunks() - extraReservedChunks) * cls.maxRawValueSize - 1 # for the 'C' or 'U' prefix + return ( + cls._maxChunks() - extraReservedChunks + ) * cls.maxRawValueSize - 1 # for the 'C' or 'U' prefix @classmethod def _maxEncodedSize(cls): return cls._maxChunks() * cls.maxValueSize @classmethod - def binaryToAttributes(cls, binary) -> Dict[str, str]: + def binaryToAttributes(cls, binary) -> dict[str, str]: """ Turn a bytestring, or None, into SimpleDB attributes. """ - if binary is None: return {'numChunks': '0'} + if binary is None: + return {"numChunks": "0"} assert isinstance(binary, bytes) assert len(binary) <= cls.maxBinarySize() # The use of compression is just an optimization. We can't include it in the maxValueSize # computation because the compression ratio depends on the input. compressed = bz2.compress(binary) if len(compressed) > len(binary): - compressed = b'U' + binary + compressed = b"U" + binary else: - compressed = b'C' + compressed + compressed = b"C" + compressed encoded = base64.b64encode(compressed) assert len(encoded) <= cls._maxEncodedSize() n = cls.maxValueSize - chunks = (encoded[i:i + n] for i in range(0, len(encoded), n)) - attributes = {cls._chunkName(i): chunk.decode("utf-8") for i, chunk in enumerate(chunks)} - attributes.update({'numChunks': str(len(attributes))}) + chunks = (encoded[i : i + n] for i in range(0, len(encoded), n)) + attributes = { + cls._chunkName(i): chunk.decode("utf-8") for i, chunk in enumerate(chunks) + } + attributes.update({"numChunks": str(len(attributes))}) return attributes @classmethod def attributeDictToList( - cls, attributes: Dict[str, str] - ) -> List["AttributeTypeDef"]: + cls, attributes: dict[str, str] + ) -> list["AttributeTypeDef"]: """ Convert the attribute dict (ex: from binaryToAttributes) into a list of attribute typed dicts to be compatible with boto3 argument syntax @@ -164,8 +170,8 @@ def attributeDictToList( @classmethod def attributeListToDict( - cls, attributes: List["AttributeTypeDef"] - ) -> Dict[str, str]: + cls, attributes: list["AttributeTypeDef"] + ) -> dict[str, str]: """ Convert the attribute boto3 representation of list of attribute typed dicts back to a dictionary with name, value pairs @@ -176,10 +182,12 @@ def attributeListToDict( @classmethod def get_attributes_from_item( - cls, item: "ItemTypeDef", keys: List[str] - ) -> List[Optional[str]]: - return_values: List[Optional[str]] = [None for _ in keys] - mapped_indices: Dict[str, int] = {name: index for index, name in enumerate(keys)} + cls, item: "ItemTypeDef", keys: list[str] + ) -> list[Optional[str]]: + return_values: list[Optional[str]] = [None for _ in keys] + mapped_indices: dict[str, int] = { + name: index for index, name in enumerate(keys) + } for attribute in item["Attributes"]: name = attribute["Name"] value = attribute["Value"] @@ -202,12 +210,12 @@ def presenceIndicator(cls): Assuming that binaryToAttributes() is used with SDB's PutAttributes, the return value of this method could be used to detect the presence/absence of an item in SDB. """ - return 'numChunks' + return "numChunks" @classmethod def attributesToBinary( - cls, attributes: List["AttributeTypeDef"] - ) -> Tuple[bytes, int]: + cls, attributes: list["AttributeTypeDef"] + ) -> tuple[bytes, int]: """ :rtype: (str|None,int) :return: the binary data and the number of chunks it was composed from @@ -223,14 +231,14 @@ def attributesToBinary( numChunks = int(value) chunks.sort() if numChunks: - serializedJob = b''.join(v.encode() for k, v in chunks) + serializedJob = b"".join(v.encode() for k, v in chunks) compressed = base64.b64decode(serializedJob) - if compressed[0] == b'C'[0]: + if compressed[0] == b"C"[0]: binary = bz2.decompress(compressed[1:]) - elif compressed[0] == b'U'[0]: + elif compressed[0] == b"U"[0]: binary = compressed[1:] else: - raise RuntimeError(f'Unexpected prefix {compressed[0]}') + raise RuntimeError(f"Unexpected prefix {compressed[0]}") else: binary = None return binary, numChunks @@ -242,12 +250,14 @@ def fileSizeAndTime(localFilePath): @retry(errors=[AWSServerErrors]) -def uploadFromPath(localFilePath: str, - resource, - bucketName: str, - fileID: str, - headerArgs: Optional[dict] = None, - partSize: int = 50 << 20): +def uploadFromPath( + localFilePath: str, + resource, + bucketName: str, + fileID: str, + headerArgs: Optional[dict] = None, + partSize: int = 50 << 20, +): """ Uploads a file to s3, using multipart uploading if applicable @@ -266,9 +276,13 @@ def uploadFromPath(localFilePath: str, client = resource.meta.client file_size, file_time = fileSizeAndTime(localFilePath) - version = uploadFile(localFilePath, resource, bucketName, fileID, headerArgs, partSize) - info = client.head_object(Bucket=bucketName, Key=compat_bytes(fileID), VersionId=version, **headerArgs) - size = info.get('ContentLength') + version = uploadFile( + localFilePath, resource, bucketName, fileID, headerArgs, partSize + ) + info = client.head_object( + Bucket=bucketName, Key=compat_bytes(fileID), VersionId=version, **headerArgs + ) + size = info.get("ContentLength") assert size == file_size @@ -278,12 +292,14 @@ def uploadFromPath(localFilePath: str, @retry(errors=[AWSServerErrors]) -def uploadFile(readable, - resource, - bucketName: str, - fileID: str, - headerArgs: Optional[dict] = None, - partSize: int = 50 << 20): +def uploadFile( + readable, + resource, + bucketName: str, + fileID: str, + headerArgs: Optional[dict] = None, + partSize: int = 50 << 20, +): """ Upload a readable object to s3, using multipart uploading if applicable. :param readable: a readable stream or a file path to upload to s3 @@ -299,29 +315,32 @@ def uploadFile(readable, client = resource.meta.client config = TransferConfig( - multipart_threshold=partSize, - multipart_chunksize=partSize, - use_threads=True + multipart_threshold=partSize, multipart_chunksize=partSize, use_threads=True ) if isinstance(readable, str): - client.upload_file(Filename=readable, - Bucket=bucketName, - Key=compat_bytes(fileID), - ExtraArgs=headerArgs, - Config=config) + client.upload_file( + Filename=readable, + Bucket=bucketName, + Key=compat_bytes(fileID), + ExtraArgs=headerArgs, + Config=config, + ) else: - client.upload_fileobj(Fileobj=readable, - Bucket=bucketName, - Key=compat_bytes(fileID), - ExtraArgs=headerArgs, - Config=config) + client.upload_fileobj( + Fileobj=readable, + Bucket=bucketName, + Key=compat_bytes(fileID), + ExtraArgs=headerArgs, + Config=config, + ) # Wait until the object exists before calling head_object object_summary = resource.ObjectSummary(bucketName, compat_bytes(fileID)) object_summary.wait_until_exists(**headerArgs) info = client.head_object(Bucket=bucketName, Key=compat_bytes(fileID), **headerArgs) - return info.get('VersionId', None) + return info.get("VersionId", None) + class ServerSideCopyProhibitedError(RuntimeError): """ @@ -329,17 +348,20 @@ class ServerSideCopyProhibitedError(RuntimeError): insists that you pay to download and upload the data yourself instead. """ + @retry(errors=[AWSServerErrors]) -def copyKeyMultipart(resource: "S3ServiceResource", - srcBucketName: str, - srcKeyName: str, - srcKeyVersion: str, - dstBucketName: str, - dstKeyName: str, - sseAlgorithm: Optional[str] = None, - sseKey: Optional[str] = None, - copySourceSseAlgorithm: Optional[str] = None, - copySourceSseKey: Optional[str] = None): +def copyKeyMultipart( + resource: "S3ServiceResource", + srcBucketName: str, + srcKeyName: str, + srcKeyVersion: str, + dstBucketName: str, + dstKeyName: str, + sseAlgorithm: Optional[str] = None, + sseKey: Optional[str] = None, + copySourceSseAlgorithm: Optional[str] = None, + copySourceSseKey: Optional[str] = None, +): """ Copies a key from a source key to a destination key in multiple parts. Note that if the destination key exists it will be overwritten implicitly, and if it does not exist a new @@ -376,9 +398,12 @@ def copyKeyMultipart(resource: "S3ServiceResource", """ dstBucket = resource.Bucket(compat_bytes(dstBucketName)) dstObject = dstBucket.Object(compat_bytes(dstKeyName)) - copySource = {'Bucket': compat_bytes(srcBucketName), 'Key': compat_bytes(srcKeyName)} + copySource = { + "Bucket": compat_bytes(srcBucketName), + "Key": compat_bytes(srcKeyName), + } if srcKeyVersion is not None: - copySource['VersionId'] = compat_bytes(srcKeyVersion) + copySource["VersionId"] = compat_bytes(srcKeyVersion) # Get a client to the source region, which may not be the same as the one # this resource is connected to. We should probably talk to it for source @@ -388,10 +413,8 @@ def copyKeyMultipart(resource: "S3ServiceResource", source_client = cast( "S3Client", session.client( - 's3', - region_name=source_region, - config=DIAL_SPECIFIC_REGION_CONFIG - ) + "s3", region_name=source_region, config=DIAL_SPECIFIC_REGION_CONFIG + ), ) # The boto3 functions don't allow passing parameters as None to @@ -400,19 +423,28 @@ def copyKeyMultipart(resource: "S3ServiceResource", # required. destEncryptionArgs = {} if sseKey is not None: - destEncryptionArgs.update({'SSECustomerAlgorithm': sseAlgorithm, - 'SSECustomerKey': sseKey}) + destEncryptionArgs.update( + {"SSECustomerAlgorithm": sseAlgorithm, "SSECustomerKey": sseKey} + ) copyEncryptionArgs = {} if copySourceSseKey is not None: - copyEncryptionArgs.update({'CopySourceSSECustomerAlgorithm': copySourceSseAlgorithm, - 'CopySourceSSECustomerKey': copySourceSseKey}) + copyEncryptionArgs.update( + { + "CopySourceSSECustomerAlgorithm": copySourceSseAlgorithm, + "CopySourceSSECustomerKey": copySourceSseKey, + } + ) copyEncryptionArgs.update(destEncryptionArgs) try: # Kick off a server-side copy operation - dstObject.copy(copySource, SourceClient=source_client, ExtraArgs=copyEncryptionArgs) + dstObject.copy( + copySource, SourceClient=source_client, ExtraArgs=copyEncryptionArgs + ) except ClientError as e: - if get_error_code(e) == 'AccessDenied' and 'cross-region' in get_error_message(e): + if get_error_code(e) == "AccessDenied" and "cross-region" in get_error_message( + e + ): # We have this problem: # The Internet and AWS docs say that we just can't do a # cross-region CopyObject from inside a VPC with an endpoint. The @@ -422,13 +454,16 @@ def copyKeyMultipart(resource: "S3ServiceResource", # the source region's API servers, they reject it and tell us to # talk to the destination region's API servers instead. Which we # can't reach. - logger.error('Amazon is refusing to perform a server-side copy of %s: %s', copySource, e) + logger.error( + "Amazon is refusing to perform a server-side copy of %s: %s", + copySource, + e, + ) raise ServerSideCopyProhibitedError() else: # Some other ClientError happened raise - # Wait until the object exists before calling head_object object_summary = resource.ObjectSummary(dstObject.bucket_name, dstObject.key) object_summary.wait_until_exists(**destEncryptionArgs) @@ -438,14 +473,15 @@ def copyKeyMultipart(resource: "S3ServiceResource", # after, leaving open the possibility that it may have been # modified again in the few seconds since the copy finished. There # isn't much we can do about it. - info = resource.meta.client.head_object(Bucket=dstObject.bucket_name, - Key=dstObject.key, - **destEncryptionArgs) - return info.get('VersionId', None) + info = resource.meta.client.head_object( + Bucket=dstObject.bucket_name, Key=dstObject.key, **destEncryptionArgs + ) + return info.get("VersionId", None) -def _put_attributes_using_post(self, domain_or_name, item_name, attributes, - replace=True, expected_value=None): +def _put_attributes_using_post( + self, domain_or_name, item_name, attributes, replace=True, expected_value=None +): """ Monkey-patched version of SDBConnection.put_attributes that uses POST instead of GET @@ -455,13 +491,12 @@ def _put_attributes_using_post(self, domain_or_name, item_name, attributes, https://github.com/BD2KGenomics/toil/issues/502 """ domain, domain_name = self.get_domain_and_name(domain_or_name) - params = {'DomainName': domain_name, - 'ItemName': item_name} + params = {"DomainName": domain_name, "ItemName": item_name} self._build_name_value_list(params, attributes, replace) if expected_value: self._build_expected_value(params, expected_value) # The addition of the verb keyword argument is the only difference to put_attributes (Hannes) - return self.get_status('PutAttributes', params, verb='POST') + return self.get_status("PutAttributes", params, verb="POST") def monkeyPatchSdbConnection(sdb): @@ -470,6 +505,7 @@ def monkeyPatchSdbConnection(sdb): """ sdb.put_attributes = types.MethodType(_put_attributes_using_post, sdb) + def sdb_unavailable(e): # Since we're checking against a collection here we absolutely need an # integer status code. This is probably a BotoServerError, but other 500s @@ -478,23 +514,28 @@ def sdb_unavailable(e): def no_such_sdb_domain(e): - return (isinstance(e, ClientError) - and get_error_code(e) - and get_error_code(e).endswith('NoSuchDomain')) + return ( + isinstance(e, ClientError) + and get_error_code(e) + and get_error_code(e).endswith("NoSuchDomain") + ) def retryable_ssl_error(e): # https://github.com/BD2KGenomics/toil/issues/978 - return isinstance(e, SSLError) and e.reason == 'DECRYPTION_FAILED_OR_BAD_RECORD_MAC' + return isinstance(e, SSLError) and e.reason == "DECRYPTION_FAILED_OR_BAD_RECORD_MAC" def retryable_sdb_errors(e): - return (sdb_unavailable(e) - or no_such_sdb_domain(e) - or connection_error(e) - or retryable_ssl_error(e)) + return ( + sdb_unavailable(e) + or no_such_sdb_domain(e) + or connection_error(e) + or retryable_ssl_error(e) + ) -def retry_sdb(delays=DEFAULT_DELAYS, timeout=DEFAULT_TIMEOUT, predicate=retryable_sdb_errors): +def retry_sdb( + delays=DEFAULT_DELAYS, timeout=DEFAULT_TIMEOUT, predicate=retryable_sdb_errors +): return old_retry(delays=delays, timeout=timeout, predicate=predicate) - diff --git a/src/toil/jobStores/conftest.py b/src/toil/jobStores/conftest.py index b90874c8de..a92ca9749b 100644 --- a/src/toil/jobStores/conftest.py +++ b/src/toil/jobStores/conftest.py @@ -18,6 +18,7 @@ try: import boto3 + print(boto3.__file__) # prevent this import from being removed except ImportError: collect_ignore.append("aws") diff --git a/src/toil/jobStores/fileJobStore.py b/src/toil/jobStores/fileJobStore.py index 636db8e782..0ae5c8816b 100644 --- a/src/toil/jobStores/fileJobStore.py +++ b/src/toil/jobStores/fileJobStore.py @@ -19,30 +19,29 @@ import re import shutil import stat -import sys import time import uuid +from collections.abc import Iterable, Iterator from contextlib import contextmanager -from typing import IO, Iterable, Iterator, List, Optional, Union, overload +from typing import IO, Literal, Optional, Union, overload from urllib.parse import ParseResult, quote, unquote -if sys.version_info >= (3, 8): - from typing import Literal -else: - from typing_extensions import Literal - from toil.fileStores import FileID from toil.job import TemporaryID -from toil.jobStores.abstractJobStore import (AbstractJobStore, - JobStoreExistsException, - NoSuchFileException, - NoSuchJobException, - NoSuchJobStoreException) -from toil.lib.io import (AtomicFileCreate, - atomic_copy, - atomic_copyobj, - mkdtemp, - robust_rmtree) +from toil.jobStores.abstractJobStore import ( + AbstractJobStore, + JobStoreExistsException, + NoSuchFileException, + NoSuchJobException, + NoSuchJobStoreException, +) +from toil.lib.io import ( + AtomicFileCreate, + atomic_copy, + atomic_copyobj, + mkdtemp, + robust_rmtree, +) logger = logging.getLogger(__name__) @@ -59,10 +58,10 @@ class FileJobStore(AbstractJobStore): # What prefix should be on the per-job job directories, to distinguish them # from the spray directories? - JOB_DIR_PREFIX = 'instance-' + JOB_DIR_PREFIX = "instance-" # What prefix do we put on the per-job-name directories we sort jobs into? - JOB_NAME_DIR_PREFIX = 'kind-' + JOB_NAME_DIR_PREFIX = "kind-" # 10Mb RAM chunks when reading/writing files BUFFER_SIZE = 10485760 # 10Mb @@ -88,17 +87,17 @@ def __init__(self, path: str, fanOut: int = 1000) -> None: logger.debug("Path to job store directory is '%s'.", self.jobStoreDir) # Directory where actual job files go, and their job-associated temp files - self.jobsDir = os.path.join(self.jobStoreDir, 'jobs') + self.jobsDir = os.path.join(self.jobStoreDir, "jobs") # Directory where stats files go - self.statsDir = os.path.join(self.jobStoreDir, 'stats') + self.statsDir = os.path.join(self.jobStoreDir, "stats") # Directory where non-job-associated files for the file store go - self.filesDir = os.path.join(self.jobStoreDir, 'files/no-job') + self.filesDir = os.path.join(self.jobStoreDir, "files/no-job") # Directory where job-associated files for the file store go. # Each per-job directory in here will have separate directories for # files to clean up and files to not clean up when the job is deleted. - self.jobFilesDir = os.path.join(self.jobStoreDir, 'files/for-job') + self.jobFilesDir = os.path.join(self.jobStoreDir, "files/for-job") # Directory where shared files go - self.sharedFilesDir = os.path.join(self.jobStoreDir, 'files/shared') + self.sharedFilesDir = os.path.join(self.jobStoreDir, "files/shared") self.fanOut = fanOut @@ -107,7 +106,7 @@ def __init__(self, path: str, fanOut: int = 1000) -> None: self.symlink_job_store_reads = None def __repr__(self): - return f'FileJobStore({self.jobStoreDir})' + return f"FileJobStore({self.jobStoreDir})" def initialize(self, config): try: @@ -153,8 +152,10 @@ def assign_job_id(self, job_description): # Make a unique temp directory under a directory for this job name, # possibly sprayed across multiple levels of subdirectories. - absJobDir = mkdtemp(prefix=self.JOB_DIR_PREFIX, - dir=self._get_arbitrary_jobs_dir_for_name(usefulFilename)) + absJobDir = mkdtemp( + prefix=self.JOB_DIR_PREFIX, + dir=self._get_arbitrary_jobs_dir_for_name(usefulFilename), + ) job_description.jobStoreID = self._get_job_id_from_dir(absJobDir) @@ -180,7 +181,9 @@ def _wait_for_exists(self, jobStoreID, maxTries=35, sleepTime=1): Spin-wait and block for a job to appear before returning False if it does not. """ - return self._wait_for_file(self._get_job_file_name(jobStoreID), maxTries=maxTries, sleepTime=sleepTime) + return self._wait_for_file( + self._get_job_file_name(jobStoreID), maxTries=maxTries, sleepTime=sleepTime + ) def _wait_for_file(self, fileName, maxTries=35, sleepTime=1): """ @@ -198,14 +201,18 @@ def _wait_for_file(self, fileName, maxTries=35, sleepTime=1): In practice, the need for retries happens rarely, but it does happen over the course of large workflows with a jobStore on a busy NFS. """ - for iTry in range(1,maxTries+1): + for iTry in range(1, maxTries + 1): if os.path.exists(fileName): return True if iTry >= maxTries: return False elif iTry == 1: - logger.warning(("Path `{}` does not exist (yet). We will try #{} more times with {}s " - "intervals.").format(fileName, maxTries - iTry, sleepTime)) + logger.warning( + ( + "Path `{}` does not exist (yet). We will try #{} more times with {}s " + "intervals." + ).format(fileName, maxTries - iTry, sleepTime) + ) time.sleep(sleepTime) return False @@ -216,7 +223,7 @@ def get_public_url(self, jobStoreFileID): self._check_job_store_file_id(jobStoreFileID) jobStorePath = self._get_file_path_from_id(jobStoreFileID) if os.path.exists(jobStorePath): - return 'file:' + jobStorePath + return "file:" + jobStorePath else: raise NoSuchFileException(jobStoreFileID) @@ -224,7 +231,7 @@ def get_shared_public_url(self, sharedFileName): jobStorePath = os.path.join(self.sharedFilesDir, sharedFileName) if not os.path.exists(jobStorePath): raise NoSuchFileException(sharedFileName) - return 'file:' + jobStorePath + return "file:" + jobStorePath def load_job(self, job_id): # If the job obviously doesn't exist, note that. @@ -232,7 +239,7 @@ def load_job(self, job_id): # Try to load a valid version of the job. jobFile = self._get_job_file_name(job_id) try: - with open(jobFile, 'rb') as fileHandle: + with open(jobFile, "rb") as fileHandle: job = pickle.load(fileHandle) except FileNotFoundError: # We were racing a delete on a non-POSIX-compliant filesystem. @@ -254,7 +261,9 @@ def load_job(self, job_id): def update_job(self, job): assert job.jobStoreID is not None, f"Tried to update job {job} without an ID" - assert not isinstance(job.jobStoreID, TemporaryID), f"Tried to update job {job} without an assigned ID" + assert not isinstance( + job.jobStoreID, TemporaryID + ), f"Tried to update job {job} without an assigned ID" job.pre_update_hook() @@ -267,10 +276,11 @@ def update_job(self, job): # The file is then moved to its correct path. # Atomicity guarantees use the fact the underlying file system's "move" # function is atomic. - with open(dest_filename + ".new", 'xb') as f: + with open(dest_filename + ".new", "xb") as f: pickle.dump(job, f) # This should be atomic for the file system os.rename(dest_filename + ".new", dest_filename) + def delete_job(self, job_id): # The jobStoreID is the relative path to the directory containing the job, # removing this directory deletes the job. @@ -311,7 +321,9 @@ def _copy_or_link(self, src_path, dst_path, hardlink=False, symlink=False): else: atomic_copy(src_path, dst_path) - def _import_file(self, otherCls, uri, shared_file_name=None, hardlink=False, symlink=True): + def _import_file( + self, otherCls, uri, shared_file_name=None, hardlink=False, symlink=True + ): # symlink argument says whether the caller can take symlinks or not. # ex: if false, it means the workflow cannot work with symlinks and we need to hardlink or copy. # TODO: Do we ever actually hardlink? @@ -320,21 +332,30 @@ def _import_file(self, otherCls, uri, shared_file_name=None, hardlink=False, sym if issubclass(otherCls, FileJobStore): if os.path.isdir(uri_path): # Don't allow directories (unless someone is racing us) - raise IsADirectoryError(f"URI {uri} points to a directory but a file was expected") + raise IsADirectoryError( + f"URI {uri} points to a directory but a file was expected" + ) if shared_file_name is None: executable = os.stat(uri_path).st_mode & stat.S_IXUSR != 0 - absPath = self._get_unique_file_path(uri_path) # use this to get a valid path to write to in job store + # use this to get a valid path to write to in job store + absPath = self._get_unique_file_path(uri_path) self._copy_or_link(uri, absPath, hardlink=hardlink, symlink=symlink) # TODO: os.stat(absPath).st_size consistently gives values lower than # getDirSizeRecursively() - return FileID(self._get_file_id_from_path(absPath), os.stat(absPath).st_size, executable) + return FileID( + self._get_file_id_from_path(absPath), + os.stat(absPath).st_size, + executable, + ) else: self._requireValidSharedFileName(shared_file_name) path = self._get_shared_file_path(shared_file_name) self._copy_or_link(uri, path, hardlink=hardlink, symlink=symlink) return None else: - return super()._import_file(otherCls, uri, shared_file_name=shared_file_name) + return super()._import_file( + otherCls, uri, shared_file_name=shared_file_name + ) def _export_file(self, otherCls, file_id, uri): if issubclass(otherCls, FileJobStore): @@ -343,7 +364,7 @@ def _export_file(self, otherCls, file_id, uri): # Make sure we don't need to worry about directories when exporting # to local files, just like for cloud storage. os.makedirs(os.path.dirname(destPath), exist_ok=True) - executable = getattr(file_id, 'executable', False) + executable = getattr(file_id, "executable", False) if self.moveExports: self._move_and_linkback(srcPath, destPath, executable=executable) else: @@ -352,7 +373,11 @@ def _export_file(self, otherCls, file_id, uri): super()._default_export_file(otherCls, file_id, uri) def _move_and_linkback(self, srcPath, destPath, executable): - logger.debug("moveExports option, Moving src=%s to dest=%s ; then symlinking dest to src", srcPath, destPath) + logger.debug( + "moveExports option, Moving src=%s to dest=%s ; then symlinking dest to src", + srcPath, + destPath, + ) shutil.move(srcPath, destPath) os.symlink(destPath, srcPath) if executable: @@ -388,7 +413,7 @@ def _open_url(cls, url: ParseResult) -> IO[bytes]: """ Open a file URL as a binary stream. """ - return open(cls._extract_path_from_url(url), 'rb') + return open(cls._extract_path_from_url(url), "rb") @classmethod def _write_to_url(cls, readable, url, executable=False): @@ -400,20 +425,24 @@ def _write_to_url(cls, readable, url, executable=False): :param object readable: An open file object to read from. """ # we use a ~10Mb buffer to improve speed - atomic_copyobj(readable, - cls._extract_path_from_url(url), - length=cls.BUFFER_SIZE, - executable=executable) + atomic_copyobj( + readable, + cls._extract_path_from_url(url), + length=cls.BUFFER_SIZE, + executable=executable, + ) @classmethod - def _list_url(cls, url: ParseResult) -> List[str]: + def _list_url(cls, url: ParseResult) -> list[str]: path = cls._extract_path_from_url(url) listing = [] for p in os.listdir(path): # We know there are no slashes in these component = quote(p) # Return directories with trailing slashes and files without - listing.append((component + '/') if os.path.isdir(os.path.join(path, p)) else component) + listing.append( + (component + "/") if os.path.isdir(os.path.join(path, p)) else component + ) return listing @classmethod @@ -426,13 +455,13 @@ def _extract_path_from_url(url): """ :return: local file path of file pointed at by the given URL """ - if url.netloc != '' and url.netloc != 'localhost': + if url.netloc != "" and url.netloc != "localhost": raise RuntimeError("The URL '%s' is invalid" % url.geturl()) return unquote(url.path) @classmethod def _supports_url(cls, url, export=False): - return url.scheme.lower() == 'file' + return url.scheme.lower() == "file" def _make_string_filename_safe(self, arbitraryString, maxLength=240): """ @@ -461,7 +490,7 @@ def _make_string_filename_safe(self, arbitraryString, maxLength=240): parts.append("UNPRINTABLE") # Glue it all together, and truncate to length - return '_'.join(parts)[:maxLength] + return "_".join(parts)[:maxLength] def write_file(self, local_path, job_id=None, cleanup=False): absPath = self._get_unique_file_path(local_path, job_id, cleanup) @@ -470,20 +499,30 @@ def write_file(self, local_path, job_id=None, cleanup=False): return relPath @contextmanager - def write_file_stream(self, job_id=None, cleanup=False, basename=None, encoding=None, errors=None): + def write_file_stream( + self, job_id=None, cleanup=False, basename=None, encoding=None, errors=None + ): if not basename: - basename = 'stream' + basename = "stream" absPath = self._get_unique_file_path(basename, job_id, cleanup) relPath = self._get_file_id_from_path(absPath) - with open(absPath, 'wb' if encoding == None else 'wt', encoding=encoding, errors=errors) as f: + with open( + absPath, + "wb" if encoding == None else "wt", + encoding=encoding, + errors=errors, + ) as f: # Don't yield while holding an open file descriptor to the temp # file. That can result in temp files still being open when we try # to clean ourselves up, somehow, for certain workloads. yield f, relPath def get_empty_file_store_id(self, jobStoreID=None, cleanup=False, basename=None): - with self.write_file_stream(jobStoreID, cleanup, basename) as (fileHandle, jobStoreFileID): + with self.write_file_stream(jobStoreID, cleanup, basename) as ( + fileHandle, + jobStoreFileID, + ): return jobStoreFileID def update_file(self, file_id, local_path): @@ -500,13 +539,15 @@ def read_file(self, file_id: str, local_path: str, symlink: bool = False) -> Non self._check_job_store_file_id(file_id) jobStoreFilePath = self._get_file_path_from_id(file_id) localDirPath = os.path.dirname(local_path) - executable = getattr(file_id, 'executable', False) + executable = getattr(file_id, "executable", False) if not symlink and os.path.islink(local_path): # We had a symlink and want to clobber it with a hardlink or copy. os.unlink(local_path) - if os.path.exists(local_path) and os.path.samefile(jobStoreFilePath, local_path): + if os.path.exists(local_path) and os.path.samefile( + jobStoreFilePath, local_path + ): # The files are already the same: same name, hardlinked, or # symlinked. There is nothing to do, and trying to shutil.copyfile # one over the other will fail. @@ -535,7 +576,9 @@ def read_file(self, file_id: str, local_path: str, symlink: bool = False) -> Non # In this case, we try to make a hard link. pass else: - logger.error(f"Unexpected OSError when reading file '{jobStoreFilePath}' from job store") + logger.error( + f"Unexpected OSError when reading file '{jobStoreFilePath}' from job store" + ) raise # If we get here, symlinking isn't an option. @@ -579,7 +622,9 @@ def read_file(self, file_id: str, local_path: str, symlink: bool = False) -> Non # hit the file copy case. pass else: - logger.error(f"Unexpected OSError when reading file '{jobStoreFilePath}' from job store") + logger.error( + f"Unexpected OSError when reading file '{jobStoreFilePath}' from job store" + ) raise # If we get here, neither a symlink nor a hardlink will work. @@ -594,15 +639,17 @@ def delete_file(self, file_id): def file_exists(self, file_id): absPath = self._get_file_path_from_id(file_id) - if (not absPath.startswith(self.jobsDir) and - not absPath.startswith(self.filesDir) and - not absPath.startswith(self.jobFilesDir)): + if ( + not absPath.startswith(self.jobsDir) + and not absPath.startswith(self.filesDir) + and not absPath.startswith(self.jobFilesDir) + ): # Don't even look for it, it is out of bounds. raise NoSuchFileException(file_id) try: st = os.stat(absPath) - except os.error: + except OSError: return False if not stat.S_ISREG(st.st_mode): raise NoSuchFileException(file_id) @@ -612,15 +659,17 @@ def get_file_size(self, file_id): # Duplicate a bunch of fileExists to save on stat calls absPath = self._get_file_path_from_id(file_id) - if (not absPath.startswith(self.jobsDir) and - not absPath.startswith(self.filesDir) and - not absPath.startswith(self.jobFilesDir)): + if ( + not absPath.startswith(self.jobsDir) + and not absPath.startswith(self.filesDir) + and not absPath.startswith(self.jobFilesDir) + ): # Don't even look for it, it is out of bounds. raise NoSuchFileException(file_id) try: st = os.stat(absPath) - except os.error: + except OSError: return 0 return st.st_size @@ -630,7 +679,12 @@ def update_file_stream(self, file_id, encoding=None, errors=None): # File objects are context managers (CM) so we could simply return what open returns. # However, it is better to wrap it in another CM so as to prevent users from accessing # the file object directly, without a with statement. - with open(self._get_file_path_from_id(file_id), 'wb' if encoding == None else 'wt', encoding=encoding, errors=errors) as f: + with open( + self._get_file_path_from_id(file_id), + "wb" if encoding == None else "wt", + encoding=encoding, + errors=errors, + ) as f: yield f @contextmanager @@ -640,15 +694,13 @@ def read_file_stream( file_id: Union[str, FileID], encoding: Literal[None] = None, errors: Optional[str] = None, - ) -> Iterator[IO[bytes]]: - ... + ) -> Iterator[IO[bytes]]: ... @contextmanager @overload def read_file_stream( self, file_id: Union[str, FileID], encoding: str, errors: Optional[str] = None - ) -> Iterator[IO[str]]: - ... + ) -> Iterator[IO[str]]: ... @contextmanager @overload @@ -657,8 +709,7 @@ def read_file_stream( file_id: Union[str, FileID], encoding: Optional[str] = None, errors: Optional[str] = None, - ) -> Union[Iterator[IO[bytes]], Iterator[IO[str]]]: - ... + ) -> Union[Iterator[IO[bytes]], Iterator[IO[str]]]: ... @contextmanager def read_file_stream( @@ -694,18 +745,32 @@ def _get_shared_file_path(self, sharedFileName): return os.path.join(self.sharedFilesDir, sharedFileName) @contextmanager - def write_shared_file_stream(self, shared_file_name, encrypted=None, encoding=None, errors=None): + def write_shared_file_stream( + self, shared_file_name, encrypted=None, encoding=None, errors=None + ): # the isProtected parameter has no effect on the fileStore self._requireValidSharedFileName(shared_file_name) - with AtomicFileCreate(self._get_shared_file_path(shared_file_name)) as tmpSharedFilePath: - with open(tmpSharedFilePath, 'wb' if encoding == None else 'wt', encoding=encoding, errors=None) as f: + with AtomicFileCreate( + self._get_shared_file_path(shared_file_name) + ) as tmpSharedFilePath: + with open( + tmpSharedFilePath, + "wb" if encoding == None else "wt", + encoding=encoding, + errors=None, + ) as f: yield f @contextmanager def read_shared_file_stream(self, shared_file_name, encoding=None, errors=None): self._requireValidSharedFileName(shared_file_name) try: - with open(self._get_shared_file_path(shared_file_name), 'rb' if encoding == None else 'rt', encoding=encoding, errors=errors) as f: + with open( + self._get_shared_file_path(shared_file_name), + "rb" if encoding == None else "rt", + encoding=encoding, + errors=errors, + ) as f: yield f except OSError as e: @@ -743,15 +808,11 @@ def list_all_file_names(self, for_job: Optional[str] = None) -> Iterable[str]: job_id = self._get_job_id_from_files_dir(job_instance_dir) jobs.append(job_id) - for name in os.listdir(self.sharedFilesDir): - # Announce all the shared files - yield name + yield from os.listdir(self.sharedFilesDir) for file_dir_path in self._list_dynamic_spray_dir(self.filesDir): # Run on all the no-job files - for dir_file in os.listdir(file_dir_path): - # There ought to be just one file in here. - yield dir_file + yield from os.listdir(file_dir_path) for job_store_id in jobs: # Files from _get_job_files_dir @@ -763,9 +824,7 @@ def list_all_file_names(self, for_job: Optional[str] = None) -> Iterable[str]: # Except the cleanup directory which we do later. continue file_dir_path = os.path.join(job_files_dir, file_dir) - for dir_file in os.listdir(file_dir_path): - # There ought to be just one file in here. - yield dir_file + yield from os.listdir(file_dir_path) # Files from _get_job_files_cleanup_dir job_cleanup_files_dir = os.path.join(job_files_dir, "cleanup") @@ -773,15 +832,13 @@ def list_all_file_names(self, for_job: Optional[str] = None) -> Iterable[str]: for file_dir in os.listdir(job_cleanup_files_dir): # Each file is in its own directory file_dir_path = os.path.join(job_cleanup_files_dir, file_dir) - for dir_file in os.listdir(file_dir_path): - # There ought to be just one file in here. - yield dir_file + yield from os.listdir(file_dir_path) def write_logs(self, msg): # Temporary files are placed in the stats directory tree tempStatsFileName = "stats" + str(uuid.uuid4().hex) + ".new" tempStatsFile = os.path.join(self._get_arbitrary_stats_dir(), tempStatsFileName) - writeFormat = 'w' if isinstance(msg, str) else 'wb' + writeFormat = "w" if isinstance(msg, str) else "wb" with open(tempStatsFile, writeFormat) as f: f.write(msg) os.rename(tempStatsFile, tempStatsFile[:-4]) # This operation is atomic @@ -790,14 +847,14 @@ def read_logs(self, callback, read_all=False): numberOfFilesProcessed = 0 for tempDir in self._stats_directories(): for tempFile in os.listdir(tempDir): - if tempFile.startswith('stats'): + if tempFile.startswith("stats"): absTempFile = os.path.join(tempDir, tempFile) if os.path.isfile(absTempFile): - if read_all or not tempFile.endswith('.new'): - with open(absTempFile, 'rb') as fH: + if read_all or not tempFile.endswith(".new"): + with open(absTempFile, "rb") as fH: callback(fH) numberOfFilesProcessed += 1 - newName = tempFile.rsplit('.', 1)[0] + '.new' + newName = tempFile.rsplit(".", 1)[0] + ".new" newAbsTempFile = os.path.join(tempDir, newName) # Mark this item as read os.rename(absTempFile, newAbsTempFile) @@ -822,14 +879,14 @@ def _get_job_id_from_dir(self, absPath): :param str absPath: The absolute path to a job directory under self.jobsDir which represents a job. :rtype : string, string is the job ID, which is a path relative to self.jobsDir """ - return absPath[len(self.jobsDir)+1:] + return absPath[len(self.jobsDir) + 1 :] def _get_job_id_from_files_dir(self, absPath: str) -> str: """ :param str absPath: The absolute path to a job directory under self.jobFilesDir which holds a job's files. :rtype : string, string is the job ID """ - return absPath[len(self.jobFilesDir)+1:] + return absPath[len(self.jobFilesDir) + 1 :] def _get_job_file_name(self, jobStoreID): """ @@ -911,7 +968,7 @@ def _get_file_id_from_path(self, absPath): :rtype : string, string is the file ID. """ - return quote(absPath[len(self.jobStoreDir)+1:]) + return quote(absPath[len(self.jobStoreDir) + 1 :]) def _check_job_store_file_id(self, jobStoreFileID): """ @@ -941,11 +998,17 @@ def _get_arbitrary_jobs_dir_for_name(self, jobNameSlug): if len(os.listdir(self.jobsDir)) > self.fanOut: # Make sure that we don't over-fill the root with too many unique job names. # Go in a subdirectory tree, and then go by job name and make another tree. - return self._get_dynamic_spray_dir(os.path.join(self._get_dynamic_spray_dir(self.jobsDir), - self.JOB_NAME_DIR_PREFIX + jobNameSlug)) + return self._get_dynamic_spray_dir( + os.path.join( + self._get_dynamic_spray_dir(self.jobsDir), + self.JOB_NAME_DIR_PREFIX + jobNameSlug, + ) + ) else: # Just go in the root - return self._get_dynamic_spray_dir(os.path.join(self.jobsDir, self.JOB_NAME_DIR_PREFIX + jobNameSlug)) + return self._get_dynamic_spray_dir( + os.path.join(self.jobsDir, self.JOB_NAME_DIR_PREFIX + jobNameSlug) + ) def _get_arbitrary_stats_dir(self): """ @@ -1089,8 +1152,9 @@ def _job_directories(self): continue # Now we have only the directories that are named after jobs. Look inside them. - yield from self._walk_dynamic_spray_dir(os.path.join(jobHoldingDir, jobNameDir)) - + yield from self._walk_dynamic_spray_dir( + os.path.join(jobHoldingDir, jobNameDir) + ) def _stats_directories(self): """ @@ -1140,18 +1204,24 @@ def _get_file_directory(self, jobStoreID=None, cleanup=False): self._check_job_store_id_assigned(jobStoreID) # Find where all its created files should live, depending on if # they need to go away when the job is deleted or not. - jobFilesDir = self._get_job_files_dir(jobStoreID) if not cleanup else self._get_job_files_cleanup_dir(jobStoreID) + jobFilesDir = ( + self._get_job_files_dir(jobStoreID) + if not cleanup + else self._get_job_files_cleanup_dir(jobStoreID) + ) # Lazily create the parent directory. # We don't want our tree filled with confusingly empty directories. os.makedirs(jobFilesDir, exist_ok=True) # Then make a temp directory inside it - filesDir = os.path.join(jobFilesDir, 'file-' + uuid.uuid4().hex) + filesDir = os.path.join(jobFilesDir, "file-" + uuid.uuid4().hex) os.mkdir(filesDir) return filesDir else: # Make a temporary file within the non-job-associated files hierarchy - filesDir = os.path.join(self._get_arbitrary_files_dir(), 'file-' + uuid.uuid4().hex) + filesDir = os.path.join( + self._get_arbitrary_files_dir(), "file-" + uuid.uuid4().hex + ) os.mkdir(filesDir) return filesDir diff --git a/src/toil/jobStores/googleJobStore.py b/src/toil/jobStores/googleJobStore.py index 9eaf9ff07e..1cc2cc2714 100644 --- a/src/toil/jobStores/googleJobStore.py +++ b/src/toil/jobStores/googleJobStore.py @@ -20,20 +20,24 @@ from contextlib import contextmanager from functools import wraps from io import BytesIO -from typing import IO, List, Optional +from typing import IO, Optional from urllib.parse import ParseResult -from google.api_core.exceptions import (GoogleAPICallError, - InternalServerError, - ServiceUnavailable) +from google.api_core.exceptions import ( + GoogleAPICallError, + InternalServerError, + ServiceUnavailable, +) from google.auth.exceptions import DefaultCredentialsError from google.cloud import exceptions, storage -from toil.jobStores.abstractJobStore import (AbstractJobStore, - JobStoreExistsException, - NoSuchFileException, - NoSuchJobException, - NoSuchJobStoreException) +from toil.jobStores.abstractJobStore import ( + AbstractJobStore, + JobStoreExistsException, + NoSuchFileException, + NoSuchJobException, + NoSuchJobStoreException, +) from toil.jobStores.utils import ReadablePipe, WritablePipe from toil.lib.compatibility import compat_bytes from toil.lib.io import AtomicFileCreate @@ -42,7 +46,7 @@ log = logging.getLogger(__name__) -GOOGLE_STORAGE = 'gs' +GOOGLE_STORAGE = "gs" MAX_BATCH_SIZE = 1000 @@ -75,19 +79,22 @@ def google_retry(f): It should wrap any function that makes use of the Google Client API """ + @wraps(f) def wrapper(*args, **kwargs): - for attempt in old_retry(delays=truncExpBackoff(), - timeout=300, - predicate=google_retry_predicate): + for attempt in old_retry( + delays=truncExpBackoff(), timeout=300, predicate=google_retry_predicate + ): with attempt: return f(*args, **kwargs) + return wrapper class GoogleJobStore(AbstractJobStore): - nodeServiceAccountJson = '/root/service_account.json' + nodeServiceAccountJson = "/root/service_account.json" + def __init__(self, locator: str) -> None: super().__init__(locator) @@ -99,20 +106,19 @@ def __init__(self, locator: str) -> None: projectID = None self.projectID = projectID - self.bucketName = namePrefix+"--toil" + self.bucketName = namePrefix + "--toil" log.debug("Instantiating google jobStore with name: %s", self.bucketName) # this is a :class:`~google.cloud.storage.bucket.Bucket` self.bucket = None - self.statsBaseID = 'f16eef0c-b597-4b8b-9b0c-4d605b4f506c' - self.statsReadPrefix = '_' - self.readStatsBaseID = self.statsReadPrefix+self.statsBaseID + self.statsBaseID = "f16eef0c-b597-4b8b-9b0c-4d605b4f506c" + self.statsReadPrefix = "_" + self.readStatsBaseID = self.statsReadPrefix + self.statsBaseID self.sseKey = None self.storageClient = self.create_client() - @classmethod def create_client(cls) -> storage.Client: """ @@ -127,28 +133,36 @@ def create_client(cls) -> storage.Client: # Determine if we have an override environment variable for our credentials. # We get the path to check existence, but Google Storage works out what # to use later by looking at the environment again. - credentials_path: Optional[str] = os.getenv('GOOGLE_APPLICATION_CREDENTIALS', None) + credentials_path: Optional[str] = os.getenv( + "GOOGLE_APPLICATION_CREDENTIALS", None + ) if credentials_path is not None and not os.path.exists(credentials_path): # If the file is missing, complain. # This variable holds a file name and not any sensitive data itself. - log.warning("File '%s' from GOOGLE_APPLICATION_CREDENTIALS is unavailable! " - "We may not be able to authenticate!", - credentials_path) + log.warning( + "File '%s' from GOOGLE_APPLICATION_CREDENTIALS is unavailable! " + "We may not be able to authenticate!", + credentials_path, + ) if credentials_path is None and os.path.exists(cls.nodeServiceAccountJson): try: # load credentials from a particular file on GCE nodes if an override path is not set - return storage.Client.from_service_account_json(cls.nodeServiceAccountJson) + return storage.Client.from_service_account_json( + cls.nodeServiceAccountJson + ) except OSError: # Probably we don't have permission to use the file. - log.warning("File '%s' exists but didn't work to authenticate!", - cls.nodeServiceAccountJson) + log.warning( + "File '%s' exists but didn't work to authenticate!", + cls.nodeServiceAccountJson, + ) # Either a filename is specified, or our fallback file isn't there. try: # See if Google can work out how to authenticate. return storage.Client() - except (DefaultCredentialsError, EnvironmentError): + except (DefaultCredentialsError, OSError): # Depending on which Google codepath or module version (???) # realizes we have no credentials, we can get an EnvironemntError, # or the new DefaultCredentialsError we are supposedly specced to @@ -158,7 +172,6 @@ def create_client(cls) -> storage.Client: # This is likely to happen all the time so don't warn. return storage.Client.create_anonymous_client() - @google_retry def initialize(self, config=None): try: @@ -169,7 +182,7 @@ def initialize(self, config=None): # set up sever side encryption after we set up config in super if self.config.sseKey is not None: - with open(self.config.sseKey, 'rb') as f: + with open(self.config.sseKey, "rb") as f: self.sseKey = compat_bytes(f.read()) assert len(self.sseKey) == 32 @@ -199,13 +212,13 @@ def destroy(self): count = 0 while count < len(blobs_to_delete): with self.storageClient.batch(): - for blob in blobs_to_delete[count:count + MAX_BATCH_SIZE]: + for blob in blobs_to_delete[count : count + MAX_BATCH_SIZE]: blob.delete() count = count + MAX_BATCH_SIZE self.bucket.delete() def _new_job_id(self): - return f'job-{uuid.uuid4()}' + return f"job-{uuid.uuid4()}" def assign_job_id(self, job_description): jobStoreID = self._new_job_id() @@ -219,12 +232,17 @@ def batch(self): def create_job(self, job_description): job_description.pre_update_hook() - self._write_bytes(job_description.jobStoreID, pickle.dumps(job_description, protocol=pickle.HIGHEST_PROTOCOL)) + self._write_bytes( + job_description.jobStoreID, + pickle.dumps(job_description, protocol=pickle.HIGHEST_PROTOCOL), + ) return job_description @google_retry def job_exists(self, job_id): - return self.bucket.blob(compat_bytes(job_id), encryption_key=self.sseKey).exists() + return self.bucket.blob( + compat_bytes(job_id), encryption_key=self.sseKey + ).exists() @google_retry def get_public_url(self, fileName): @@ -251,7 +269,11 @@ def load_job(self, job_id): def update_job(self, job): job.pre_update_hook() - self._write_bytes(job.jobStoreID, pickle.dumps(job, protocol=pickle.HIGHEST_PROTOCOL), update=True) + self._write_bytes( + job.jobStoreID, + pickle.dumps(job, protocol=pickle.HIGHEST_PROTOCOL), + update=True, + ) @google_retry def delete_job(self, job_id): @@ -269,32 +291,40 @@ def get_env(self): env = {} - credentials_path: Optional[str] = os.getenv('GOOGLE_APPLICATION_CREDENTIALS', None) + credentials_path: Optional[str] = os.getenv( + "GOOGLE_APPLICATION_CREDENTIALS", None + ) if credentials_path is not None: # Send along the environment variable that points to the credentials file. # It must be available in the same place on all nodes. - env['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path + env["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path return env @google_retry def jobs(self): - for blob in self.bucket.list_blobs(prefix=b'job-'): + for blob in self.bucket.list_blobs(prefix=b"job-"): jobStoreID = blob.name # TODO: do this better - if len(jobStoreID) == 40 and jobStoreID.startswith('job-'): # 'job-' + uuid length + if len(jobStoreID) == 40 and jobStoreID.startswith( + "job-" + ): # 'job-' + uuid length yield self.load_job(jobStoreID) def write_file(self, local_path, job_id=None, cleanup=False): fileID = self._new_id(isFile=True, jobStoreID=job_id if cleanup else None) - with open(local_path, 'rb') as f: + with open(local_path, "rb") as f: self._write_file(fileID, f) return fileID @contextmanager - def write_file_stream(self, job_id=None, cleanup=False, basename=None, encoding=None, errors=None): + def write_file_stream( + self, job_id=None, cleanup=False, basename=None, encoding=None, errors=None + ): fileID = self._new_id(isFile=True, jobStoreID=job_id if cleanup else None) - with self._upload_stream(fileID, update=False, encoding=encoding, errors=errors) as writable: + with self._upload_stream( + fileID, update=False, encoding=encoding, errors=errors + ) as writable: yield writable, fileID def get_empty_file_store_id(self, jobStoreID=None, cleanup=False, basename=None): @@ -309,16 +339,19 @@ def read_file(self, file_id, local_path, symlink=False): if not self.file_exists(file_id): raise NoSuchFileException(file_id) with AtomicFileCreate(local_path) as tmpPath: - with open(tmpPath, 'wb') as writeable: - blob = self.bucket.get_blob(compat_bytes(file_id), encryption_key=self.sseKey) + with open(tmpPath, "wb") as writeable: + blob = self.bucket.get_blob( + compat_bytes(file_id), encryption_key=self.sseKey + ) blob.download_to_file(writeable) - if getattr(file_id, 'executable', False): + if getattr(file_id, "executable", False): os.chmod(local_path, os.stat(local_path).st_mode | stat.S_IXUSR) @contextmanager def read_file_stream(self, file_id, encoding=None, errors=None): - with self.read_shared_file_stream(file_id, isProtected=True, encoding=encoding, - errors=errors) as readable: + with self.read_shared_file_stream( + file_id, isProtected=True, encoding=encoding, errors=errors + ) as readable: yield readable def delete_file(self, file_id): @@ -326,32 +359,49 @@ def delete_file(self, file_id): @google_retry def file_exists(self, file_id): - return self.bucket.blob(compat_bytes(file_id), encryption_key=self.sseKey).exists() + return self.bucket.blob( + compat_bytes(file_id), encryption_key=self.sseKey + ).exists() @google_retry def get_file_size(self, file_id): if not self.file_exists(file_id): return 0 - return self.bucket.get_blob(compat_bytes(file_id), encryption_key=self.sseKey).size + return self.bucket.get_blob( + compat_bytes(file_id), encryption_key=self.sseKey + ).size def update_file(self, file_id, local_path): - with open(local_path, 'rb') as f: + with open(local_path, "rb") as f: self._write_file(file_id, f, update=True) @contextmanager def update_file_stream(self, file_id, encoding=None, errors=None): - with self._upload_stream(file_id, update=True, encoding=encoding, errors=errors) as writable: + with self._upload_stream( + file_id, update=True, encoding=encoding, errors=errors + ) as writable: yield writable @contextmanager - def write_shared_file_stream(self, shared_file_name, encrypted=True, encoding=None, errors=None): - with self._upload_stream(shared_file_name, encrypt=encrypted, update=True, encoding=encoding, - errors=errors) as writable: + def write_shared_file_stream( + self, shared_file_name, encrypted=True, encoding=None, errors=None + ): + with self._upload_stream( + shared_file_name, + encrypt=encrypted, + update=True, + encoding=encoding, + errors=errors, + ) as writable: yield writable @contextmanager - def read_shared_file_stream(self, shared_file_name, isProtected=True, encoding=None, errors=None): - with self._download_stream(shared_file_name, encrypt=isProtected, encoding=encoding, errors=errors) as readable: + def read_shared_file_stream( + self, shared_file_name, isProtected=True, encoding=None, errors=None + ): + with self._download_stream( + shared_file_name, encrypt=isProtected, encoding=encoding, errors=errors + ) as readable: yield readable @classmethod @@ -374,7 +424,7 @@ def _get_blob_from_url(cls, url, exists=False): fileName = url.path # remove leading '/', which can cause problems if fileName is a path - if fileName.startswith('/'): + if fileName.startswith("/"): fileName = fileName[1:] storageClient = cls.create_client() @@ -413,7 +463,7 @@ def _open_url(cls, url: ParseResult) -> IO[bytes]: @classmethod def _supports_url(cls, url, export=False): - return url.scheme.lower() == 'gs' + return url.scheme.lower() == "gs" @classmethod def _write_to_url(cls, readable: bytes, url: str, executable: bool = False) -> None: @@ -421,12 +471,16 @@ def _write_to_url(cls, readable: bytes, url: str, executable: bool = False) -> N blob.upload_from_file(readable) @classmethod - def _list_url(cls, url: ParseResult) -> List[str]: - raise NotImplementedError("Listing files in Google buckets is not yet implemented!") + def _list_url(cls, url: ParseResult) -> list[str]: + raise NotImplementedError( + "Listing files in Google buckets is not yet implemented!" + ) @classmethod def _get_is_directory(cls, url: ParseResult) -> bool: - raise NotImplementedError("Checking directory status in Google buckets is not yet implemented!") + raise NotImplementedError( + "Checking directory status in Google buckets is not yet implemented!" + ) @google_retry def write_logs(self, msg: bytes) -> None: @@ -456,7 +510,9 @@ def read_logs(self, callback, read_all=False): if not read_all: # rename this file by copying it and deleting the old version to avoid # rereading it - newID = self.readStatsBaseID + blob.name[len(self.statsBaseID):] + newID = ( + self.readStatsBaseID + blob.name[len(self.statsBaseID) :] + ) # NOTE: just copies then deletes old. self.bucket.rename_blob(blob, compat_bytes(newID)) except NoSuchFileException: @@ -472,7 +528,7 @@ def read_logs(self, callback, read_all=False): if lastTry: # this was our second try, we are reasonably sure there aren't any stats # left to gather - break + break # Try one more time in a couple seconds time.sleep(5) lastTry = True @@ -486,11 +542,11 @@ def read_logs(self, callback, read_all=False): @staticmethod def _new_id(isFile=False, jobStoreID=None): if isFile and jobStoreID: # file associated with job - return jobStoreID+str(uuid.uuid4()) + return jobStoreID + str(uuid.uuid4()) elif isFile: # nonassociated file return str(uuid.uuid4()) else: # job id - return f'job-{uuid.uuid4()}' + return f"job-{uuid.uuid4()}" @google_retry def _delete(self, jobStoreFileID): @@ -514,8 +570,12 @@ def _read_contents(self, jobStoreID): return job.download_as_string() @google_retry - def _write_file(self, jobStoreID: str, fileObj: bytes, update=False, encrypt=True) -> None: - blob = self.bucket.blob(compat_bytes(jobStoreID), encryption_key=self.sseKey if encrypt else None) + def _write_file( + self, jobStoreID: str, fileObj: bytes, update=False, encrypt=True + ) -> None: + blob = self.bucket.blob( + compat_bytes(jobStoreID), encryption_key=self.sseKey if encrypt else None + ) if not update: # TODO: should probably raise a special exception and be added to all jobStores assert not blob.exists() @@ -529,7 +589,9 @@ def _write_bytes(self, jobStoreID: str, stringToUpload: bytes, **kwarg) -> None: @contextmanager @google_retry - def _upload_stream(self, fileName, update=False, encrypt=True, encoding=None, errors=None): + def _upload_stream( + self, fileName, update=False, encrypt=True, encoding=None, errors=None + ): """ Yields a context manager that can be used to write to the bucket with a stream. See :class:`~toil.jobStores.utils.WritablePipe` for an example. @@ -555,7 +617,10 @@ def _upload_stream(self, fileName, update=False, encrypt=True, encoding=None, er :return: an instance of WritablePipe. :rtype: :class:`~toil.jobStores.utils.writablePipe` """ - blob = self.bucket.blob(compat_bytes(fileName), encryption_key=self.sseKey if encrypt else None) + blob = self.bucket.blob( + compat_bytes(fileName), encryption_key=self.sseKey if encrypt else None + ) + class UploadPipe(WritablePipe): def readFrom(self, readable): if not update: @@ -591,7 +656,9 @@ def _download_stream(self, fileName, encrypt=True, encoding=None, errors=None): :rtype: :class:`~toil.jobStores.utils.ReadablePipe` """ - blob = self.bucket.get_blob(compat_bytes(fileName), encryption_key=self.sseKey if encrypt else None) + blob = self.bucket.get_blob( + compat_bytes(fileName), encryption_key=self.sseKey if encrypt else None + ) if blob is None: raise NoSuchFileException(fileName) diff --git a/src/toil/jobStores/utils.py b/src/toil/jobStores/utils.py index 5fb5f5db01..946cbc8898 100644 --- a/src/toil/jobStores/utils.py +++ b/src/toil/jobStores/utils.py @@ -10,6 +10,7 @@ log = logging.getLogger(__name__) + class WritablePipe(ABC): """ An object-oriented wrapper for os.pipe. Clients should subclass it, implement @@ -84,7 +85,7 @@ def readFrom(self, readable): raise NotImplementedError() def _reader(self): - with os.fdopen(self.readable_fh, 'rb') as readable: + with os.fdopen(self.readable_fh, "rb") as readable: # TODO: If the reader somehow crashes here, both threads might try # to close readable_fh. Fortunately we don't do anything that # should be able to fail here. @@ -112,7 +113,12 @@ def __init__(self, encoding=None, errors=None): def __enter__(self): self.readable_fh, writable_fh = os.pipe() - self.writable = os.fdopen(writable_fh, 'wb' if self.encoding == None else 'wt', encoding=self.encoding, errors=self.errors) + self.writable = os.fdopen( + writable_fh, + "wb" if self.encoding == None else "wt", + encoding=self.encoding, + errors=self.errors, + ) self.thread = ExceptionalThread(target=self._reader) self.thread.start() return self.writable @@ -132,7 +138,9 @@ def __exit__(self, exc_type, exc_val, exc_tb): # already an exception in the main thread raise else: - log.error('Swallowing additional exception in reader thread: %s', str(e)) + log.error( + "Swallowing additional exception in reader thread: %s", str(e) + ) finally: # The responsibility for closing the readable end is generally that of the reader # thread. To cover the small window before the reader takes over we also close it here. @@ -217,7 +225,7 @@ def writeTo(self, writable): def _writer(self): try: - with os.fdopen(self.writable_fh, 'wb') as writable: + with os.fdopen(self.writable_fh, "wb") as writable: self.writeTo(writable) except OSError as e: # The other side of the pipe may have been closed by the @@ -244,7 +252,12 @@ def __init__(self, encoding=None, errors=None): def __enter__(self): readable_fh, self.writable_fh = os.pipe() - self.readable = os.fdopen(readable_fh, 'rb' if self.encoding == None else 'rt', encoding=self.encoding, errors=self.errors) + self.readable = os.fdopen( + readable_fh, + "rb" if self.encoding == None else "rt", + encoding=self.encoding, + errors=self.errors, + ) self.thread = ExceptionalThread(target=self._writer) self.thread.start() return self.readable @@ -264,6 +277,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): # already an exception in the main thread raise + class ReadableTransformingPipe(ReadablePipe): """ A pipe which is constructed around a readable stream, and which provides a @@ -296,7 +310,6 @@ class ReadableTransformingPipe(ReadablePipe): """ - def __init__(self, source, encoding=None, errors=None): """ :param str encoding: the name of the encoding used to encode the file. Encodings are the same @@ -323,15 +336,17 @@ def transform(self, readable, writable): def writeTo(self, writable): self.transform(self.source, writable) + class JobStoreUnavailableException(RuntimeError): """ Raised when a particular type of job store is requested but can't be used. """ + def generate_locator( job_store_type: str, local_suggestion: Optional[str] = None, - decoration: Optional[str] = None + decoration: Optional[str] = None, ) -> str: """ Generate a random locator for a job store of the given type. Raises an @@ -347,7 +362,7 @@ def generate_locator( """ # Prepare decoration for splicing into strings - decoration = ('-' + decoration) if decoration else '' + decoration = ("-" + decoration) if decoration else "" try: if job_store_type == "google": @@ -363,6 +378,7 @@ def generate_locator( elif job_store_type == "aws": # Make sure we have AWS from toil.jobStores.aws.jobStore import AWSJobStore # noqa + # Find a region from toil.lib.aws import get_current_aws_region @@ -370,7 +386,9 @@ def generate_locator( if not region: # We can't generate an AWS job store without a region - raise JobStoreUnavailableException(f"{job_store_type} job store can't be made without a region") + raise JobStoreUnavailableException( + f"{job_store_type} job store can't be made without a region" + ) # Roll a random name return f"aws:{region}:toil{decoration}-{str(uuid.uuid4())}" @@ -380,11 +398,14 @@ def generate_locator( return local_suggestion else: # Pick a temp path - return os.path.join(tempfile.gettempdir(), 'toil-' + str(uuid.uuid4()) + decoration) + return os.path.join( + tempfile.gettempdir(), "toil-" + str(uuid.uuid4()) + decoration + ) else: - raise JobStoreUnavailableException(f"{job_store_type} job store isn't known") + raise JobStoreUnavailableException( + f"{job_store_type} job store isn't known" + ) except ImportError: - raise JobStoreUnavailableException(f"libraries for {job_store_type} job store are not installed") - - - + raise JobStoreUnavailableException( + f"libraries for {job_store_type} job store are not installed" + ) diff --git a/src/toil/leader.py b/src/toil/leader.py index a488f699d0..337085239a 100644 --- a/src/toil/leader.py +++ b/src/toil/leader.py @@ -21,31 +21,36 @@ import pickle import sys import time -from typing import Any, Dict, List, Optional, Set, Union +from typing import Any, Optional, Union import enlighten from toil import resolveEntryPoint from toil.batchSystems import DeadlockException -from toil.batchSystems.abstractBatchSystem import (AbstractBatchSystem, - BatchJobExitReason, - EXIT_STATUS_UNAVAILABLE_VALUE) -from toil.bus import (JobCompletedMessage, - JobFailedMessage, - JobIssuedMessage, - JobMissingMessage, - JobUpdatedMessage, - QueueSizeMessage, - get_job_kind) +from toil.batchSystems.abstractBatchSystem import ( + EXIT_STATUS_UNAVAILABLE_VALUE, + AbstractBatchSystem, + BatchJobExitReason, +) +from toil.bus import ( + JobCompletedMessage, + JobFailedMessage, + JobIssuedMessage, + JobMissingMessage, + JobUpdatedMessage, + QueueSizeMessage, + get_job_kind, +) from toil.common import Config, ToilMetrics from toil.cwl.utils import CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE from toil.exceptions import FailedJobsException -from toil.job import (CheckpointJobDescription, - JobDescription, - ServiceJobDescription, - TemporaryID) -from toil.jobStores.abstractJobStore import (AbstractJobStore, - NoSuchJobException) +from toil.job import ( + CheckpointJobDescription, + JobDescription, + ServiceJobDescription, + TemporaryID, +) +from toil.jobStores.abstractJobStore import AbstractJobStore, NoSuchJobException from toil.lib.throttle import LocalThrottle from toil.provisioners.abstractProvisioner import AbstractProvisioner from toil.provisioners.clusterScaler import ScalerThread @@ -79,13 +84,15 @@ class Leader: consulting the job store, and issuing them in the batch system. """ - def __init__(self, - config: Config, - batchSystem: AbstractBatchSystem, - provisioner: Optional[AbstractProvisioner], - jobStore: AbstractJobStore, - rootJob: JobDescription, - jobCache: Optional[Dict[Union[str, TemporaryID], JobDescription]] = None) -> None: + def __init__( + self, + config: Config, + batchSystem: AbstractBatchSystem, + provisioner: Optional[AbstractProvisioner], + jobStore: AbstractJobStore, + rootJob: JobDescription, + jobCache: Optional[dict[Union[str, TemporaryID], JobDescription]] = None, + ) -> None: """ Create a Toil Leader object. @@ -117,7 +124,9 @@ def __init__(self, # Message bus messages need to go to the given file. # Keep a reference to the return value so the listener stays alive. - self._message_subscription = self.toilState.bus.connect_output_file(self.config.write_messages) + self._message_subscription = self.toilState.bus.connect_output_file( + self.config.write_messages + ) # Connect to the message bus, so we will get all the messages of these # types in an inbox. @@ -132,17 +141,22 @@ def __init__(self, # this, somehow, so they can also see messages from this? self.toilState.load_workflow(rootJob, jobCache=jobCache) - logger.debug("Found %s jobs to start and %i jobs with successors to run", - self._messages.count(JobUpdatedMessage), len(self.toilState.successorCounts)) + logger.debug( + "Found %s jobs to start and %i jobs with successors to run", + self._messages.count(JobUpdatedMessage), + len(self.toilState.successorCounts), + ) # Batch system self.batchSystem = batchSystem if len(self.batchSystem.getIssuedBatchJobIDs()) != 0: - raise RuntimeError("The initialized batchsystem did not start with 0 active jobs.") + raise RuntimeError( + "The initialized batchsystem did not start with 0 active jobs." + ) logger.debug("Checked batch system has no running jobs and no updated jobs") # Map of batch system IDs to job store IDs - self.issued_jobs_by_batch_system_id: Dict[int, str] = {} + self.issued_jobs_by_batch_system_id: dict[int, str] = {} # Number of preemptible jobs currently being run by batch system self.preemptibleJobsIssued = 0 @@ -150,10 +164,12 @@ def __init__(self, # Tracking the number service jobs issued, # this is used limit the number of services issued to the batch system self.serviceJobsIssued = 0 - self.serviceJobsToBeIssued: List[str] = [] # A queue of IDs of service jobs that await scheduling + self.serviceJobsToBeIssued: list[str] = ( + [] + ) # A queue of IDs of service jobs that await scheduling # Equivalents for service jobs to be run on preemptible nodes self.preemptibleServiceJobsIssued = 0 - self.preemptibleServiceJobsToBeIssued: List[str] = [] + self.preemptibleServiceJobsToBeIssued: list[str] = [] # Timing of the rescuing method self.timeSinceJobsLastRescued = None @@ -161,7 +177,7 @@ def __init__(self, # For each issued job's batch system ID, how many times did we not see # it when we should have? If this hits a threshold, the job is declared # missing and killed and possibly retried. - self.reissueMissingJobs_missingHash: Dict[int, int] = {} + self.reissueMissingJobs_missingHash: dict[int, int] = {} # Class used to create/destroy nodes in the cluster, may be None if # using a statically defined cluster @@ -179,7 +195,7 @@ def __init__(self, self.statsAndLogging = StatsAndLogging(self.jobStore, self.config) # Set used to monitor deadlocked jobs - self.potentialDeadlockedJobs: Set[str] = set() + self.potentialDeadlockedJobs: set[str] = set() self.potentialDeadlockTime = 0 # A dashboard that runs on the leader node in AWS clusters to track the state @@ -187,8 +203,13 @@ def __init__(self, self.toilMetrics: Optional[ToilMetrics] = None # internal jobs we should not expose at top level debugging - self.debugJobNames = ("CWLJob", "CWLWorkflow", "CWLScatter", "CWLGather", - "ResolveIndirect") + self.debugJobNames = ( + "CWLJob", + "CWLWorkflow", + "CWLScatter", + "CWLGather", + "ResolveIndirect", + ) self.deadlockThrottler = LocalThrottle(self.config.deadlockCheckInterval) @@ -206,8 +227,10 @@ def __init__(self, self.GOOD_COLOR = (0, 60, 108) self.BAD_COLOR = (253, 199, 0) # And set a format that shows failures - self.PROGRESS_BAR_FORMAT = ('{desc}{desc_pad}{percentage:3.0f}%|{bar}| {count:{len_total}d}/{total:d} ' - '({count_1:d} failures) [{elapsed}<{eta}, {rate:.2f}{unit_pad}{unit}/s]') + self.PROGRESS_BAR_FORMAT = ( + "{desc}{desc_pad}{percentage:3.0f}%|{bar}| {count:{len_total}d}/{total:d} " + "({count_1:d} failures) [{elapsed}<{eta}, {rate:.2f}{unit_pad}{unit}/s]" + ) # TODO: No way to set background color on the terminal for the bar. # What exit code should the process use if the workflow failed? @@ -225,16 +248,25 @@ def run(self) -> Any: """ self.jobStore.write_kill_flag(kill=False) - with enlighten.get_manager(stream=sys.stderr, enabled=not self.config.disableProgress) as manager: + with enlighten.get_manager( + stream=sys.stderr, enabled=not self.config.disableProgress + ) as manager: # Set up the fancy console UI if desirable - self.progress_overall = manager.counter(total=0, desc='Workflow Progress', unit='jobs', - color=self.GOOD_COLOR, bar_format=self.PROGRESS_BAR_FORMAT) + self.progress_overall = manager.counter( + total=0, + desc="Workflow Progress", + unit="jobs", + color=self.GOOD_COLOR, + bar_format=self.PROGRESS_BAR_FORMAT, + ) self.progress_failed = self.progress_overall.add_subcounter(self.BAD_COLOR) # Start the stats/logging aggregation thread self.statsAndLogging.start() if self.config.metrics: - self.toilMetrics = ToilMetrics(self.toilState.bus, provisioner=self.provisioner) + self.toilMetrics = ToilMetrics( + self.toilState.bus, provisioner=self.provisioner + ) try: @@ -251,10 +283,13 @@ def run(self) -> Any: self.innerLoop() finally: if self.clusterScaler is not None: - logger.debug('Waiting for workers to shutdown.') + logger.debug("Waiting for workers to shutdown.") startTime = time.time() self.clusterScaler.shutdown() - logger.debug('Worker shutdown complete in %s seconds.', time.time() - startTime) + logger.debug( + "Worker shutdown complete in %s seconds.", + time.time() - startTime, + ) finally: # Ensure service manager thread is properly shutdown @@ -267,16 +302,25 @@ def run(self) -> Any: self.toilMetrics.shutdown() # Filter the failed jobs - self.toilState.totalFailedJobs = [j for j in self.toilState.totalFailedJobs if self.toilState.job_exists(j)] + self.toilState.totalFailedJobs = [ + j + for j in self.toilState.totalFailedJobs + if self.toilState.job_exists(j) + ] try: self.create_status_sentinel_file(self.toilState.totalFailedJobs) except OSError as e: - logger.debug(f'Error from importFile with hardlink=True: {e}') + logger.debug(f"Error from importFile with hardlink=True: {e}") - logger.info("Finished toil run %s" % - ("successfully." if not self.toilState.totalFailedJobs \ - else ("with %s failed jobs." % len(self.toilState.totalFailedJobs)))) + logger.info( + "Finished toil run %s" + % ( + "successfully." + if not self.toilState.totalFailedJobs + else ("with %s failed jobs." % len(self.toilState.totalFailedJobs)) + ) + ) if len(self.toilState.totalFailedJobs): failed_jobs = [] @@ -289,19 +333,28 @@ def run(self) -> Any: # Job actually finished and was removed pass - logger.info("Failed jobs at end of the run: %s", ' '.join(str(j) for j in failed_jobs)) - raise FailedJobsException(self.jobStore, failed_jobs, exit_code=self.recommended_fail_exit_code) + logger.info( + "Failed jobs at end of the run: %s", + " ".join(str(j) for j in failed_jobs), + ) + raise FailedJobsException( + self.jobStore, + failed_jobs, + exit_code=self.recommended_fail_exit_code, + ) return self.jobStore.get_root_job_return_value() def create_status_sentinel_file(self, fail: bool) -> None: """Create a file in the jobstore indicating failure or success.""" - logName = 'failed.log' if fail else 'succeeded.log' + logName = "failed.log" if fail else "succeeded.log" localLog = os.path.join(os.getcwd(), logName) - open(localLog, 'w').close() - self.jobStore.import_file('file://' + localLog, logName, hardlink=True) + open(localLog, "w").close() + self.jobStore.import_file("file://" + localLog, logName, hardlink=True) - if os.path.exists(localLog): # Bandaid for Jenkins tests failing stochastically and unexplainably. + if os.path.exists( + localLog + ): # Bandaid for Jenkins tests failing stochastically and unexplainably. os.remove(localLog) def _handledFailedSuccessor(self, successor_id: str, predecessor_id: str) -> bool: @@ -313,8 +366,11 @@ def _handledFailedSuccessor(self, successor_id: str, predecessor_id: str) -> boo :returns: True if there are still active successors. False if all successors have failed and the job is queued to run to handle the failed successors. """ - logger.debug("Successor job: %s of job: %s has failed """ - "predecessors", self.toilState.get_job(successor_id), self.toilState.get_job(predecessor_id)) + logger.debug( + "Successor job: %s of job: %s has failed " "" "predecessors", + self.toilState.get_job(successor_id), + self.toilState.get_job(predecessor_id), + ) # Add the job to the set having failed successors self.toilState.hasFailedSuccessors.add(predecessor_id) @@ -328,9 +384,12 @@ def _handledFailedSuccessor(self, successor_id: str, predecessor_id: str) -> boo # If the job now has no active successors, add to active jobs # so it can be processed as a job with failed successors. if self.toilState.count_pending_successors(predecessor_id) == 0: - logger.debug("Job: %s has no successors to run " - "and some are failed, adding to list of jobs " - "with failed successors", self.toilState.get_job(predecessor_id)) + logger.debug( + "Job: %s has no successors to run " + "and some are failed, adding to list of jobs " + "with failed successors", + self.toilState.get_job(predecessor_id), + ) self._messages.publish(JobUpdatedMessage(predecessor_id, 0)) # Report no successors are running return False @@ -338,7 +397,9 @@ def _handledFailedSuccessor(self, successor_id: str, predecessor_id: str) -> boo # Some successors are still active return True - def _checkSuccessorReadyToRunMultiplePredecessors(self, successor_id: str, predecessor_id: str) -> bool: + def _checkSuccessorReadyToRunMultiplePredecessors( + self, successor_id: str, predecessor_id: str + ) -> bool: """ Check if a successor job is ready to run when there are multiple predecessors. @@ -359,8 +420,11 @@ def _checkSuccessorReadyToRunMultiplePredecessors(self, successor_id: str, prede # Grab the predecessor for reporting predecessor = self.toilState.get_job(predecessor_id) - logger.debug("Successor job: %s of job: %s has multiple " - "predecessors", successor, predecessor) + logger.debug( + "Successor job: %s of job: %s has multiple " "predecessors", + successor, + predecessor, + ) # Add the predecessor as a finished predecessor to the successor successor.predecessorsFinished.add(predecessor_id) @@ -379,13 +443,17 @@ def _checkSuccessorReadyToRunMultiplePredecessors(self, successor_id: str, prede if len(successor.predecessorsFinished) == successor.predecessorNumber: # All the successor's predecessors are done now. # Remove the successor job from the set of waiting multi-predecessor jobs. - self.toilState.jobsToBeScheduledWithMultiplePredecessors.remove(successor_id) + self.toilState.jobsToBeScheduledWithMultiplePredecessors.remove( + successor_id + ) return True else: # The job is not ready to run return False - def _makeJobSuccessorReadyToRun(self, successor_id: str, predecessor_id: str) -> bool: + def _makeJobSuccessorReadyToRun( + self, successor_id: str, predecessor_id: str + ) -> bool: """ Make a successor job ready to run if possible. @@ -393,7 +461,7 @@ def _makeJobSuccessorReadyToRun(self, successor_id: str, predecessor_id: str) -> :param predecessor_id: The job which the successor comes after. :returns: False if the successor job should not yet be run or True otherwise. """ - #Build map from successor to predecessors. + # Build map from successor to predecessors. if successor_id not in self.toilState.successor_to_predecessors: self.toilState.successor_to_predecessors[successor_id] = set() if not isinstance(successor_id, str): @@ -404,9 +472,15 @@ def _makeJobSuccessorReadyToRun(self, successor_id: str, predecessor_id: str) -> # Grab the successor successor = self.toilState.get_job(successor_id) - logger.debug("Added job %s as coming after job %s", successor, self.toilState.get_job(predecessor_id)) + logger.debug( + "Added job %s as coming after job %s", + successor, + self.toilState.get_job(predecessor_id), + ) if successor.predecessorNumber > 1: - return self._checkSuccessorReadyToRunMultiplePredecessors(successor_id, predecessor_id) + return self._checkSuccessorReadyToRunMultiplePredecessors( + successor_id, predecessor_id + ) else: return True @@ -425,13 +499,20 @@ def _runJobSuccessors(self, predecessor_id: str) -> None: next_successors = predecessor.nextSuccessors() if next_successors is None or len(next_successors) == 0: - raise RuntimeError(f"Job {self} trying to run successors, but it doesn't have any") - logger.debug("Job: %s has %i successors to schedule", - predecessor_id, len(next_successors)) - #Record the number of successors that must be completed before - #the job can be considered again + raise RuntimeError( + f"Job {self} trying to run successors, but it doesn't have any" + ) + logger.debug( + "Job: %s has %i successors to schedule", + predecessor_id, + len(next_successors), + ) + # Record the number of successors that must be completed before + # the job can be considered again if self.toilState.count_pending_successors(predecessor_id) != 0: - raise RuntimeError('Attempted to schedule successors of the same job twice!') + raise RuntimeError( + "Attempted to schedule successors of the same job twice!" + ) self.toilState.successors_pending(predecessor_id, len(next_successors)) # For each successor schedule if all predecessors have been completed @@ -442,7 +523,11 @@ def _runJobSuccessors(self, predecessor_id: str) -> None: except NoSuchJobException: # Job already done and gone, but probably shouldn't be. Or maybe isn't visible yet. # TODO: Shouldn't this be an error? - logger.warning("Job %s is a successor of %s but is already done and gone.", successor_id, predecessor_id) + logger.warning( + "Job %s is a successor of %s but is already done and gone.", + successor_id, + predecessor_id, + ) # Don't try and run it continue if self._makeJobSuccessorReadyToRun(successor_id, predecessor_id): @@ -464,46 +549,62 @@ def _processFailedSuccessors(self, predecessor_id: str): # The job has services running; signal for them to be killed. # Once they are killed, then the job will be updated again and then # scheduled to be removed. - logger.warning("Telling job %s to terminate its services due to successor failure", - predecessor) - self.serviceManager.kill_services(self.toilState.servicesIssued[predecessor_id], - error=True) + logger.warning( + "Telling job %s to terminate its services due to successor failure", + predecessor, + ) + self.serviceManager.kill_services( + self.toilState.servicesIssued[predecessor_id], error=True + ) elif self.toilState.count_pending_successors(predecessor_id) > 0: # The job has non-service jobs running; wait for them to finish. # the job will be re-added to the updated jobs when these jobs # are done - logger.debug("Job %s with ID: %s with failed successors still has successor jobs running", - predecessor, predecessor_id) - elif (isinstance(predecessor, CheckpointJobDescription) and - predecessor.checkpoint is not None and - predecessor.remainingTryCount > 1): + logger.debug( + "Job %s with ID: %s with failed successors still has successor jobs running", + predecessor, + predecessor_id, + ) + elif ( + isinstance(predecessor, CheckpointJobDescription) + and predecessor.checkpoint is not None + and predecessor.remainingTryCount > 1 + ): # If the job is a checkpoint and has remaining retries... # The logic behind using > 1 rather than > 0 here: Since this job has # been tried once (without decreasing its try count as the job # itself was successful), and its subtree failed, it shouldn't be retried # unless it has more than 1 try. if predecessor_id in self.toilState.jobs_issued: - logger.debug('Checkpoint job %s was updated while issued', predecessor_id) + logger.debug( + "Checkpoint job %s was updated while issued", predecessor_id + ) else: # It hasn't already been reissued. # This check lets us be robust against repeated job update # messages (such as from services starting *and* failing), by # making sure that we don't stay in a state that where we # reissue the job every time we get one. - logger.warning('Job: %s is being restarted as a checkpoint after the total ' - 'failure of jobs in its subtree.', predecessor_id) + logger.warning( + "Job: %s is being restarted as a checkpoint after the total " + "failure of jobs in its subtree.", + predecessor_id, + ) self.issueJob(predecessor) else: # Mark it totally failed - logger.debug("Job %s is being processed as completely failed", predecessor_id) + logger.debug( + "Job %s is being processed as completely failed", predecessor_id + ) self.processTotallyFailedJob(predecessor_id) def _processReadyJob(self, job_id: str, result_status: int): # We operate on the JobDescription mostly. readyJob = self.toilState.get_job(job_id) - logger.debug('Updating status of job %s with result status: %s', - readyJob, result_status) + logger.debug( + "Updating status of job %s with result status: %s", readyJob, result_status + ) # TODO: Filter out nonexistent successors/services now, so we can tell # if they are all done and the job needs deleting? @@ -516,8 +617,11 @@ def _processReadyJob(self, job_id: str, result_status: int): # want to act on it; we want to wait until it gets the update it # gets when the service manager is done trying to start its # services. - logger.debug("Got a job to update which is still owned by the service " - "manager: %s", readyJob.jobStoreID) + logger.debug( + "Got a job to update which is still owned by the service " + "manager: %s", + readyJob.jobStoreID, + ) elif readyJob.jobStoreID in self.toilState.hasFailedSuccessors: self._processFailedSuccessors(job_id) elif readyJob.has_body() or result_status != 0: @@ -531,8 +635,9 @@ def _processReadyJob(self, job_id: str, result_status: int): # If the job has run out of tries or is a service job whose error flag has # been indicated, fail the job. - if (readyJob.remainingTryCount == 0 or - (isServiceJob and not self.jobStore.file_exists(readyJob.errorJobStoreID))): + if readyJob.remainingTryCount == 0 or ( + isServiceJob and not self.jobStore.file_exists(readyJob.errorJobStoreID) + ): self.processTotallyFailedJob(job_id) logger.warning("Job %s is completely failed", readyJob) else: @@ -543,29 +648,39 @@ def _processReadyJob(self, job_id: str, result_status: int): # Build a map from the service jobs to the job and a map # of the services created for the job if readyJob.jobStoreID in self.toilState.servicesIssued: - raise RuntimeError(f"The ready job: {readyJob.jobStoreID} was already issued.") + raise RuntimeError( + f"The ready job: {readyJob.jobStoreID} was already issued." + ) self.toilState.servicesIssued[readyJob.jobStoreID] = set() for serviceJobList in readyJob.serviceHostIDsInBatches(): for serviceID in serviceJobList: if serviceID in self.toilState.service_to_client: - raise RuntimeError(f"The ready service ID: {serviceID} was already added.") + raise RuntimeError( + f"The ready service ID: {serviceID} was already added." + ) # TODO: Why do we refresh here? self.toilState.reset_job(serviceID) serviceHost = self.toilState.get_job(serviceID) self.toilState.service_to_client[serviceID] = readyJob.jobStoreID self.toilState.servicesIssued[readyJob.jobStoreID].add(serviceID) - logger.debug("Giving job: %s to service manager to schedule its jobs", readyJob) + logger.debug( + "Giving job: %s to service manager to schedule its jobs", readyJob + ) # Use the service manager to start the services self.serviceManager.put_client(job_id) elif readyJob.nextSuccessors() is not None: # There are successors to run self._runJobSuccessors(job_id) elif readyJob.jobStoreID in self.toilState.servicesIssued: - logger.debug("Telling job: %s to terminate its services due to the " - "successful completion of its successor jobs", - readyJob) - self.serviceManager.kill_services(self.toilState.servicesIssued[readyJob.jobStoreID], error=False) + logger.debug( + "Telling job: %s to terminate its services due to the " + "successful completion of its successor jobs", + readyJob, + ) + self.serviceManager.kill_services( + self.toilState.servicesIssued[readyJob.jobStoreID], error=False + ) else: # There are no remaining tasks to schedule within the job. # @@ -594,7 +709,10 @@ def _processReadyJob(self, job_id: str, result_status: int): try: self.toilState.delete_job(readyJob.jobStoreID) except Exception as e: - logger.exception("Re-processing success for job we could not remove: %s", readyJob) + logger.exception( + "Re-processing success for job we could not remove: %s", + readyJob, + ) # Kick it back to being handled as succeeded again. We # don't want to have a failure here cause a Toil-level # retry which causes more actual jobs to try to run. @@ -606,12 +724,18 @@ def _processReadyJob(self, job_id: str, result_status: int): self.processRemovedJob(readyJob, 0) else: self.processTotallyFailedJob(job_id) - logger.error("Job: %s is empty but completely failed - something is very wrong", readyJob.jobStoreID) + logger.error( + "Job: %s is empty but completely failed - something is very wrong", + readyJob.jobStoreID, + ) def _processReadyJobs(self): """Process jobs that are ready to be scheduled/have successors to schedule.""" - logger.debug('Built the jobs list, currently have %i jobs to update and %i jobs issued', - self._messages.count(JobUpdatedMessage), self.getNumberOfJobsIssued()) + logger.debug( + "Built the jobs list, currently have %i jobs to update and %i jobs issued", + self._messages.count(JobUpdatedMessage), + self.getNumberOfJobsIssued(), + ) # Now go through and, for each job that has updated this tick, process it. @@ -626,9 +750,13 @@ def _processReadyJobs(self): if message.job_id in handled_with_status: if handled_with_status[message.job_id] == message.result_status: # This is a harmless duplicate - logger.debug("Job %s already updated this tick with status %s and " - "we've received duplicate message %s", message.job_id, - handled_with_status[message.job_id], message) + logger.debug( + "Job %s already updated this tick with status %s and " + "we've received duplicate message %s", + message.job_id, + handled_with_status[message.job_id], + message, + ) else: # This is a conflicting update. We may have already treated # a job as succeeding but now we've heard it's failed, or @@ -636,9 +764,13 @@ def _processReadyJobs(self): # This probably shouldn't happen, but does because the # scheduler is not correct somehow and hasn't been for a # long time. Complain about it. - logger.warning("Job %s already updated this tick with status %s " - "but we've now received %s", message.job_id, - handled_with_status[message.job_id], message) + logger.warning( + "Job %s already updated this tick with status %s " + "but we've now received %s", + message.job_id, + handled_with_status[message.job_id], + message, + ) # Either way, we only want to handle one update per tick, like # the old dict-based implementation. continue @@ -656,16 +788,21 @@ def _startServiceJobs(self): if service_id is None: break - logger.debug('Launching service job: %s', self.toilState.get_job(service_id)) + logger.debug( + "Launching service job: %s", self.toilState.get_job(service_id) + ) self.issueServiceJob(service_id) def _processJobsWithRunningServices(self): """Get jobs whose services have started.""" while True: client_id = self.serviceManager.get_ready_client(0) - if client_id is None: # Stop trying to get jobs when function returns None + if client_id is None: # Stop trying to get jobs when function returns None break - logger.debug('Job: %s has established its services; all services are running', client_id) + logger.debug( + "Job: %s has established its services; all services are running", + client_id, + ) # Grab the client job description client = self.toilState.get_job(client_id) @@ -678,9 +815,9 @@ def _processJobsWithFailedServices(self): """Get jobs whose services have failed to start.""" while True: client_id = self.serviceManager.get_unservable_client(0) - if client_id is None: # Stop trying to get jobs when function returns None + if client_id is None: # Stop trying to get jobs when function returns None break - logger.debug('Job: %s has failed to establish its services.', client_id) + logger.debug("Job: %s has failed to establish its services.", client_id) # Grab the client job description client = self.toilState.get_job(client_id) @@ -695,20 +832,33 @@ def _processJobsWithFailedServices(self): def _gatherUpdatedJobs(self, updatedJobTuple): """Gather any new, updated JobDescriptions from the batch system.""" bsID, exitStatus, exitReason, wallTime = ( - updatedJobTuple.jobID, updatedJobTuple.exitStatus, updatedJobTuple.exitReason, - updatedJobTuple.wallTime) + updatedJobTuple.jobID, + updatedJobTuple.exitStatus, + updatedJobTuple.exitReason, + updatedJobTuple.wallTime, + ) # easy, track different state try: - updatedJob = self.toilState.get_job(self.issued_jobs_by_batch_system_id[bsID]) + updatedJob = self.toilState.get_job( + self.issued_jobs_by_batch_system_id[bsID] + ) except KeyError: - logger.warning("A result seems to already have been processed for job %s", bsID) + logger.warning( + "A result seems to already have been processed for job %s", bsID + ) else: if exitStatus == 0: - logger.debug('Job ended: %s', updatedJob) + logger.debug("Job ended: %s", updatedJob) else: - status_string = str(exitStatus) if exitStatus != EXIT_STATUS_UNAVAILABLE_VALUE else "" - logger.warning(f'Job failed with exit value {status_string}: {updatedJob}\n' - f'Exit reason: {BatchJobExitReason.to_string(exitReason)}') + status_string = ( + str(exitStatus) + if exitStatus != EXIT_STATUS_UNAVAILABLE_VALUE + else "" + ) + logger.warning( + f"Job failed with exit value {status_string}: {updatedJob}\n" + f"Exit reason: {BatchJobExitReason.to_string(exitReason)}" + ) # This logic is undefined for which of the failing jobs will send its exit code # when there are multiple failing jobs with different exit statuses self.recommended_fail_exit_code = exitStatus @@ -718,10 +868,20 @@ def _gatherUpdatedJobs(self, updatedJobTuple): # exception because of this, make sure to forward along # this exit code. logger.warning("This indicates an unsupported CWL requirement!") - self.recommended_fail_exit_code = CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE + self.recommended_fail_exit_code = ( + CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE + ) # Tell everyone it stopped running. - self._messages.publish(JobCompletedMessage(get_job_kind(updatedJob.get_names()), updatedJob.jobStoreID, exitStatus)) - self.process_finished_job(bsID, exitStatus, wall_time=wallTime, exit_reason=exitReason) + self._messages.publish( + JobCompletedMessage( + get_job_kind(updatedJob.get_names()), + updatedJob.jobStoreID, + exitStatus, + ) + ) + self.process_finished_job( + bsID, exitStatus, wall_time=wallTime, exit_reason=exitReason + ) def _processLostJobs(self): """Process jobs that have gone awry.""" @@ -729,7 +889,9 @@ def _processLostJobs(self): # gather for rescueJobsFrequency seconds) check if there are any jobs # that have run too long (see self.reissueOverLongJobs) or which have # gone missing from the batch system (see self.reissueMissingJobs) - if ((time.time() - self.timeSinceJobsLastRescued) >= self.config.rescueJobsFrequency): + if ( + time.time() - self.timeSinceJobsLastRescued + ) >= self.config.rescueJobsFrequency: # We only rescue jobs every N seconds, and when we have apparently # exhausted the current job supply self.reissueOverLongJobs() @@ -749,9 +911,11 @@ def innerLoop(self): """ self.timeSinceJobsLastRescued = time.time() - while self._messages.count(JobUpdatedMessage) > 0 or \ - self.getNumberOfJobsIssued() or \ - self.serviceManager.get_job_count(): + while ( + self._messages.count(JobUpdatedMessage) > 0 + or self.getNumberOfJobsIssued() + or self.serviceManager.get_job_count() + ): if self._messages.count(JobUpdatedMessage) > 0: self._processReadyJobs() @@ -803,13 +967,21 @@ def innerLoop(self): if not self._messages.empty(): raise RuntimeError(f"Pending messages at shutdown: {self._messages}") if self.toilState.successorCounts != {}: - raise RuntimeError(f"Jobs waiting on successors at shutdown: {self.toilState.successorCounts}") + raise RuntimeError( + f"Jobs waiting on successors at shutdown: {self.toilState.successorCounts}" + ) if self.toilState.successor_to_predecessors != {}: - raise RuntimeError(f"Successors pending for their predecessors at shutdown: {self.toilState.successor_to_predecessors}") + raise RuntimeError( + f"Successors pending for their predecessors at shutdown: {self.toilState.successor_to_predecessors}" + ) if self.toilState.service_to_client != {}: - raise RuntimeError(f"Services pending for their clients at shutdown: {self.toilState.service_to_client}") + raise RuntimeError( + f"Services pending for their clients at shutdown: {self.toilState.service_to_client}" + ) if self.toilState.servicesIssued != {}: - raise RuntimeError(f"Services running at shutdown: {self.toilState.servicesIssued}") + raise RuntimeError( + f"Services running at shutdown: {self.toilState.servicesIssued}" + ) def checkForDeadlocks(self): """Check if the system is deadlocked running service jobs.""" @@ -819,18 +991,22 @@ def checkForDeadlocks(self): # If there are no updated jobs and at least some jobs running if totalServicesIssued >= totalRunningJobs and totalRunningJobs > 0: # Collect all running service job store IDs into a set to compare with the deadlock set - running_service_ids: Set[str] = set() + running_service_ids: set[str] = set() for js_id in self.issued_jobs_by_batch_system_id.values(): job = self.toilState.get_job(js_id) - if isinstance(job, ServiceJobDescription) and self.serviceManager.is_running(js_id): + if isinstance( + job, ServiceJobDescription + ) and self.serviceManager.is_running(js_id): running_service_ids.add(js_id) if len(running_service_ids) > totalRunningJobs: # This is too many services. # TODO: couldn't more jobs have started since we polled the # running job count? - raise RuntimeError(f"Supposedly running {len(running_service_ids)} services, which is" - f"more than the {totalRunningJobs} currently running jobs overall.") + raise RuntimeError( + f"Supposedly running {len(running_service_ids)} services, which is" + f"more than the {totalRunningJobs} currently running jobs overall." + ) # If all the running jobs are active services then we have a potential deadlock if len(running_service_ids) == totalRunningJobs: @@ -844,27 +1020,49 @@ def checkForDeadlocks(self): # Use a generic message if none is available message = "Cluster may be too small." - # See if this is a new potential deadlock if self.potentialDeadlockedJobs != running_service_ids: - logger.warning(("Potential deadlock detected! All %s running jobs are service jobs, " - "with no normal jobs to use them! %s"), totalRunningJobs, message) + logger.warning( + ( + "Potential deadlock detected! All %s running jobs are service jobs, " + "with no normal jobs to use them! %s" + ), + totalRunningJobs, + message, + ) self.potentialDeadlockedJobs = running_service_ids self.potentialDeadlockTime = time.time() else: # We wait self.config.deadlockWait seconds before declaring the system deadlocked stuckFor = time.time() - self.potentialDeadlockTime if stuckFor >= self.config.deadlockWait: - logger.error("We have been deadlocked since %s on these service jobs: %s", - self.potentialDeadlockTime, self.potentialDeadlockedJobs) - raise DeadlockException(("The workflow is service deadlocked - all %d running jobs " - "have been the same active services for at least %s seconds") % (totalRunningJobs, self.config.deadlockWait)) + logger.error( + "We have been deadlocked since %s on these service jobs: %s", + self.potentialDeadlockTime, + self.potentialDeadlockedJobs, + ) + raise DeadlockException( + ( + "The workflow is service deadlocked - all %d running jobs " + "have been the same active services for at least %s seconds" + ) + % (totalRunningJobs, self.config.deadlockWait) + ) else: # Complain that we are still stuck. - waitingNormalJobs = self.getNumberOfJobsIssued() - totalServicesIssued - logger.warning(("Potentially deadlocked for %.0f seconds. Waiting at most %.0f more seconds " - "for any of %d issued non-service jobs to schedule and start. %s"), - stuckFor, self.config.deadlockWait - stuckFor, waitingNormalJobs, message) + waitingNormalJobs = ( + self.getNumberOfJobsIssued() - totalServicesIssued + ) + logger.warning( + ( + "Potentially deadlocked for %.0f seconds. Waiting at most %.0f more seconds " + "for any of %d issued non-service jobs to schedule and start. %s" + ), + stuckFor, + self.config.deadlockWait - stuckFor, + waitingNormalJobs, + message, + ) else: # We have observed non-service jobs running, so reset the potential deadlock self.feed_deadlock_watchdog() @@ -885,29 +1083,38 @@ def issueJob(self, jobNode: JobDescription) -> None: """Add a job to the queue of jobs currently trying to run.""" # Never issue the same job multiple times simultaneously if jobNode.jobStoreID in self.toilState.jobs_issued: - raise RuntimeError(f"Attempted to issue {jobNode} multiple times simultaneously!") + raise RuntimeError( + f"Attempted to issue {jobNode} multiple times simultaneously!" + ) - workerCommand = [resolveEntryPoint('_toil_worker'), - jobNode.jobName, - self.jobStoreLocator, - jobNode.jobStoreID] + workerCommand = [ + resolveEntryPoint("_toil_worker"), + jobNode.jobName, + self.jobStoreLocator, + jobNode.jobStoreID, + ] for context in self.batchSystem.getWorkerContexts(): # For each context manager hook the batch system wants to run in # the worker, serialize and send it. - workerCommand.append('--context') - workerCommand.append(base64.b64encode(pickle.dumps(context)).decode('utf-8')) + workerCommand.append("--context") + workerCommand.append( + base64.b64encode(pickle.dumps(context)).decode("utf-8") + ) - omp_threads = os.environ.get('OMP_NUM_THREADS') \ - or str(max(1, int(jobNode.cores))) # make sure OMP_NUM_THREADS is a positive integer + omp_threads = os.environ.get("OMP_NUM_THREADS") or str( + max(1, int(jobNode.cores)) + ) # make sure OMP_NUM_THREADS is a positive integer job_environment = { # Set the number of cores used by OpenMP applications - 'OMP_NUM_THREADS': omp_threads, + "OMP_NUM_THREADS": omp_threads, } # jobBatchSystemID is an int for each job - jobBatchSystemID = self.batchSystem.issueBatchJob(' '.join(workerCommand), jobNode, job_environment=job_environment) + jobBatchSystemID = self.batchSystem.issueBatchJob( + " ".join(workerCommand), jobNode, job_environment=job_environment + ) # Record the job by the ID the batch system will use to talk about it with us self.issued_jobs_by_batch_system_id[jobBatchSystemID] = jobNode.jobStoreID # Record that this job is issued right now and shouldn't e.g. be issued again. @@ -917,11 +1124,18 @@ def issueJob(self, jobNode: JobDescription) -> None: # so increment this value after the job is added to the issuedJob dict self.preemptibleJobsIssued += 1 cur_logger = logger.debug if jobNode.local else logger.info - cur_logger("Issued job %s with job batch system ID: " - "%s and %s", - jobNode, str(jobBatchSystemID), jobNode.requirements_string()) + cur_logger( + "Issued job %s with job batch system ID: " "%s and %s", + jobNode, + str(jobBatchSystemID), + jobNode.requirements_string(), + ) # Tell everyone it is issued and the queue size changed - self._messages.publish(JobIssuedMessage(get_job_kind(jobNode.get_names()), jobNode.jobStoreID, jobBatchSystemID)) + self._messages.publish( + JobIssuedMessage( + get_job_kind(jobNode.get_names()), jobNode.jobStoreID, jobBatchSystemID + ) + ) self._messages.publish(QueueSizeMessage(self.getNumberOfJobsIssued())) # Tell the user there's another job to do self.progress_overall.total += 1 @@ -941,7 +1155,9 @@ def issueServiceJob(self, service_id: str) -> None: # Grab the service job description service = self.toilState.get_job(service_id) if not isinstance(service, ServiceJobDescription): - raise RuntimeError("The grabbed service job description is not the right type.") + raise RuntimeError( + "The grabbed service job description is not the right type." + ) if service.preemptible: self.preemptibleServiceJobsToBeIssued.append(service_id) @@ -951,14 +1167,23 @@ def issueServiceJob(self, service_id: str) -> None: def issueQueingServiceJobs(self): """Issues any queuing service jobs up to the limit of the maximum allowed.""" - while len(self.serviceJobsToBeIssued) > 0 and self.serviceJobsIssued < self.config.maxServiceJobs: + while ( + len(self.serviceJobsToBeIssued) > 0 + and self.serviceJobsIssued < self.config.maxServiceJobs + ): self.issueJob(self.toilState.get_job(self.serviceJobsToBeIssued.pop())) self.serviceJobsIssued += 1 - while len(self.preemptibleServiceJobsToBeIssued) > 0 and self.preemptibleServiceJobsIssued < self.config.maxPreemptibleServiceJobs: - self.issueJob(self.toilState.get_job(self.preemptibleServiceJobsToBeIssued.pop())) + while ( + len(self.preemptibleServiceJobsToBeIssued) > 0 + and self.preemptibleServiceJobsIssued + < self.config.maxPreemptibleServiceJobs + ): + self.issueJob( + self.toilState.get_job(self.preemptibleServiceJobsToBeIssued.pop()) + ) self.preemptibleServiceJobsIssued += 1 - def getNumberOfJobsIssued(self, preemptible: Optional[bool]=None) -> int: + def getNumberOfJobsIssued(self, preemptible: Optional[bool] = None) -> int: """ Get number of jobs that have been added by issueJob(s) and not removed by removeJob. @@ -1008,12 +1233,16 @@ def removeJob(self, jobBatchSystemID: int) -> JobDescription: """ if jobBatchSystemID not in self.issued_jobs_by_batch_system_id: raise RuntimeError("Job was already removed or was never issued.") - issuedDesc = self.toilState.get_job(self.issued_jobs_by_batch_system_id[jobBatchSystemID]) + issuedDesc = self.toilState.get_job( + self.issued_jobs_by_batch_system_id[jobBatchSystemID] + ) if issuedDesc.preemptible: # len(issued_jobs_by_batch_system_id) should always be greater than or equal to preemptibleJobsIssued, # so decrement this value before removing the job from the issuedJob map if self.preemptibleJobsIssued <= 0: - raise RuntimeError("The number of preemptive issued jobs cannot be negative.") + raise RuntimeError( + "The number of preemptive issued jobs cannot be negative." + ) self.preemptibleJobsIssued -= 1 # It's not issued anymore. del self.issued_jobs_by_batch_system_id[jobBatchSystemID] @@ -1033,19 +1262,24 @@ def removeJob(self, jobBatchSystemID: int) -> JobDescription: return issuedDesc - def getJobs(self, preemptible: Optional[bool] = None) -> List[JobDescription]: + def getJobs(self, preemptible: Optional[bool] = None) -> list[JobDescription]: """ Get all issued jobs. :param preemptible: If specified, select only preemptible or only non-preemptible jobs. """ - jobs = [self.toilState.get_job(job_store_id) for job_store_id in self.issued_jobs_by_batch_system_id.values()] + jobs = [ + self.toilState.get_job(job_store_id) + for job_store_id in self.issued_jobs_by_batch_system_id.values() + ] if preemptible is not None: jobs = [job for job in jobs if job.preemptible == preemptible] return jobs - def killJobs(self, jobsToKill, exit_reason: BatchJobExitReason = BatchJobExitReason.KILLED): + def killJobs( + self, jobsToKill, exit_reason: BatchJobExitReason = BatchJobExitReason.KILLED + ): """ Kills the given set of jobs and then sends them for processing. @@ -1059,7 +1293,9 @@ def killJobs(self, jobsToKill, exit_reason: BatchJobExitReason = BatchJobExitRea self.batchSystem.killBatchJobs(jobsToKill) for jobBatchSystemID in jobsToKill: # Reissue immediately, noting that we killed the job - willRerun = self.process_finished_job(jobBatchSystemID, 1, exit_reason=exit_reason) + willRerun = self.process_finished_job( + jobBatchSystemID, 1, exit_reason=exit_reason + ) if willRerun: # Compose a list of all the jobs that will run again @@ -1067,7 +1303,7 @@ def killJobs(self, jobsToKill, exit_reason: BatchJobExitReason = BatchJobExitRea return jobsRerunning - #Following functions handle error cases for when jobs have gone awry with the batch system. + # Following functions handle error cases for when jobs have gone awry with the batch system. def reissueOverLongJobs(self) -> None: """ @@ -1078,20 +1314,30 @@ def reissueOverLongJobs(self) -> None: """ maxJobDuration = self.config.maxJobDuration jobsToKill = [] - if maxJobDuration < 10000000: # We won't bother doing anything if rescue time > 16 weeks. + if ( + maxJobDuration < 10000000 + ): # We won't bother doing anything if rescue time > 16 weeks. runningJobs = self.batchSystem.getRunningBatchJobIDs() for jobBatchSystemID in list(runningJobs.keys()): if runningJobs[jobBatchSystemID] > maxJobDuration: - logger.warning("The job: %s has been running for: %s seconds, more than the " - "max job duration: %s, we'll kill it", - self.issued_jobs_by_batch_system_id[jobBatchSystemID], - str(runningJobs[jobBatchSystemID]), - str(maxJobDuration)) + logger.warning( + "The job: %s has been running for: %s seconds, more than the " + "max job duration: %s, we'll kill it", + self.issued_jobs_by_batch_system_id[jobBatchSystemID], + str(runningJobs[jobBatchSystemID]), + str(maxJobDuration), + ) jobsToKill.append(jobBatchSystemID) - reissued = self.killJobs(jobsToKill, exit_reason=BatchJobExitReason.MAXJOBDURATION) + reissued = self.killJobs( + jobsToKill, exit_reason=BatchJobExitReason.MAXJOBDURATION + ) if len(jobsToKill) > 0: # Summarize our actions - logger.info("Killed %d over long jobs and reissued %d of them", len(jobsToKill), len(reissued)) + logger.info( + "Killed %d over long jobs and reissued %d of them", + len(jobsToKill), + len(reissued), + ) def reissueMissingJobs(self, killAfterNTimesMissing=3): """ @@ -1103,11 +1349,13 @@ def reissueMissingJobs(self, killAfterNTimesMissing=3): """ issuedJobs = set(self.batchSystem.getIssuedBatchJobIDs()) jobBatchSystemIDsSet = set(list(self.issued_jobs_by_batch_system_id.keys())) - #Clean up the reissueMissingJobs_missingHash hash, getting rid of jobs that have turned up + # Clean up the reissueMissingJobs_missingHash hash, getting rid of jobs that have turned up missingJobIDsSet = set(list(self.reissueMissingJobs_missingHash.keys())) for jobBatchSystemID in missingJobIDsSet.difference(jobBatchSystemIDsSet): self.reissueMissingJobs_missingHash.pop(jobBatchSystemID) - logger.warning("Batch system id: %s is no longer missing", str(jobBatchSystemID)) + logger.warning( + "Batch system id: %s is no longer missing", str(jobBatchSystemID) + ) # checks we have no unexpected jobs running if not issuedJobs.issubset(jobBatchSystemIDsSet): raise RuntimeError("An unexpected job is still running.") @@ -1119,24 +1367,33 @@ def reissueMissingJobs(self, killAfterNTimesMissing=3): else: self.reissueMissingJobs_missingHash[jobBatchSystemID] = 1 timesMissing = self.reissueMissingJobs_missingHash[jobBatchSystemID] - logger.warning("Job store ID %s with batch system id %s is missing for the %i time", - jobStoreID, str(jobBatchSystemID), timesMissing) + logger.warning( + "Job store ID %s with batch system id %s is missing for the %i time", + jobStoreID, + str(jobBatchSystemID), + timesMissing, + ) # Tell everyone it is missing self._messages.publish(JobMissingMessage(jobStoreID)) if timesMissing == killAfterNTimesMissing: self.reissueMissingJobs_missingHash.pop(jobBatchSystemID) jobsToKill.append(jobBatchSystemID) self.killJobs(jobsToKill, exit_reason=BatchJobExitReason.MISSING) - return len( self.reissueMissingJobs_missingHash ) == 0 #We use this to inform - #if there are missing jobs + return len(self.reissueMissingJobs_missingHash) == 0 # We use this to inform + # if there are missing jobs def processRemovedJob(self, issuedJob, result_status): if result_status != 0: - logger.warning("Despite the batch system claiming failure the " - "job %s seems to have finished and been removed", issuedJob) + logger.warning( + "Despite the batch system claiming failure the " + "job %s seems to have finished and been removed", + issuedJob, + ) self._updatePredecessorStatus(issuedJob.jobStoreID) - def process_finished_job(self, batch_system_id, result_status, wall_time=None, exit_reason=None) -> bool: + def process_finished_job( + self, batch_system_id, result_status, wall_time=None, exit_reason=None + ) -> bool: """ Process finished jobs. @@ -1157,12 +1414,18 @@ def process_finished_job(self, batch_system_id, result_status, wall_time=None, e self.progress_failed.update(incr=1) # Delegate to the version that uses a JobDescription - return self.process_finished_job_description(issued_job, result_status, wall_time, exit_reason, batch_system_id) - - def process_finished_job_description(self, finished_job: JobDescription, result_status: int, - wall_time: Optional[float] = None, - exit_reason: Optional[BatchJobExitReason] = None, - batch_system_id: Optional[int] = None) -> bool: + return self.process_finished_job_description( + issued_job, result_status, wall_time, exit_reason, batch_system_id + ) + + def process_finished_job_description( + self, + finished_job: JobDescription, + result_status: int, + wall_time: Optional[float] = None, + exit_reason: Optional[BatchJobExitReason] = None, + batch_system_id: Optional[int] = None, + ) -> bool: """ Process a finished JobDescription based upon its success or failure. @@ -1184,7 +1447,9 @@ def process_finished_job_description(self, finished_job: JobDescription, result_ # TODO: Use message bus? self.clusterScaler.addCompletedJob(finished_job, wall_time) if self.toilState.job_exists(job_store_id): - logger.debug("Job %s continues to exist (i.e. has more to do)", finished_job) + logger.debug( + "Job %s continues to exist (i.e. has more to do)", finished_job + ) try: # Reload the job as modified by the worker if finished_job.has_body(): @@ -1209,24 +1474,22 @@ def process_finished_job_description(self, finished_job: JobDescription, result_ "batch system may have killed (or never started) " "the Toil worker." ) - change_detected = self.toilState.reset_job_expecting_change(job_store_id, timeout) + change_detected = self.toilState.reset_job_expecting_change( + job_store_id, timeout + ) replacement_job = self.toilState.get_job(job_store_id) if not change_detected: - logger.warning( - 'Job %s %s', - replacement_job, - complaint - ) + logger.warning("Job %s %s", replacement_job, complaint) if result_status == 0: # Make the job fail because we ran it and it finished # and we never heard back. logger.error( - 'Marking ostensibly successful job %s that did ' - 'not report in to the job store before ' - '--jobStoreTimeout as having been partitioned ' - 'from us.', - replacement_job + "Marking ostensibly successful job %s that did " + "not report in to the job store before " + "--jobStoreTimeout as having been partitioned " + "from us.", + replacement_job, ) result_status = EXIT_STATUS_UNAVAILABLE_VALUE exit_reason = BatchJobExitReason.PARTITION @@ -1242,7 +1505,9 @@ def process_finished_job_description(self, finished_job: JobDescription, result_ # read from e.g. a non-POSIX-compliant filesystem gave us a # false positive when we checked for its existence. Process the # job from here as any other job removed from the job store. - logger.debug("Job %s is actually complete upon closer inspection", finished_job) + logger.debug( + "Job %s is actually complete upon closer inspection", finished_job + ) self.processRemovedJob(finished_job, result_status) return False if replacement_job.logJobStoreFileID is not None: @@ -1250,18 +1515,31 @@ def process_finished_job_description(self, finished_job: JobDescription, result_ # more memory efficient than read().striplines() while leaving off the # trailing \n left when using readlines() # http://stackoverflow.com/a/15233739 - StatsAndLogging.logWithFormatting(f'Log from job "{job_store_id}"', log_stream, method=logger.warning, - message='The job seems to have left a log file, indicating failure: %s' % replacement_job) + StatsAndLogging.logWithFormatting( + f'Log from job "{job_store_id}"', + log_stream, + method=logger.warning, + message="The job seems to have left a log file, indicating failure: %s" + % replacement_job, + ) if self.config.writeLogs or self.config.writeLogsGzip: with replacement_job.getLogFileHandle(self.jobStore) as log_stream: # Send log data from the job store to each per-job log file involved. - StatsAndLogging.writeLogFiles([names.stats_name for names in replacement_job.get_chain()], log_stream, self.config, failed=True) + StatsAndLogging.writeLogFiles( + [names.stats_name for names in replacement_job.get_chain()], + log_stream, + self.config, + failed=True, + ) if result_status != 0: # If the batch system returned a non-zero exit code then the worker # is assumed not to have captured the failure of the job, so we # reduce the try count here. if replacement_job.logJobStoreFileID is None: - logger.warning("No log file is present, despite job failing: %s", replacement_job) + logger.warning( + "No log file is present, despite job failing: %s", + replacement_job, + ) if batch_system_id is not None: # Look for any standard output/error files created by the batch system. @@ -1270,30 +1548,60 @@ def process_finished_job_description(self, finished_job: JobDescription, result_ # --workDir / TOIL_WORKDIR is on a shared file system. # They live directly in the Toil work directory because that is # guaranteed to exist on the leader and workers. - file_list = glob.glob(self.batchSystem.format_std_out_err_glob(batch_system_id)) + file_list = glob.glob( + self.batchSystem.format_std_out_err_glob(batch_system_id) + ) for log_file in file_list: try: - log_stream = open(log_file, 'rb') + log_stream = open(log_file, "rb") except: - logger.warning('The batch system left a file %s, but it could not be opened' % log_file) + logger.warning( + "The batch system left a file %s, but it could not be opened" + % log_file + ) else: with log_stream: if os.path.getsize(log_file) > 0: - StatsAndLogging.logWithFormatting(f'Log from job "{job_store_id}"', log_stream, method=logger.warning, - message='The batch system left a non-empty file %s:' % log_file) - if self.config.writeLogs or self.config.writeLogsGzip: - file_root, _ = os.path.splitext(os.path.basename(log_file)) - job_names = [names.stats_name for names in replacement_job.get_chain()] + StatsAndLogging.logWithFormatting( + f'Log from job "{job_store_id}"', + log_stream, + method=logger.warning, + message="The batch system left a non-empty file %s:" + % log_file, + ) + if ( + self.config.writeLogs + or self.config.writeLogsGzip + ): + file_root, _ = os.path.splitext( + os.path.basename(log_file) + ) + job_names = [ + names.stats_name + for names in replacement_job.get_chain() + ] # Tack the batch system log file name onto each job's name - job_names = [j + '_' + file_root for j in job_names] + job_names = [ + j + "_" + file_root for j in job_names + ] log_stream.seek(0) - StatsAndLogging.writeLogFiles(job_names, log_stream, self.config, failed=True) + StatsAndLogging.writeLogFiles( + job_names, + log_stream, + self.config, + failed=True, + ) else: - logger.warning('The batch system left an empty file %s' % log_file) + logger.warning( + "The batch system left an empty file %s" + % log_file + ) # Tell the job to reset itself after a failure. # It needs to know the failure reason if available; some are handled specially. - replacement_job.setupJobAfterFailure(exit_status=result_status, exit_reason=exit_reason) + replacement_job.setupJobAfterFailure( + exit_status=result_status, exit_reason=exit_reason + ) self.toilState.commit_job(job_store_id) elif job_store_id in self.toilState.hasFailedSuccessors: @@ -1301,18 +1609,20 @@ def process_finished_job_description(self, finished_job: JobDescription, result_ self.toilState.hasFailedSuccessors.remove(job_store_id) # Now that we know the job is done we can add it to the list of updated jobs - self._messages.publish(JobUpdatedMessage(replacement_job.jobStoreID, result_status)) + self._messages.publish( + JobUpdatedMessage(replacement_job.jobStoreID, result_status) + ) logger.debug("Added job: %s to updated jobs", replacement_job) # Return True if it will rerun (still has retries) and false if it # is completely failed. return replacement_job.remainingTryCount > 0 - else: #The job is done + else: # The job is done self.processRemovedJob(finished_job, result_status) # Being done, it won't run again. return False - def getSuccessors(self, job_id: str, alreadySeenSuccessors: Set[str]) -> Set[str]: + def getSuccessors(self, job_id: str, alreadySeenSuccessors: set[str]) -> set[str]: """ Get successors of the given job by walking the job graph recursively. @@ -1320,6 +1630,7 @@ def getSuccessors(self, job_id: str, alreadySeenSuccessors: Set[str]) -> Set[str :returns: The set of found successors. This set is added to alreadySeenSuccessors. """ successors = set() + def successorRecursion(job_id: str) -> None: # TODO: do we need to reload from the job store here, or is the cache OK? jobDesc = self.toilState.get_job(job_id) @@ -1351,12 +1662,15 @@ def processTotallyFailedJob(self, job_id: str) -> None: # Tell everyone it failed - self._messages.publish(JobFailedMessage(get_job_kind(job_desc.get_names()), job_id)) + self._messages.publish( + JobFailedMessage(get_job_kind(job_desc.get_names()), job_id) + ) if job_id in self.toilState.service_to_client: # Is a service job - logger.debug("Service job is being processed as a totally failed job: %s", job_desc) - + logger.debug( + "Service job is being processed as a totally failed job: %s", job_desc + ) if not isinstance(job_desc, ServiceJobDescription): raise RuntimeError("The service job description type is incorrect.") @@ -1380,8 +1694,13 @@ def processTotallyFailedJob(self, job_id: str) -> None: # properly, and to remember that this service failed with an error # and possibly never started. if client_id in self.toilState.servicesIssued: - self.serviceManager.kill_services(self.toilState.servicesIssued[client_id], error=True) - logger.warning("Job: %s is instructing all other services of its parent job to quit", job_desc) + self.serviceManager.kill_services( + self.toilState.servicesIssued[client_id], error=True + ) + logger.warning( + "Job: %s is instructing all other services of its parent job to quit", + job_desc, + ) # This ensures that the job will not attempt to run any of it's # successors on the stack @@ -1405,9 +1724,14 @@ def processTotallyFailedJob(self, job_id: str) -> None: # Any successor already in toilState.failedSuccessors will not be traversed # All successors traversed will be added to toilState.failedSuccessors and returned # as a set (unseenSuccessors). - unseenSuccessors = self.getSuccessors(job_id, self.toilState.failedSuccessors) - logger.debug("Found new failed successors: %s of job: %s", " ".join( - unseenSuccessors), job_desc) + unseenSuccessors = self.getSuccessors( + job_id, self.toilState.failedSuccessors + ) + logger.debug( + "Found new failed successors: %s of job: %s", + " ".join(unseenSuccessors), + job_desc, + ) # For each newly found successor for successorJobStoreID in unseenSuccessors: @@ -1418,7 +1742,9 @@ def processTotallyFailedJob(self, job_id: str) -> None: # For each such predecessor job # (we remove the successor from toilState.successor_to_predecessors to avoid doing # this multiple times for each failed predecessor) - for predecessor_id in self.toilState.successor_to_predecessors.pop(successorJobStoreID): + for predecessor_id in self.toilState.successor_to_predecessors.pop( + successorJobStoreID + ): predecessor = self.toilState.get_job(predecessor_id) @@ -1427,8 +1753,11 @@ def processTotallyFailedJob(self, job_id: str) -> None: # Indicate that it has failed jobs. self.toilState.hasFailedSuccessors.add(predecessor_id) - logger.debug("Marking job: %s as having failed successors (found by " - "reading successors failed job)", predecessor) + logger.debug( + "Marking job: %s as having failed successors (found by " + "reading successors failed job)", + predecessor, + ) # If the predecessor has no remaining successors, add to list of updated jobs if self.toilState.count_pending_successors(predecessor_id) == 0: @@ -1442,8 +1771,12 @@ def processTotallyFailedJob(self, job_id: str) -> None: # Mark the predecessor as failed self.toilState.hasFailedSuccessors.add(predecessor_id) - logger.debug("Totally failed job: %s is marking direct predecessor: %s " - "as having failed jobs", job_desc, self.toilState.get_job(predecessor_id)) + logger.debug( + "Totally failed job: %s is marking direct predecessor: %s " + "as having failed jobs", + job_desc, + self.toilState.get_job(predecessor_id), + ) self._updatePredecessorStatus(job_id) @@ -1453,38 +1786,59 @@ def _updatePredecessorStatus(self, jobStoreID: str) -> None: # Is a service host job, so its predecessor is its client client_id = self.toilState.service_to_client.pop(jobStoreID) self.toilState.servicesIssued[client_id].remove(jobStoreID) - if len(self.toilState.servicesIssued[client_id]) == 0: # Predecessor job has + if ( + len(self.toilState.servicesIssued[client_id]) == 0 + ): # Predecessor job has # all its services terminated - self.toilState.servicesIssued.pop(client_id) # The job has no running services + self.toilState.servicesIssued.pop( + client_id + ) # The job has no running services - logger.debug('Job %s is no longer waiting on services; all services have stopped', self.toilState.get_job(client_id)) + logger.debug( + "Job %s is no longer waiting on services; all services have stopped", + self.toilState.get_job(client_id), + ) # Now we know the job is done we can add it to the list of # updated job files self._messages.publish(JobUpdatedMessage(client_id, 0)) else: - logger.debug('Job %s is still waiting on %d services', - self.toilState.get_job(client_id), - len(self.toilState.servicesIssued[client_id])) + logger.debug( + "Job %s is still waiting on %d services", + self.toilState.get_job(client_id), + len(self.toilState.servicesIssued[client_id]), + ) elif jobStoreID not in self.toilState.successor_to_predecessors: - #We have reach the root job + # We have reach the root job if self._messages.count(JobUpdatedMessage) != 0: raise RuntimeError("Root job is done but other jobs are still updated") if len(self.toilState.successor_to_predecessors) != 0: - raise RuntimeError("Job {} is finished and had no predecessor, but we have other outstanding jobs " - "with predecessors: {}".format(jobStoreID, self.toilState.successor_to_predecessors.keys())) + raise RuntimeError( + "Job {} is finished and had no predecessor, but we have other outstanding jobs " + "with predecessors: {}".format( + jobStoreID, self.toilState.successor_to_predecessors.keys() + ) + ) if len(self.toilState.successorCounts) != 0: - raise RuntimeError("Root job is done but jobs waiting on successors: {self.toilState.successorCounts}") - logger.debug("Reached root job %s so no predecessors to clean up" % jobStoreID) + raise RuntimeError( + "Root job is done but jobs waiting on successors: {self.toilState.successorCounts}" + ) + logger.debug( + "Reached root job %s so no predecessors to clean up" % jobStoreID + ) else: # Is a non-root, non-service job logger.debug("Cleaning the predecessors of %s" % jobStoreID) # For each predecessor - for predecessor_id in self.toilState.successor_to_predecessors.pop(jobStoreID): + for predecessor_id in self.toilState.successor_to_predecessors.pop( + jobStoreID + ): if not isinstance(predecessor_id, str): - raise RuntimeError("Predecessor ID should be str but is {type(predecessor_id)}") + raise RuntimeError( + "Predecessor ID should be str but is {type(predecessor_id)}" + ) predecessor = self.toilState.get_job(predecessor_id) # Tell the predecessor that this job is done (keep only other successor jobs) diff --git a/src/toil/lib/accelerators.py b/src/toil/lib/accelerators.py index f65ded42a9..dd5d432f68 100644 --- a/src/toil/lib/accelerators.py +++ b/src/toil/lib/accelerators.py @@ -17,7 +17,7 @@ import os import string import subprocess -from typing import Dict, List, Set, Union, cast +from typing import Union, cast from xml.dom import minidom from toil.job import AcceleratorRequirement @@ -34,13 +34,20 @@ def have_working_nvidia_smi() -> bool: it can fulfill a CUDARequirement. """ try: - subprocess.check_call(['nvidia-smi']) - except (FileNotFoundError, PermissionError, subprocess.CalledProcessError, OSError, UnicodeDecodeError): + subprocess.check_call(["nvidia-smi"]) + except ( + FileNotFoundError, + PermissionError, + subprocess.CalledProcessError, + OSError, + UnicodeDecodeError, + ): return False return True + @memoize -def get_host_accelerator_numbers() -> List[int]: +def get_host_accelerator_numbers() -> list[int]: """ Work out what accelerator is what. @@ -52,7 +59,12 @@ def get_host_accelerator_numbers() -> List[int]: right GPUs as seen from a Docker daemon. """ - for number_list_var in ['SLURM_STEP_GPUS', 'SLURM_JOB_GPUS', 'CUDA_VISIBLE_DEVICES', 'NVIDIA_VISIBLE_DEVICES']: + for number_list_var in [ + "SLURM_STEP_GPUS", + "SLURM_JOB_GPUS", + "CUDA_VISIBLE_DEVICES", + "NVIDIA_VISIBLE_DEVICES", + ]: # Any of these can have a list of GPU numbers, but the CUDA/NVIDIA ones # also support a system of GPU GUIDs that we don't support. # TODO: If Slurm confinement is set we ignore any attempt to further @@ -62,7 +74,9 @@ def get_host_accelerator_numbers() -> List[int]: if number_list_var in os.environ: device_string = os.environ[number_list_var] # Parse all the numbers we have - device_numbers = [int(part) for part in device_string.split(',') if part.isnumeric()] + device_numbers = [ + int(part) for part in device_string.split(",") if part.isnumeric() + ] if len(device_numbers) > 0: # We found some numbers, so use those return device_numbers @@ -70,6 +84,7 @@ def get_host_accelerator_numbers() -> List[int]: # If we don't see a set of limits we understand, say we have all nvidia GPUs return list(range(count_nvidia_gpus())) + @memoize def have_working_nvidia_docker_runtime() -> bool: """ @@ -77,11 +92,30 @@ def have_working_nvidia_docker_runtime() -> bool: """ try: # The runtime injects nvidia-smi; it doesn't seem to have to be in the image we use here - subprocess.check_call(['docker', 'run', '--rm', '--runtime', 'nvidia', '--gpus', 'all', 'ubuntu:20.04', 'nvidia-smi']) - except (FileNotFoundError, PermissionError, subprocess.CalledProcessError, OSError, UnicodeDecodeError): + subprocess.check_call( + [ + "docker", + "run", + "--rm", + "--runtime", + "nvidia", + "--gpus", + "all", + "ubuntu:20.04", + "nvidia-smi", + ] + ) + except ( + FileNotFoundError, + PermissionError, + subprocess.CalledProcessError, + OSError, + UnicodeDecodeError, + ): return False return True + @memoize def count_nvidia_gpus() -> int: """ @@ -101,12 +135,13 @@ def count_nvidia_gpus() -> int: .firstChild, ).data ) - except: + except: return 0 # TODO: Parse each gpu > product_name > text content and convert to some # kind of "model" that agrees with e.g. Kubernetes naming. + @memoize def count_amd_gpus() -> int: """ @@ -118,10 +153,18 @@ def count_amd_gpus() -> int: # we believe this is the expected output for amd-smi, but we don't actually have and amd gpu to test against # so we assume the output from the amd-smi documentation: # https://rocm.docs.amd.com/projects/amdsmi/en/latest/how-to/using-AMD-SMI-CLI-tool.html - out = subprocess.check_output((["amd-smi", "static"])) - gpu_count = len([line for line in out.decode("utf-8").split("\n") if line.startswith("gpu")]) + out = subprocess.check_output(["amd-smi", "static"]) + gpu_count = len( + [line for line in out.decode("utf-8").split("\n") if line.startswith("gpu")] + ) return gpu_count - except (FileNotFoundError, PermissionError, subprocess.SubprocessError, OSError, UnicodeDecodeError): + except ( + FileNotFoundError, + PermissionError, + subprocess.SubprocessError, + OSError, + UnicodeDecodeError, + ): # if the amd-smi command fails, try rocm-smi # if a different exception is raised, something other than the subprocess call is wrong pass @@ -129,15 +172,27 @@ def count_amd_gpus() -> int: # similarly, since we don't have an AMD gpu to test against, assume the output from the rocm-smi documentation: # https://rocm.blogs.amd.com/software-tools-optimization/affinity/part-2/README.html#gpu-numa-configuration-rocm-smi-showtoponuma out = subprocess.check_output(["rocm-smi"]) - gpu_count = len([line for line in out.decode("utf-8").split("\n") if len(line)> 0 and line[0] in string.digits]) + gpu_count = len( + [ + line + for line in out.decode("utf-8").split("\n") + if len(line) > 0 and line[0] in string.digits + ] + ) return gpu_count - except (FileNotFoundError, PermissionError, subprocess.SubprocessError, OSError, UnicodeDecodeError): + except ( + FileNotFoundError, + PermissionError, + subprocess.SubprocessError, + OSError, + UnicodeDecodeError, + ): pass return 0 @memoize -def get_individual_local_accelerators() -> List[AcceleratorRequirement]: +def get_individual_local_accelerators() -> list[AcceleratorRequirement]: """ Determine all the local accelerators available. Report each with count 1, in the order of the number that can be used to assign them. @@ -146,11 +201,22 @@ def get_individual_local_accelerators() -> List[AcceleratorRequirement]: accelerator assignment API. """ - gpus: List[AcceleratorRequirement] = [{'kind': 'gpu', 'brand': 'nvidia', 'api': 'cuda', 'count': 1} for _ in range(count_nvidia_gpus())] - gpus.extend([{'kind': 'gpu', 'brand': 'amd', 'api': 'rocm', 'count': 1} for _ in range(count_amd_gpus())]) + gpus: list[AcceleratorRequirement] = [ + {"kind": "gpu", "brand": "nvidia", "api": "cuda", "count": 1} + for _ in range(count_nvidia_gpus()) + ] + gpus.extend( + [ + {"kind": "gpu", "brand": "amd", "api": "rocm", "count": 1} + for _ in range(count_amd_gpus()) + ] + ) return gpus -def get_restrictive_environment_for_local_accelerators(accelerator_numbers : Union[Set[int], List[int]]) -> Dict[str, str]: + +def get_restrictive_environment_for_local_accelerators( + accelerator_numbers: Union[set[int], list[int]] +) -> dict[str, str]: """ Get environment variables which can be applied to a process to restrict it to using only the given accelerator numbers. @@ -161,11 +227,12 @@ def get_restrictive_environment_for_local_accelerators(accelerator_numbers : Uni # Since we only know about nvidia GPUs right now, we can just say our # accelerator numbering space is the same as nvidia's GPU numbering space. - gpu_list = ','.join(str(i) for i in accelerator_numbers) + gpu_list = ",".join(str(i) for i in accelerator_numbers) # Put this in several places: CUDA_VISIBLE_DEVICES for controlling # processes right here, and SINGULARITYENV_CUDA_VISIBLE_DEVICES for # propagating to Singularity containers. - return {'CUDA_VISIBLE_DEVICES': gpu_list, 'SINGULARITYENV_CUDA_VISIBLE_DEVICES': gpu_list} - - + return { + "CUDA_VISIBLE_DEVICES": gpu_list, + "SINGULARITYENV_CUDA_VISIBLE_DEVICES": gpu_list, + } diff --git a/src/toil/lib/aws/__init__.py b/src/toil/lib/aws/__init__.py index 4f757e0235..295d4132a8 100644 --- a/src/toil/lib/aws/__init__.py +++ b/src/toil/lib/aws/__init__.py @@ -16,8 +16,9 @@ import os import re import socket +from collections.abc import MutableMapping from http.client import HTTPException -from typing import TYPE_CHECKING, Dict, Literal, MutableMapping, Optional, Union +from typing import TYPE_CHECKING, Literal, Optional, Union from urllib.error import URLError from urllib.request import urlopen @@ -33,8 +34,7 @@ # These are errors where we think something randomly # went wrong on the AWS side and we ought to retry. AWSServerErrors = toil.lib.retry.ErrorCondition( - error=ClientError, - error_codes=[404, 500, 502, 503, 504] + error=ClientError, error_codes=[404, 500, 502, 503, 504] ) logger = logging.getLogger(__name__) @@ -42,6 +42,7 @@ # This file isn't allowed to import anything that depends on Boto or Boto3, # which may not be installed, because it has to be importable everywhere. + def get_current_aws_region() -> Optional[str]: """ Return the AWS region that the currently configured AWS zone (see @@ -51,11 +52,13 @@ def get_current_aws_region() -> Optional[str]: aws_zone = get_current_aws_zone() return zone_to_region(aws_zone) if aws_zone else None + def get_aws_zone_from_environment() -> Optional[str]: """ Get the AWS zone from TOIL_AWS_ZONE if set. """ - return os.environ.get('TOIL_AWS_ZONE', None) + return os.environ.get("TOIL_AWS_ZONE", None) + def get_aws_zone_from_metadata() -> Optional[str]: """ @@ -70,11 +73,15 @@ def get_aws_zone_from_metadata() -> Optional[str]: # Use the ECS metadata service logger.debug("Fetch AZ from ECS metadata") try: - resp = json.load(urlopen(os.environ['ECS_CONTAINER_METADATA_URI_V4'] + '/task', timeout=1)) + resp = json.load( + urlopen( + os.environ["ECS_CONTAINER_METADATA_URI_V4"] + "/task", timeout=1 + ) + ) logger.debug("ECS metadata: %s", resp) if isinstance(resp, dict): # We found something. Go with that. - return resp.get('AvailabilityZone') + return resp.get("AvailabilityZone") except (json.decoder.JSONDecodeError, KeyError, URLError) as e: # We're on ECS but can't get the metadata. That's odd. logger.warning("Skipping ECS metadata due to error: %s", e) @@ -95,6 +102,7 @@ def get_aws_zone_from_metadata() -> Optional[str]: logger.warning("Skipping EC2 metadata due to error: %s", e) return None + def get_aws_zone_from_boto() -> Optional[str]: """ Get the AWS zone from the Boto3 config file or from AWS_DEFAULT_REGION, if it is configured and the @@ -102,28 +110,30 @@ def get_aws_zone_from_boto() -> Optional[str]: """ try: import boto3 - from session import client + boto3_session = boto3.session.Session() # this should check AWS_DEFAULT_REGION and ~/.aws/config zone = boto3_session.region_name if zone is not None: - zone += 'a' # derive an availability zone in the region + zone += "a" # derive an availability zone in the region return zone except ImportError: pass return None + def get_aws_zone_from_environment_region() -> Optional[str]: """ Pick an AWS zone in the region defined by TOIL_AWS_REGION, if it is set. """ - aws_region = os.environ.get('TOIL_AWS_REGION') + aws_region = os.environ.get("TOIL_AWS_REGION") if aws_region is not None: # If a region is specified, use the first zone in the region. - return aws_region + 'a' + return aws_region + "a" # Otherwise, don't pick a region and let us fall back on the next method. return None + def get_current_aws_zone() -> Optional[str]: """ Get the currently configured or occupied AWS zone to use. @@ -141,62 +151,76 @@ def get_current_aws_zone() -> Optional[str]: Returns 'us-east-1a' if no method can produce a zone to use. """ - return get_aws_zone_from_environment() or \ - get_aws_zone_from_metadata() or \ - get_aws_zone_from_environment_region() or \ - get_aws_zone_from_boto() or \ - 'us-east-1a' # AWS's native default + return ( + get_aws_zone_from_environment() + or get_aws_zone_from_metadata() + or get_aws_zone_from_environment_region() + or get_aws_zone_from_boto() + or "us-east-1a" + ) # AWS's native default + def zone_to_region(zone: str) -> AWSRegionName: """Get a region (e.g. us-west-2) from a zone (e.g. us-west-1c).""" # re.compile() caches the regex internally so we don't have to - availability_zone = re.compile(r'^([a-z]{2}-[a-z]+-[1-9][0-9]*)([a-z])$') + availability_zone = re.compile(r"^([a-z]{2}-[a-z]+-[1-9][0-9]*)([a-z])$") m = availability_zone.match(zone) if not m: raise ValueError(f"Can't extract region from availability zone '{zone}'") return m.group(1) + def running_on_ec2() -> bool: """ Return True if we are currently running on EC2, and false otherwise. """ + # TODO: Move this to toil.lib.ec2 and make toil.lib.ec2 importable without boto? def file_begins_with(path, prefix): with open(path) as f: return f.read(len(prefix)) == prefix - hv_uuid_path = '/sys/hypervisor/uuid' - if os.path.exists(hv_uuid_path) and file_begins_with(hv_uuid_path, 'ec2'): + hv_uuid_path = "/sys/hypervisor/uuid" + if os.path.exists(hv_uuid_path) and file_begins_with(hv_uuid_path, "ec2"): return True # Some instances do not have the /sys/hypervisor/uuid file, so check the identity document instead. # See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-identity-documents.html try: - urlopen('http://169.254.169.254/latest/dynamic/instance-identity/document', timeout=1) + urlopen( + "http://169.254.169.254/latest/dynamic/instance-identity/document", + timeout=1, + ) return True except (URLError, socket.timeout, HTTPException): return False + def running_on_ecs() -> bool: """ Return True if we are currently running on Amazon ECS, and false otherwise. """ # We only care about relatively current ECS - return 'ECS_CONTAINER_METADATA_URI_V4' in os.environ + return "ECS_CONTAINER_METADATA_URI_V4" in os.environ + -def build_tag_dict_from_env(environment: MutableMapping[str, str] = os.environ) -> Dict[str, str]: +def build_tag_dict_from_env( + environment: MutableMapping[str, str] = os.environ +) -> dict[str, str]: tags = dict() - owner_tag = environment.get('TOIL_OWNER_TAG') + owner_tag = environment.get("TOIL_OWNER_TAG") if owner_tag: - tags.update({'Owner': owner_tag}) + tags.update({"Owner": owner_tag}) - user_tags = environment.get('TOIL_AWS_TAGS') + user_tags = environment.get("TOIL_AWS_TAGS") if user_tags: try: json_user_tags = json.loads(user_tags) if isinstance(json_user_tags, dict): tags.update(json.loads(user_tags)) else: - logger.error('TOIL_AWS_TAGS must be in JSON format: {"key" : "value", ...}') + logger.error( + 'TOIL_AWS_TAGS must be in JSON format: {"key" : "value", ...}' + ) exit(1) except json.decoder.JSONDecodeError: logger.error('TOIL_AWS_TAGS must be in JSON format: {"key" : "value", ...}') diff --git a/src/toil/lib/aws/ami.py b/src/toil/lib/aws/ami.py index 528b0418fa..126c4faf09 100644 --- a/src/toil/lib/aws/ami.py +++ b/src/toil/lib/aws/ami.py @@ -2,7 +2,8 @@ import logging import os import urllib.request -from typing import Dict, Iterator, Optional, cast +from collections.abc import Iterator +from typing import Optional, cast from urllib.error import HTTPError, URLError from botocore.client import BaseClient @@ -13,7 +14,7 @@ logger = logging.getLogger(__name__) -def get_flatcar_ami(ec2_client: BaseClient, architecture: str = 'amd64') -> str: +def get_flatcar_ami(ec2_client: BaseClient, architecture: str = "amd64") -> str: """ Retrieve the flatcar AMI image to use as the base for all Toil autoscaling instances. @@ -31,69 +32,93 @@ def get_flatcar_ami(ec2_client: BaseClient, architecture: str = 'amd64') -> str: """ # Take a user override - ami = os.environ.get('TOIL_AWS_AMI') + ami = os.environ.get("TOIL_AWS_AMI") try_number = 0 if not ami: - logger.debug('No AMI found in TOIL_AWS_AMI; checking stable Flatcar release feed') - ami = feed_flatcar_ami_release(ec2_client=ec2_client, architecture=architecture, source='stable') + logger.debug( + "No AMI found in TOIL_AWS_AMI; checking stable Flatcar release feed" + ) + ami = feed_flatcar_ami_release( + ec2_client=ec2_client, architecture=architecture, source="stable" + ) if not ami: - logger.warning('No available AMI found in Flatcar release feed; checking marketplace') - ami = aws_marketplace_flatcar_ami_search(ec2_client=ec2_client, architecture=architecture) + logger.warning( + "No available AMI found in Flatcar release feed; checking marketplace" + ) + ami = aws_marketplace_flatcar_ami_search( + ec2_client=ec2_client, architecture=architecture + ) if not ami: - logger.debug('No AMI found in marketplace; checking Toil Flatcar release feed') - ami = feed_flatcar_ami_release(ec2_client=ec2_client, architecture=architecture, source='toil') + logger.debug("No AMI found in marketplace; checking Toil Flatcar release feed") + ami = feed_flatcar_ami_release( + ec2_client=ec2_client, architecture=architecture, source="toil" + ) if not ami: - logger.debug('No AMI found in Toil project feed; checking beta Flatcar release feed') - ami = feed_flatcar_ami_release(ec2_client=ec2_client, architecture=architecture, source='beta') + logger.debug( + "No AMI found in Toil project feed; checking beta Flatcar release feed" + ) + ami = feed_flatcar_ami_release( + ec2_client=ec2_client, architecture=architecture, source="beta" + ) if not ami: - logger.debug('No AMI found in beta Flatcar release feed; checking archived Flatcar release feed') - ami = feed_flatcar_ami_release(ec2_client=ec2_client, architecture=architecture, source='archive') + logger.debug( + "No AMI found in beta Flatcar release feed; checking archived Flatcar release feed" + ) + ami = feed_flatcar_ami_release( + ec2_client=ec2_client, architecture=architecture, source="archive" + ) if not ami: - logger.critical('No available Flatcar AMI in any source!') - raise RuntimeError(f'Unable to fetch the latest flatcar image. Upload ' - f'https://stable.release.flatcar-linux.net/{architecture}-usr/current/flatcar_production_ami_image.bin.bz2 ' - f'to AWS as am AMI and set TOIL_AWS_AMI in the environment to its AMI ID.') - logger.info('Selected Flatcar AMI: %s', ami) + logger.critical("No available Flatcar AMI in any source!") + raise RuntimeError( + f"Unable to fetch the latest flatcar image. Upload " + f"https://stable.release.flatcar-linux.net/{architecture}-usr/current/flatcar_production_ami_image.bin.bz2 " + f"to AWS as am AMI and set TOIL_AWS_AMI in the environment to its AMI ID." + ) + logger.info("Selected Flatcar AMI: %s", ami) return ami + @retry(errors=[HTTPError]) -def _fetch_flatcar_feed(architecture: str = 'amd64', source: str = 'stable') -> bytes: +def _fetch_flatcar_feed(architecture: str = "amd64", source: str = "stable") -> bytes: """ Get the binary data of the Flatcar release feed for the given architecture. - + :param source: can be set to a Flatcar release channel ('stable', 'beta', or 'alpha'), 'archive' to check the Internet Archive for a feed, and 'toil' to check if the Toil project has put up a feed. - + :raises HTTPError: if the feed cannot be fetched. """ - + # We have a few places we know to get the feed from. JSON_FEED_URL = { - 'stable': f'https://stable.release.flatcar-linux.net/{architecture}-usr/current/flatcar_production_ami_all.json', - 'beta': f'https://beta.release.flatcar-linux.net/{architecture}-usr/current/flatcar_production_ami_all.json', - 'alpha': f'https://alpha.release.flatcar-linux.net/{architecture}-usr/current/flatcar_production_ami_all.json', - 'archive': f'https://web.archive.org/web/20220625112618if_/https://stable.release.flatcar-linux.net/{architecture}-usr/current/flatcar_production_ami_all.json', - 'toil': f'https://raw.githubusercontent.com/DataBiosphere/toil/master/contrib/flatcar/{architecture}-usr/current/flatcar_production_ami_all.json' + "stable": f"https://stable.release.flatcar-linux.net/{architecture}-usr/current/flatcar_production_ami_all.json", + "beta": f"https://beta.release.flatcar-linux.net/{architecture}-usr/current/flatcar_production_ami_all.json", + "alpha": f"https://alpha.release.flatcar-linux.net/{architecture}-usr/current/flatcar_production_ami_all.json", + "archive": f"https://web.archive.org/web/20220625112618if_/https://stable.release.flatcar-linux.net/{architecture}-usr/current/flatcar_production_ami_all.json", + "toil": f"https://raw.githubusercontent.com/DataBiosphere/toil/master/contrib/flatcar/{architecture}-usr/current/flatcar_production_ami_all.json", }[source] return cast(bytes, urllib.request.urlopen(JSON_FEED_URL).read()) -def flatcar_release_feed_amis(region: str, architecture: str = 'amd64', source: str = 'stable') -> Iterator[str]: + +def flatcar_release_feed_amis( + region: str, architecture: str = "amd64", source: str = "stable" +) -> Iterator[str]: """ Yield AMI IDs for the given architecture from the Flatcar release feed. - + :param source: can be set to a Flatcar release channel ('stable', 'beta', or 'alpha'), 'archive' to check the Internet Archive for a feed, and 'toil' to check if the Toil project has put up a feed. - + Retries if the release feed cannot be fetched. If the release feed has a permanent error, yields nothing. If some entries in the release feed are unparseable, yields the others. """ - + # If we get non-JSON content we want to retry. MAX_TRIES = 3 - + try_number = 0 while try_number < MAX_TRIES: try: @@ -101,45 +126,48 @@ def flatcar_release_feed_amis(region: str, architecture: str = 'amd64', source: break except HTTPError: # Flatcar servers did not return the feed - logger.exception(f'Could not retrieve {source} Flatcar release feed JSON') + logger.exception(f"Could not retrieve {source} Flatcar release feed JSON") # Don't retry return except json.JSONDecodeError: # Feed is not JSON - logger.exception(f'Could not decode {source} Flatcar release feed JSON') + logger.exception(f"Could not decode {source} Flatcar release feed JSON") # Try again try_number += 1 continue except URLError: # Could be a connection timeout - logger.exception(f'Failed to retrieve {source} Flatcar release feed JSON') + logger.exception(f"Failed to retrieve {source} Flatcar release feed JSON") # Try again try_number += 1 continue if try_number == MAX_TRIES: # We could not get the JSON - logger.error(f'Could not get a readable {source} Flatcar release feed JSON') + logger.error(f"Could not get a readable {source} Flatcar release feed JSON") # Bail on this method return - for ami_record in feed.get('amis', []): + for ami_record in feed.get("amis", []): # Scan the list of regions - if ami_record.get('name', None) == region: + if ami_record.get("name", None) == region: # When we find ours, return the AMI ID - if 'hvm' in ami_record: - yield ami_record['hvm'] + if "hvm" in ami_record: + yield ami_record["hvm"] # And stop, there should be one per region. return # We didn't find our region - logger.warning(f'Flatcar {source} release feed does not have an image for region {region}') - - + logger.warning( + f"Flatcar {source} release feed does not have an image for region {region}" + ) + @retry() # TODO: What errors do we get for timeout, JSON parse failure, etc? -def feed_flatcar_ami_release(ec2_client: BaseClient, architecture: str = 'amd64', source: str = 'stable') -> Optional[str]: +def feed_flatcar_ami_release( + ec2_client: BaseClient, architecture: str = "amd64", source: str = "stable" +) -> Optional[str]: """ Check a Flatcar release feed for the latest flatcar AMI. - + Verify it's on AWS. :param ec2_client: Boto3 EC2 Client @@ -152,40 +180,55 @@ def feed_flatcar_ami_release(ec2_client: BaseClient, architecture: str = 'amd64' # Rather than hardcode a list of AMIs by region that will die, we use # their JSON feed of the current ones. - region = ec2_client._client_config.region_name # type: ignore - + region = ec2_client._client_config.region_name # type: ignore + for ami in flatcar_release_feed_amis(region, architecture, source): # verify it exists on AWS try: - response = ec2_client.describe_images(Filters=[{'Name': 'image-id', 'Values': [ami]}]) # type: ignore - if len(response['Images']) == 1 and response['Images'][0]['State'] == 'available': + response = ec2_client.describe_images(Filters=[{"Name": "image-id", "Values": [ami]}]) # type: ignore + if ( + len(response["Images"]) == 1 + and response["Images"][0]["State"] == "available" + ): return ami else: - logger.warning(f'Flatcar release feed suggests image {ami} which does not exist on AWS in {region}') + logger.warning( + f"Flatcar release feed suggests image {ami} which does not exist on AWS in {region}" + ) except ClientError: # Sometimes we get back nonsense like: # botocore.exceptions.ClientError: An error occurred (AuthFailure) when calling the DescribeImages operation: AWS was not able to validate the provided access credentials # Don't hold that against the AMI. - logger.exception(f'Unable to check if AMI {ami} exists on AWS in {region}; assuming it does') + logger.exception( + f"Unable to check if AMI {ami} exists on AWS in {region}; assuming it does" + ) return ami # We didn't find it - logger.warning(f'Flatcar release feed does not have an image for region {region} that exists on AWS') + logger.warning( + f"Flatcar release feed does not have an image for region {region} that exists on AWS" + ) return None @retry() # TODO: What errors do we get for timeout, JSON parse failure, etc? -def aws_marketplace_flatcar_ami_search(ec2_client: BaseClient, architecture: str = 'amd64') -> Optional[str]: +def aws_marketplace_flatcar_ami_search( + ec2_client: BaseClient, architecture: str = "amd64" +) -> Optional[str]: """Query AWS for all AMI names matching ``Flatcar-stable-*`` and return the most recent one.""" # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.Client.describe_images # Possible arch choices on AWS: 'i386'|'x86_64'|'arm64'|'x86_64_mac' - architecture_mapping = {'amd64': 'x86_64', - 'arm64': 'arm64'} - response: dict = ec2_client.describe_images(Owners=['aws-marketplace'], # type: ignore - Filters=[{'Name': 'name', 'Values': ['Flatcar-stable-*']}]) - latest: Dict[str, str] = {'CreationDate': '0lder than atoms.'} - for image in response['Images']: - if image['Architecture'] == architecture_mapping[architecture] and image['State'] == 'available': - if image['CreationDate'] > latest['CreationDate']: + architecture_mapping = {"amd64": "x86_64", "arm64": "arm64"} + response = ec2_client.describe_images( # type: ignore[attr-defined] + Owners=["aws-marketplace"], + Filters=[{"Name": "name", "Values": ["Flatcar-stable-*"]}], + ) + latest: dict[str, str] = {"CreationDate": "0lder than atoms."} + for image in response["Images"]: + if ( + image["Architecture"] == architecture_mapping[architecture] + and image["State"] == "available" + ): + if image["CreationDate"] > latest["CreationDate"]: latest = image - return latest.get('ImageId', None) + return latest.get("ImageId", None) diff --git a/src/toil/lib/aws/iam.py b/src/toil/lib/aws/iam.py index fcaaa59706..b091cc6236 100644 --- a/src/toil/lib/aws/iam.py +++ b/src/toil/lib/aws/iam.py @@ -1,17 +1,17 @@ import fnmatch import json import logging -import boto3 - -from botocore.exceptions import ClientError from collections import defaultdict from functools import lru_cache -from typing import TYPE_CHECKING, Dict, List, Optional, Union, Any +from typing import TYPE_CHECKING, Any, Optional, Union + +import boto3 +from botocore.exceptions import ClientError from toil.lib.aws import AWSServerErrors, session -from toil.lib.misc import printq -from toil.lib.retry import retry, get_error_status, get_error_code from toil.lib.aws.session import client as get_client +from toil.lib.misc import printq +from toil.lib.retry import get_error_code, get_error_status, retry if TYPE_CHECKING: from mypy_boto3_iam import IAMClient @@ -23,7 +23,7 @@ logger = logging.getLogger(__name__) -#TODO Make this comprehensive +# TODO Make this comprehensive CLUSTER_LAUNCHING_PERMISSIONS = [ "iam:CreateRole", "iam:CreateInstanceProfile", @@ -59,23 +59,30 @@ "ec2:TerminateInstances", ] -AllowedActionCollection = Dict[str, Dict[str, List[str]]] +AllowedActionCollection = dict[str, dict[str, list[str]]] @retry(errors=[AWSServerErrors]) -def delete_iam_instance_profile(instance_profile_name: str, region: Optional[str] = None, quiet: bool = True) -> None: +def delete_iam_instance_profile( + instance_profile_name: str, region: Optional[str] = None, quiet: bool = True +) -> None: iam_resource = session.resource("iam", region_name=region) instance_profile = iam_resource.InstanceProfile(instance_profile_name) if instance_profile.roles is not None: for role in instance_profile.roles: - printq(f'Now dissociating role: {role.name} from instance profile {instance_profile_name}', quiet) + printq( + f"Now dissociating role: {role.name} from instance profile {instance_profile_name}", + quiet, + ) instance_profile.remove_role(RoleName=role.name) instance_profile.delete() printq(f'Instance profile "{instance_profile_name}" successfully deleted.', quiet) @retry(errors=[AWSServerErrors]) -def delete_iam_role(role_name: str, region: Optional[str] = None, quiet: bool = True) -> None: +def delete_iam_role( + role_name: str, region: Optional[str] = None, quiet: bool = True +) -> None: """ Deletes an AWS IAM role. Any separate policies are detached from the role, and any inline policies are deleted. @@ -84,22 +91,36 @@ def delete_iam_role(role_name: str, region: Optional[str] = None, quiet: bool = :param quiet: Whether or not to print/log information about the deletion to stdout. """ # TODO: This function could benefit from less complex Boto3 type hints - iam_client = session.client('iam', region_name=region) - iam_resource = session.resource('iam', region_name=region) + iam_client = session.client("iam", region_name=region) + iam_resource = session.resource("iam", region_name=region) role = iam_resource.Role(role_name) # normal policies for attached_policy in role.attached_policies.all(): - printq(f'Now dissociating policy: {attached_policy.policy_name} from role {role.name}', quiet) + printq( + f"Now dissociating policy: {attached_policy.policy_name} from role {role.name}", + quiet, + ) role.detach_policy(PolicyArn=attached_policy.arn) # inline policies for inline_policy in role.policies.all(): - printq(f'Deleting inline policy: {inline_policy.policy_name} from role {role.name}', quiet) - iam_client.delete_role_policy(RoleName=role.name, PolicyName=inline_policy.policy_name) + printq( + f"Deleting inline policy: {inline_policy.policy_name} from role {role.name}", + quiet, + ) + iam_client.delete_role_policy( + RoleName=role.name, PolicyName=inline_policy.policy_name + ) iam_client.delete_role(RoleName=role_name) - printq(f'Role {role_name} successfully deleted.', quiet) + printq(f"Role {role_name} successfully deleted.", quiet) + # "PolicyDocumentDictTypeDef" -def create_iam_role(role_name: str, assume_role_policy_document: str, policies: Dict[str, Any], region: Optional[str] = None) -> str: +def create_iam_role( + role_name: str, + assume_role_policy_document: str, + policies: dict[str, Any], + region: Optional[str] = None, +) -> str: """ Creates an AWS IAM role. Any separate policies are detached from the role, and any inline policies are deleted. @@ -108,15 +129,17 @@ def create_iam_role(role_name: str, assume_role_policy_document: str, policies: :param assume_role_policy_document: Policies to create inline with the role. :param policies: Global policies to attach to the role. """ - iam_client = session.client('iam', region_name=region) + iam_client = session.client("iam", region_name=region) try: # Make the role - logger.debug('Creating IAM role %s...', role_name) - iam_client.create_role(RoleName=role_name, AssumeRolePolicyDocument=assume_role_policy_document) - logger.debug('Created new IAM role') + logger.debug("Creating IAM role %s...", role_name) + iam_client.create_role( + RoleName=role_name, AssumeRolePolicyDocument=assume_role_policy_document + ) + logger.debug("Created new IAM role") except ClientError as e: - if get_error_status(e) == 409 and get_error_code(e) == 'EntityAlreadyExists': - logger.debug('IAM role already exists. Reusing.') + if get_error_status(e) == 409 and get_error_code(e) == "EntityAlreadyExists": + logger.debug("IAM role already exists. Reusing.") else: raise @@ -129,44 +152,55 @@ def create_iam_role(role_name: str, assume_role_policy_document: str, policies: for policy_name, policy in policies.items(): current_policy = None try: - current_policy = iam_client.get_role_policy(RoleName=role_name, PolicyName=policy_name)["PolicyDocument"] + current_policy = iam_client.get_role_policy( + RoleName=role_name, PolicyName=policy_name + )["PolicyDocument"] except iam_client.exceptions.NoSuchEntityException: pass if current_policy != policy: - iam_client.put_role_policy(RoleName=role_name, PolicyName=policy_name, PolicyDocument=json.dumps(policy)) + iam_client.put_role_policy( + RoleName=role_name, + PolicyName=policy_name, + PolicyDocument=json.dumps(policy), + ) # Now the role has the right policies so it is ready. return role_name def init_action_collection() -> AllowedActionCollection: - ''' + """ Initialization of an action collection, an action collection contains allowed Actions and NotActions by resource, these are patterns containing wildcards, an Action explicitly allows a matched pattern, eg ec2:* will explicitly allow all ec2 permissions A NotAction will explicitly allow all actions that don't match a specific pattern eg iam:* allows all non iam actions - ''' - return defaultdict(lambda: {'Action': [], 'NotAction': []}) + """ + return defaultdict(lambda: {"Action": [], "NotAction": []}) -def add_to_action_collection(a: AllowedActionCollection, b: AllowedActionCollection) -> AllowedActionCollection: - ''' + +def add_to_action_collection( + a: AllowedActionCollection, b: AllowedActionCollection +) -> AllowedActionCollection: + """ Combines two action collections - ''' + """ to_return = init_action_collection() for key in a.keys(): - to_return[key]['Action'] += a[key]['Action'] - to_return[key]['NotAction'] += a[key]['NotAction'] + to_return[key]["Action"] += a[key]["Action"] + to_return[key]["NotAction"] += a[key]["NotAction"] for key in b.keys(): - to_return[key]['Action'] += b[key]['Action'] - to_return[key]['NotAction'] += b[key]['NotAction'] + to_return[key]["Action"] += b[key]["Action"] + to_return[key]["NotAction"] += b[key]["NotAction"] return to_return -def policy_permissions_allow(given_permissions: AllowedActionCollection, required_permissions: List[str] = []) -> bool: +def policy_permissions_allow( + given_permissions: AllowedActionCollection, required_permissions: list[str] = [] +) -> bool: """ Check whether given set of actions are a subset of another given set of actions, returns true if they are otherwise false and prints a warning. @@ -176,24 +210,31 @@ def policy_permissions_allow(given_permissions: AllowedActionCollection, require """ # We only check actions explicitly allowed on all resources here, - #TODO: Add a resource parameter to check for actions allowed by resource + # TODO: Add a resource parameter to check for actions allowed by resource resource = "*" missing_perms = [] for permission in required_permissions: - if not permission_matches_any(permission, given_permissions[resource]['Action']): - if given_permissions[resource]['NotAction'] == [] or permission_matches_any(permission, given_permissions[resource]["NotAction"]): + if not permission_matches_any( + permission, given_permissions[resource]["Action"] + ): + if given_permissions[resource]["NotAction"] == [] or permission_matches_any( + permission, given_permissions[resource]["NotAction"] + ): missing_perms.append(permission) if missing_perms: - logger.warning('You appear to lack the folowing AWS permissions: %s', ', '.join(missing_perms)) + logger.warning( + "You appear to lack the folowing AWS permissions: %s", + ", ".join(missing_perms), + ) return False return True -def permission_matches_any(perm: str, list_perms: List[str]) -> bool: +def permission_matches_any(perm: str, list_perms: list[str]) -> bool: """ Takes a permission and checks whether it's contained within a list of given permissions Returns True if it is otherwise False @@ -235,14 +276,14 @@ def get_actions_from_policy_document( if isinstance(statement[key], list): # type: ignore[literal-required] allowed_actions[resource][key] += statement[key] # type: ignore[literal-required] else: - #Assumes that if it isn't a list it's probably a string + # Assumes that if it isn't a list it's probably a string allowed_actions[resource][key].append(statement[key]) # type: ignore[literal-required] return allowed_actions def allowed_actions_attached( - iam: "IAMClient", attached_policies: List["AttachedPolicyTypeDef"] + iam: "IAMClient", attached_policies: list["AttachedPolicyTypeDef"] ) -> AllowedActionCollection: """ Go through all attached policy documents and create an AllowedActionCollection representing granted permissions. @@ -253,17 +294,20 @@ def allowed_actions_attached( allowed_actions: AllowedActionCollection = init_action_collection() for policy in attached_policies: - policy_desc = iam.get_policy(PolicyArn=policy['PolicyArn']) - policy_ver = iam.get_policy_version(PolicyArn=policy_desc['Policy']['Arn'], VersionId=policy_desc['Policy']['DefaultVersionId']) - policy_document = policy_ver['PolicyVersion']['Document'] - #TODO whenever boto fixes the typing, stop ignoring this line in typecheck - allowed_actions = add_to_action_collection(allowed_actions, get_actions_from_policy_document(policy_document)) # type: ignore + policy_desc = iam.get_policy(PolicyArn=policy["PolicyArn"]) + policy_ver = iam.get_policy_version( + PolicyArn=policy_desc["Policy"]["Arn"], + VersionId=policy_desc["Policy"]["DefaultVersionId"], + ) + policy_document = policy_ver["PolicyVersion"]["Document"] + # TODO whenever boto fixes the typing, stop ignoring this line in typecheck + allowed_actions = add_to_action_collection(allowed_actions, get_actions_from_policy_document(policy_document)) # type: ignore return allowed_actions def allowed_actions_roles( - iam: "IAMClient", policy_names: List[str], role_name: str + iam: "IAMClient", policy_names: list[str], role_name: str ) -> AllowedActionCollection: """ Returns a dictionary containing a list of all aws actions allowed for a given role. @@ -276,10 +320,7 @@ def allowed_actions_roles( allowed_actions: AllowedActionCollection = init_action_collection() for policy_name in policy_names: - role_policy = iam.get_role_policy( - RoleName=role_name, - PolicyName=policy_name - ) + role_policy = iam.get_role_policy(RoleName=role_name, PolicyName=policy_name) logger.debug("Checking role policy") # PolicyDocument is now a TypedDict, but an instance of TypedDict is not an instance of dict? if isinstance(role_policy["PolicyDocument"], str): @@ -287,13 +328,15 @@ def allowed_actions_roles( else: policy_document = role_policy["PolicyDocument"] - allowed_actions = add_to_action_collection(allowed_actions, get_actions_from_policy_document(policy_document)) + allowed_actions = add_to_action_collection( + allowed_actions, get_actions_from_policy_document(policy_document) + ) return allowed_actions def collect_policy_actions( - policy_documents: List[Union[str, "PolicyDocumentDictTypeDef"]] + policy_documents: list[Union[str, "PolicyDocumentDictTypeDef"]] ) -> AllowedActionCollection: """ Collect all of the actions allowed by the given policy documents into one AllowedActionCollection. @@ -305,12 +348,14 @@ def collect_policy_actions( policy_dict = json.loads(policy_str) else: policy_dict = policy_str - allowed_actions = add_to_action_collection(allowed_actions, get_actions_from_policy_document(policy_dict)) + allowed_actions = add_to_action_collection( + allowed_actions, get_actions_from_policy_document(policy_dict) + ) return allowed_actions def allowed_actions_user( - iam: "IAMClient", policy_names: List[str], user_name: str + iam: "IAMClient", policy_names: list[str], user_name: str ) -> AllowedActionCollection: """ Gets all allowed actions for a user given by user_name, returns a dictionary, keyed by resource, @@ -321,17 +366,16 @@ def allowed_actions_user( :param user_name: Name of user to get associated policies """ user_policies = [ - iam.get_user_policy( - UserName=user_name, - PolicyName=policy_name - )["PolicyDocument"] + iam.get_user_policy(UserName=user_name, PolicyName=policy_name)[ + "PolicyDocument" + ] for policy_name in policy_names ] return collect_policy_actions(user_policies) def allowed_actions_group( - iam: "IAMClient", policy_names: List[str], group_name: str + iam: "IAMClient", policy_names: list[str], group_name: str ) -> AllowedActionCollection: """ Gets all allowed actions for a group given by group_name, returns a dictionary, keyed by resource, @@ -342,10 +386,9 @@ def allowed_actions_group( :param group_name: Name of group to get associated policies """ group_policies = [ - iam.get_group_policy( - GroupName=group_name, - PolicyName=policy_name - )["PolicyDocument"] + iam.get_group_policy(GroupName=group_name, PolicyName=policy_name)[ + "PolicyDocument" + ] for policy_name in policy_names ] return collect_policy_actions(group_policies) @@ -368,22 +411,42 @@ def get_policy_permissions(region: str) -> AllowedActionCollection: try: # If successful then we assume we are operating as a user, and grab the associated permissions user = iam.get_user() - list_policies = iam.list_user_policies(UserName=user['User']['UserName']) - attached_policies = iam.list_attached_user_policies(UserName=user['User']['UserName']) - user_attached_policies = allowed_actions_attached(iam, attached_policies['AttachedPolicies']) - allowed_actions = add_to_action_collection(allowed_actions, user_attached_policies) - user_inline_policies = allowed_actions_user(iam, list_policies['PolicyNames'], user['User']['UserName']) - allowed_actions = add_to_action_collection(allowed_actions, user_inline_policies) + list_policies = iam.list_user_policies(UserName=user["User"]["UserName"]) + attached_policies = iam.list_attached_user_policies( + UserName=user["User"]["UserName"] + ) + user_attached_policies = allowed_actions_attached( + iam, attached_policies["AttachedPolicies"] + ) + allowed_actions = add_to_action_collection( + allowed_actions, user_attached_policies + ) + user_inline_policies = allowed_actions_user( + iam, list_policies["PolicyNames"], user["User"]["UserName"] + ) + allowed_actions = add_to_action_collection( + allowed_actions, user_inline_policies + ) # grab group policies associated with the user - groups = iam.list_groups_for_user(UserName=user['User']['UserName']) + groups = iam.list_groups_for_user(UserName=user["User"]["UserName"]) for group in groups["Groups"]: - list_policies = iam.list_group_policies(GroupName=group['GroupName']) - attached_policies = iam.list_attached_group_policies(GroupName=group['GroupName']) - group_attached_policies = allowed_actions_attached(iam, attached_policies['AttachedPolicies']) - allowed_actions = add_to_action_collection(allowed_actions, group_attached_policies) - group_inline_policies = allowed_actions_group(iam, list_policies['PolicyNames'], group['GroupName']) - allowed_actions = add_to_action_collection(allowed_actions, group_inline_policies) + list_policies = iam.list_group_policies(GroupName=group["GroupName"]) + attached_policies = iam.list_attached_group_policies( + GroupName=group["GroupName"] + ) + group_attached_policies = allowed_actions_attached( + iam, attached_policies["AttachedPolicies"] + ) + allowed_actions = add_to_action_collection( + allowed_actions, group_attached_policies + ) + group_inline_policies = allowed_actions_group( + iam, list_policies["PolicyNames"], group["GroupName"] + ) + allowed_actions = add_to_action_collection( + allowed_actions, group_inline_policies + ) except: # If not successful, we check the role associated with an instance profile @@ -395,19 +458,28 @@ def get_policy_permissions(region: str) -> AllowedActionCollection: role_name = role["Arn"].split("/")[1] list_policies = iam.list_role_policies(RoleName=role_name) attached_policies = iam.list_attached_role_policies(RoleName=role_name) - role_attached_policies = allowed_actions_attached(iam, attached_policies['AttachedPolicies']) - allowed_actions = add_to_action_collection(allowed_actions, role_attached_policies) - role_inline_policies = allowed_actions_roles(iam, list_policies['PolicyNames'], role_name) - allowed_actions = add_to_action_collection(allowed_actions, role_inline_policies) + role_attached_policies = allowed_actions_attached( + iam, attached_policies["AttachedPolicies"] + ) + allowed_actions = add_to_action_collection( + allowed_actions, role_attached_policies + ) + role_inline_policies = allowed_actions_roles( + iam, list_policies["PolicyNames"], role_name + ) + allowed_actions = add_to_action_collection( + allowed_actions, role_inline_policies + ) except: logger.exception("Exception when trying to get role policies") logger.debug("Allowed actions: %s", allowed_actions) return allowed_actions -@lru_cache() + +@lru_cache def get_aws_account_num() -> Optional[str]: """ Returns AWS account num """ - return boto3.client('sts').get_caller_identity().get('Account') + return boto3.client("sts").get_caller_identity().get("Account") diff --git a/src/toil/lib/aws/s3.py b/src/toil/lib/aws/s3.py index 77cb94d56e..b911a18dc9 100644 --- a/src/toil/lib/aws/s3.py +++ b/src/toil/lib/aws/s3.py @@ -12,17 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging -from typing import List from mypy_boto3_s3.type_defs import ListMultipartUploadsOutputTypeDef -from toil.lib.aws import session, AWSServerErrors +from toil.lib.aws import AWSServerErrors, session from toil.lib.retry import retry logger = logging.getLogger(__name__) @retry(errors=[AWSServerErrors]) -def list_multipart_uploads(bucket: str, region: str, prefix: str, max_uploads: int = 1) -> ListMultipartUploadsOutputTypeDef: +def list_multipart_uploads( + bucket: str, region: str, prefix: str, max_uploads: int = 1 +) -> ListMultipartUploadsOutputTypeDef: s3_client = session.client("s3", region_name=region) - return s3_client.list_multipart_uploads(Bucket=bucket, MaxUploads=max_uploads, Prefix=prefix) + return s3_client.list_multipart_uploads( + Bucket=bucket, MaxUploads=max_uploads, Prefix=prefix + ) diff --git a/src/toil/lib/aws/session.py b/src/toil/lib/aws/session.py index dc8b837d4d..a442648322 100644 --- a/src/toil/lib/aws/session.py +++ b/src/toil/lib/aws/session.py @@ -15,17 +15,7 @@ import logging import os import threading -from typing import ( - TYPE_CHECKING, - Dict, - Literal, - Optional, - Tuple, - TypeVar, - Union, - cast, - overload, -) +from typing import TYPE_CHECKING, Literal, Optional, cast, overload import boto3 import boto3.resources.base @@ -61,6 +51,7 @@ # initializing Boto3 (or Boto2) things at a time. _init_lock = threading.RLock() + def _new_boto3_session(region_name: Optional[str] = None) -> Session: """ This is the One True Place where new Boto3 sessions should be made, and @@ -76,10 +67,16 @@ def _new_boto3_session(region_name: Optional[str] = None) -> Session: with _init_lock: botocore_session = get_session() - botocore_session.get_component('credential_provider').get_provider( - 'assume-role').cache = JSONFileCache() + botocore_session.get_component("credential_provider").get_provider( + "assume-role" + ).cache = JSONFileCache() + + return Session( + botocore_session=botocore_session, + region_name=region_name, + profile_name=os.environ.get("TOIL_AWS_PROFILE", None), + ) - return Session(botocore_session=botocore_session, region_name=region_name, profile_name=os.environ.get("TOIL_AWS_PROFILE", None)) class AWSConnectionManager: """ @@ -115,23 +112,31 @@ def __init__(self) -> None: """ # This stores Boto3 sessions in .item of a thread-local storage, by # region. - self.sessions_by_region: Dict[Optional[str], threading.local] = collections.defaultdict(threading.local) + self.sessions_by_region: dict[Optional[str], threading.local] = ( + collections.defaultdict(threading.local) + ) # This stores Boto3 resources in .item of a thread-local storage, by # (region, service name, endpoint URL) tuples - self.resource_cache: Dict[Tuple[Optional[str], str, Optional[str]], threading.local] = collections.defaultdict(threading.local) + self.resource_cache: dict[ + tuple[Optional[str], str, Optional[str]], threading.local + ] = collections.defaultdict(threading.local) # This stores Boto3 clients in .item of a thread-local storage, by # (region, service name, endpoint URL) tuples - self.client_cache: Dict[Tuple[Optional[str], str, Optional[str]], threading.local] = collections.defaultdict(threading.local) + self.client_cache: dict[ + tuple[Optional[str], str, Optional[str]], threading.local + ] = collections.defaultdict(threading.local) # This stores Boto 2 connections in .item of a thread-local storage, by # (region, service name) tuples. - self.boto2_cache: Dict[Tuple[Optional[str], str], threading.local] = collections.defaultdict(threading.local) + self.boto2_cache: dict[tuple[Optional[str], str], threading.local] = ( + collections.defaultdict(threading.local) + ) def session(self, region: Optional[str]) -> boto3.session.Session: """ Get the Boto3 Session to use for the given region. """ storage = self.sessions_by_region[region] - if not hasattr(storage, 'item'): + if not hasattr(storage, "item"): # This is the first time this thread wants to talk to this region # through this manager storage.item = _new_boto3_session(region_name=region) @@ -159,7 +164,12 @@ def resource( endpoint_url: Optional[str] = None, ) -> "EC2ServiceResource": ... - def resource(self, region: Optional[str], service_name: str, endpoint_url: Optional[str] = None) -> boto3.resources.base.ServiceResource: + def resource( + self, + region: Optional[str], + service_name: str, + endpoint_url: Optional[str] = None, + ) -> boto3.resources.base.ServiceResource: """ Get the Boto3 Resource to use with the given service (like 'ec2') in the given region. @@ -168,7 +178,7 @@ def resource(self, region: Optional[str], service_name: str, endpoint_url: Optio """ key = (region, service_name, endpoint_url) storage = self.resource_cache[key] - if not hasattr(storage, 'item'): + if not hasattr(storage, "item"): with _init_lock: # We lock inside the if check; we don't care if the memoization # sometimes results in multiple different copies leaking out. @@ -178,10 +188,10 @@ def resource(self, region: Optional[str], service_name: str, endpoint_url: Optio # The Boto3 stubs are missing an overload for `resource` that takes # a non-literal string. See # - storage.item = self.session(region).resource(service_name, endpoint_url=endpoint_url) # type: ignore + storage.item = self.session(region).resource(service_name, endpoint_url=endpoint_url) # type: ignore else: # We might not be able to pass None to Boto3 and have it be the same as no argument. - storage.item = self.session(region).resource(service_name) # type: ignore + storage.item = self.session(region).resource(service_name) # type: ignore return cast(boto3.resources.base.ServiceResource, storage.item) @@ -234,9 +244,13 @@ def client( config: Optional[Config] = None, ) -> "AutoScalingClient": ... - - def client(self, region: Optional[str], service_name: Literal["ec2", "iam", "s3", "sts", "sdb", "autoscaling"], endpoint_url: Optional[str] = None, - config: Optional[Config] = None) -> botocore.client.BaseClient: + def client( + self, + region: Optional[str], + service_name: Literal["ec2", "iam", "s3", "sts", "sdb", "autoscaling"], + endpoint_url: Optional[str] = None, + config: Optional[Config] = None, + ) -> botocore.client.BaseClient: """ Get the Boto3 Client to use with the given service (like 'ec2') in the given region. @@ -249,29 +263,34 @@ def client(self, region: Optional[str], service_name: Literal["ec2", "iam", "s3" # Don't try and memoize if a custom config is used with _init_lock: if endpoint_url is not None: - return self.session(region).client(service_name, endpoint_url=endpoint_url, config=config) + return self.session(region).client( + service_name, endpoint_url=endpoint_url, config=config + ) else: return self.session(region).client(service_name, config=config) key = (region, service_name, endpoint_url) storage = self.client_cache[key] - if not hasattr(storage, 'item'): + if not hasattr(storage, "item"): with _init_lock: # We lock because we call .client() if endpoint_url is not None: # The Boto3 stubs are probably missing an overload here too. See: # - storage.item = self.session(region).client(service_name, endpoint_url=endpoint_url) + storage.item = self.session(region).client( + service_name, endpoint_url=endpoint_url + ) else: # We might not be able to pass None to Boto3 and have it be the same as no argument. storage.item = self.session(region).client(service_name) - return cast(botocore.client.BaseClient , storage.item) + return cast(botocore.client.BaseClient, storage.item) # If you don't want your own AWSConnectionManager, we have a global one and some global functions _global_manager = AWSConnectionManager() + def establish_boto3_session(region_name: Optional[str] = None) -> Session: """ Get a Boto 3 session usable by the current thread. @@ -282,6 +301,7 @@ def establish_boto3_session(region_name: Optional[str] = None) -> Session: # Just use a global version of the manager. Note that we change the argument order! return _global_manager.session(region_name) + @overload def client( service_name: Literal["ec2"], @@ -325,7 +345,13 @@ def client( config: Optional[Config] = None, ) -> "AutoScalingClient": ... -def client(service_name: Literal["ec2", "iam", "s3", "sts", "sdb", "autoscaling"], region_name: Optional[str] = None, endpoint_url: Optional[str] = None, config: Optional[Config] = None) -> botocore.client.BaseClient: + +def client( + service_name: Literal["ec2", "iam", "s3", "sts", "sdb", "autoscaling"], + region_name: Optional[str] = None, + endpoint_url: Optional[str] = None, + config: Optional[Config] = None, +) -> botocore.client.BaseClient: """ Get a Boto 3 client for a particular AWS service, usable by the current thread. @@ -333,7 +359,10 @@ def client(service_name: Literal["ec2", "iam", "s3", "sts", "sdb", "autoscaling" """ # Just use a global version of the manager. Note that we change the argument order! - return _global_manager.client(region_name, service_name, endpoint_url=endpoint_url, config=config) + return _global_manager.client( + region_name, service_name, endpoint_url=endpoint_url, config=config + ) + @overload def resource( @@ -354,7 +383,12 @@ def resource( endpoint_url: Optional[str] = None, ) -> "EC2ServiceResource": ... -def resource(service_name: Literal["s3", "iam", "ec2"], region_name: Optional[str] = None, endpoint_url: Optional[str] = None) -> boto3.resources.base.ServiceResource: + +def resource( + service_name: Literal["s3", "iam", "ec2"], + region_name: Optional[str] = None, + endpoint_url: Optional[str] = None, +) -> boto3.resources.base.ServiceResource: """ Get a Boto 3 resource for a particular AWS service, usable by the current thread. @@ -362,4 +396,6 @@ def resource(service_name: Literal["s3", "iam", "ec2"], region_name: Optional[st """ # Just use a global version of the manager. Note that we change the argument order! - return _global_manager.resource(region_name, service_name, endpoint_url=endpoint_url) + return _global_manager.resource( + region_name, service_name, endpoint_url=endpoint_url + ) diff --git a/src/toil/lib/aws/utils.py b/src/toil/lib/aws/utils.py index 1c094ba9c8..a742a9e8ae 100644 --- a/src/toil/lib/aws/utils.py +++ b/src/toil/lib/aws/utils.py @@ -15,20 +15,8 @@ import logging import os import socket -from typing import ( - TYPE_CHECKING, - Any, - Callable, - ContextManager, - Dict, - Iterable, - Iterator, - List, - Optional, - Set, - Tuple, - cast, -) +from collections.abc import Iterable, Iterator +from typing import TYPE_CHECKING, Any, Callable, ContextManager, Optional, cast from urllib.parse import ParseResult from toil.lib.aws import AWSRegionName, AWSServerErrors, session @@ -36,7 +24,6 @@ from toil.lib.retry import ( DEFAULT_DELAYS, DEFAULT_TIMEOUT, - ErrorCondition, get_error_code, get_error_status, old_retry, @@ -61,20 +48,20 @@ # These are error codes we expect from AWS if we are making requests too fast. # https://github.com/boto/botocore/blob/49f87350d54f55b687969ec8bf204df785975077/botocore/retries/standard.py#L316 THROTTLED_ERROR_CODES = [ - 'Throttling', - 'ThrottlingException', - 'ThrottledException', - 'RequestThrottledException', - 'TooManyRequestsException', - 'ProvisionedThroughputExceededException', - 'TransactionInProgressException', - 'RequestLimitExceeded', - 'BandwidthLimitExceeded', - 'LimitExceededException', - 'RequestThrottled', - 'SlowDown', - 'PriorRequestNotComplete', - 'EC2ThrottledException', + "Throttling", + "ThrottlingException", + "ThrottledException", + "RequestThrottledException", + "TooManyRequestsException", + "ProvisionedThroughputExceededException", + "TransactionInProgressException", + "RequestLimitExceeded", + "BandwidthLimitExceeded", + "LimitExceededException", + "RequestThrottled", + "SlowDown", + "PriorRequestNotComplete", + "EC2ThrottledException", ] @@ -96,12 +83,12 @@ def connection_reset(e: Exception) -> bool: # errno is listed as 104. To be safe, we check for both: return isinstance(e, socket.error) and e.errno in (errno.ECONNRESET, 104) + def connection_error(e: Exception) -> bool: """ Return True if an error represents a failure to make a network connection. """ - return (connection_reset(e) - or isinstance(e, EndpointConnectionError)) + return connection_reset(e) or isinstance(e, EndpointConnectionError) # TODO: Replace with: @retry and ErrorCondition @@ -109,34 +96,47 @@ def retryable_s3_errors(e: Exception) -> bool: """ Return true if this is an error from S3 that looks like we ought to retry our request. """ - return (connection_error(e) - or (isinstance(e, ClientError) and get_error_status(e) in (429, 500)) - or (isinstance(e, ClientError) and get_error_code(e) in THROTTLED_ERROR_CODES) - # boto3 errors - or (isinstance(e, ClientError) and get_error_code(e) in THROTTLED_ERROR_CODES) - or (isinstance(e, ClientError) and 'BucketNotEmpty' in str(e)) - or (isinstance(e, ClientError) and e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 409 and 'try again' in str(e)) - or (isinstance(e, ClientError) and e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') in (404, 429, 500, 502, 503, 504))) + return ( + connection_error(e) + or (isinstance(e, ClientError) and get_error_status(e) in (429, 500)) + or (isinstance(e, ClientError) and get_error_code(e) in THROTTLED_ERROR_CODES) + # boto3 errors + or (isinstance(e, ClientError) and get_error_code(e) in THROTTLED_ERROR_CODES) + or (isinstance(e, ClientError) and "BucketNotEmpty" in str(e)) + or ( + isinstance(e, ClientError) + and e.response.get("ResponseMetadata", {}).get("HTTPStatusCode") == 409 + and "try again" in str(e) + ) + or ( + isinstance(e, ClientError) + and e.response.get("ResponseMetadata", {}).get("HTTPStatusCode") + in (404, 429, 500, 502, 503, 504) + ) + ) -def retry_s3(delays: Iterable[float] = DEFAULT_DELAYS, timeout: float = DEFAULT_TIMEOUT, predicate: Callable[[Exception], bool] = retryable_s3_errors) -> Iterator[ContextManager[None]]: +def retry_s3( + delays: Iterable[float] = DEFAULT_DELAYS, + timeout: float = DEFAULT_TIMEOUT, + predicate: Callable[[Exception], bool] = retryable_s3_errors, +) -> Iterator[ContextManager[None]]: """ Retry iterator of context managers specifically for S3 operations. """ return old_retry(delays=delays, timeout=timeout, predicate=predicate) + @retry(errors=[AWSServerErrors]) def delete_s3_bucket( - s3_resource: "S3ServiceResource", - bucket: str, - quiet: bool = True + s3_resource: "S3ServiceResource", bucket: str, quiet: bool = True ) -> None: """ Delete the given S3 bucket. """ - printq(f'Deleting s3 bucket: {bucket}', quiet) + printq(f"Deleting s3 bucket: {bucket}", quiet) - paginator = s3_resource.meta.client.get_paginator('list_object_versions') + paginator = s3_resource.meta.client.get_paginator("list_object_versions") try: for response in paginator.paginate(Bucket=bucket): # Versions and delete markers can both go in here to be deleted. @@ -144,15 +144,20 @@ def delete_s3_bucket( # defined for them in the stubs to express that. See # . So we # have to do gymnastics to get them into the same list. - to_delete: List[Dict[str, Any]] = cast(List[Dict[str, Any]], response.get('Versions', [])) + \ - cast(List[Dict[str, Any]], response.get('DeleteMarkers', [])) + to_delete: list[dict[str, Any]] = cast( + list[dict[str, Any]], response.get("Versions", []) + ) + cast(list[dict[str, Any]], response.get("DeleteMarkers", [])) for entry in to_delete: - printq(f" Deleting {entry['Key']} version {entry['VersionId']}", quiet) - s3_resource.meta.client.delete_object(Bucket=bucket, Key=entry['Key'], VersionId=entry['VersionId']) + printq( + f" Deleting {entry['Key']} version {entry['VersionId']}", quiet + ) + s3_resource.meta.client.delete_object( + Bucket=bucket, Key=entry["Key"], VersionId=entry["VersionId"] + ) s3_resource.Bucket(bucket).delete() - printq(f'\n * Deleted s3 bucket successfully: {bucket}\n\n', quiet) + printq(f"\n * Deleted s3 bucket successfully: {bucket}\n\n", quiet) except s3_resource.meta.client.exceptions.NoSuchBucket: - printq(f'\n * S3 bucket no longer exists: {bucket}\n\n', quiet) + printq(f"\n * S3 bucket no longer exists: {bucket}\n\n", quiet) def create_s3_bucket( @@ -178,6 +183,7 @@ def create_s3_bucket( ) return bucket + @retry(errors=[ClientError]) def enable_public_objects(bucket_name: str) -> None: """ @@ -201,7 +207,7 @@ def enable_public_objects(bucket_name: str) -> None: would be a very awkward way to do it. So we restore the old behavior. """ - s3_client = session.client('s3') + s3_client = session.client("s3") # Even though the new default is for public access to be prohibited, this # is implemented by adding new things attached to the bucket. If we remove @@ -214,13 +220,18 @@ def enable_public_objects(bucket_name: str) -> None: # Stop using an ownership controls setting that prohibits ACLs. s3_client.delete_bucket_ownership_controls(Bucket=bucket_name) + class NoBucketLocationError(Exception): """ Error to represent that we could not get a location for a bucket. """ - pass -def get_bucket_region(bucket_name: str, endpoint_url: Optional[str] = None, only_strategies: Optional[Set[int]] = None) -> str: + +def get_bucket_region( + bucket_name: str, + endpoint_url: Optional[str] = None, + only_strategies: Optional[set[int]] = None, +) -> str: """ Get the AWS region name associated with the given S3 bucket, or raise NoBucketLocationError. @@ -231,13 +242,15 @@ def get_bucket_region(bucket_name: str, endpoint_url: Optional[str] = None, only :param only_strategies: For testing, use only strategies with 1-based numbers in this set. """ - s3_client = session.client('s3', endpoint_url=endpoint_url) + s3_client = session.client("s3", endpoint_url=endpoint_url) def attempt_get_bucket_location() -> Optional[str]: """ Try and get the bucket location from the normal API call. """ - return s3_client.get_bucket_location(Bucket=bucket_name).get('LocationConstraint', None) + return s3_client.get_bucket_location(Bucket=bucket_name).get( + "LocationConstraint", None + ) def attempt_get_bucket_location_from_us_east_1() -> Optional[str]: """ @@ -253,8 +266,10 @@ def attempt_get_bucket_location_from_us_east_1() -> Optional[str]: # It could also be because AWS open data buckets (which we tend to # encounter this problem for) tend to actually themselves be in # us-east-1. - backup_s3_client = session.client('s3', region_name='us-east-1') - return backup_s3_client.get_bucket_location(Bucket=bucket_name).get('LocationConstraint', None) + backup_s3_client = session.client("s3", region_name="us-east-1") + return backup_s3_client.get_bucket_location(Bucket=bucket_name).get( + "LocationConstraint", None + ) def attempt_head_bucket() -> Optional[str]: """ @@ -266,11 +281,11 @@ def attempt_head_bucket() -> Optional[str]: # us where the bucket is. See # info = s3_client.head_bucket(Bucket=bucket_name) - return info['ResponseMetadata']['HTTPHeaders']['x-amz-bucket-region'] + return info["ResponseMetadata"]["HTTPHeaders"]["x-amz-bucket-region"] # Compose a list of strategies we want to try in order, which may work. # None is an acceptable return type that actually means something. - strategies: List[Callable[[], Optional[str]]] = [] + strategies: list[Callable[[], Optional[str]]] = [] strategies.append(attempt_get_bucket_location) if not endpoint_url: # We should only try to talk to us-east-1 if we don't have a custom @@ -278,20 +293,24 @@ def attempt_head_bucket() -> Optional[str]: strategies.append(attempt_get_bucket_location_from_us_east_1) strategies.append(attempt_head_bucket) - error_logs: List[Tuple[int, str]] = [] + error_logs: list[tuple[int, str]] = [] for attempt in retry_s3(): with attempt: for i, strategy in enumerate(strategies): - if only_strategies is not None and i+1 not in only_strategies: + if only_strategies is not None and i + 1 not in only_strategies: # We want to test running without this strategy. continue try: location = bucket_location_to_region(strategy()) - logger.debug('Got bucket location from strategy %d', i + 1) + logger.debug("Got bucket location from strategy %d", i + 1) return location except ClientError as e: - if get_error_code(e) == 'AccessDenied' and not endpoint_url: - logger.debug('Strategy %d to get bucket location did not work: %s', i + 1, e) + if get_error_code(e) == "AccessDenied" and not endpoint_url: + logger.debug( + "Strategy %d to get bucket location did not work: %s", + i + 1, + e, + ) error_logs.append((i + 1, str(e))) last_error: Exception = e # We were blocked with this strategy. Move on to the @@ -301,135 +320,147 @@ def attempt_head_bucket() -> Optional[str]: raise except KeyError as e: # If we get a weird head response we will have a KeyError - logger.debug('Strategy %d to get bucket location did not work: %s', i + 1, e) + logger.debug( + "Strategy %d to get bucket location did not work: %s", i + 1, e + ) error_logs.append((i + 1, str(e))) last_error = e error_messages = [] for rank, message in error_logs: - error_messages.append(f"Strategy {rank} failed to get bucket location because: {message}") + error_messages.append( + f"Strategy {rank} failed to get bucket location because: {message}" + ) # If we get here we ran out of attempts. - raise NoBucketLocationError("Could not get bucket location: " + "\n".join(error_messages)) from last_error + raise NoBucketLocationError( + "Could not get bucket location: " + "\n".join(error_messages) + ) from last_error + def region_to_bucket_location(region: str) -> str: - return '' if region == 'us-east-1' else region + return "" if region == "us-east-1" else region + def bucket_location_to_region(location: Optional[str]) -> str: return "us-east-1" if location == "" or location is None else location def get_object_for_url(url: ParseResult, existing: Optional[bool] = None) -> "S3Object": - """ - Extracts a key (object) from a given parsed s3:// URL. + """ + Extracts a key (object) from a given parsed s3:// URL. - If existing is true and the object does not exist, raises FileNotFoundError. + If existing is true and the object does not exist, raises FileNotFoundError. - :param bool existing: If True, key is expected to exist. If False, key is expected not to - exists and it will be created. If None, the key will be created if it doesn't exist. - """ + :param bool existing: If True, key is expected to exist. If False, key is expected not to + exists and it will be created. If None, the key will be created if it doesn't exist. + """ + + key_name = url.path[1:] + bucket_name = url.netloc + + # Decide if we need to override Boto's built-in URL here. + endpoint_url: Optional[str] = None + host = os.environ.get("TOIL_S3_HOST", None) + port = os.environ.get("TOIL_S3_PORT", None) + protocol = "https" + if os.environ.get("TOIL_S3_USE_SSL", True) == "False": + protocol = "http" + if host: + endpoint_url = f"{protocol}://{host}" + f":{port}" if port else "" + + # TODO: OrdinaryCallingFormat equivalent in boto3? + # if botoargs: + # botoargs['calling_format'] = boto.s3.connection.OrdinaryCallingFormat() - key_name = url.path[1:] - bucket_name = url.netloc - - # Decide if we need to override Boto's built-in URL here. - endpoint_url: Optional[str] = None - host = os.environ.get('TOIL_S3_HOST', None) - port = os.environ.get('TOIL_S3_PORT', None) - protocol = 'https' - if os.environ.get('TOIL_S3_USE_SSL', True) == 'False': - protocol = 'http' - if host: - endpoint_url = f'{protocol}://{host}' + f':{port}' if port else '' - - # TODO: OrdinaryCallingFormat equivalent in boto3? - # if botoargs: - # botoargs['calling_format'] = boto.s3.connection.OrdinaryCallingFormat() - - try: - # Get the bucket's region to avoid a redirect per request - region = get_bucket_region(bucket_name, endpoint_url=endpoint_url) - s3 = session.resource('s3', region_name=region, endpoint_url=endpoint_url) - except NoBucketLocationError as e: - # Probably don't have permission. - # TODO: check if it is that - logger.debug("Couldn't get bucket location: %s", e) - logger.debug("Fall back to not specifying location") - s3 = session.resource('s3', endpoint_url=endpoint_url) - - obj = s3.Object(bucket_name, key_name) - objExists = True - - try: - obj.load() - except ClientError as e: - if get_error_status(e) == 404: - objExists = False - else: - raise - if existing is True and not objExists: - raise FileNotFoundError(f"Key '{key_name}' does not exist in bucket '{bucket_name}'.") - elif existing is False and objExists: - raise RuntimeError(f"Key '{key_name}' exists in bucket '{bucket_name}'.") - - if not objExists: - obj.put() # write an empty file - return obj + try: + # Get the bucket's region to avoid a redirect per request + region = get_bucket_region(bucket_name, endpoint_url=endpoint_url) + s3 = session.resource("s3", region_name=region, endpoint_url=endpoint_url) + except NoBucketLocationError as e: + # Probably don't have permission. + # TODO: check if it is that + logger.debug("Couldn't get bucket location: %s", e) + logger.debug("Fall back to not specifying location") + s3 = session.resource("s3", endpoint_url=endpoint_url) + + obj = s3.Object(bucket_name, key_name) + objExists = True + + try: + obj.load() + except ClientError as e: + if get_error_status(e) == 404: + objExists = False + else: + raise + if existing is True and not objExists: + raise FileNotFoundError( + f"Key '{key_name}' does not exist in bucket '{bucket_name}'." + ) + elif existing is False and objExists: + raise RuntimeError(f"Key '{key_name}' exists in bucket '{bucket_name}'.") + + if not objExists: + obj.put() # write an empty file + return obj @retry(errors=[AWSServerErrors]) -def list_objects_for_url(url: ParseResult) -> List[str]: - """ - Extracts a key (object) from a given parsed s3:// URL. The URL will be - supplemented with a trailing slash if it is missing. - """ - key_name = url.path[1:] - bucket_name = url.netloc - - if key_name != '' and not key_name.endswith('/'): - # Make sure to put the trailing slash on the key, or else we'll see - # a prefix of just it. - key_name = key_name + '/' - - # Decide if we need to override Boto's built-in URL here. - # TODO: Deduplicate with get_object_for_url, or push down into session module - endpoint_url: Optional[str] = None - host = os.environ.get('TOIL_S3_HOST', None) - port = os.environ.get('TOIL_S3_PORT', None) - protocol = 'https' - if os.environ.get('TOIL_S3_USE_SSL', True) == 'False': - protocol = 'http' - if host: - endpoint_url = f'{protocol}://{host}' + f':{port}' if port else '' - - client = session.client('s3', endpoint_url=endpoint_url) - - listing = [] - - paginator = client.get_paginator('list_objects_v2') - result = paginator.paginate(Bucket=bucket_name, Prefix=key_name, Delimiter='/') - for page in result: - if 'CommonPrefixes' in page: - for prefix_item in page['CommonPrefixes']: - listing.append(prefix_item['Prefix'][len(key_name):]) - if 'Contents' in page: - for content_item in page['Contents']: - if content_item['Key'] == key_name: - # Ignore folder name itself - continue - listing.append(content_item['Key'][len(key_name):]) +def list_objects_for_url(url: ParseResult) -> list[str]: + """ + Extracts a key (object) from a given parsed s3:// URL. The URL will be + supplemented with a trailing slash if it is missing. + """ + key_name = url.path[1:] + bucket_name = url.netloc + + if key_name != "" and not key_name.endswith("/"): + # Make sure to put the trailing slash on the key, or else we'll see + # a prefix of just it. + key_name = key_name + "/" + + # Decide if we need to override Boto's built-in URL here. + # TODO: Deduplicate with get_object_for_url, or push down into session module + endpoint_url: Optional[str] = None + host = os.environ.get("TOIL_S3_HOST", None) + port = os.environ.get("TOIL_S3_PORT", None) + protocol = "https" + if os.environ.get("TOIL_S3_USE_SSL", True) == "False": + protocol = "http" + if host: + endpoint_url = f"{protocol}://{host}" + f":{port}" if port else "" + + client = session.client("s3", endpoint_url=endpoint_url) + + listing = [] + + paginator = client.get_paginator("list_objects_v2") + result = paginator.paginate(Bucket=bucket_name, Prefix=key_name, Delimiter="/") + for page in result: + if "CommonPrefixes" in page: + for prefix_item in page["CommonPrefixes"]: + listing.append(prefix_item["Prefix"][len(key_name) :]) + if "Contents" in page: + for content_item in page["Contents"]: + if content_item["Key"] == key_name: + # Ignore folder name itself + continue + listing.append(content_item["Key"][len(key_name) :]) + + logger.debug("Found in %s items: %s", url, listing) + return listing - logger.debug('Found in %s items: %s', url, listing) - return listing -def flatten_tags(tags: Dict[str, str]) -> List[Dict[str, str]]: +def flatten_tags(tags: dict[str, str]) -> list[dict[str, str]]: """ Convert tags from a key to value dict into a list of 'Key': xxx, 'Value': xxx dicts. """ - return [{'Key': k, 'Value': v} for k, v in tags.items()] + return [{"Key": k, "Value": v} for k, v in tags.items()] -def boto3_pager(requestor_callable: Callable[..., Any], result_attribute_name: str, - **kwargs: Any) -> Iterable[Any]: +def boto3_pager( + requestor_callable: Callable[..., Any], result_attribute_name: str, **kwargs: Any +) -> Iterable[Any]: """ Yield all the results from calling the given Boto 3 method with the given keyword arguments, paging through the results using the Marker or @@ -450,7 +481,7 @@ def boto3_pager(requestor_callable: Callable[..., Any], result_attribute_name: s yield from page.get(result_attribute_name, []) -def get_item_from_attributes(attributes: List["AttributeTypeDef"], name: str) -> Any: +def get_item_from_attributes(attributes: list["AttributeTypeDef"], name: str) -> Any: """ Given a list of attributes, find the attribute associated with the name and return its corresponding value. @@ -464,4 +495,7 @@ def get_item_from_attributes(attributes: List["AttributeTypeDef"], name: str) -> :param name: name of the attribute :return: value of the attribute """ - return next((attribute["Value"] for attribute in attributes if attribute["Name"] == name), None) + return next( + (attribute["Value"] for attribute in attributes if attribute["Name"] == name), + None, + ) diff --git a/src/toil/lib/aws/utils.py.orig b/src/toil/lib/aws/utils.py.orig new file mode 100644 index 0000000000..4cef4637e0 --- /dev/null +++ b/src/toil/lib/aws/utils.py.orig @@ -0,0 +1,504 @@ +# Copyright (C) 2015-2021 Regents of the University of California +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import errno +import logging +import os +import socket +from typing import (Any, + Callable, + ContextManager, + Dict, + Iterable, + Iterator, + List, + Optional, + Set, +<<<<<<< HEAD + Tuple, + cast) +======= + cast, + TYPE_CHECKING) +>>>>>>> ce9c91c31 (Allow for not installing the mypy_boto3_* packages) +from urllib.parse import ParseResult + +from toil.lib.aws import session, AWSRegionName, AWSServerErrors +from toil.lib.misc import printq +from toil.lib.retry import (DEFAULT_DELAYS, + DEFAULT_TIMEOUT, + get_error_code, + get_error_status, + old_retry, + retry, ErrorCondition) + +if TYPE_CHECKING: + from mypy_boto3_sdb.type_defs import AttributeTypeDef + from mypy_boto3_s3.service_resource import Bucket, Object as S3Object + +try: + from botocore.exceptions import ClientError, EndpointConnectionError +except ImportError: + ClientError = None # type: ignore + EndpointConnectionError = None # type: ignore + # AWS/boto extra is not installed + +logger = logging.getLogger(__name__) + +# These are error codes we expect from AWS if we are making requests too fast. +# https://github.com/boto/botocore/blob/49f87350d54f55b687969ec8bf204df785975077/botocore/retries/standard.py#L316 +THROTTLED_ERROR_CODES = [ + 'Throttling', + 'ThrottlingException', + 'ThrottledException', + 'RequestThrottledException', + 'TooManyRequestsException', + 'ProvisionedThroughputExceededException', + 'TransactionInProgressException', + 'RequestLimitExceeded', + 'BandwidthLimitExceeded', + 'LimitExceededException', + 'RequestThrottled', + 'SlowDown', + 'PriorRequestNotComplete', + 'EC2ThrottledException', +] + +@retry(errors=[AWSServerErrors]) +def delete_iam_role( + role_name: str, region: Optional[str] = None, quiet: bool = True +) -> None: + # TODO: the Boto3 type hints are a bit oversealous here; they want hundreds + # of overloads of the client-getting methods to exist based on the literal + # string passed in, to return exactly the right kind of client or resource. + # So we end up having to wrap all the calls in casts, which kind of defeats + # the point of a nice fluent method you can call with the name of the thing + # you want; we should have been calling iam_client() and so on all along if + # we wanted MyPy to be able to understand us. So at some point we should + # consider revising our API here to be less annoying to explain to the type + # checker. + iam_client = session.client('iam', region_name=region) + iam_resource = session.resource('iam', region_name=region) + role = iam_resource.Role(role_name) + # normal policies + for attached_policy in role.attached_policies.all(): + printq(f'Now dissociating policy: {attached_policy.policy_name} from role {role.name}', quiet) + role.detach_policy(PolicyArn=attached_policy.arn) + # inline policies + for inline_policy in role.policies.all(): + printq(f'Deleting inline policy: {inline_policy.policy_name} from role {role.name}', quiet) + iam_client.delete_role_policy(RoleName=role.name, PolicyName=inline_policy.policy_name) + iam_client.delete_role(RoleName=role_name) + printq(f'Role {role_name} successfully deleted.', quiet) + + +@retry(errors=[AWSServerErrors]) +def delete_iam_instance_profile( + instance_profile_name: str, region: Optional[str] = None, quiet: bool = True +) -> None: + iam_resource = session.resource("iam", region_name=region) + instance_profile = iam_resource.InstanceProfile(instance_profile_name) + if instance_profile.roles is not None: + for role in instance_profile.roles: + printq(f'Now dissociating role: {role.name} from instance profile {instance_profile_name}', quiet) + instance_profile.remove_role(RoleName=role.name) + instance_profile.delete() + printq(f'Instance profile "{instance_profile_name}" successfully deleted.', quiet) + + +@retry(errors=[AWSServerErrors]) +def delete_sdb_domain( + sdb_domain_name: str, region: Optional[str] = None, quiet: bool = True +) -> None: + sdb_client = session.client("sdb", region_name=region) + sdb_client.delete_domain(DomainName=sdb_domain_name) + printq(f'SBD Domain: "{sdb_domain_name}" successfully deleted.', quiet) + + +def connection_reset(e: Exception) -> bool: + """ + Return true if an error is a connection reset error. + """ + # For some reason we get 'error: [Errno 104] Connection reset by peer' where the + # English description suggests that errno is 54 (ECONNRESET) while the actual + # errno is listed as 104. To be safe, we check for both: + return isinstance(e, socket.error) and e.errno in (errno.ECONNRESET, 104) + +def connection_error(e: Exception) -> bool: + """ + Return True if an error represents a failure to make a network connection. + """ + return (connection_reset(e) + or isinstance(e, EndpointConnectionError)) + + +# TODO: Replace with: @retry and ErrorCondition +def retryable_s3_errors(e: Exception) -> bool: + """ + Return true if this is an error from S3 that looks like we ought to retry our request. + """ + return (connection_error(e) + or (isinstance(e, ClientError) and get_error_status(e) in (429, 500)) + or (isinstance(e, ClientError) and get_error_code(e) in THROTTLED_ERROR_CODES) + # boto3 errors + or (isinstance(e, ClientError) and get_error_code(e) in THROTTLED_ERROR_CODES) + or (isinstance(e, ClientError) and 'BucketNotEmpty' in str(e)) + or (isinstance(e, ClientError) and e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 409 and 'try again' in str(e)) + or (isinstance(e, ClientError) and e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') in (404, 429, 500, 502, 503, 504))) + + +def retry_s3(delays: Iterable[float] = DEFAULT_DELAYS, timeout: float = DEFAULT_TIMEOUT, predicate: Callable[[Exception], bool] = retryable_s3_errors) -> Iterator[ContextManager[None]]: + """ + Retry iterator of context managers specifically for S3 operations. + """ + return old_retry(delays=delays, timeout=timeout, predicate=predicate) + +@retry(errors=[AWSServerErrors]) +def delete_s3_bucket( + s3_resource: "S3ServiceResource", + bucket: str, + quiet: bool = True +) -> None: + """ + Delete the given S3 bucket. + """ + printq(f'Deleting s3 bucket: {bucket}', quiet) + + paginator = s3_resource.meta.client.get_paginator('list_object_versions') + try: + for response in paginator.paginate(Bucket=bucket): + # Versions and delete markers can both go in here to be deleted. + # They both have Key and VersionId, but there's no shared base type + # defined for them in the stubs to express that. See + # . So we + # have to do gymnastics to get them into the same list. + to_delete: List[Dict[str, Any]] = cast(List[Dict[str, Any]], response.get('Versions', [])) + \ + cast(List[Dict[str, Any]], response.get('DeleteMarkers', [])) + for entry in to_delete: + printq(f" Deleting {entry['Key']} version {entry['VersionId']}", quiet) + s3_resource.meta.client.delete_object(Bucket=bucket, Key=entry['Key'], VersionId=entry['VersionId']) + s3_resource.Bucket(bucket).delete() + printq(f'\n * Deleted s3 bucket successfully: {bucket}\n\n', quiet) + except s3_resource.meta.client.exceptions.NoSuchBucket: + printq(f'\n * S3 bucket no longer exists: {bucket}\n\n', quiet) + + +def create_s3_bucket( + s3_resource: "S3ServiceResource", + bucket_name: str, + region: AWSRegionName, +) -> "Bucket": + """ + Create an AWS S3 bucket, using the given Boto3 S3 session, with the + given name, in the given region. + + Supports the us-east-1 region, where bucket creation is special. + + *ALL* S3 bucket creation should use this function. + """ + logger.debug("Creating bucket '%s' in region %s.", bucket_name, region) + if region == "us-east-1": # see https://github.com/boto/boto3/issues/125 + bucket = s3_resource.create_bucket(Bucket=bucket_name) + else: + bucket = s3_resource.create_bucket( + Bucket=bucket_name, + CreateBucketConfiguration={"LocationConstraint": region}, + ) + return bucket + +@retry(errors=[ClientError]) +def enable_public_objects(bucket_name: str) -> None: + """ + Enable a bucket to contain objects which are public. + + This adjusts the bucket's Public Access Block setting to not block all + public access, and also adjusts the bucket's Object Ownership setting to a + setting which enables object ACLs. + + Does *not* touch the *account*'s Public Access Block setting, which can + also interfere here. That is probably best left to the account + administrator. + + This configuration used to be the default, and is what most of Toil's code + is written to expect, but it was changed so that new buckets default to the + more restrictive setting + , + with the expectation that people would write IAM policies for the buckets + to allow public access if needed. Toil expects to be able to make arbitrary + objects in arbitrary places public, and naming them all in an IAM policy + would be a very awkward way to do it. So we restore the old behavior. + """ + + s3_client = session.client('s3') + + # Even though the new default is for public access to be prohibited, this + # is implemented by adding new things attached to the bucket. If we remove + # those things the bucket will default to the old defaults. See + # . + + # Stop blocking public access + s3_client.delete_public_access_block(Bucket=bucket_name) + + # Stop using an ownership controls setting that prohibits ACLs. + s3_client.delete_bucket_ownership_controls(Bucket=bucket_name) + +class NoBucketLocationError(Exception): + """ + Error to represent that we could not get a location for a bucket. + """ + pass + +def get_bucket_region(bucket_name: str, endpoint_url: Optional[str] = None, only_strategies: Optional[Set[int]] = None) -> str: + """ + Get the AWS region name associated with the given S3 bucket, or raise NoBucketLocationError. + + Does not log at info level or above when this does not work; failures are expected in some contexts. + + Takes an optional S3 API URL override. + + :param only_strategies: For testing, use only strategies with 1-based numbers in this set. + """ + + s3_client = session.client('s3', endpoint_url=endpoint_url) + + def attempt_get_bucket_location() -> Optional[str]: + """ + Try and get the bucket location from the normal API call. + """ + return s3_client.get_bucket_location(Bucket=bucket_name).get('LocationConstraint', None) + + def attempt_get_bucket_location_from_us_east_1() -> Optional[str]: + """ + Try and get the bucket location from the normal API call, but against us-east-1 + """ + # Sometimes we aren't allowed to GetBucketLocation. At least some of + # the time, that's only true when we talk to whatever S3 API servers we + # usually use, and we can get around this lack of permission by talking + # to us-east-1 instead. We've been told that this is because us-east-1 + # is special and will answer the question when other regions won't. + # See: + # + # It could also be because AWS open data buckets (which we tend to + # encounter this problem for) tend to actually themselves be in + # us-east-1. + backup_s3_client = session.client('s3', region_name='us-east-1') + return backup_s3_client.get_bucket_location(Bucket=bucket_name).get('LocationConstraint', None) + + def attempt_head_bucket() -> Optional[str]: + """ + Try and get the bucket location from calling HeadBucket and inspecting + the headers. + """ + # If that also doesn't work, we can try HEAD-ing the bucket and looking + # for an 'x-amz-bucket-region' header on the response, which can tell + # us where the bucket is. See + # + info = s3_client.head_bucket(Bucket=bucket_name) + return info['ResponseMetadata']['HTTPHeaders']['x-amz-bucket-region'] + + # Compose a list of strategies we want to try in order, which may work. + # None is an acceptable return type that actually means something. + strategies: List[Callable[[], Optional[str]]] = [] + strategies.append(attempt_get_bucket_location) + if not endpoint_url: + # We should only try to talk to us-east-1 if we don't have a custom + # URL. + strategies.append(attempt_get_bucket_location_from_us_east_1) + strategies.append(attempt_head_bucket) + + error_logs: List[Tuple[int, str]] = [] + for attempt in retry_s3(): + with attempt: + for i, strategy in enumerate(strategies): + if only_strategies is not None and i+1 not in only_strategies: + # We want to test running without this strategy. + continue + try: + location = bucket_location_to_region(strategy()) + logger.debug('Got bucket location from strategy %d', i + 1) + return location + except ClientError as e: + if get_error_code(e) == 'AccessDenied' and not endpoint_url: + logger.debug('Strategy %d to get bucket location did not work: %s', i + 1, e) + error_logs.append((i + 1, str(e))) + last_error: Exception = e + # We were blocked with this strategy. Move on to the + # next strategy which might work. + continue + else: + raise + except KeyError as e: + # If we get a weird head response we will have a KeyError + logger.debug('Strategy %d to get bucket location did not work: %s', i + 1, e) + error_logs.append((i + 1, str(e))) + last_error = e + + error_messages = [] + for rank, message in error_logs: + error_messages.append(f"Strategy {rank} failed to get bucket location because: {message}") + # If we get here we ran out of attempts. + raise NoBucketLocationError("Could not get bucket location: " + "\n".join(error_messages)) from last_error + +def region_to_bucket_location(region: str) -> str: + return '' if region == 'us-east-1' else region + +def bucket_location_to_region(location: Optional[str]) -> str: + return "us-east-1" if location == "" or location is None else location + +def get_object_for_url(url: ParseResult, existing: Optional[bool] = None) -> "S3Object": + """ + Extracts a key (object) from a given parsed s3:// URL. + + If existing is true and the object does not exist, raises FileNotFoundError. + + :param bool existing: If True, key is expected to exist. If False, key is expected not to + exists and it will be created. If None, the key will be created if it doesn't exist. + """ + + key_name = url.path[1:] + bucket_name = url.netloc + + # Decide if we need to override Boto's built-in URL here. + endpoint_url: Optional[str] = None + host = os.environ.get('TOIL_S3_HOST', None) + port = os.environ.get('TOIL_S3_PORT', None) + protocol = 'https' + if os.environ.get('TOIL_S3_USE_SSL', True) == 'False': + protocol = 'http' + if host: + endpoint_url = f'{protocol}://{host}' + f':{port}' if port else '' + + # TODO: OrdinaryCallingFormat equivalent in boto3? + # if botoargs: + # botoargs['calling_format'] = boto.s3.connection.OrdinaryCallingFormat() + + try: + # Get the bucket's region to avoid a redirect per request + region = get_bucket_region(bucket_name, endpoint_url=endpoint_url) + s3 = session.resource('s3', region_name=region, endpoint_url=endpoint_url) + except NoBucketLocationError as e: + # Probably don't have permission. + # TODO: check if it is that + logger.debug("Couldn't get bucket location: %s", e) + logger.debug("Fall back to not specifying location") + s3 = session.resource('s3', endpoint_url=endpoint_url) + + obj = s3.Object(bucket_name, key_name) + objExists = True + + try: + obj.load() + except ClientError as e: + if get_error_status(e) == 404: + objExists = False + else: + raise + if existing is True and not objExists: + raise FileNotFoundError(f"Key '{key_name}' does not exist in bucket '{bucket_name}'.") + elif existing is False and objExists: + raise RuntimeError(f"Key '{key_name}' exists in bucket '{bucket_name}'.") + + if not objExists: + obj.put() # write an empty file + return obj + + +@retry(errors=[AWSServerErrors]) +def list_objects_for_url(url: ParseResult) -> List[str]: + """ + Extracts a key (object) from a given parsed s3:// URL. The URL will be + supplemented with a trailing slash if it is missing. + """ + key_name = url.path[1:] + bucket_name = url.netloc + + if key_name != '' and not key_name.endswith('/'): + # Make sure to put the trailing slash on the key, or else we'll see + # a prefix of just it. + key_name = key_name + '/' + + # Decide if we need to override Boto's built-in URL here. + # TODO: Deduplicate with get_object_for_url, or push down into session module + endpoint_url: Optional[str] = None + host = os.environ.get('TOIL_S3_HOST', None) + port = os.environ.get('TOIL_S3_PORT', None) + protocol = 'https' + if os.environ.get('TOIL_S3_USE_SSL', True) == 'False': + protocol = 'http' + if host: + endpoint_url = f'{protocol}://{host}' + f':{port}' if port else '' + + client = session.client('s3', endpoint_url=endpoint_url) + + listing = [] + + paginator = client.get_paginator('list_objects_v2') + result = paginator.paginate(Bucket=bucket_name, Prefix=key_name, Delimiter='/') + for page in result: + if 'CommonPrefixes' in page: + for prefix_item in page['CommonPrefixes']: + listing.append(prefix_item['Prefix'][len(key_name):]) + if 'Contents' in page: + for content_item in page['Contents']: + if content_item['Key'] == key_name: + # Ignore folder name itself + continue + listing.append(content_item['Key'][len(key_name):]) + + logger.debug('Found in %s items: %s', url, listing) + return listing + +def flatten_tags(tags: Dict[str, str]) -> List[Dict[str, str]]: + """ + Convert tags from a key to value dict into a list of 'Key': xxx, 'Value': xxx dicts. + """ + return [{'Key': k, 'Value': v} for k, v in tags.items()] + + +def boto3_pager(requestor_callable: Callable[..., Any], result_attribute_name: str, + **kwargs: Any) -> Iterable[Any]: + """ + Yield all the results from calling the given Boto 3 method with the + given keyword arguments, paging through the results using the Marker or + NextToken, and fetching out and looping over the list in the response + with the given attribute name. + """ + + # Recover the Boto3 client, and the name of the operation + client = requestor_callable.__self__ # type: ignore[attr-defined] + op_name = requestor_callable.__name__ + + # grab a Boto 3 built-in paginator. See + # + paginator = client.get_paginator(op_name) + + for page in paginator.paginate(**kwargs): + # Invoke it and go through the pages, yielding from them + yield from page.get(result_attribute_name, []) + + +def get_item_from_attributes(attributes: List["AttributeTypeDef"], name: str) -> Any: + """ + Given a list of attributes, find the attribute associated with the name and return its corresponding value. + + The `attribute_list` will be a list of TypedDict's (which boto3 SDB functions commonly return), + where each TypedDict has a "Name" and "Value" key value pair. + This function grabs the value out of the associated TypedDict. + + If the attribute with the name does not exist, the function will return None. + + :param attributes: list of attributes + :param name: name of the attribute + :return: value of the attribute + """ + return next((attribute["Value"] for attribute in attributes if attribute["Name"] == name), None) diff --git a/src/toil/lib/bioio.py b/src/toil/lib/bioio.py index ccb5708976..1b1529f8bb 100644 --- a/src/toil/lib/bioio.py +++ b/src/toil/lib/bioio.py @@ -27,15 +27,19 @@ def system(command): will be passed to subprocess.check_call. :type command: str | sequence[string] """ - logger.warning('Deprecated toil method that will be moved/replaced in a future release."') - logger.debug(f'Running: {command}') + logger.warning( + 'Deprecated toil method that will be moved/replaced in a future release."' + ) + logger.debug(f"Running: {command}") subprocess.check_call(command, shell=isinstance(command, str), bufsize=-1) # Used by cactus; now a wrapper and not used in Toil. # TODO: Remove from cactus and then remove from Toil. def getLogLevelString(logger=None): - root_logger.warning('Deprecated toil method. Please call "logging.getLevelName" directly.') + root_logger.warning( + 'Deprecated toil method. Please call "logging.getLevelName" directly.' + ) if logger is None: logger = root_logger return logging.getLevelName(logger.getEffectiveLevel()) @@ -44,12 +48,16 @@ def getLogLevelString(logger=None): # Used by cactus; now a wrapper and not used in Toil. # TODO: Remove from cactus and then remove from Toil. def setLoggingFromOptions(options): - logger.warning('Deprecated toil method. Please use "toil.statsAndLogging.set_logging_from_options()" instead."') + logger.warning( + 'Deprecated toil method. Please use "toil.statsAndLogging.set_logging_from_options()" instead."' + ) set_logging_from_options(options) # Used by cactus; now a wrapper and not used in Toil. # TODO: Remove from cactus and then remove from Toil. def getTempFile(suffix="", rootDir=None): - logger.warning('Deprecated toil method. Please use "toil.test.get_temp_file()" instead."') + logger.warning( + 'Deprecated toil method. Please use "toil.test.get_temp_file()" instead."' + ) return get_temp_file(suffix, rootDir) diff --git a/src/toil/lib/compatibility.py b/src/toil/lib/compatibility.py index 2dd9166c73..df14d3334d 100644 --- a/src/toil/lib/compatibility.py +++ b/src/toil/lib/compatibility.py @@ -7,15 +7,20 @@ def deprecated(new_function_name: str) -> Callable[..., Any]: def decorate(func: Callable[..., Any]) -> Callable[..., Any]: @functools.wraps(func) def call(*args: Any, **kwargs: Any) -> Any: - warnings.warn(f'WARNING: "{func.__name__}()" is deprecated. Please use "{new_function_name}()" instead.', - DeprecationWarning) + warnings.warn( + f'WARNING: "{func.__name__}()" is deprecated. Please use "{new_function_name}()" instead.', + DeprecationWarning, + ) return func(*args, **kwargs) + return call + return decorate def compat_bytes(s: Union[bytes, str]) -> str: - return s.decode('utf-8') if isinstance(s, bytes) else s + return s.decode("utf-8") if isinstance(s, bytes) else s + # MyPy can't yet support the recursive type we would need to say "we go through # any structure of dicts, tuples, lists, and sets and convert all bytes types @@ -28,13 +33,13 @@ def compat_bytes_recursive(data: Any) -> Any: """ if isinstance(data, dict): # Keyed collection - return type(data)((compat_bytes_recursive(i) for i in data.items())) + return type(data)(compat_bytes_recursive(i) for i in data.items()) elif isinstance(data, (tuple, list, set)): # Flat collection - return type(data)((compat_bytes_recursive(i) for i in data)) + return type(data)(compat_bytes_recursive(i) for i in data) elif isinstance(data, bytes): # Leaf bytes - return data.decode('utf-8') + return data.decode("utf-8") else: # Leaf non-bytes return data diff --git a/src/toil/lib/conversions.py b/src/toil/lib/conversions.py index 345a4d72c2..f391f5ade4 100644 --- a/src/toil/lib/conversions.py +++ b/src/toil/lib/conversions.py @@ -4,68 +4,99 @@ """ import math -from typing import SupportsInt, Tuple, Union, Optional +from typing import Optional, SupportsInt, Union # See https://en.wikipedia.org/wiki/Binary_prefix -BINARY_PREFIXES = ['ki', 'mi', 'gi', 'ti', 'pi', 'ei', 'kib', 'mib', 'gib', 'tib', 'pib', 'eib'] -DECIMAL_PREFIXES = ['b', 'k', 'm', 'g', 't', 'p', 'e', 'kb', 'mb', 'gb', 'tb', 'pb', 'eb'] +BINARY_PREFIXES = [ + "ki", + "mi", + "gi", + "ti", + "pi", + "ei", + "kib", + "mib", + "gib", + "tib", + "pib", + "eib", +] +DECIMAL_PREFIXES = [ + "b", + "k", + "m", + "g", + "t", + "p", + "e", + "kb", + "mb", + "gb", + "tb", + "pb", + "eb", +] VALID_PREFIXES = BINARY_PREFIXES + DECIMAL_PREFIXES -def bytes_in_unit(unit: str = 'B') -> int: +def bytes_in_unit(unit: str = "B") -> int: num_bytes = 1 - if unit.lower() in ['ki', 'kib']: + if unit.lower() in ["ki", "kib"]: num_bytes = 1 << 10 - if unit.lower() in ['mi', 'mib']: + if unit.lower() in ["mi", "mib"]: num_bytes = 1 << 20 - if unit.lower() in ['gi', 'gib']: + if unit.lower() in ["gi", "gib"]: num_bytes = 1 << 30 - if unit.lower() in ['ti', 'tib']: + if unit.lower() in ["ti", "tib"]: num_bytes = 1 << 40 - if unit.lower() in ['pi', 'pib']: + if unit.lower() in ["pi", "pib"]: num_bytes = 1 << 50 - if unit.lower() in ['ei', 'eib']: + if unit.lower() in ["ei", "eib"]: num_bytes = 1 << 60 - if unit.lower() in ['k', 'kb']: + if unit.lower() in ["k", "kb"]: num_bytes = 1000 - if unit.lower() in ['m', 'mb']: - num_bytes = 1000 ** 2 - if unit.lower() in ['g', 'gb']: - num_bytes = 1000 ** 3 - if unit.lower() in ['t', 'tb']: - num_bytes = 1000 ** 4 - if unit.lower() in ['p', 'pb']: - num_bytes = 1000 ** 5 - if unit.lower() in ['e', 'eb']: - num_bytes = 1000 ** 6 + if unit.lower() in ["m", "mb"]: + num_bytes = 1000**2 + if unit.lower() in ["g", "gb"]: + num_bytes = 1000**3 + if unit.lower() in ["t", "tb"]: + num_bytes = 1000**4 + if unit.lower() in ["p", "pb"]: + num_bytes = 1000**5 + if unit.lower() in ["e", "eb"]: + num_bytes = 1000**6 return num_bytes -def convert_units(num: float, - src_unit: str, - dst_unit: str = 'B') -> float: +def convert_units(num: float, src_unit: str, dst_unit: str = "B") -> float: """Returns a float representing the converted input in dst_units.""" if not src_unit.lower() in VALID_PREFIXES: - raise RuntimeError(f"{src_unit} not a valid unit, valid units are {VALID_PREFIXES}.") + raise RuntimeError( + f"{src_unit} not a valid unit, valid units are {VALID_PREFIXES}." + ) if not dst_unit.lower() in VALID_PREFIXES: - raise RuntimeError(f"{dst_unit} not a valid unit, valid units are {VALID_PREFIXES}.") + raise RuntimeError( + f"{dst_unit} not a valid unit, valid units are {VALID_PREFIXES}." + ) return (num * bytes_in_unit(src_unit)) / bytes_in_unit(dst_unit) -def parse_memory_string(string: str) -> Tuple[float, str]: +def parse_memory_string(string: str) -> tuple[float, str]: """ Given a string representation of some memory (i.e. '1024 Mib'), return the number and unit. """ for i, character in enumerate(string): # find the first character of the unit - if character not in '0123456789.-_ ': + if character not in "0123456789.-_ ": units = string[i:].strip() if not units.lower() in VALID_PREFIXES: - raise RuntimeError(f"{units} not a valid unit, valid units are {VALID_PREFIXES}.") + raise RuntimeError( + f"{units} not a valid unit, valid units are {VALID_PREFIXES}." + ) return float(string[:i]), units - return float(string), 'b' + return float(string), "b" def human2bytes(string: str) -> int: @@ -75,7 +106,7 @@ def human2bytes(string: str) -> int: """ value, unit = parse_memory_string(string) - return int(convert_units(value, src_unit=unit, dst_unit='b')) + return int(convert_units(value, src_unit=unit, dst_unit="b")) def bytes2human(n: SupportsInt) -> str: @@ -84,48 +115,51 @@ def bytes2human(n: SupportsInt) -> str: if n < 0: raise ValueError("n < 0") elif n < 1: - return '0 b' + return "0 b" power_level = math.floor(math.log(n, 1024)) - units = ('b', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei') + units = ("b", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei") unit = units[power_level if power_level < len(units) else -1] value = convert_units(n, "b", unit) - return f'{value:.1f} {unit}' - + return f"{value:.1f} {unit}" + + def b_to_mib(n: Union[int, float]) -> float: """ Convert a number from bytes to mibibytes. """ - return convert_units(n, 'b', 'mib') + return convert_units(n, "b", "mib") def mib_to_b(n: Union[int, float]) -> float: """ Convert a number from mibibytes to bytes. """ - return convert_units(n, 'mib', 'b') + return convert_units(n, "mib", "b") + -#General Conversions +# General Conversions -def hms_duration_to_seconds(hms: str) -> float: + +def hms_duration_to_seconds(hms: str) -> float: """ - Parses a given time string in hours:minutes:seconds, + Parses a given time string in hours:minutes:seconds, returns an equivalent total seconds value """ - vals_to_convert = hms.split(':') + vals_to_convert = hms.split(":") seconds = 0.0 - + for val in vals_to_convert: - if(float(val) < 0): + if float(val) < 0: raise ValueError("Invalid Time, negative value") - if(len(vals_to_convert) != 3): + if len(vals_to_convert) != 3: raise ValueError("Invalid amount of fields, function takes input in 'hh:mm:ss'") - seconds += float(vals_to_convert[0]) * 60 * 60 - seconds += float(vals_to_convert[1]) * 60 - seconds += float(vals_to_convert[2]) + seconds += float(vals_to_convert[0]) * 60 * 60 + seconds += float(vals_to_convert[1]) * 60 + seconds += float(vals_to_convert[2]) return seconds @@ -133,7 +167,7 @@ def hms_duration_to_seconds(hms: str) -> float: def strtobool(val: str) -> bool: """ Make a human-readable string into a bool. - + Convert a string along the lines of "y", "1", "ON", "TrUe", or "Yes" to True, and the corresponding false-ish values to False. """ @@ -145,7 +179,7 @@ def strtobool(val: str) -> bool: for prefix in prefixes: if lowered.startswith(prefix): return result - raise ValueError(f"Cannot convert \"{val}\" to a bool") + raise ValueError(f'Cannot convert "{val}" to a bool') def opt_strtobool(b: Optional[str]) -> Optional[bool]: diff --git a/src/toil/lib/docker.py b/src/toil/lib/docker.py index c1ab30edca..6eac2684e1 100644 --- a/src/toil/lib/docker.py +++ b/src/toil/lib/docker.py @@ -17,15 +17,17 @@ import re import struct from shlex import quote -from typing import List, Optional +from typing import Optional import requests import docker -from docker.errors import (ContainerError, - ImageNotFound, - NotFound, - create_api_error_from_http_exception) +from docker.errors import ( + ContainerError, + ImageNotFound, + NotFound, + create_api_error_from_http_exception, +) from docker.utils.socket import consume_socket_output, demux_adaptor from toil.lib.accelerators import get_host_accelerator_numbers @@ -37,42 +39,49 @@ def dockerCheckOutput(*args, **kwargs): - raise RuntimeError("dockerCheckOutput() using subprocess.check_output() has been removed, " - "please switch to apiDockerCall().") + raise RuntimeError( + "dockerCheckOutput() using subprocess.check_output() has been removed, " + "please switch to apiDockerCall()." + ) def dockerCall(*args, **kwargs): - raise RuntimeError("dockerCall() using subprocess.check_output() has been removed, " - "please switch to apiDockerCall().") + raise RuntimeError( + "dockerCall() using subprocess.check_output() has been removed, " + "please switch to apiDockerCall()." + ) def subprocessDockerCall(*args, **kwargs): - raise RuntimeError("subprocessDockerCall() has been removed, " - "please switch to apiDockerCall().") - - -def apiDockerCall(job, - image, - parameters=None, - deferParam=None, - volumes=None, - working_dir=None, - containerName=None, - entrypoint=None, - detach=False, - log_config=None, - auto_remove=None, - remove=False, - user=None, - environment=None, - stdout=None, - stderr=False, - stream=False, - demux=False, - streamfile=None, - accelerators: Optional[List[int]] = None, - timeout=365 * 24 * 60 * 60, - **kwargs): + raise RuntimeError( + "subprocessDockerCall() has been removed, " "please switch to apiDockerCall()." + ) + + +def apiDockerCall( + job, + image, + parameters=None, + deferParam=None, + volumes=None, + working_dir=None, + containerName=None, + entrypoint=None, + detach=False, + log_config=None, + auto_remove=None, + remove=False, + user=None, + environment=None, + stdout=None, + stderr=False, + stream=False, + demux=False, + streamfile=None, + accelerators: Optional[list[int]] = None, + timeout=365 * 24 * 60 * 60, + **kwargs, +): """ A toil wrapper for the python docker API. @@ -184,7 +193,7 @@ def toil_job(job): working_dir = os.getcwd() if volumes is None: - volumes = {working_dir: {'bind': '/data', 'mode': 'rw'}} + volumes = {working_dir: {"bind": "/data", "mode": "rw"}} for volume in volumes: if not os.path.exists(volume): @@ -197,11 +206,11 @@ def toil_job(job): # and chain with pipes. if len(parameters) > 0 and type(parameters[0]) is list: if entrypoint is None: - entrypoint = ['/bin/bash', '-c'] - chain_params = \ - [' '.join(quote(arg) for arg in command) \ - for command in parameters] - command = ' | '.join(chain_params) + entrypoint = ["/bin/bash", "-c"] + chain_params = [ + " ".join(quote(arg) for arg in command) for command in parameters + ] + command = " | ".join(chain_params) pipe_prefix = "set -eo pipefail && " command = [pipe_prefix + command] logger.debug("Calling docker with: " + repr(command)) @@ -213,7 +222,7 @@ def toil_job(job): # practice: # http://docker-py.readthedocs.io/en/stable/containers.html elif len(parameters) > 0 and type(parameters) is list: - command = ' '.join(quote(arg) for arg in parameters) + command = " ".join(quote(arg) for arg in parameters) logger.debug("Calling docker with: " + repr(command)) # If the 'parameters' lists are empty, they are respecified as None, which @@ -226,9 +235,9 @@ def toil_job(job): # Ensure the user has passed a valid value for deferParam if deferParam not in (None, FORGO, STOP, RM): - raise RuntimeError('Please provide a valid value for deferParam.') + raise RuntimeError("Please provide a valid value for deferParam.") - client = docker.from_env(version='auto', timeout=timeout) + client = docker.from_env(version="auto", timeout=timeout) if deferParam is None: deferParam = RM @@ -263,8 +272,7 @@ def toil_job(job): # TODO: Here we assume that the host accelerators are all GPUs device_requests.append( docker.types.DeviceRequest( - device_ids=[','.join(host_accelerators)], - capabilities=[['gpu']] + device_ids=[",".join(host_accelerators)], capabilities=[["gpu"]] ) ) @@ -275,24 +283,26 @@ def toil_job(job): # 'hello world\n' if stdout is None: stdout = True - out = client.containers.run(image=image, - command=command, - working_dir=working_dir, - entrypoint=entrypoint, - name=containerName, - detach=False, - volumes=volumes, - auto_remove=auto_remove, - stdout=stdout, - stderr=stderr, - # to get the generator if demux=True - stream=stream or demux, - remove=remove, - log_config=log_config, - user=user, - environment=environment, - device_requests=device_requests, - **kwargs) + out = client.containers.run( + image=image, + command=command, + working_dir=working_dir, + entrypoint=entrypoint, + name=containerName, + detach=False, + volumes=volumes, + auto_remove=auto_remove, + stdout=stdout, + stderr=stderr, + # to get the generator if demux=True + stream=stream or demux, + remove=remove, + log_config=log_config, + user=user, + environment=environment, + device_requests=device_requests, + **kwargs, + ) if demux is False: return out @@ -300,7 +310,10 @@ def toil_job(job): # If demux is True (i.e.: we want STDOUT and STDERR separated), we need to decode # the raw response from the docker API and preserve the stream type this time. response = out._response - gen = (demux_adaptor(*frame) for frame in _multiplexed_response_stream_helper(response)) + gen = ( + demux_adaptor(*frame) + for frame in _multiplexed_response_stream_helper(response) + ) if stream: return gen @@ -309,9 +322,11 @@ def toil_job(job): else: if (stdout or stderr) and log_config is None: - logger.warning('stdout or stderr specified, but log_config is not set. ' - 'Defaulting to "journald".') - log_config = dict(type='journald') + logger.warning( + "stdout or stderr specified, but log_config is not set. " + 'Defaulting to "journald".' + ) + log_config = dict(type="journald") if stdout is None: stdout = False @@ -319,30 +334,34 @@ def toil_job(job): # When detach is True, this returns a container object: # >>> client.containers.run("bfirsh/reticulate-splines", detach=True) # - container = client.containers.run(image=image, - command=command, - working_dir=working_dir, - entrypoint=entrypoint, - name=containerName, - detach=True, - volumes=volumes, - auto_remove=auto_remove, - stdout=stdout, - stderr=stderr, - stream=stream, - remove=remove, - log_config=log_config, - user=user, - environment=environment, - device_requests=device_requests, - **kwargs) + container = client.containers.run( + image=image, + command=command, + working_dir=working_dir, + entrypoint=entrypoint, + name=containerName, + detach=True, + volumes=volumes, + auto_remove=auto_remove, + stdout=stdout, + stderr=stderr, + stream=stream, + remove=remove, + log_config=log_config, + user=user, + environment=environment, + device_requests=device_requests, + **kwargs, + ) if stdout or stderr: if streamfile is None: - streamfile = 'output.log' - with open(streamfile, 'wb') as f: + streamfile = "output.log" + with open(streamfile, "wb") as f: # stream=True makes this loop blocking; we will loop until # the container stops and there is no more output. - for line in container.logs(stdout=stdout, stderr=stderr, stream=True): + for line in container.logs( + stdout=stdout, stderr=stderr, stream=True + ): f.write(line.encode() if isinstance(line, str) else line) # If we didn't capture output, the caller will need to .wait() on @@ -361,10 +380,12 @@ def toil_job(job): raise create_api_error_from_http_exception(e) -def dockerKill(container_name: str, - gentleKill: bool = False, - remove: bool = False, - timeout: int = 365 * 24 * 60 * 60) -> None: +def dockerKill( + container_name: str, + gentleKill: bool = False, + remove: bool = False, + timeout: int = 365 * 24 * 60 * 60, +) -> None: """ Immediately kills a container. Equivalent to "docker kill": https://docs.docker.com/engine/reference/commandline/kill/ @@ -378,10 +399,10 @@ def dockerKill(container_name: str, to respect the timeout. Defaults to 1 year (i.e. wait essentially indefinitely). """ - client = docker.from_env(version='auto', timeout=timeout) + client = docker.from_env(version="auto", timeout=timeout) try: this_container = client.containers.get(container_name) - while this_container.status == 'running': + while this_container.status == "running": if gentleKill is False: client.containers.get(container_name).kill() else: @@ -390,9 +411,13 @@ def dockerKill(container_name: str, if remove: this_container.remove() except NotFound: - logger.debug(f"Attempted to stop container ({container_name}), but container != exist.") + logger.debug( + f"Attempted to stop container ({container_name}), but container != exist." + ) except requests.exceptions.HTTPError as e: - logger.debug(f"Attempted to stop container ({container_name}), but server gave an error:") + logger.debug( + f"Attempted to stop container ({container_name}), but server gave an error:" + ) raise create_api_error_from_http_exception(e) @@ -419,10 +444,10 @@ def containerIsRunning(container_name: str, timeout: int = 365 * 24 * 60 * 60): :returns: True if status is 'running', False if status is anything else, and None if the container does not exist. """ - client = docker.from_env(version='auto', timeout=timeout) + client = docker.from_env(version="auto", timeout=timeout) try: this_container = client.containers.get(container_name) - if this_container.status == 'running': + if this_container.status == "running": return True else: # this_container.status == 'exited', 'restarting', or 'paused' @@ -430,8 +455,7 @@ def containerIsRunning(container_name: str, timeout: int = 365 * 24 * 60 * 60): except NotFound: return None except requests.exceptions.HTTPError as e: - logger.debug("Server error attempting to call container: %s", - container_name) + logger.debug("Server error attempting to call container: %s", container_name) raise create_api_error_from_http_exception(e) @@ -440,8 +464,12 @@ def getContainerName(job): Create a random string including the job name, and return it. Name will match ``[a-zA-Z0-9][a-zA-Z0-9_.-]``. """ - parts = ['toil', str(job.description), base64.b64encode(os.urandom(9), b'-_').decode('utf-8')] - name = re.sub('[^a-zA-Z0-9_.-]', '', '--'.join(parts)) + parts = [ + "toil", + str(job.description), + base64.b64encode(os.urandom(9), b"-_").decode("utf-8"), + ] + name = re.sub("[^a-zA-Z0-9_.-]", "", "--".join(parts)) return name @@ -459,7 +487,7 @@ def _multiplexed_response_stream_helper(response): break # header is 8 bytes with format: {STREAM_TYPE, 0, 0, 0, SIZE1, SIZE2, SIZE3, SIZE4} # protocol: https://docs.docker.com/engine/api/v1.24/#attach-to-a-container - stream_type, length = struct.unpack('>BxxxL', header) + stream_type, length = struct.unpack(">BxxxL", header) if not length: continue data = response.raw.read(length) diff --git a/src/toil/lib/ec2.py b/src/toil/lib/ec2.py index 78b79f25c8..a3ed68a351 100644 --- a/src/toil/lib/ec2.py +++ b/src/toil/lib/ec2.py @@ -1,22 +1,8 @@ import logging import time from base64 import b64encode -from operator import itemgetter -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - Generator, - Iterable, - List, - Mapping, - Optional, - Union, -) - -import botocore.client -from boto3.resources.base import ServiceResource +from collections.abc import Generator, Iterable, Mapping +from typing import TYPE_CHECKING, Any, Callable, Optional, Union from toil.lib.aws.session import establish_boto3_session from toil.lib.aws.utils import flatten_tags @@ -48,43 +34,44 @@ class UserError(RuntimeError): def __init__(self, message=None, cause=None): if (message is None) == (cause is None): raise RuntimeError("Must pass either message or cause.") - super().__init__( - message if cause is None else cause.message) + super().__init__(message if cause is None else cause.message) def not_found(e): try: - return get_error_code(e).endswith('.NotFound') + return get_error_code(e).endswith(".NotFound") except ValueError: # Not the right kind of error return False def inconsistencies_detected(e): - if get_error_code(e) == 'InvalidGroup.NotFound': + if get_error_code(e) == "InvalidGroup.NotFound": return True m = get_error_message(e).lower() - matches = ('invalid iam instance profile' in m) or ('no associated iam roles' in m) + matches = ("invalid iam instance profile" in m) or ("no associated iam roles" in m) return matches # We also define these error categories for the new retry decorator -INCONSISTENCY_ERRORS = [ErrorCondition(boto_error_codes=['InvalidGroup.NotFound']), - ErrorCondition(error_message_must_include='Invalid IAM Instance Profile'), - ErrorCondition(error_message_must_include='no associated IAM Roles')] +INCONSISTENCY_ERRORS = [ + ErrorCondition(boto_error_codes=["InvalidGroup.NotFound"]), + ErrorCondition(error_message_must_include="Invalid IAM Instance Profile"), + ErrorCondition(error_message_must_include="no associated IAM Roles"), +] def retry_ec2(t=a_short_time, retry_for=10 * a_short_time, retry_while=not_found): - return old_retry(delays=(t, t, t * 2, t * 4), - timeout=retry_for, - predicate=retry_while) + return old_retry( + delays=(t, t, t * 2, t * 4), timeout=retry_for, predicate=retry_while + ) class UnexpectedResourceState(Exception): def __init__(self, resource, to_state, state): super().__init__( - "Expected state of %s to be '%s' but got '%s'" % - (resource, to_state, state)) + "Expected state of %s to be '%s' but got '%s'" % (resource, to_state, state) + ) def wait_transition( @@ -113,7 +100,9 @@ def wait_transition( for attempt in retry_ec2(): with attempt: described = boto3_ec2.describe_instances(InstanceIds=[instance_id]) - resource = described["Reservations"][0]["Instances"][0] # there should only be one requested + resource = described["Reservations"][0]["Instances"][ + 0 + ] # there should only be one requested state = state_getter(resource) if state != to_state: raise UnexpectedResourceState(resource, to_state, state) @@ -135,29 +124,41 @@ def wait_instances_running( pending_ids = set() for i in instances: i: "InstanceTypeDef" - if i['State']['Name'] == 'pending': - pending_ids.add(i['InstanceId']) - elif i['State']['Name'] == 'running': - if i['InstanceId'] in running_ids: - raise RuntimeError("An instance was already added to the list of running instance IDs. Maybe there is a duplicate.") - running_ids.add(i['InstanceId']) + if i["State"]["Name"] == "pending": + pending_ids.add(i["InstanceId"]) + elif i["State"]["Name"] == "running": + if i["InstanceId"] in running_ids: + raise RuntimeError( + "An instance was already added to the list of running instance IDs. Maybe there is a duplicate." + ) + running_ids.add(i["InstanceId"]) yield i else: - if i['InstanceId'] in other_ids: - raise RuntimeError("An instance was already added to the list of other instances. Maybe there is a duplicate.") - other_ids.add(i['InstanceId']) + if i["InstanceId"] in other_ids: + raise RuntimeError( + "An instance was already added to the list of other instances. Maybe there is a duplicate." + ) + other_ids.add(i["InstanceId"]) yield i - logger.info('%i instance(s) pending, %i running, %i other.', - *list(map(len, (pending_ids, running_ids, other_ids)))) + logger.info( + "%i instance(s) pending, %i running, %i other.", + *list(map(len, (pending_ids, running_ids, other_ids))), + ) if not pending_ids: break seconds = max(a_short_time, min(len(pending_ids), 10 * a_short_time)) - logger.info('Sleeping for %is', seconds) + logger.info("Sleeping for %is", seconds) time.sleep(seconds) for attempt in retry_ec2(): with attempt: - described_instances = boto3_ec2.describe_instances(InstanceIds=list(pending_ids)) - instances = [instance for reservation in described_instances["Reservations"] for instance in reservation["Instances"]] + described_instances = boto3_ec2.describe_instances( + InstanceIds=list(pending_ids) + ) + instances = [ + instance + for reservation in described_instances["Reservations"] + for instance in reservation["Instances"] + ] def wait_spot_requests_active( @@ -165,7 +166,7 @@ def wait_spot_requests_active( requests: Iterable["SpotInstanceRequestTypeDef"], timeout: float = None, tentative: bool = False, -) -> Iterable[List["SpotInstanceRequestTypeDef"]]: +) -> Iterable[list["SpotInstanceRequestTypeDef"]]: """ Wait until no spot request in the given iterator is in the 'open' state or, optionally, a timeout occurs. Yield spot requests as soon as they leave the 'open' state. @@ -188,11 +189,11 @@ def wait_spot_requests_active( open_ids = None def cancel() -> None: - logger.warning('Cancelling remaining %i spot requests.', len(open_ids)) + logger.warning("Cancelling remaining %i spot requests.", len(open_ids)) boto3_ec2.cancel_spot_instance_requests(SpotInstanceRequestIds=list(open_ids)) def spot_request_not_found(e: Exception) -> bool: - return get_error_code(e) == 'InvalidSpotInstanceRequestID.NotFound' + return get_error_code(e) == "InvalidSpotInstanceRequestID.NotFound" try: while True: @@ -200,42 +201,55 @@ def spot_request_not_found(e: Exception) -> bool: batch = [] for r in requests: r: "SpotInstanceRequestTypeDef" # pycharm thinks it is a string - if r['State'] == 'open': - open_ids.add(r['InstanceId']) - if r['Status'] == 'pending-evaluation': - eval_ids.add(r['InstanceId']) - elif r['Status'] == 'pending-fulfillment': - fulfill_ids.add(r['InstanceId']) + if r["State"] == "open": + open_ids.add(r["InstanceId"]) + if r["Status"] == "pending-evaluation": + eval_ids.add(r["InstanceId"]) + elif r["Status"] == "pending-fulfillment": + fulfill_ids.add(r["InstanceId"]) else: logger.info( - 'Request %s entered status %s indicating that it will not be ' - 'fulfilled anytime soon.', r['InstanceId'], r['Status']) - elif r['State'] == 'active': - if r['InstanceId'] in active_ids: - raise RuntimeError("A request was already added to the list of active requests. Maybe there are duplicate requests.") - active_ids.add(r['InstanceId']) + "Request %s entered status %s indicating that it will not be " + "fulfilled anytime soon.", + r["InstanceId"], + r["Status"], + ) + elif r["State"] == "active": + if r["InstanceId"] in active_ids: + raise RuntimeError( + "A request was already added to the list of active requests. Maybe there are duplicate requests." + ) + active_ids.add(r["InstanceId"]) batch.append(r) else: - if r['InstanceId'] in other_ids: - raise RuntimeError("A request was already added to the list of other IDs. Maybe there are duplicate requests.") - other_ids.add(r['InstanceId']) + if r["InstanceId"] in other_ids: + raise RuntimeError( + "A request was already added to the list of other IDs. Maybe there are duplicate requests." + ) + other_ids.add(r["InstanceId"]) batch.append(r) if batch: yield batch - logger.info('%i spot requests(s) are open (%i of which are pending evaluation and %i ' - 'are pending fulfillment), %i are active and %i are in another state.', - *list(map(len, (open_ids, eval_ids, fulfill_ids, active_ids, other_ids)))) + logger.info( + "%i spot requests(s) are open (%i of which are pending evaluation and %i " + "are pending fulfillment), %i are active and %i are in another state.", + *list( + map(len, (open_ids, eval_ids, fulfill_ids, active_ids, other_ids)) + ), + ) if not open_ids or tentative and not eval_ids and not fulfill_ids: break sleep_time = 2 * a_short_time if timeout is not None and time.time() + sleep_time >= timeout: - logger.warning('Timed out waiting for spot requests.') + logger.warning("Timed out waiting for spot requests.") break - logger.info('Sleeping for %is', sleep_time) + logger.info("Sleeping for %is", sleep_time) time.sleep(sleep_time) for attempt in retry_ec2(retry_while=spot_request_not_found): with attempt: - requests = boto3_ec2.describe_spot_instance_requests(SpotInstanceRequestIds=list(open_ids)) + requests = boto3_ec2.describe_spot_instance_requests( + SpotInstanceRequestIds=list(open_ids) + ) except BaseException: if open_ids: with panic(logger): @@ -261,18 +275,22 @@ def create_spot_instances( """ def spotRequestNotFound(e): - return getattr(e, 'error_code', None) == "InvalidSpotInstanceRequestID.NotFound" - - spec['LaunchSpecification'].update({'ImageId': image_id}) # boto3 image id is in the launch specification - for attempt in retry_ec2(retry_for=a_long_time, - retry_while=inconsistencies_detected): + return getattr(e, "error_code", None) == "InvalidSpotInstanceRequestID.NotFound" + + spec["LaunchSpecification"].update( + {"ImageId": image_id} + ) # boto3 image id is in the launch specification + for attempt in retry_ec2( + retry_for=a_long_time, retry_while=inconsistencies_detected + ): with attempt: requests_dict = boto3_ec2.request_spot_instances( - SpotPrice=price, InstanceCount=num_instances, **spec) - requests = requests_dict['SpotInstanceRequests'] + SpotPrice=price, InstanceCount=num_instances, **spec + ) + requests = requests_dict["SpotInstanceRequests"] if tags is not None: - for requestID in (request['SpotInstanceRequestId'] for request in requests): + for requestID in (request["SpotInstanceRequestId"] for request in requests): for attempt in retry_ec2(retry_while=spotRequestNotFound): with attempt: boto3_ec2.create_tags(Resources=[requestID], Tags=tags) @@ -280,21 +298,21 @@ def spotRequestNotFound(e): num_active, num_other = 0, 0 # noinspection PyUnboundLocalVariable,PyTypeChecker # request_spot_instances's type annotation is wrong - for batch in wait_spot_requests_active(boto3_ec2, - requests, - timeout=timeout, - tentative=tentative): + for batch in wait_spot_requests_active( + boto3_ec2, requests, timeout=timeout, tentative=tentative + ): instance_ids = [] for request in batch: request: "SpotInstanceRequestTypeDef" - if request["State"] == 'active': + if request["State"] == "active": instance_ids.append(request["InstanceId"]) num_active += 1 else: logger.info( - 'Request %s in unexpected state %s.', + "Request %s in unexpected state %s.", request["InstanceId"], - request["State"]) + request["State"], + ) num_other += 1 if instance_ids: # This next line is the reason we batch. It's so we can get multiple instances in @@ -303,16 +321,18 @@ def spotRequestNotFound(e): for attempt in retry_ec2(): with attempt: # Increase hop limit from 1 to use Instance Metadata V2 - boto3_ec2.modify_instance_metadata_options(InstanceId=instance_id, HttpPutResponseHopLimit=3) + boto3_ec2.modify_instance_metadata_options( + InstanceId=instance_id, HttpPutResponseHopLimit=3 + ) yield boto3_ec2.describe_instances(InstanceIds=instance_ids) if not num_active: - message = 'None of the spot requests entered the active state' + message = "None of the spot requests entered the active state" if tentative: - logger.warning(message + '.') + logger.warning(message + ".") else: raise RuntimeError(message) if num_other: - logger.warning('%i request(s) entered a state other than active.', num_other) + logger.warning("%i request(s) entered a state other than active.", num_other) def create_ondemand_instances( @@ -320,18 +340,19 @@ def create_ondemand_instances( image_id: str, spec: Mapping[str, Any], num_instances: int = 1, -) -> List["InstanceTypeDef"]: +) -> list["InstanceTypeDef"]: """ Requests the RunInstances EC2 API call but accounts for the race between recently created instance profiles, IAM roles and an instance creation that refers to them. """ - instance_type = spec['InstanceType'] - logger.info('Creating %s instance(s) ... ', instance_type) + instance_type = spec["InstanceType"] + logger.info("Creating %s instance(s) ... ", instance_type) boto_instance_list = [] - for attempt in retry_ec2(retry_for=a_long_time, - retry_while=inconsistencies_detected): + for attempt in retry_ec2( + retry_for=a_long_time, retry_while=inconsistencies_detected + ): with attempt: - boto_instance_list: List["InstanceTypeDef"] = boto3_ec2.run_instances( + boto_instance_list: list["InstanceTypeDef"] = boto3_ec2.run_instances( ImageId=image_id, MinCount=num_instances, MaxCount=num_instances, **spec )["Instances"] @@ -339,7 +360,7 @@ def create_ondemand_instances( def increase_instance_hop_limit( - boto3_ec2: "EC2Client", boto_instance_list: List["InstanceTypeDef"] + boto3_ec2: "EC2Client", boto_instance_list: list["InstanceTypeDef"] ) -> None: """ Increase the default HTTP hop limit, as we are running Toil and Kubernetes inside a Docker container, so the default @@ -351,11 +372,13 @@ def increase_instance_hop_limit( :return: """ for boto_instance in boto_instance_list: - instance_id = boto_instance['InstanceId'] + instance_id = boto_instance["InstanceId"] for attempt in retry_ec2(): with attempt: # Increase hop limit from 1 to use Instance Metadata V2 - boto3_ec2.modify_instance_metadata_options(InstanceId=instance_id, HttpPutResponseHopLimit=3) + boto3_ec2.modify_instance_metadata_options( + InstanceId=instance_id, HttpPutResponseHopLimit=3 + ) def prune(bushy: dict) -> dict: @@ -372,15 +395,17 @@ def prune(bushy: dict) -> dict: # We need a module-level client to get the dynamically-generated error types to # catch, and to wait on IAM items. -iam_client = establish_boto3_session().client('iam') +iam_client = establish_boto3_session().client("iam") # exception is generated by a factory so we weirdly need a client instance to reference it -@retry(errors=[iam_client.exceptions.NoSuchEntityException], - intervals=[1, 1, 2, 4, 8, 16, 32, 64]) +@retry( + errors=[iam_client.exceptions.NoSuchEntityException], + intervals=[1, 1, 2, 4, 8, 16, 32, 64], +) def wait_until_instance_profile_arn_exists(instance_profile_arn: str): # TODO: We have no guarantee that the ARN contains the name. - instance_profile_name = instance_profile_arn.split(':instance-profile/')[-1] + instance_profile_name = instance_profile_arn.split(":instance-profile/")[-1] logger.debug("Checking for instance profile %s...", instance_profile_name) iam_client.get_instance_profile(InstanceProfileName=instance_profile_name) logger.debug("Instance profile found") @@ -393,14 +418,14 @@ def create_instances( key_name: str, instance_type: str, num_instances: int = 1, - security_group_ids: Optional[List] = None, + security_group_ids: Optional[list] = None, user_data: Optional[Union[str, bytes]] = None, - block_device_map: Optional[List[Dict]] = None, + block_device_map: Optional[list[dict]] = None, instance_profile_arn: Optional[str] = None, placement_az: Optional[str] = None, subnet_id: str = None, - tags: Optional[Dict[str, str]] = None, -) -> List["Instance"]: + tags: Optional[dict[str, str]] = None, +) -> list["Instance"]: """ Replaces create_ondemand_instances. Uses boto3 and returns a list of Boto3 instance dicts. @@ -411,23 +436,25 @@ def create_instances( Tags, if given, are applied to the instances, and all volumes. """ - logger.info('Creating %s instance(s) ... ', instance_type) + logger.info("Creating %s instance(s) ... ", instance_type) if isinstance(user_data, str): - user_data = user_data.encode('utf-8') - - request = {'ImageId': image_id, - 'MinCount': num_instances, - 'MaxCount': num_instances, - 'KeyName': key_name, - 'SecurityGroupIds': security_group_ids, - 'InstanceType': instance_type, - 'UserData': user_data, - 'BlockDeviceMappings': block_device_map, - 'SubnetId': subnet_id, - # Metadata V2 defaults hops to 1, which is an issue when running inside a docker container - # https://github.com/adamchainz/ec2-metadata?tab=readme-ov-file#instance-metadata-service-version-2 - 'MetadataOptions': {'HttpPutResponseHopLimit': 3}} + user_data = user_data.encode("utf-8") + + request = { + "ImageId": image_id, + "MinCount": num_instances, + "MaxCount": num_instances, + "KeyName": key_name, + "SecurityGroupIds": security_group_ids, + "InstanceType": instance_type, + "UserData": user_data, + "BlockDeviceMappings": block_device_map, + "SubnetId": subnet_id, + # Metadata V2 defaults hops to 1, which is an issue when running inside a docker container + # https://github.com/adamchainz/ec2-metadata?tab=readme-ov-file#instance-metadata-service-version-2 + "MetadataOptions": {"HttpPutResponseHopLimit": 3}, + } if instance_profile_arn: # We could just retry when we get an error because the ARN doesn't @@ -435,16 +462,18 @@ def create_instances( wait_until_instance_profile_arn_exists(instance_profile_arn) # Add it to the request - request['IamInstanceProfile'] = {'Arn': instance_profile_arn} + request["IamInstanceProfile"] = {"Arn": instance_profile_arn} if placement_az: - request['Placement'] = {'AvailabilityZone': placement_az} + request["Placement"] = {"AvailabilityZone": placement_az} if tags: # Tag everything when we make it. flat_tags = flatten_tags(tags) - request['TagSpecifications'] = [{'ResourceType': 'instance', 'Tags': flat_tags}, - {'ResourceType': 'volume', 'Tags': flat_tags}] + request["TagSpecifications"] = [ + {"ResourceType": "instance", "Tags": flat_tags}, + {"ResourceType": "volume", "Tags": flat_tags}, + ] return ec2_resource.create_instances(**prune(request)) @@ -456,13 +485,13 @@ def create_launch_template( image_id: str, key_name: str, instance_type: str, - security_group_ids: Optional[List] = None, + security_group_ids: Optional[list] = None, user_data: Optional[Union[str, bytes]] = None, - block_device_map: Optional[List[Dict]] = None, + block_device_map: Optional[list[dict]] = None, instance_profile_arn: Optional[str] = None, placement_az: Optional[str] = None, subnet_id: Optional[str] = None, - tags: Optional[Dict[str, str]] = None, + tags: Optional[dict[str, str]] = None, ) -> str: """ Creates a launch template with the given name for launching instances with the given parameters. @@ -479,25 +508,26 @@ def create_launch_template( """ - logger.info('Creating launch template for %s instances ... ', instance_type) + logger.info("Creating launch template for %s instances ... ", instance_type) if isinstance(user_data, str): # Make sure we have bytes - user_data = user_data.encode('utf-8') + user_data = user_data.encode("utf-8") # Then base64 and decode back to str. - user_data = b64encode(user_data).decode('utf-8') - - template = {'ImageId': image_id, - 'KeyName': key_name, - 'SecurityGroupIds': security_group_ids, - 'InstanceType': instance_type, - 'UserData': user_data, - 'BlockDeviceMappings': block_device_map, - 'SubnetId': subnet_id, - # Increase hop limit from 1 to use Instance Metadata V2 - 'MetadataOptions': {'HttpPutResponseHopLimit': 3} - } + user_data = b64encode(user_data).decode("utf-8") + + template = { + "ImageId": image_id, + "KeyName": key_name, + "SecurityGroupIds": security_group_ids, + "InstanceType": instance_type, + "UserData": user_data, + "BlockDeviceMappings": block_device_map, + "SubnetId": subnet_id, + # Increase hop limit from 1 to use Instance Metadata V2 + "MetadataOptions": {"HttpPutResponseHopLimit": 3}, + } if instance_profile_arn: # We could just retry when we get an error because the ARN doesn't @@ -505,39 +535,47 @@ def create_launch_template( wait_until_instance_profile_arn_exists(instance_profile_arn) # Add it to the request - template['IamInstanceProfile'] = {'Arn': instance_profile_arn} + template["IamInstanceProfile"] = {"Arn": instance_profile_arn} if placement_az: - template['Placement'] = {'AvailabilityZone': placement_az} + template["Placement"] = {"AvailabilityZone": placement_az} flat_tags = [] if tags: # Tag everything when we make it. flat_tags = flatten_tags(tags) - template['TagSpecifications'] = [{'ResourceType': 'instance', 'Tags': flat_tags}, - {'ResourceType': 'volume', 'Tags': flat_tags}] + template["TagSpecifications"] = [ + {"ResourceType": "instance", "Tags": flat_tags}, + {"ResourceType": "volume", "Tags": flat_tags}, + ] - request = {'LaunchTemplateData': prune(template), - 'LaunchTemplateName': template_name} + request = { + "LaunchTemplateData": prune(template), + "LaunchTemplateName": template_name, + } if tags: - request['TagSpecifications'] = [{'ResourceType': 'launch-template', 'Tags': flat_tags}] + request["TagSpecifications"] = [ + {"ResourceType": "launch-template", "Tags": flat_tags} + ] - return ec2_client.create_launch_template(**request)['LaunchTemplate']['LaunchTemplateId'] + return ec2_client.create_launch_template(**request)["LaunchTemplate"][ + "LaunchTemplateId" + ] @retry(intervals=[5, 5, 10, 20, 20, 20, 20], errors=INCONSISTENCY_ERRORS) def create_auto_scaling_group( autoscaling_client: "AutoScalingClient", asg_name: str, - launch_template_ids: Dict[str, str], - vpc_subnets: List[str], + launch_template_ids: dict[str, str], + vpc_subnets: list[str], min_size: int, max_size: int, instance_types: Optional[Iterable[str]] = None, spot_bid: Optional[float] = None, spot_cheapest: bool = False, - tags: Optional[Dict[str, str]] = None, + tags: Optional[dict[str, str]] = None, ) -> None: """ Create a new Auto Scaling Group with the given name (which is also its @@ -571,19 +609,26 @@ def create_auto_scaling_group( """ if instance_types is None: - instance_types: List[str] = [] + instance_types: list[str] = [] if instance_types is not None and len(instance_types) > 20: - raise RuntimeError(f"Too many instance types ({len(instance_types)}) in group; AWS supports only 20.") + raise RuntimeError( + f"Too many instance types ({len(instance_types)}) in group; AWS supports only 20." + ) if len(vpc_subnets) == 0: - raise RuntimeError("No VPC subnets specified to launch into; not clear where to put instances") + raise RuntimeError( + "No VPC subnets specified to launch into; not clear where to put instances" + ) def get_launch_template_spec(instance_type): """ Get a LaunchTemplateSpecification for the given instance type. """ - return {'LaunchTemplateId': launch_template_ids[instance_type], 'Version': '$Default'} + return { + "LaunchTemplateId": launch_template_ids[instance_type], + "Version": "$Default", + } # We always write the ASG with a MixedInstancesPolicy even when we have only one type. # And we use a separate launch template for every instance type, and apply it as an override. @@ -592,24 +637,42 @@ def get_launch_template_spec(instance_type): # We need to use a launch template per instance type so that different # instance types with specified EBS storage size overrides will get their # storage. - mip = {'LaunchTemplate': {'LaunchTemplateSpecification': get_launch_template_spec(next(iter(instance_types))), # noqa - 'Overrides': [{'InstanceType': t, 'LaunchTemplateSpecification': get_launch_template_spec(t)} for t in instance_types]}} # noqa + mip = { + "LaunchTemplate": { + "LaunchTemplateSpecification": get_launch_template_spec( + next(iter(instance_types)) + ), # noqa + "Overrides": [ + { + "InstanceType": t, + "LaunchTemplateSpecification": get_launch_template_spec(t), + } + for t in instance_types + ], + } + } # noqa if spot_bid is not None: # Ask for spot instances by saying everything above base capacity of 0 should be spot. - mip['InstancesDistribution'] = {'OnDemandPercentageAboveBaseCapacity': 0, - 'SpotAllocationStrategy': 'capacity-optimized' if not spot_cheapest else 'lowest-price', - 'SpotMaxPrice': str(spot_bid)} - - asg = {'AutoScalingGroupName': asg_name, - 'MixedInstancesPolicy': prune(mip), - 'MinSize': min_size, - 'MaxSize': max_size, - 'VPCZoneIdentifier': ','.join(vpc_subnets)} + mip["InstancesDistribution"] = { + "OnDemandPercentageAboveBaseCapacity": 0, + "SpotAllocationStrategy": ( + "capacity-optimized" if not spot_cheapest else "lowest-price" + ), + "SpotMaxPrice": str(spot_bid), + } + + asg = { + "AutoScalingGroupName": asg_name, + "MixedInstancesPolicy": prune(mip), + "MinSize": min_size, + "MaxSize": max_size, + "VPCZoneIdentifier": ",".join(vpc_subnets), + } if tags: # Tag the ASG itself. - asg['Tags'] = flatten_tags(tags) + asg["Tags"] = flatten_tags(tags) logger.debug("Creating Autoscaling Group across subnets: %s", vpc_subnets) diff --git a/src/toil/lib/ec2nodes.py b/src/toil/lib/ec2nodes.py index 56577b6860..ac38bd7395 100644 --- a/src/toil/lib/ec2nodes.py +++ b/src/toil/lib/ec2nodes.py @@ -16,74 +16,90 @@ import logging import os import re -import textwrap -import requests import shutil -import enlighten # type: ignore - -from typing import Dict, List, Tuple, Union, Any +import textwrap +from typing import Any, Union +import enlighten # type: ignore +import requests logger = logging.getLogger(__name__) manager = enlighten.get_manager() dirname = os.path.dirname(__file__) -region_json_dirname = os.path.join(dirname, 'region_jsons') - - -EC2Regions = {'us-west-1': 'US West (N. California)', - 'us-west-2': 'US West (Oregon)', - 'us-east-1': 'US East (N. Virginia)', - 'us-east-2': 'US East (Ohio)', - 'us-gov-west-1': 'AWS GovCloud (US)', - 'ca-central-1': 'Canada (Central)', - 'ap-northeast-1': 'Asia Pacific (Tokyo)', - 'ap-northeast-2': 'Asia Pacific (Seoul)', - 'ap-northeast-3': 'Asia Pacific (Osaka-Local)', - 'ap-southeast-1': 'Asia Pacific (Singapore)', - 'ap-southeast-2': 'Asia Pacific (Sydney)', - 'ap-south-1': 'Asia Pacific (Mumbai)', - 'eu-west-1': 'EU (Ireland)', - 'eu-west-2': 'EU (London)', - 'eu-west-3': 'EU (Paris)', - 'eu-central-1': 'EU (Frankfurt)', - 'sa-east-1': 'South America (Sao Paulo)'} +region_json_dirname = os.path.join(dirname, "region_jsons") + + +EC2Regions = { + "us-west-1": "US West (N. California)", + "us-west-2": "US West (Oregon)", + "us-east-1": "US East (N. Virginia)", + "us-east-2": "US East (Ohio)", + "us-gov-west-1": "AWS GovCloud (US)", + "ca-central-1": "Canada (Central)", + "ap-northeast-1": "Asia Pacific (Tokyo)", + "ap-northeast-2": "Asia Pacific (Seoul)", + "ap-northeast-3": "Asia Pacific (Osaka-Local)", + "ap-southeast-1": "Asia Pacific (Singapore)", + "ap-southeast-2": "Asia Pacific (Sydney)", + "ap-south-1": "Asia Pacific (Mumbai)", + "eu-west-1": "EU (Ireland)", + "eu-west-2": "EU (London)", + "eu-west-3": "EU (Paris)", + "eu-central-1": "EU (Frankfurt)", + "sa-east-1": "South America (Sao Paulo)", +} class InstanceType: - __slots__ = ('name', 'cores', 'memory', 'disks', 'disk_capacity', 'architecture') - - def __init__(self, name: str, cores: int, memory: float, disks: float, disk_capacity: float, architecture: str): + __slots__ = ("name", "cores", "memory", "disks", "disk_capacity", "architecture") + + def __init__( + self, + name: str, + cores: int, + memory: float, + disks: float, + disk_capacity: float, + architecture: str, + ): self.name = name # the API name of the instance type self.cores = cores # the number of cores self.memory = memory # RAM in GiB self.disks = disks # the number of ephemeral (aka 'instance store') volumes - self.disk_capacity = disk_capacity # the capacity of each ephemeral volume in GiB - self.architecture = architecture # the architecture of the instance type. Can be either amd64 or arm64 + self.disk_capacity = ( + disk_capacity # the capacity of each ephemeral volume in GiB + ) + self.architecture = architecture # the architecture of the instance type. Can be either amd64 or arm64 def __str__(self) -> str: - return ("Type: {}\n" - "Cores: {}\n" - "Disks: {}\n" - "Memory: {}\n" - "Disk Capacity: {}\n" - "Architecture: {}\n" - "".format( + return ( + "Type: {}\n" + "Cores: {}\n" + "Disks: {}\n" + "Memory: {}\n" + "Disk Capacity: {}\n" + "Architecture: {}\n" + "".format( self.name, self.cores, self.disks, self.memory, self.disk_capacity, - self.architecture)) + self.architecture, + ) + ) def __eq__(self, other: object) -> bool: if not isinstance(other, InstanceType): return NotImplemented - if (self.name == other.name and - self.cores == other.cores and - self.memory == other.memory and - self.disks == other.disks and - self.disk_capacity == other.disk_capacity and - self.architecture == other.architecture): + if ( + self.name == other.name + and self.cores == other.cores + and self.memory == other.memory + and self.disks == other.disks + and self.disk_capacity == other.disk_capacity + and self.architecture == other.architecture + ): return True return False @@ -95,7 +111,7 @@ def is_number(s: str) -> bool: :param s: Any unicode string. :return: True if s represents a number, False otherwise. """ - s = s.replace(',', '') + s = s.replace(",", "") try: float(s) return True @@ -103,6 +119,7 @@ def is_number(s: str) -> bool: pass try: import unicodedata + unicodedata.numeric(s) return True except (TypeError, ValueError) as e: @@ -110,7 +127,9 @@ def is_number(s: str) -> bool: return False -def parse_storage(storage_info: str) -> Union[List[int], Tuple[Union[int, float], float]]: +def parse_storage( + storage_info: str, +) -> Union[list[int], tuple[Union[int, float], float]]: """ Parses EC2 JSON storage param string into a number. @@ -129,12 +148,19 @@ def parse_storage(storage_info: str) -> Union[List[int], Tuple[Union[int, float] return [0, 0] else: specs = storage_info.strip().split() - if is_number(specs[0]) and specs[1] == 'x' and is_number(specs[2]): - return float(specs[0].replace(',', '')), float(specs[2].replace(',', '')) - elif is_number(specs[0]) and specs[1] == 'GB' and specs[2] == 'NVMe' and specs[3] == 'SSD': - return 1, float(specs[0].replace(',', '')) + if is_number(specs[0]) and specs[1] == "x" and is_number(specs[2]): + return float(specs[0].replace(",", "")), float(specs[2].replace(",", "")) + elif ( + is_number(specs[0]) + and specs[1] == "GB" + and specs[2] == "NVMe" + and specs[3] == "SSD" + ): + return 1, float(specs[0].replace(",", "")) else: - raise RuntimeError('EC2 JSON format has likely changed. Error parsing disk specs.') + raise RuntimeError( + "EC2 JSON format has likely changed. Error parsing disk specs." + ) def parse_memory(mem_info: str) -> float: @@ -148,14 +174,14 @@ def parse_memory(mem_info: str) -> float: :param mem_info: EC2 JSON memory param string. :return: A float representing memory in GiB. """ - mem = mem_info.replace(',', '').split() - if mem[1] == 'GiB': + mem = mem_info.replace(",", "").split() + if mem[1] == "GiB": return float(mem[0]) else: - raise RuntimeError('EC2 JSON format has likely changed. Error parsing memory.') + raise RuntimeError("EC2 JSON format has likely changed. Error parsing memory.") -def download_region_json(filename: str, region: str = 'us-east-1') -> None: +def download_region_json(filename: str, region: str = "us-east-1") -> None: """ Downloads and writes the AWS Billing JSON to a file using the AWS pricing API. @@ -165,18 +191,25 @@ def download_region_json(filename: str, region: str = 'us-east-1') -> None: aws instance name (example: 't2.micro'), and the value is an InstanceType object representing that aws instance name. """ - response = requests.get(f'https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/{region}/index.json', stream=True) + response = requests.get( + f"https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/{region}/index.json", + stream=True, + ) file_size = int(response.headers.get("content-length", 0)) - print(f'Downloading ~{file_size / 1000000000}Gb {region} AWS billing file to: {filename}') + print( + f"Downloading ~{file_size / 1000000000}Gb {region} AWS billing file to: {filename}" + ) - with manager.counter(total=file_size, desc=os.path.basename(filename), unit='bytes', leave=False) as progress_bar: + with manager.counter( + total=file_size, desc=os.path.basename(filename), unit="bytes", leave=False + ) as progress_bar: with open(filename, "wb") as file: for data in response.iter_content(1048576): progress_bar.update(len(data)) file.write(data) -def reduce_region_json_size(filename:str) -> List[Dict[str, Any]]: +def reduce_region_json_size(filename: str) -> list[dict[str, Any]]: """ Deletes information in the json file that we don't need, and rewrites it. This makes the file smaller. @@ -185,23 +218,31 @@ def reduce_region_json_size(filename:str) -> List[Dict[str, Any]]: (with AWS's new Query API), but even those may eventually one day grow ridiculously large, so we do what we can to keep the file sizes down (and thus also the amount loaded into memory) to keep this script working for longer. """ - with open(filename, 'r') as f: - aws_products = json.loads(f.read())['products'] + with open(filename) as f: + aws_products = json.loads(f.read())["products"] aws_product_list = list() for k in list(aws_products.keys()): - ec2_attributes = aws_products[k]['attributes'] - if (ec2_attributes.get('tenancy') == 'Shared' and - ec2_attributes.get('operatingSystem') == 'Linux' and - ec2_attributes.get('operation') == 'RunInstances' and - ec2_attributes.get('usagetype').endswith('BoxUsage:' + ec2_attributes['instanceType'])): - aws_product_list.append(dict(disk=ec2_attributes["storage"], - loc=ec2_attributes["location"], - name=ec2_attributes["instanceType"], - mem=ec2_attributes["memory"], - cpu=ec2_attributes["vcpu"])) + ec2_attributes = aws_products[k]["attributes"] + if ( + ec2_attributes.get("tenancy") == "Shared" + and ec2_attributes.get("operatingSystem") == "Linux" + and ec2_attributes.get("operation") == "RunInstances" + and ec2_attributes.get("usagetype").endswith( + "BoxUsage:" + ec2_attributes["instanceType"] + ) + ): + aws_product_list.append( + dict( + disk=ec2_attributes["storage"], + loc=ec2_attributes["location"], + name=ec2_attributes["instanceType"], + mem=ec2_attributes["memory"], + cpu=ec2_attributes["vcpu"], + ) + ) del aws_products[k] del aws_products - with open(filename, 'w') as f: + with open(filename, "w") as f: f.write(json.dumps(dict(aws=aws_product_list), indent=2)) return aws_product_list @@ -214,14 +255,20 @@ def updateStaticEC2Instances() -> None: :return: Nothing. Writes a new 'generatedEC2Lists.py' file. """ - print("Updating Toil's EC2 lists to the most current version from AWS's bulk API.\n" - "This may take a while, depending on your internet connection.\n") - - original_aws_instance_list = os.path.join(dirname, 'generatedEC2Lists.py') # original + print( + "Updating Toil's EC2 lists to the most current version from AWS's bulk API.\n" + "This may take a while, depending on your internet connection.\n" + ) + + original_aws_instance_list = os.path.join( + dirname, "generatedEC2Lists.py" + ) # original if not os.path.exists(original_aws_instance_list): raise RuntimeError(f"Path {original_aws_instance_list} does not exist.") # use a temporary file until all info is fetched - updated_aws_instance_list = os.path.join(dirname, 'generatedEC2Lists_tmp.py') # temp + updated_aws_instance_list = os.path.join( + dirname, "generatedEC2Lists_tmp.py" + ) # temp if os.path.exists(updated_aws_instance_list): os.remove(updated_aws_instance_list) @@ -229,15 +276,15 @@ def updateStaticEC2Instances() -> None: os.mkdir(region_json_dirname) currentEC2List = [] - instancesByRegion: Dict[str, List[str]] = {} + instancesByRegion: dict[str, list[str]] = {} for region in EC2Regions.keys(): - region_json = os.path.join(region_json_dirname, f'{region}.json') + region_json = os.path.join(region_json_dirname, f"{region}.json") if os.path.exists(region_json): try: - with open(region_json, 'r') as f: - aws_products = json.loads(f.read())['aws'] - print(f'Reusing previously downloaded json @: {region_json}') + with open(region_json) as f: + aws_products = json.loads(f.read())["aws"] + print(f"Reusing previously downloaded json @: {region_json}") except: os.remove(region_json) download_region_json(filename=region_json, region=region) @@ -251,14 +298,24 @@ def updateStaticEC2Instances() -> None: disks, disk_capacity = parse_storage(i["disk"]) # Determines whether the instance type is from an ARM or AMD family # ARM instance names include a digit followed by a 'g' before the instance size - architecture = 'arm64' if re.search(r".*\dg.*\..*", i["name"]) else 'amd64' - ec2InstanceList.append(InstanceType(name=i["name"], - cores=i["cpu"], - memory=parse_memory(i["mem"]), - disks=disks, - disk_capacity=disk_capacity, - architecture=architecture)) - print('Finished for ' + str(region) + '. ' + str(len(ec2InstanceList)) + ' added.\n') + architecture = "arm64" if re.search(r".*\dg.*\..*", i["name"]) else "amd64" + ec2InstanceList.append( + InstanceType( + name=i["name"], + cores=i["cpu"], + memory=parse_memory(i["mem"]), + disks=disks, + disk_capacity=disk_capacity, + architecture=architecture, + ) + ) + print( + "Finished for " + + str(region) + + ". " + + str(len(ec2InstanceList)) + + " added.\n" + ) currentEC2Dict = {_.name: _ for _ in ec2InstanceList} for instanceName, instanceTypeObj in currentEC2Dict.items(): if instanceTypeObj not in currentEC2List: @@ -266,8 +323,10 @@ def updateStaticEC2Instances() -> None: instancesByRegion.setdefault(region, []).append(instanceName) # write provenance note, copyright and imports - with open(updated_aws_instance_list, 'w') as f: - f.write(textwrap.dedent(''' + with open(updated_aws_instance_list, "w") as f: + f.write( + textwrap.dedent( + """ # !!! AUTOGENERATED FILE !!! # Update with: src/toil/utils/toilUpdateEC2Instances.py # @@ -284,36 +343,41 @@ def updateStaticEC2Instances() -> None: # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from toil.lib.ec2nodes import InstanceType\n\n\n''').format(year=datetime.date.today().strftime("%Y"))[1:]) + from toil.lib.ec2nodes import InstanceType\n\n\n""" + ).format(year=datetime.date.today().strftime("%Y"))[1:] + ) # write header of total EC2 instance type list - genString = f'# {len(currentEC2List)} Instance Types. Generated {datetime.datetime.now()}.\n' + genString = f"# {len(currentEC2List)} Instance Types. Generated {datetime.datetime.now()}.\n" genString = genString + "E2Instances = {\n" sortedCurrentEC2List = sorted(currentEC2List, key=lambda x: x.name) # write the list of all instances types for i in sortedCurrentEC2List: - genString = genString + f" '{i.name}': InstanceType(name='{i.name}', cores={i.cores}, memory={i.memory}, disks={i.disks}, disk_capacity={i.disk_capacity}, architecture='{i.architecture}'),\n" - genString = genString + '}\n\n' + genString = ( + genString + + f" '{i.name}': InstanceType(name='{i.name}', cores={i.cores}, memory={i.memory}, disks={i.disks}, disk_capacity={i.disk_capacity}, architecture='{i.architecture}'),\n" + ) + genString = genString + "}\n\n" - genString = genString + 'regionDict = {\n' + genString = genString + "regionDict = {\n" for regionName, instanceList in instancesByRegion.items(): genString = genString + f" '{regionName}': [" for instance in sorted(instanceList): genString = genString + f"'{instance}', " - if genString.endswith(', '): + if genString.endswith(", "): genString = genString[:-2] - genString = genString + '],\n' - if genString.endswith(',\n'): - genString = genString[:-len(',\n')] - genString = genString + '}\n' - with open(updated_aws_instance_list, 'a+') as f: + genString = genString + "],\n" + if genString.endswith(",\n"): + genString = genString[: -len(",\n")] + genString = genString + "}\n" + with open(updated_aws_instance_list, "a+") as f: f.write(genString) # append key for fetching at the end - regionKey = '\nec2InstancesByRegion = {region: [E2Instances[i] for i in instances] for region, instances in regionDict.items()}\n' + regionKey = "\nec2InstancesByRegion = {region: [E2Instances[i] for i in instances] for region, instances in regionDict.items()}\n" - with open(updated_aws_instance_list, 'a+') as f: + with open(updated_aws_instance_list, "a+") as f: f.write(regionKey) # replace the instance list with a current list @@ -321,5 +385,7 @@ def updateStaticEC2Instances() -> None: # delete the aws region json file directory if os.path.exists(region_json_dirname): - print(f'Update Successful! Removing AWS Region JSON Files @: {region_json_dirname}') + print( + f"Update Successful! Removing AWS Region JSON Files @: {region_json_dirname}" + ) shutil.rmtree(region_json_dirname) diff --git a/src/toil/lib/encryption/_dummy.py b/src/toil/lib/encryption/_dummy.py index 6a3e64ea89..f105d215b1 100644 --- a/src/toil/lib/encryption/_dummy.py +++ b/src/toil/lib/encryption/_dummy.py @@ -27,6 +27,8 @@ def decrypt(ciphertext: bytes, keyPath: str) -> bytes: def _bail(): - raise NotImplementedError("Encryption support is not installed. Consider re-installing toil " - "with the 'encryption' extra along with any other extras you might " - "want, e.g. 'pip install toil[encryption,...]'.") + raise NotImplementedError( + "Encryption support is not installed. Consider re-installing toil " + "with the 'encryption' extra along with any other extras you might " + "want, e.g. 'pip install toil[encryption,...]'." + ) diff --git a/src/toil/lib/encryption/_nacl.py b/src/toil/lib/encryption/_nacl.py index d9ec9eb585..9a6ad267d2 100644 --- a/src/toil/lib/encryption/_nacl.py +++ b/src/toil/lib/encryption/_nacl.py @@ -40,11 +40,13 @@ def encrypt(message: bytes, keyPath: str) -> bytes: >>> import os >>> os.remove(k) """ - with open(keyPath, 'rb') as f: + with open(keyPath, "rb") as f: key = f.read() if len(key) != SecretBox.KEY_SIZE: - raise ValueError("Key is %d bytes, but must be exactly %d bytes" % (len(key), - SecretBox.KEY_SIZE)) + raise ValueError( + "Key is %d bytes, but must be exactly %d bytes" + % (len(key), SecretBox.KEY_SIZE) + ) sb = SecretBox(key) # We generate the nonce using secure random bits. For long enough # nonce size, the chance of a random nonce collision becomes @@ -87,11 +89,13 @@ def decrypt(ciphertext: bytes, keyPath: str) -> bytes: >>> import os >>> os.remove(k) """ - with open(keyPath, 'rb') as f: + with open(keyPath, "rb") as f: key = f.read() if len(key) != SecretBox.KEY_SIZE: - raise ValueError("Key is %d bytes, but must be exactly %d bytes" % (len(key), - SecretBox.KEY_SIZE)) + raise ValueError( + "Key is %d bytes, but must be exactly %d bytes" + % (len(key), SecretBox.KEY_SIZE) + ) sb = SecretBox(key) # The nonce is kept with the message. return sb.decrypt(ciphertext) diff --git a/src/toil/lib/encryption/conftest.py b/src/toil/lib/encryption/conftest.py index a0b2a02e22..48d416f7e6 100644 --- a/src/toil/lib/encryption/conftest.py +++ b/src/toil/lib/encryption/conftest.py @@ -4,6 +4,7 @@ try: import nacl + print(nacl.__file__) # to keep this import from being removed except ImportError: collect_ignore.append("_nacl.py") diff --git a/src/toil/lib/exceptions.py b/src/toil/lib/exceptions.py index adaecc6eee..8140838939 100644 --- a/src/toil/lib/exceptions.py +++ b/src/toil/lib/exceptions.py @@ -38,20 +38,21 @@ class panic: the primary exception will be reraised. """ - def __init__( self, log=None ): - super().__init__( ) + def __init__(self, log=None): + super().__init__() self.log = log self.exc_info = None - def __enter__( self ): - self.exc_info = sys.exc_info( ) + def __enter__(self): + self.exc_info = sys.exc_info() - def __exit__( self, *exc_info ): - if self.log is not None and exc_info and exc_info[ 0 ]: - self.log.warning( "Exception during panic", exc_info=exc_info ) + def __exit__(self, *exc_info): + if self.log is not None and exc_info and exc_info[0]: + self.log.warning("Exception during panic", exc_info=exc_info) exc_type, exc_value, traceback = self.exc_info raise_(exc_type, exc_value, traceback) + def raise_(exc_type, exc_value, traceback) -> None: if exc_value is not None: exc = exc_value diff --git a/src/toil/lib/expando.py b/src/toil/lib/expando.py index b543b18f30..e48d6e198f 100644 --- a/src/toil/lib/expando.py +++ b/src/toil/lib/expando.py @@ -14,6 +14,7 @@ # 5.14.2018: copied into Toil from https://github.com/BD2KGenomics/bd2k-python-lib + class Expando(dict): """ Pass initial attributes to the constructor: @@ -100,14 +101,15 @@ class Expando(dict): True """ - def __init__( self, *args, **kwargs ): - super().__init__( *args, **kwargs ) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) self.__slots__ = None self.__dict__ = self def copy(self): return type(self)(self) + class MagicExpando(Expando): """ Use MagicExpando for chained attribute access. diff --git a/src/toil/lib/generatedEC2Lists.py b/src/toil/lib/generatedEC2Lists.py index e1bab8957d..b5a5465131 100644 --- a/src/toil/lib/generatedEC2Lists.py +++ b/src/toil/lib/generatedEC2Lists.py @@ -16,800 +16,15805 @@ # limitations under the License. from toil.lib.ec2nodes import InstanceType - # 772 Instance Types. Generated 2024-02-23 13:20:11.748439. E2Instances = { - 'a1.2xlarge': InstanceType(name='a1.2xlarge', cores=8, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'a1.4xlarge': InstanceType(name='a1.4xlarge', cores=16, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'a1.large': InstanceType(name='a1.large', cores=2, memory=4.0, disks=0, disk_capacity=0, architecture='amd64'), - 'a1.medium': InstanceType(name='a1.medium', cores=1, memory=2.0, disks=0, disk_capacity=0, architecture='amd64'), - 'a1.metal': InstanceType(name='a1.metal', cores=16, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'a1.xlarge': InstanceType(name='a1.xlarge', cores=4, memory=8.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c1.medium': InstanceType(name='c1.medium', cores=2, memory=1.7, disks=1.0, disk_capacity=350.0, architecture='amd64'), - 'c1.xlarge': InstanceType(name='c1.xlarge', cores=8, memory=7.0, disks=4.0, disk_capacity=420.0, architecture='amd64'), - 'c3.2xlarge': InstanceType(name='c3.2xlarge', cores=8, memory=15.0, disks=2.0, disk_capacity=80.0, architecture='amd64'), - 'c3.4xlarge': InstanceType(name='c3.4xlarge', cores=16, memory=30.0, disks=2.0, disk_capacity=160.0, architecture='amd64'), - 'c3.8xlarge': InstanceType(name='c3.8xlarge', cores=32, memory=60.0, disks=2.0, disk_capacity=320.0, architecture='amd64'), - 'c3.large': InstanceType(name='c3.large', cores=2, memory=3.75, disks=2.0, disk_capacity=16.0, architecture='amd64'), - 'c3.xlarge': InstanceType(name='c3.xlarge', cores=4, memory=7.5, disks=2.0, disk_capacity=40.0, architecture='amd64'), - 'c4.2xlarge': InstanceType(name='c4.2xlarge', cores=8, memory=15.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c4.4xlarge': InstanceType(name='c4.4xlarge', cores=16, memory=30.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c4.8xlarge': InstanceType(name='c4.8xlarge', cores=36, memory=60.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c4.large': InstanceType(name='c4.large', cores=2, memory=3.75, disks=0, disk_capacity=0, architecture='amd64'), - 'c4.xlarge': InstanceType(name='c4.xlarge', cores=4, memory=7.5, disks=0, disk_capacity=0, architecture='amd64'), - 'c5.12xlarge': InstanceType(name='c5.12xlarge', cores=48, memory=96.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c5.18xlarge': InstanceType(name='c5.18xlarge', cores=72, memory=144.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c5.24xlarge': InstanceType(name='c5.24xlarge', cores=96, memory=192.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c5.2xlarge': InstanceType(name='c5.2xlarge', cores=8, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c5.4xlarge': InstanceType(name='c5.4xlarge', cores=16, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c5.9xlarge': InstanceType(name='c5.9xlarge', cores=36, memory=72.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c5.large': InstanceType(name='c5.large', cores=2, memory=4.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c5.metal': InstanceType(name='c5.metal', cores=96, memory=192.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c5.xlarge': InstanceType(name='c5.xlarge', cores=4, memory=8.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c5a.12xlarge': InstanceType(name='c5a.12xlarge', cores=48, memory=96.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c5a.16xlarge': InstanceType(name='c5a.16xlarge', cores=64, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c5a.24xlarge': InstanceType(name='c5a.24xlarge', cores=96, memory=192.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c5a.2xlarge': InstanceType(name='c5a.2xlarge', cores=8, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c5a.4xlarge': InstanceType(name='c5a.4xlarge', cores=16, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c5a.8xlarge': InstanceType(name='c5a.8xlarge', cores=32, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c5a.large': InstanceType(name='c5a.large', cores=2, memory=4.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c5a.xlarge': InstanceType(name='c5a.xlarge', cores=4, memory=8.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c5ad.12xlarge': InstanceType(name='c5ad.12xlarge', cores=48, memory=96.0, disks=2.0, disk_capacity=900.0, architecture='amd64'), - 'c5ad.16xlarge': InstanceType(name='c5ad.16xlarge', cores=64, memory=128.0, disks=2.0, disk_capacity=1200.0, architecture='amd64'), - 'c5ad.24xlarge': InstanceType(name='c5ad.24xlarge', cores=96, memory=192.0, disks=2.0, disk_capacity=1900.0, architecture='amd64'), - 'c5ad.2xlarge': InstanceType(name='c5ad.2xlarge', cores=8, memory=16.0, disks=1.0, disk_capacity=300.0, architecture='amd64'), - 'c5ad.4xlarge': InstanceType(name='c5ad.4xlarge', cores=16, memory=32.0, disks=2.0, disk_capacity=300.0, architecture='amd64'), - 'c5ad.8xlarge': InstanceType(name='c5ad.8xlarge', cores=32, memory=64.0, disks=2.0, disk_capacity=600.0, architecture='amd64'), - 'c5ad.large': InstanceType(name='c5ad.large', cores=2, memory=4.0, disks=1.0, disk_capacity=75.0, architecture='amd64'), - 'c5ad.xlarge': InstanceType(name='c5ad.xlarge', cores=4, memory=8.0, disks=1.0, disk_capacity=150.0, architecture='amd64'), - 'c5d.12xlarge': InstanceType(name='c5d.12xlarge', cores=48, memory=96.0, disks=2.0, disk_capacity=900.0, architecture='amd64'), - 'c5d.18xlarge': InstanceType(name='c5d.18xlarge', cores=72, memory=144.0, disks=2.0, disk_capacity=900.0, architecture='amd64'), - 'c5d.24xlarge': InstanceType(name='c5d.24xlarge', cores=96, memory=192.0, disks=4.0, disk_capacity=900.0, architecture='amd64'), - 'c5d.2xlarge': InstanceType(name='c5d.2xlarge', cores=8, memory=16.0, disks=1.0, disk_capacity=200.0, architecture='amd64'), - 'c5d.4xlarge': InstanceType(name='c5d.4xlarge', cores=16, memory=32.0, disks=1.0, disk_capacity=400.0, architecture='amd64'), - 'c5d.9xlarge': InstanceType(name='c5d.9xlarge', cores=36, memory=72.0, disks=1.0, disk_capacity=900.0, architecture='amd64'), - 'c5d.large': InstanceType(name='c5d.large', cores=2, memory=4.0, disks=1.0, disk_capacity=50.0, architecture='amd64'), - 'c5d.metal': InstanceType(name='c5d.metal', cores=96, memory=192.0, disks=4.0, disk_capacity=900.0, architecture='amd64'), - 'c5d.xlarge': InstanceType(name='c5d.xlarge', cores=4, memory=8.0, disks=1.0, disk_capacity=100.0, architecture='amd64'), - 'c5n.18xlarge': InstanceType(name='c5n.18xlarge', cores=72, memory=192.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c5n.2xlarge': InstanceType(name='c5n.2xlarge', cores=8, memory=21.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c5n.4xlarge': InstanceType(name='c5n.4xlarge', cores=16, memory=42.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c5n.9xlarge': InstanceType(name='c5n.9xlarge', cores=36, memory=96.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c5n.large': InstanceType(name='c5n.large', cores=2, memory=5.25, disks=0, disk_capacity=0, architecture='amd64'), - 'c5n.metal': InstanceType(name='c5n.metal', cores=72, memory=192.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c5n.xlarge': InstanceType(name='c5n.xlarge', cores=4, memory=10.5, disks=0, disk_capacity=0, architecture='amd64'), - 'c6a.12xlarge': InstanceType(name='c6a.12xlarge', cores=48, memory=96.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6a.16xlarge': InstanceType(name='c6a.16xlarge', cores=64, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6a.24xlarge': InstanceType(name='c6a.24xlarge', cores=96, memory=192.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6a.2xlarge': InstanceType(name='c6a.2xlarge', cores=8, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6a.32xlarge': InstanceType(name='c6a.32xlarge', cores=128, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6a.48xlarge': InstanceType(name='c6a.48xlarge', cores=192, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6a.4xlarge': InstanceType(name='c6a.4xlarge', cores=16, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6a.8xlarge': InstanceType(name='c6a.8xlarge', cores=32, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6a.large': InstanceType(name='c6a.large', cores=2, memory=4.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6a.metal': InstanceType(name='c6a.metal', cores=192, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6a.xlarge': InstanceType(name='c6a.xlarge', cores=4, memory=8.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6g.12xlarge': InstanceType(name='c6g.12xlarge', cores=48, memory=96.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c6g.16xlarge': InstanceType(name='c6g.16xlarge', cores=64, memory=128.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c6g.2xlarge': InstanceType(name='c6g.2xlarge', cores=8, memory=16.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c6g.4xlarge': InstanceType(name='c6g.4xlarge', cores=16, memory=32.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c6g.8xlarge': InstanceType(name='c6g.8xlarge', cores=32, memory=64.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c6g.large': InstanceType(name='c6g.large', cores=2, memory=4.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c6g.medium': InstanceType(name='c6g.medium', cores=1, memory=2.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c6g.metal': InstanceType(name='c6g.metal', cores=64, memory=128.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c6g.xlarge': InstanceType(name='c6g.xlarge', cores=4, memory=8.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c6gd.12xlarge': InstanceType(name='c6gd.12xlarge', cores=48, memory=96.0, disks=2.0, disk_capacity=1425.0, architecture='arm64'), - 'c6gd.16xlarge': InstanceType(name='c6gd.16xlarge', cores=64, memory=128.0, disks=2.0, disk_capacity=1900.0, architecture='arm64'), - 'c6gd.2xlarge': InstanceType(name='c6gd.2xlarge', cores=8, memory=16.0, disks=1.0, disk_capacity=475.0, architecture='arm64'), - 'c6gd.4xlarge': InstanceType(name='c6gd.4xlarge', cores=16, memory=32.0, disks=1.0, disk_capacity=950.0, architecture='arm64'), - 'c6gd.8xlarge': InstanceType(name='c6gd.8xlarge', cores=32, memory=64.0, disks=1.0, disk_capacity=1900.0, architecture='arm64'), - 'c6gd.large': InstanceType(name='c6gd.large', cores=2, memory=4.0, disks=1.0, disk_capacity=118.0, architecture='arm64'), - 'c6gd.medium': InstanceType(name='c6gd.medium', cores=1, memory=2.0, disks=1.0, disk_capacity=59.0, architecture='arm64'), - 'c6gd.metal': InstanceType(name='c6gd.metal', cores=64, memory=128.0, disks=2.0, disk_capacity=1900.0, architecture='arm64'), - 'c6gd.xlarge': InstanceType(name='c6gd.xlarge', cores=4, memory=8.0, disks=1.0, disk_capacity=237.0, architecture='arm64'), - 'c6gn.12xlarge': InstanceType(name='c6gn.12xlarge', cores=48, memory=96.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c6gn.16xlarge': InstanceType(name='c6gn.16xlarge', cores=64, memory=128.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c6gn.2xlarge': InstanceType(name='c6gn.2xlarge', cores=8, memory=16.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c6gn.4xlarge': InstanceType(name='c6gn.4xlarge', cores=16, memory=32.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c6gn.8xlarge': InstanceType(name='c6gn.8xlarge', cores=32, memory=64.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c6gn.large': InstanceType(name='c6gn.large', cores=2, memory=4.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c6gn.medium': InstanceType(name='c6gn.medium', cores=1, memory=2.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c6gn.metal': InstanceType(name='c6gn.metal', cores=64, memory=128.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c6gn.xlarge': InstanceType(name='c6gn.xlarge', cores=4, memory=8.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c6i.12xlarge': InstanceType(name='c6i.12xlarge', cores=48, memory=96.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6i.16xlarge': InstanceType(name='c6i.16xlarge', cores=64, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6i.24xlarge': InstanceType(name='c6i.24xlarge', cores=96, memory=192.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6i.2xlarge': InstanceType(name='c6i.2xlarge', cores=8, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6i.32xlarge': InstanceType(name='c6i.32xlarge', cores=128, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6i.4xlarge': InstanceType(name='c6i.4xlarge', cores=16, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6i.8xlarge': InstanceType(name='c6i.8xlarge', cores=32, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6i.large': InstanceType(name='c6i.large', cores=2, memory=4.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6i.metal': InstanceType(name='c6i.metal', cores=128, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6i.xlarge': InstanceType(name='c6i.xlarge', cores=4, memory=8.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6id.12xlarge': InstanceType(name='c6id.12xlarge', cores=48, memory=96.0, disks=2.0, disk_capacity=1425.0, architecture='amd64'), - 'c6id.16xlarge': InstanceType(name='c6id.16xlarge', cores=64, memory=128.0, disks=2.0, disk_capacity=1900.0, architecture='amd64'), - 'c6id.24xlarge': InstanceType(name='c6id.24xlarge', cores=96, memory=192.0, disks=4.0, disk_capacity=1425.0, architecture='amd64'), - 'c6id.2xlarge': InstanceType(name='c6id.2xlarge', cores=8, memory=16.0, disks=1.0, disk_capacity=474.0, architecture='amd64'), - 'c6id.32xlarge': InstanceType(name='c6id.32xlarge', cores=128, memory=256.0, disks=4.0, disk_capacity=1900.0, architecture='amd64'), - 'c6id.4xlarge': InstanceType(name='c6id.4xlarge', cores=16, memory=32.0, disks=1.0, disk_capacity=950.0, architecture='amd64'), - 'c6id.8xlarge': InstanceType(name='c6id.8xlarge', cores=32, memory=64.0, disks=1.0, disk_capacity=1900.0, architecture='amd64'), - 'c6id.large': InstanceType(name='c6id.large', cores=2, memory=4.0, disks=1.0, disk_capacity=118.0, architecture='amd64'), - 'c6id.metal': InstanceType(name='c6id.metal', cores=128, memory=256.0, disks=4.0, disk_capacity=1900.0, architecture='amd64'), - 'c6id.xlarge': InstanceType(name='c6id.xlarge', cores=4, memory=8.0, disks=1.0, disk_capacity=237.0, architecture='amd64'), - 'c6in.12xlarge': InstanceType(name='c6in.12xlarge', cores=48, memory=96.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6in.16xlarge': InstanceType(name='c6in.16xlarge', cores=64, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6in.24xlarge': InstanceType(name='c6in.24xlarge', cores=96, memory=192.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6in.2xlarge': InstanceType(name='c6in.2xlarge', cores=8, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6in.32xlarge': InstanceType(name='c6in.32xlarge', cores=128, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6in.4xlarge': InstanceType(name='c6in.4xlarge', cores=16, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6in.8xlarge': InstanceType(name='c6in.8xlarge', cores=32, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6in.large': InstanceType(name='c6in.large', cores=2, memory=4.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6in.metal': InstanceType(name='c6in.metal', cores=128, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c6in.xlarge': InstanceType(name='c6in.xlarge', cores=4, memory=8.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c7a.12xlarge': InstanceType(name='c7a.12xlarge', cores=48, memory=96.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c7a.16xlarge': InstanceType(name='c7a.16xlarge', cores=64, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c7a.24xlarge': InstanceType(name='c7a.24xlarge', cores=96, memory=192.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c7a.2xlarge': InstanceType(name='c7a.2xlarge', cores=8, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c7a.32xlarge': InstanceType(name='c7a.32xlarge', cores=128, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c7a.48xlarge': InstanceType(name='c7a.48xlarge', cores=192, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c7a.4xlarge': InstanceType(name='c7a.4xlarge', cores=16, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c7a.8xlarge': InstanceType(name='c7a.8xlarge', cores=32, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c7a.large': InstanceType(name='c7a.large', cores=2, memory=4.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c7a.medium': InstanceType(name='c7a.medium', cores=1, memory=2.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c7a.metal-48xl': InstanceType(name='c7a.metal-48xl', cores=192, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c7a.xlarge': InstanceType(name='c7a.xlarge', cores=4, memory=8.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c7g.12xlarge': InstanceType(name='c7g.12xlarge', cores=48, memory=96.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c7g.16xlarge': InstanceType(name='c7g.16xlarge', cores=64, memory=128.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c7g.2xlarge': InstanceType(name='c7g.2xlarge', cores=8, memory=16.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c7g.4xlarge': InstanceType(name='c7g.4xlarge', cores=16, memory=32.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c7g.8xlarge': InstanceType(name='c7g.8xlarge', cores=32, memory=64.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c7g.large': InstanceType(name='c7g.large', cores=2, memory=4.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c7g.medium': InstanceType(name='c7g.medium', cores=1, memory=2.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c7g.metal': InstanceType(name='c7g.metal', cores=64, memory=128.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c7g.xlarge': InstanceType(name='c7g.xlarge', cores=4, memory=8.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c7gd.12xlarge': InstanceType(name='c7gd.12xlarge', cores=48, memory=96.0, disks=2.0, disk_capacity=1425.0, architecture='arm64'), - 'c7gd.16xlarge': InstanceType(name='c7gd.16xlarge', cores=64, memory=128.0, disks=2.0, disk_capacity=1900.0, architecture='arm64'), - 'c7gd.2xlarge': InstanceType(name='c7gd.2xlarge', cores=8, memory=16.0, disks=1.0, disk_capacity=475.0, architecture='arm64'), - 'c7gd.4xlarge': InstanceType(name='c7gd.4xlarge', cores=16, memory=32.0, disks=1.0, disk_capacity=950.0, architecture='arm64'), - 'c7gd.8xlarge': InstanceType(name='c7gd.8xlarge', cores=32, memory=64.0, disks=1.0, disk_capacity=1900.0, architecture='arm64'), - 'c7gd.large': InstanceType(name='c7gd.large', cores=2, memory=4.0, disks=1.0, disk_capacity=118.0, architecture='arm64'), - 'c7gd.medium': InstanceType(name='c7gd.medium', cores=1, memory=2.0, disks=1.0, disk_capacity=59.0, architecture='arm64'), - 'c7gd.xlarge': InstanceType(name='c7gd.xlarge', cores=4, memory=8.0, disks=1.0, disk_capacity=237.0, architecture='arm64'), - 'c7gn.12xlarge': InstanceType(name='c7gn.12xlarge', cores=48, memory=96.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c7gn.16xlarge': InstanceType(name='c7gn.16xlarge', cores=64, memory=128.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c7gn.2xlarge': InstanceType(name='c7gn.2xlarge', cores=8, memory=16.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c7gn.4xlarge': InstanceType(name='c7gn.4xlarge', cores=16, memory=32.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c7gn.8xlarge': InstanceType(name='c7gn.8xlarge', cores=32, memory=64.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c7gn.large': InstanceType(name='c7gn.large', cores=2, memory=4.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c7gn.medium': InstanceType(name='c7gn.medium', cores=1, memory=2.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c7gn.xlarge': InstanceType(name='c7gn.xlarge', cores=4, memory=8.0, disks=0, disk_capacity=0, architecture='arm64'), - 'c7i.12xlarge': InstanceType(name='c7i.12xlarge', cores=48, memory=96.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c7i.16xlarge': InstanceType(name='c7i.16xlarge', cores=64, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c7i.24xlarge': InstanceType(name='c7i.24xlarge', cores=96, memory=192.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c7i.2xlarge': InstanceType(name='c7i.2xlarge', cores=8, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c7i.48xlarge': InstanceType(name='c7i.48xlarge', cores=192, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c7i.4xlarge': InstanceType(name='c7i.4xlarge', cores=16, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c7i.8xlarge': InstanceType(name='c7i.8xlarge', cores=32, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c7i.large': InstanceType(name='c7i.large', cores=2, memory=4.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c7i.metal-24xl': InstanceType(name='c7i.metal-24xl', cores=96, memory=192.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c7i.metal-48xl': InstanceType(name='c7i.metal-48xl', cores=192, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'c7i.xlarge': InstanceType(name='c7i.xlarge', cores=4, memory=8.0, disks=0, disk_capacity=0, architecture='amd64'), - 'cc2.8xlarge': InstanceType(name='cc2.8xlarge', cores=32, memory=60.5, disks=4.0, disk_capacity=840.0, architecture='amd64'), - 'cr1.8xlarge': InstanceType(name='cr1.8xlarge', cores=32, memory=244.0, disks=2.0, disk_capacity=120.0, architecture='amd64'), - 'd2.2xlarge': InstanceType(name='d2.2xlarge', cores=8, memory=61.0, disks=6.0, disk_capacity=2000.0, architecture='amd64'), - 'd2.4xlarge': InstanceType(name='d2.4xlarge', cores=16, memory=122.0, disks=12.0, disk_capacity=2000.0, architecture='amd64'), - 'd2.8xlarge': InstanceType(name='d2.8xlarge', cores=36, memory=244.0, disks=24.0, disk_capacity=2000.0, architecture='amd64'), - 'd2.xlarge': InstanceType(name='d2.xlarge', cores=4, memory=30.5, disks=3.0, disk_capacity=2000.0, architecture='amd64'), - 'd3.2xlarge': InstanceType(name='d3.2xlarge', cores=8, memory=64.0, disks=6.0, disk_capacity=2000.0, architecture='amd64'), - 'd3.4xlarge': InstanceType(name='d3.4xlarge', cores=16, memory=128.0, disks=12.0, disk_capacity=2000.0, architecture='amd64'), - 'd3.8xlarge': InstanceType(name='d3.8xlarge', cores=32, memory=256.0, disks=24.0, disk_capacity=2000.0, architecture='amd64'), - 'd3.xlarge': InstanceType(name='d3.xlarge', cores=4, memory=32.0, disks=3.0, disk_capacity=2000.0, architecture='amd64'), - 'd3en.12xlarge': InstanceType(name='d3en.12xlarge', cores=48, memory=192.0, disks=24.0, disk_capacity=14000.0, architecture='amd64'), - 'd3en.2xlarge': InstanceType(name='d3en.2xlarge', cores=8, memory=32.0, disks=4.0, disk_capacity=14000.0, architecture='amd64'), - 'd3en.4xlarge': InstanceType(name='d3en.4xlarge', cores=16, memory=64.0, disks=8.0, disk_capacity=14000.0, architecture='amd64'), - 'd3en.6xlarge': InstanceType(name='d3en.6xlarge', cores=24, memory=96.0, disks=12.0, disk_capacity=14000.0, architecture='amd64'), - 'd3en.8xlarge': InstanceType(name='d3en.8xlarge', cores=32, memory=128.0, disks=16.0, disk_capacity=14000.0, architecture='amd64'), - 'd3en.xlarge': InstanceType(name='d3en.xlarge', cores=4, memory=16.0, disks=2.0, disk_capacity=14000.0, architecture='amd64'), - 'dl1.24xlarge': InstanceType(name='dl1.24xlarge', cores=96, memory=768.0, disks=4.0, disk_capacity=1000.0, architecture='amd64'), - 'dl2q.24xlarge': InstanceType(name='dl2q.24xlarge', cores=96, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'f1.16xlarge': InstanceType(name='f1.16xlarge', cores=64, memory=976.0, disks=4.0, disk_capacity=940.0, architecture='amd64'), - 'f1.2xlarge': InstanceType(name='f1.2xlarge', cores=8, memory=122.0, disks=1.0, disk_capacity=470.0, architecture='amd64'), - 'f1.4xlarge': InstanceType(name='f1.4xlarge', cores=16, memory=244.0, disks=1.0, disk_capacity=940.0, architecture='amd64'), - 'g2.2xlarge': InstanceType(name='g2.2xlarge', cores=8, memory=15.0, disks=1.0, disk_capacity=60.0, architecture='amd64'), - 'g2.8xlarge': InstanceType(name='g2.8xlarge', cores=32, memory=60.0, disks=2.0, disk_capacity=120.0, architecture='amd64'), - 'g3.16xlarge': InstanceType(name='g3.16xlarge', cores=64, memory=488.0, disks=0, disk_capacity=0, architecture='amd64'), - 'g3.4xlarge': InstanceType(name='g3.4xlarge', cores=16, memory=122.0, disks=0, disk_capacity=0, architecture='amd64'), - 'g3.8xlarge': InstanceType(name='g3.8xlarge', cores=32, memory=244.0, disks=0, disk_capacity=0, architecture='amd64'), - 'g3s.xlarge': InstanceType(name='g3s.xlarge', cores=4, memory=30.5, disks=0, disk_capacity=0, architecture='amd64'), - 'g4ad.16xlarge': InstanceType(name='g4ad.16xlarge', cores=64, memory=256.0, disks=1, disk_capacity=2400.0, architecture='amd64'), - 'g4ad.2xlarge': InstanceType(name='g4ad.2xlarge', cores=8, memory=32.0, disks=1, disk_capacity=300.0, architecture='amd64'), - 'g4ad.4xlarge': InstanceType(name='g4ad.4xlarge', cores=16, memory=64.0, disks=1, disk_capacity=600.0, architecture='amd64'), - 'g4ad.8xlarge': InstanceType(name='g4ad.8xlarge', cores=32, memory=128.0, disks=1, disk_capacity=1200.0, architecture='amd64'), - 'g4ad.xlarge': InstanceType(name='g4ad.xlarge', cores=4, memory=16.0, disks=1, disk_capacity=150.0, architecture='amd64'), - 'g4dn.12xlarge': InstanceType(name='g4dn.12xlarge', cores=48, memory=192.0, disks=1, disk_capacity=900.0, architecture='amd64'), - 'g4dn.16xlarge': InstanceType(name='g4dn.16xlarge', cores=64, memory=256.0, disks=1, disk_capacity=900.0, architecture='amd64'), - 'g4dn.2xlarge': InstanceType(name='g4dn.2xlarge', cores=8, memory=32.0, disks=1, disk_capacity=225.0, architecture='amd64'), - 'g4dn.4xlarge': InstanceType(name='g4dn.4xlarge', cores=16, memory=64.0, disks=1, disk_capacity=225.0, architecture='amd64'), - 'g4dn.8xlarge': InstanceType(name='g4dn.8xlarge', cores=32, memory=128.0, disks=1, disk_capacity=900.0, architecture='amd64'), - 'g4dn.metal': InstanceType(name='g4dn.metal', cores=96, memory=384.0, disks=2.0, disk_capacity=900.0, architecture='amd64'), - 'g4dn.xlarge': InstanceType(name='g4dn.xlarge', cores=4, memory=16.0, disks=1, disk_capacity=125.0, architecture='amd64'), - 'g5.12xlarge': InstanceType(name='g5.12xlarge', cores=48, memory=192.0, disks=1.0, disk_capacity=3800.0, architecture='amd64'), - 'g5.16xlarge': InstanceType(name='g5.16xlarge', cores=64, memory=256.0, disks=1.0, disk_capacity=1900.0, architecture='amd64'), - 'g5.24xlarge': InstanceType(name='g5.24xlarge', cores=96, memory=384.0, disks=1.0, disk_capacity=3800.0, architecture='amd64'), - 'g5.2xlarge': InstanceType(name='g5.2xlarge', cores=8, memory=32.0, disks=1.0, disk_capacity=450.0, architecture='amd64'), - 'g5.48xlarge': InstanceType(name='g5.48xlarge', cores=192, memory=768.0, disks=2.0, disk_capacity=3800.0, architecture='amd64'), - 'g5.4xlarge': InstanceType(name='g5.4xlarge', cores=16, memory=64.0, disks=1.0, disk_capacity=600.0, architecture='amd64'), - 'g5.8xlarge': InstanceType(name='g5.8xlarge', cores=32, memory=128.0, disks=1.0, disk_capacity=900.0, architecture='amd64'), - 'g5.xlarge': InstanceType(name='g5.xlarge', cores=4, memory=16.0, disks=1.0, disk_capacity=250.0, architecture='amd64'), - 'g5g.16xlarge': InstanceType(name='g5g.16xlarge', cores=64, memory=128.0, disks=0, disk_capacity=0, architecture='arm64'), - 'g5g.2xlarge': InstanceType(name='g5g.2xlarge', cores=8, memory=16.0, disks=0, disk_capacity=0, architecture='arm64'), - 'g5g.4xlarge': InstanceType(name='g5g.4xlarge', cores=16, memory=32.0, disks=0, disk_capacity=0, architecture='arm64'), - 'g5g.8xlarge': InstanceType(name='g5g.8xlarge', cores=32, memory=64.0, disks=0, disk_capacity=0, architecture='arm64'), - 'g5g.metal': InstanceType(name='g5g.metal', cores=64, memory=128.0, disks=0, disk_capacity=0, architecture='arm64'), - 'g5g.xlarge': InstanceType(name='g5g.xlarge', cores=4, memory=8.0, disks=0, disk_capacity=0, architecture='arm64'), - 'h1.16xlarge': InstanceType(name='h1.16xlarge', cores=64, memory=256.0, disks=8.0, disk_capacity=2000.0, architecture='amd64'), - 'h1.2xlarge': InstanceType(name='h1.2xlarge', cores=8, memory=32.0, disks=1.0, disk_capacity=2000.0, architecture='amd64'), - 'h1.4xlarge': InstanceType(name='h1.4xlarge', cores=16, memory=64.0, disks=2.0, disk_capacity=2000.0, architecture='amd64'), - 'h1.8xlarge': InstanceType(name='h1.8xlarge', cores=32, memory=128.0, disks=4.0, disk_capacity=2000.0, architecture='amd64'), - 'hpc6a.48xlarge': InstanceType(name='hpc6a.48xlarge', cores=96, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'hpc6id.32xlarge': InstanceType(name='hpc6id.32xlarge', cores=64, memory=1024.0, disks=4.0, disk_capacity=3800.0, architecture='amd64'), - 'hpc7a.12xlarge': InstanceType(name='hpc7a.12xlarge', cores=24, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'hpc7a.24xlarge': InstanceType(name='hpc7a.24xlarge', cores=48, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'hpc7a.48xlarge': InstanceType(name='hpc7a.48xlarge', cores=96, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'hpc7a.96xlarge': InstanceType(name='hpc7a.96xlarge', cores=192, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'hpc7g.16xlarge': InstanceType(name='hpc7g.16xlarge', cores=64, memory=128.0, disks=0, disk_capacity=0, architecture='arm64'), - 'hpc7g.4xlarge': InstanceType(name='hpc7g.4xlarge', cores=16, memory=128.0, disks=0, disk_capacity=0, architecture='arm64'), - 'hpc7g.8xlarge': InstanceType(name='hpc7g.8xlarge', cores=32, memory=128.0, disks=0, disk_capacity=0, architecture='arm64'), - 'hs1.8xlarge': InstanceType(name='hs1.8xlarge', cores=16, memory=117.0, disks=24.0, disk_capacity=2000.0, architecture='amd64'), - 'i2.2xlarge': InstanceType(name='i2.2xlarge', cores=8, memory=61.0, disks=2.0, disk_capacity=800.0, architecture='amd64'), - 'i2.4xlarge': InstanceType(name='i2.4xlarge', cores=16, memory=122.0, disks=4.0, disk_capacity=800.0, architecture='amd64'), - 'i2.8xlarge': InstanceType(name='i2.8xlarge', cores=32, memory=244.0, disks=8.0, disk_capacity=800.0, architecture='amd64'), - 'i2.xlarge': InstanceType(name='i2.xlarge', cores=4, memory=30.5, disks=1.0, disk_capacity=800.0, architecture='amd64'), - 'i3.16xlarge': InstanceType(name='i3.16xlarge', cores=64, memory=488.0, disks=8.0, disk_capacity=1900.0, architecture='amd64'), - 'i3.2xlarge': InstanceType(name='i3.2xlarge', cores=8, memory=61.0, disks=1.0, disk_capacity=1900.0, architecture='amd64'), - 'i3.4xlarge': InstanceType(name='i3.4xlarge', cores=16, memory=122.0, disks=2.0, disk_capacity=1900.0, architecture='amd64'), - 'i3.8xlarge': InstanceType(name='i3.8xlarge', cores=32, memory=244.0, disks=4.0, disk_capacity=1900.0, architecture='amd64'), - 'i3.large': InstanceType(name='i3.large', cores=2, memory=15.25, disks=1.0, disk_capacity=475.0, architecture='amd64'), - 'i3.metal': InstanceType(name='i3.metal', cores=72, memory=512.0, disks=8.0, disk_capacity=1900.0, architecture='amd64'), - 'i3.xlarge': InstanceType(name='i3.xlarge', cores=4, memory=30.5, disks=1.0, disk_capacity=950.0, architecture='amd64'), - 'i3en.12xlarge': InstanceType(name='i3en.12xlarge', cores=48, memory=384.0, disks=4.0, disk_capacity=7500.0, architecture='amd64'), - 'i3en.24xlarge': InstanceType(name='i3en.24xlarge', cores=96, memory=768.0, disks=8.0, disk_capacity=7500.0, architecture='amd64'), - 'i3en.2xlarge': InstanceType(name='i3en.2xlarge', cores=8, memory=64.0, disks=2.0, disk_capacity=2500.0, architecture='amd64'), - 'i3en.3xlarge': InstanceType(name='i3en.3xlarge', cores=12, memory=96.0, disks=1.0, disk_capacity=7500.0, architecture='amd64'), - 'i3en.6xlarge': InstanceType(name='i3en.6xlarge', cores=24, memory=192.0, disks=2.0, disk_capacity=7500.0, architecture='amd64'), - 'i3en.large': InstanceType(name='i3en.large', cores=2, memory=16.0, disks=1.0, disk_capacity=1250.0, architecture='amd64'), - 'i3en.metal': InstanceType(name='i3en.metal', cores=96, memory=768.0, disks=8.0, disk_capacity=7500.0, architecture='amd64'), - 'i3en.xlarge': InstanceType(name='i3en.xlarge', cores=4, memory=32.0, disks=1.0, disk_capacity=2500.0, architecture='amd64'), - 'i3p.16xlarge': InstanceType(name='i3p.16xlarge', cores=64, memory=488.0, disks=0, disk_capacity=0, architecture='amd64'), - 'i4g.16xlarge': InstanceType(name='i4g.16xlarge', cores=64, memory=512.0, disks=4.0, disk_capacity=3750.0, architecture='arm64'), - 'i4g.2xlarge': InstanceType(name='i4g.2xlarge', cores=8, memory=64.0, disks=1.0, disk_capacity=1875.0, architecture='arm64'), - 'i4g.4xlarge': InstanceType(name='i4g.4xlarge', cores=16, memory=128.0, disks=1.0, disk_capacity=3750.0, architecture='arm64'), - 'i4g.8xlarge': InstanceType(name='i4g.8xlarge', cores=32, memory=256.0, disks=2.0, disk_capacity=3750.0, architecture='arm64'), - 'i4g.large': InstanceType(name='i4g.large', cores=2, memory=16.0, disks=1.0, disk_capacity=468.0, architecture='arm64'), - 'i4g.xlarge': InstanceType(name='i4g.xlarge', cores=4, memory=32.0, disks=1.0, disk_capacity=937.0, architecture='arm64'), - 'i4i.12xlarge': InstanceType(name='i4i.12xlarge', cores=48, memory=384.0, disks=3.0, disk_capacity=3750.0, architecture='amd64'), - 'i4i.16xlarge': InstanceType(name='i4i.16xlarge', cores=64, memory=512.0, disks=4.0, disk_capacity=3750.0, architecture='amd64'), - 'i4i.24xlarge': InstanceType(name='i4i.24xlarge', cores=96, memory=768.0, disks=6.0, disk_capacity=3750.0, architecture='amd64'), - 'i4i.2xlarge': InstanceType(name='i4i.2xlarge', cores=8, memory=64.0, disks=1.0, disk_capacity=1875.0, architecture='amd64'), - 'i4i.32xlarge': InstanceType(name='i4i.32xlarge', cores=128, memory=1024.0, disks=8.0, disk_capacity=3750.0, architecture='amd64'), - 'i4i.4xlarge': InstanceType(name='i4i.4xlarge', cores=16, memory=128.0, disks=1.0, disk_capacity=3750.0, architecture='amd64'), - 'i4i.8xlarge': InstanceType(name='i4i.8xlarge', cores=32, memory=256.0, disks=2.0, disk_capacity=3750.0, architecture='amd64'), - 'i4i.large': InstanceType(name='i4i.large', cores=2, memory=16.0, disks=1.0, disk_capacity=468.0, architecture='amd64'), - 'i4i.metal': InstanceType(name='i4i.metal', cores=128, memory=1024.0, disks=8.0, disk_capacity=3750.0, architecture='amd64'), - 'i4i.xlarge': InstanceType(name='i4i.xlarge', cores=4, memory=32.0, disks=1.0, disk_capacity=937.0, architecture='amd64'), - 'im4gn.16xlarge': InstanceType(name='im4gn.16xlarge', cores=64, memory=256.0, disks=4.0, disk_capacity=7500.0, architecture='arm64'), - 'im4gn.2xlarge': InstanceType(name='im4gn.2xlarge', cores=8, memory=32.0, disks=1.0, disk_capacity=3750.0, architecture='arm64'), - 'im4gn.4xlarge': InstanceType(name='im4gn.4xlarge', cores=16, memory=64.0, disks=1.0, disk_capacity=7500.0, architecture='arm64'), - 'im4gn.8xlarge': InstanceType(name='im4gn.8xlarge', cores=32, memory=128.0, disks=2.0, disk_capacity=7500.0, architecture='arm64'), - 'im4gn.large': InstanceType(name='im4gn.large', cores=2, memory=8.0, disks=1.0, disk_capacity=937.0, architecture='arm64'), - 'im4gn.xlarge': InstanceType(name='im4gn.xlarge', cores=4, memory=16.0, disks=1.0, disk_capacity=1875.0, architecture='arm64'), - 'inf1.24xlarge': InstanceType(name='inf1.24xlarge', cores=96, memory=192.0, disks=0, disk_capacity=0, architecture='amd64'), - 'inf1.2xlarge': InstanceType(name='inf1.2xlarge', cores=8, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'inf1.6xlarge': InstanceType(name='inf1.6xlarge', cores=24, memory=48.0, disks=0, disk_capacity=0, architecture='amd64'), - 'inf1.xlarge': InstanceType(name='inf1.xlarge', cores=4, memory=8.0, disks=0, disk_capacity=0, architecture='amd64'), - 'inf2.24xlarge': InstanceType(name='inf2.24xlarge', cores=96, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'inf2.48xlarge': InstanceType(name='inf2.48xlarge', cores=192, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'inf2.8xlarge': InstanceType(name='inf2.8xlarge', cores=32, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'inf2.xlarge': InstanceType(name='inf2.xlarge', cores=4, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'is4gen.2xlarge': InstanceType(name='is4gen.2xlarge', cores=8, memory=48.0, disks=1.0, disk_capacity=7500.0, architecture='arm64'), - 'is4gen.4xlarge': InstanceType(name='is4gen.4xlarge', cores=16, memory=96.0, disks=2.0, disk_capacity=7500.0, architecture='arm64'), - 'is4gen.8xlarge': InstanceType(name='is4gen.8xlarge', cores=32, memory=192.0, disks=4.0, disk_capacity=7500.0, architecture='arm64'), - 'is4gen.large': InstanceType(name='is4gen.large', cores=2, memory=12.0, disks=1.0, disk_capacity=1875.0, architecture='arm64'), - 'is4gen.medium': InstanceType(name='is4gen.medium', cores=1, memory=6.0, disks=1.0, disk_capacity=937.0, architecture='arm64'), - 'is4gen.xlarge': InstanceType(name='is4gen.xlarge', cores=4, memory=24.0, disks=1.0, disk_capacity=3750.0, architecture='arm64'), - 'm1.large': InstanceType(name='m1.large', cores=2, memory=7.5, disks=2.0, disk_capacity=420.0, architecture='amd64'), - 'm1.medium': InstanceType(name='m1.medium', cores=1, memory=3.75, disks=1.0, disk_capacity=410.0, architecture='amd64'), - 'm1.xlarge': InstanceType(name='m1.xlarge', cores=4, memory=15.0, disks=4.0, disk_capacity=420.0, architecture='amd64'), - 'm2.2xlarge': InstanceType(name='m2.2xlarge', cores=4, memory=34.2, disks=1.0, disk_capacity=850.0, architecture='amd64'), - 'm2.4xlarge': InstanceType(name='m2.4xlarge', cores=8, memory=68.4, disks=2.0, disk_capacity=840.0, architecture='amd64'), - 'm2.xlarge': InstanceType(name='m2.xlarge', cores=2, memory=17.1, disks=1.0, disk_capacity=420.0, architecture='amd64'), - 'm3.2xlarge': InstanceType(name='m3.2xlarge', cores=8, memory=30.0, disks=2.0, disk_capacity=80.0, architecture='amd64'), - 'm3.large': InstanceType(name='m3.large', cores=2, memory=7.5, disks=1.0, disk_capacity=32.0, architecture='amd64'), - 'm3.medium': InstanceType(name='m3.medium', cores=1, memory=3.75, disks=1.0, disk_capacity=4.0, architecture='amd64'), - 'm3.xlarge': InstanceType(name='m3.xlarge', cores=4, memory=15.0, disks=2.0, disk_capacity=40.0, architecture='amd64'), - 'm4.10xlarge': InstanceType(name='m4.10xlarge', cores=40, memory=160.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm4.16xlarge': InstanceType(name='m4.16xlarge', cores=64, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm4.2xlarge': InstanceType(name='m4.2xlarge', cores=8, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm4.4xlarge': InstanceType(name='m4.4xlarge', cores=16, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm4.large': InstanceType(name='m4.large', cores=2, memory=8.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm4.xlarge': InstanceType(name='m4.xlarge', cores=4, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5.12xlarge': InstanceType(name='m5.12xlarge', cores=48, memory=192.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5.16xlarge': InstanceType(name='m5.16xlarge', cores=64, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5.24xlarge': InstanceType(name='m5.24xlarge', cores=96, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5.2xlarge': InstanceType(name='m5.2xlarge', cores=8, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5.4xlarge': InstanceType(name='m5.4xlarge', cores=16, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5.8xlarge': InstanceType(name='m5.8xlarge', cores=32, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5.large': InstanceType(name='m5.large', cores=2, memory=8.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5.metal': InstanceType(name='m5.metal', cores=96, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5.xlarge': InstanceType(name='m5.xlarge', cores=4, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5a.12xlarge': InstanceType(name='m5a.12xlarge', cores=48, memory=192.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5a.16xlarge': InstanceType(name='m5a.16xlarge', cores=64, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5a.24xlarge': InstanceType(name='m5a.24xlarge', cores=96, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5a.2xlarge': InstanceType(name='m5a.2xlarge', cores=8, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5a.4xlarge': InstanceType(name='m5a.4xlarge', cores=16, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5a.8xlarge': InstanceType(name='m5a.8xlarge', cores=32, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5a.large': InstanceType(name='m5a.large', cores=2, memory=8.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5a.xlarge': InstanceType(name='m5a.xlarge', cores=4, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5ad.12xlarge': InstanceType(name='m5ad.12xlarge', cores=48, memory=192.0, disks=2.0, disk_capacity=900.0, architecture='amd64'), - 'm5ad.16xlarge': InstanceType(name='m5ad.16xlarge', cores=64, memory=256.0, disks=4.0, disk_capacity=600.0, architecture='amd64'), - 'm5ad.24xlarge': InstanceType(name='m5ad.24xlarge', cores=96, memory=384.0, disks=4.0, disk_capacity=900.0, architecture='amd64'), - 'm5ad.2xlarge': InstanceType(name='m5ad.2xlarge', cores=8, memory=32.0, disks=1.0, disk_capacity=300.0, architecture='amd64'), - 'm5ad.4xlarge': InstanceType(name='m5ad.4xlarge', cores=16, memory=64.0, disks=2.0, disk_capacity=300.0, architecture='amd64'), - 'm5ad.8xlarge': InstanceType(name='m5ad.8xlarge', cores=32, memory=128.0, disks=2.0, disk_capacity=600.0, architecture='amd64'), - 'm5ad.large': InstanceType(name='m5ad.large', cores=2, memory=8.0, disks=1.0, disk_capacity=75.0, architecture='amd64'), - 'm5ad.xlarge': InstanceType(name='m5ad.xlarge', cores=4, memory=16.0, disks=1.0, disk_capacity=150.0, architecture='amd64'), - 'm5d.12xlarge': InstanceType(name='m5d.12xlarge', cores=48, memory=192.0, disks=2.0, disk_capacity=900.0, architecture='amd64'), - 'm5d.16xlarge': InstanceType(name='m5d.16xlarge', cores=64, memory=256.0, disks=4.0, disk_capacity=600.0, architecture='amd64'), - 'm5d.24xlarge': InstanceType(name='m5d.24xlarge', cores=96, memory=384.0, disks=4.0, disk_capacity=900.0, architecture='amd64'), - 'm5d.2xlarge': InstanceType(name='m5d.2xlarge', cores=8, memory=32.0, disks=1.0, disk_capacity=300.0, architecture='amd64'), - 'm5d.4xlarge': InstanceType(name='m5d.4xlarge', cores=16, memory=64.0, disks=2.0, disk_capacity=300.0, architecture='amd64'), - 'm5d.8xlarge': InstanceType(name='m5d.8xlarge', cores=32, memory=128.0, disks=2.0, disk_capacity=600.0, architecture='amd64'), - 'm5d.large': InstanceType(name='m5d.large', cores=2, memory=8.0, disks=1.0, disk_capacity=75.0, architecture='amd64'), - 'm5d.metal': InstanceType(name='m5d.metal', cores=96, memory=384.0, disks=4.0, disk_capacity=900.0, architecture='amd64'), - 'm5d.xlarge': InstanceType(name='m5d.xlarge', cores=4, memory=16.0, disks=1.0, disk_capacity=150.0, architecture='amd64'), - 'm5dn.12xlarge': InstanceType(name='m5dn.12xlarge', cores=48, memory=192.0, disks=2.0, disk_capacity=900.0, architecture='amd64'), - 'm5dn.16xlarge': InstanceType(name='m5dn.16xlarge', cores=64, memory=256.0, disks=4.0, disk_capacity=600.0, architecture='amd64'), - 'm5dn.24xlarge': InstanceType(name='m5dn.24xlarge', cores=96, memory=384.0, disks=4.0, disk_capacity=900.0, architecture='amd64'), - 'm5dn.2xlarge': InstanceType(name='m5dn.2xlarge', cores=8, memory=32.0, disks=1.0, disk_capacity=300.0, architecture='amd64'), - 'm5dn.4xlarge': InstanceType(name='m5dn.4xlarge', cores=16, memory=64.0, disks=2.0, disk_capacity=300.0, architecture='amd64'), - 'm5dn.8xlarge': InstanceType(name='m5dn.8xlarge', cores=32, memory=128.0, disks=2.0, disk_capacity=600.0, architecture='amd64'), - 'm5dn.large': InstanceType(name='m5dn.large', cores=2, memory=8.0, disks=1.0, disk_capacity=75.0, architecture='amd64'), - 'm5dn.metal': InstanceType(name='m5dn.metal', cores=96, memory=384.0, disks=4.0, disk_capacity=900.0, architecture='amd64'), - 'm5dn.xlarge': InstanceType(name='m5dn.xlarge', cores=4, memory=16.0, disks=1.0, disk_capacity=150.0, architecture='amd64'), - 'm5n.12xlarge': InstanceType(name='m5n.12xlarge', cores=48, memory=192.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5n.16xlarge': InstanceType(name='m5n.16xlarge', cores=64, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5n.24xlarge': InstanceType(name='m5n.24xlarge', cores=96, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5n.2xlarge': InstanceType(name='m5n.2xlarge', cores=8, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5n.4xlarge': InstanceType(name='m5n.4xlarge', cores=16, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5n.8xlarge': InstanceType(name='m5n.8xlarge', cores=32, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5n.large': InstanceType(name='m5n.large', cores=2, memory=8.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5n.metal': InstanceType(name='m5n.metal', cores=96, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5n.xlarge': InstanceType(name='m5n.xlarge', cores=4, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5zn.12xlarge': InstanceType(name='m5zn.12xlarge', cores=48, memory=192.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5zn.2xlarge': InstanceType(name='m5zn.2xlarge', cores=8, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5zn.3xlarge': InstanceType(name='m5zn.3xlarge', cores=12, memory=48.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5zn.6xlarge': InstanceType(name='m5zn.6xlarge', cores=24, memory=96.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5zn.large': InstanceType(name='m5zn.large', cores=2, memory=8.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5zn.metal': InstanceType(name='m5zn.metal', cores=48, memory=192.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm5zn.xlarge': InstanceType(name='m5zn.xlarge', cores=4, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6a.12xlarge': InstanceType(name='m6a.12xlarge', cores=48, memory=192.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6a.16xlarge': InstanceType(name='m6a.16xlarge', cores=64, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6a.24xlarge': InstanceType(name='m6a.24xlarge', cores=96, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6a.2xlarge': InstanceType(name='m6a.2xlarge', cores=8, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6a.32xlarge': InstanceType(name='m6a.32xlarge', cores=128, memory=512.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6a.48xlarge': InstanceType(name='m6a.48xlarge', cores=192, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6a.4xlarge': InstanceType(name='m6a.4xlarge', cores=16, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6a.8xlarge': InstanceType(name='m6a.8xlarge', cores=32, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6a.large': InstanceType(name='m6a.large', cores=2, memory=8.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6a.metal': InstanceType(name='m6a.metal', cores=192, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6a.xlarge': InstanceType(name='m6a.xlarge', cores=4, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6g.12xlarge': InstanceType(name='m6g.12xlarge', cores=48, memory=192.0, disks=0, disk_capacity=0, architecture='arm64'), - 'm6g.16xlarge': InstanceType(name='m6g.16xlarge', cores=64, memory=256.0, disks=0, disk_capacity=0, architecture='arm64'), - 'm6g.2xlarge': InstanceType(name='m6g.2xlarge', cores=8, memory=32.0, disks=0, disk_capacity=0, architecture='arm64'), - 'm6g.4xlarge': InstanceType(name='m6g.4xlarge', cores=16, memory=64.0, disks=0, disk_capacity=0, architecture='arm64'), - 'm6g.8xlarge': InstanceType(name='m6g.8xlarge', cores=32, memory=128.0, disks=0, disk_capacity=0, architecture='arm64'), - 'm6g.large': InstanceType(name='m6g.large', cores=2, memory=8.0, disks=0, disk_capacity=0, architecture='arm64'), - 'm6g.medium': InstanceType(name='m6g.medium', cores=1, memory=4.0, disks=0, disk_capacity=0, architecture='arm64'), - 'm6g.metal': InstanceType(name='m6g.metal', cores=64, memory=256.0, disks=0, disk_capacity=0, architecture='arm64'), - 'm6g.xlarge': InstanceType(name='m6g.xlarge', cores=4, memory=16.0, disks=0, disk_capacity=0, architecture='arm64'), - 'm6gd.12xlarge': InstanceType(name='m6gd.12xlarge', cores=48, memory=192.0, disks=2.0, disk_capacity=1425.0, architecture='arm64'), - 'm6gd.16xlarge': InstanceType(name='m6gd.16xlarge', cores=64, memory=256.0, disks=2.0, disk_capacity=1900.0, architecture='arm64'), - 'm6gd.2xlarge': InstanceType(name='m6gd.2xlarge', cores=8, memory=32.0, disks=1.0, disk_capacity=475.0, architecture='arm64'), - 'm6gd.4xlarge': InstanceType(name='m6gd.4xlarge', cores=16, memory=64.0, disks=1.0, disk_capacity=950.0, architecture='arm64'), - 'm6gd.8xlarge': InstanceType(name='m6gd.8xlarge', cores=32, memory=128.0, disks=1.0, disk_capacity=1900.0, architecture='arm64'), - 'm6gd.large': InstanceType(name='m6gd.large', cores=2, memory=8.0, disks=1.0, disk_capacity=118.0, architecture='arm64'), - 'm6gd.medium': InstanceType(name='m6gd.medium', cores=1, memory=4.0, disks=1.0, disk_capacity=59.0, architecture='arm64'), - 'm6gd.metal': InstanceType(name='m6gd.metal', cores=64, memory=256.0, disks=2.0, disk_capacity=1900.0, architecture='arm64'), - 'm6gd.xlarge': InstanceType(name='m6gd.xlarge', cores=4, memory=16.0, disks=1.0, disk_capacity=237.0, architecture='arm64'), - 'm6i.12xlarge': InstanceType(name='m6i.12xlarge', cores=48, memory=192.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6i.16xlarge': InstanceType(name='m6i.16xlarge', cores=64, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6i.24xlarge': InstanceType(name='m6i.24xlarge', cores=96, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6i.2xlarge': InstanceType(name='m6i.2xlarge', cores=8, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6i.32xlarge': InstanceType(name='m6i.32xlarge', cores=128, memory=512.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6i.4xlarge': InstanceType(name='m6i.4xlarge', cores=16, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6i.8xlarge': InstanceType(name='m6i.8xlarge', cores=32, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6i.large': InstanceType(name='m6i.large', cores=2, memory=8.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6i.metal': InstanceType(name='m6i.metal', cores=128, memory=512.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6i.xlarge': InstanceType(name='m6i.xlarge', cores=4, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6id.12xlarge': InstanceType(name='m6id.12xlarge', cores=48, memory=192.0, disks=2.0, disk_capacity=1425.0, architecture='amd64'), - 'm6id.16xlarge': InstanceType(name='m6id.16xlarge', cores=64, memory=256.0, disks=2.0, disk_capacity=1900.0, architecture='amd64'), - 'm6id.24xlarge': InstanceType(name='m6id.24xlarge', cores=96, memory=384.0, disks=4.0, disk_capacity=1425.0, architecture='amd64'), - 'm6id.2xlarge': InstanceType(name='m6id.2xlarge', cores=8, memory=32.0, disks=1.0, disk_capacity=474.0, architecture='amd64'), - 'm6id.32xlarge': InstanceType(name='m6id.32xlarge', cores=128, memory=512.0, disks=4.0, disk_capacity=1900.0, architecture='amd64'), - 'm6id.4xlarge': InstanceType(name='m6id.4xlarge', cores=16, memory=64.0, disks=1.0, disk_capacity=950.0, architecture='amd64'), - 'm6id.8xlarge': InstanceType(name='m6id.8xlarge', cores=32, memory=128.0, disks=1.0, disk_capacity=1900.0, architecture='amd64'), - 'm6id.large': InstanceType(name='m6id.large', cores=2, memory=8.0, disks=1.0, disk_capacity=118.0, architecture='amd64'), - 'm6id.metal': InstanceType(name='m6id.metal', cores=128, memory=512.0, disks=4.0, disk_capacity=1900.0, architecture='amd64'), - 'm6id.xlarge': InstanceType(name='m6id.xlarge', cores=4, memory=16.0, disks=1.0, disk_capacity=237.0, architecture='amd64'), - 'm6idn.12xlarge': InstanceType(name='m6idn.12xlarge', cores=48, memory=192.0, disks=2.0, disk_capacity=1425.0, architecture='amd64'), - 'm6idn.16xlarge': InstanceType(name='m6idn.16xlarge', cores=64, memory=256.0, disks=2.0, disk_capacity=1900.0, architecture='amd64'), - 'm6idn.24xlarge': InstanceType(name='m6idn.24xlarge', cores=96, memory=384.0, disks=4.0, disk_capacity=1425.0, architecture='amd64'), - 'm6idn.2xlarge': InstanceType(name='m6idn.2xlarge', cores=8, memory=32.0, disks=1.0, disk_capacity=474.0, architecture='amd64'), - 'm6idn.32xlarge': InstanceType(name='m6idn.32xlarge', cores=128, memory=512.0, disks=4.0, disk_capacity=1900.0, architecture='amd64'), - 'm6idn.4xlarge': InstanceType(name='m6idn.4xlarge', cores=16, memory=64.0, disks=1.0, disk_capacity=950.0, architecture='amd64'), - 'm6idn.8xlarge': InstanceType(name='m6idn.8xlarge', cores=32, memory=128.0, disks=1.0, disk_capacity=1900.0, architecture='amd64'), - 'm6idn.large': InstanceType(name='m6idn.large', cores=2, memory=8.0, disks=1.0, disk_capacity=118.0, architecture='amd64'), - 'm6idn.metal': InstanceType(name='m6idn.metal', cores=128, memory=512.0, disks=4.0, disk_capacity=1900.0, architecture='amd64'), - 'm6idn.xlarge': InstanceType(name='m6idn.xlarge', cores=4, memory=16.0, disks=1.0, disk_capacity=237.0, architecture='amd64'), - 'm6in.12xlarge': InstanceType(name='m6in.12xlarge', cores=48, memory=192.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6in.16xlarge': InstanceType(name='m6in.16xlarge', cores=64, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6in.24xlarge': InstanceType(name='m6in.24xlarge', cores=96, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6in.2xlarge': InstanceType(name='m6in.2xlarge', cores=8, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6in.32xlarge': InstanceType(name='m6in.32xlarge', cores=128, memory=512.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6in.4xlarge': InstanceType(name='m6in.4xlarge', cores=16, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6in.8xlarge': InstanceType(name='m6in.8xlarge', cores=32, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6in.large': InstanceType(name='m6in.large', cores=2, memory=8.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6in.metal': InstanceType(name='m6in.metal', cores=128, memory=512.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm6in.xlarge': InstanceType(name='m6in.xlarge', cores=4, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7a.12xlarge': InstanceType(name='m7a.12xlarge', cores=48, memory=192.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7a.16xlarge': InstanceType(name='m7a.16xlarge', cores=64, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7a.24xlarge': InstanceType(name='m7a.24xlarge', cores=96, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7a.2xlarge': InstanceType(name='m7a.2xlarge', cores=8, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7a.32xlarge': InstanceType(name='m7a.32xlarge', cores=128, memory=512.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7a.48xlarge': InstanceType(name='m7a.48xlarge', cores=192, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7a.4xlarge': InstanceType(name='m7a.4xlarge', cores=16, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7a.8xlarge': InstanceType(name='m7a.8xlarge', cores=32, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7a.large': InstanceType(name='m7a.large', cores=2, memory=8.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7a.medium': InstanceType(name='m7a.medium', cores=1, memory=4.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7a.metal-48xl': InstanceType(name='m7a.metal-48xl', cores=192, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7a.xlarge': InstanceType(name='m7a.xlarge', cores=4, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7g.12xlarge': InstanceType(name='m7g.12xlarge', cores=48, memory=192.0, disks=0, disk_capacity=0, architecture='arm64'), - 'm7g.16xlarge': InstanceType(name='m7g.16xlarge', cores=64, memory=256.0, disks=0, disk_capacity=0, architecture='arm64'), - 'm7g.2xlarge': InstanceType(name='m7g.2xlarge', cores=8, memory=32.0, disks=0, disk_capacity=0, architecture='arm64'), - 'm7g.4xlarge': InstanceType(name='m7g.4xlarge', cores=16, memory=64.0, disks=0, disk_capacity=0, architecture='arm64'), - 'm7g.8xlarge': InstanceType(name='m7g.8xlarge', cores=32, memory=128.0, disks=0, disk_capacity=0, architecture='arm64'), - 'm7g.large': InstanceType(name='m7g.large', cores=2, memory=8.0, disks=0, disk_capacity=0, architecture='arm64'), - 'm7g.medium': InstanceType(name='m7g.medium', cores=1, memory=4.0, disks=0, disk_capacity=0, architecture='arm64'), - 'm7g.metal': InstanceType(name='m7g.metal', cores=64, memory=256.0, disks=0, disk_capacity=0, architecture='arm64'), - 'm7g.xlarge': InstanceType(name='m7g.xlarge', cores=4, memory=16.0, disks=0, disk_capacity=0, architecture='arm64'), - 'm7gd.12xlarge': InstanceType(name='m7gd.12xlarge', cores=48, memory=192.0, disks=2.0, disk_capacity=1425.0, architecture='arm64'), - 'm7gd.16xlarge': InstanceType(name='m7gd.16xlarge', cores=64, memory=256.0, disks=2.0, disk_capacity=1900.0, architecture='arm64'), - 'm7gd.2xlarge': InstanceType(name='m7gd.2xlarge', cores=8, memory=32.0, disks=1.0, disk_capacity=475.0, architecture='arm64'), - 'm7gd.4xlarge': InstanceType(name='m7gd.4xlarge', cores=16, memory=64.0, disks=1.0, disk_capacity=950.0, architecture='arm64'), - 'm7gd.8xlarge': InstanceType(name='m7gd.8xlarge', cores=32, memory=128.0, disks=1.0, disk_capacity=1900.0, architecture='arm64'), - 'm7gd.large': InstanceType(name='m7gd.large', cores=2, memory=8.0, disks=1.0, disk_capacity=118.0, architecture='arm64'), - 'm7gd.medium': InstanceType(name='m7gd.medium', cores=1, memory=4.0, disks=1.0, disk_capacity=59.0, architecture='arm64'), - 'm7gd.xlarge': InstanceType(name='m7gd.xlarge', cores=4, memory=16.0, disks=1.0, disk_capacity=237.0, architecture='arm64'), - 'm7i-flex.2xlarge': InstanceType(name='m7i-flex.2xlarge', cores=8, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7i-flex.4xlarge': InstanceType(name='m7i-flex.4xlarge', cores=16, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7i-flex.8xlarge': InstanceType(name='m7i-flex.8xlarge', cores=32, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7i-flex.large': InstanceType(name='m7i-flex.large', cores=2, memory=8.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7i-flex.xlarge': InstanceType(name='m7i-flex.xlarge', cores=4, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7i.12xlarge': InstanceType(name='m7i.12xlarge', cores=48, memory=192.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7i.16xlarge': InstanceType(name='m7i.16xlarge', cores=64, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7i.24xlarge': InstanceType(name='m7i.24xlarge', cores=96, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7i.2xlarge': InstanceType(name='m7i.2xlarge', cores=8, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7i.48xlarge': InstanceType(name='m7i.48xlarge', cores=192, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7i.4xlarge': InstanceType(name='m7i.4xlarge', cores=16, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7i.8xlarge': InstanceType(name='m7i.8xlarge', cores=32, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7i.large': InstanceType(name='m7i.large', cores=2, memory=8.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7i.metal-24xl': InstanceType(name='m7i.metal-24xl', cores=96, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7i.metal-48xl': InstanceType(name='m7i.metal-48xl', cores=192, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'm7i.xlarge': InstanceType(name='m7i.xlarge', cores=4, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'p2.16xlarge': InstanceType(name='p2.16xlarge', cores=64, memory=732.0, disks=0, disk_capacity=0, architecture='amd64'), - 'p2.8xlarge': InstanceType(name='p2.8xlarge', cores=32, memory=488.0, disks=0, disk_capacity=0, architecture='amd64'), - 'p2.xlarge': InstanceType(name='p2.xlarge', cores=4, memory=61.0, disks=0, disk_capacity=0, architecture='amd64'), - 'p3.16xlarge': InstanceType(name='p3.16xlarge', cores=64, memory=488.0, disks=0, disk_capacity=0, architecture='amd64'), - 'p3.2xlarge': InstanceType(name='p3.2xlarge', cores=8, memory=61.0, disks=0, disk_capacity=0, architecture='amd64'), - 'p3.8xlarge': InstanceType(name='p3.8xlarge', cores=32, memory=244.0, disks=0, disk_capacity=0, architecture='amd64'), - 'p3dn.24xlarge': InstanceType(name='p3dn.24xlarge', cores=96, memory=768.0, disks=2.0, disk_capacity=900.0, architecture='amd64'), - 'p4d.24xlarge': InstanceType(name='p4d.24xlarge', cores=96, memory=1152.0, disks=8.0, disk_capacity=1000.0, architecture='amd64'), - 'p4de.24xlarge': InstanceType(name='p4de.24xlarge', cores=96, memory=1152.0, disks=8.0, disk_capacity=1000.0, architecture='amd64'), - 'p5.48xlarge': InstanceType(name='p5.48xlarge', cores=192, memory=2048.0, disks=8.0, disk_capacity=3840.0, architecture='amd64'), - 'r3.2xlarge': InstanceType(name='r3.2xlarge', cores=8, memory=61.0, disks=1.0, disk_capacity=160.0, architecture='amd64'), - 'r3.4xlarge': InstanceType(name='r3.4xlarge', cores=16, memory=122.0, disks=1.0, disk_capacity=320.0, architecture='amd64'), - 'r3.8xlarge': InstanceType(name='r3.8xlarge', cores=32, memory=244.0, disks=2.0, disk_capacity=320.0, architecture='amd64'), - 'r3.large': InstanceType(name='r3.large', cores=2, memory=15.25, disks=1.0, disk_capacity=32.0, architecture='amd64'), - 'r3.xlarge': InstanceType(name='r3.xlarge', cores=4, memory=30.5, disks=1.0, disk_capacity=80.0, architecture='amd64'), - 'r4.16xlarge': InstanceType(name='r4.16xlarge', cores=64, memory=488.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r4.2xlarge': InstanceType(name='r4.2xlarge', cores=8, memory=61.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r4.4xlarge': InstanceType(name='r4.4xlarge', cores=16, memory=122.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r4.8xlarge': InstanceType(name='r4.8xlarge', cores=32, memory=244.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r4.large': InstanceType(name='r4.large', cores=2, memory=15.25, disks=0, disk_capacity=0, architecture='amd64'), - 'r4.xlarge': InstanceType(name='r4.xlarge', cores=4, memory=30.5, disks=0, disk_capacity=0, architecture='amd64'), - 'r5.12xlarge': InstanceType(name='r5.12xlarge', cores=48, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5.16xlarge': InstanceType(name='r5.16xlarge', cores=64, memory=512.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5.24xlarge': InstanceType(name='r5.24xlarge', cores=96, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5.2xlarge': InstanceType(name='r5.2xlarge', cores=8, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5.4xlarge': InstanceType(name='r5.4xlarge', cores=16, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5.8xlarge': InstanceType(name='r5.8xlarge', cores=32, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5.large': InstanceType(name='r5.large', cores=2, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5.metal': InstanceType(name='r5.metal', cores=96, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5.xlarge': InstanceType(name='r5.xlarge', cores=4, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5a.12xlarge': InstanceType(name='r5a.12xlarge', cores=48, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5a.16xlarge': InstanceType(name='r5a.16xlarge', cores=64, memory=512.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5a.24xlarge': InstanceType(name='r5a.24xlarge', cores=96, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5a.2xlarge': InstanceType(name='r5a.2xlarge', cores=8, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5a.4xlarge': InstanceType(name='r5a.4xlarge', cores=16, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5a.8xlarge': InstanceType(name='r5a.8xlarge', cores=32, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5a.large': InstanceType(name='r5a.large', cores=2, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5a.xlarge': InstanceType(name='r5a.xlarge', cores=4, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5ad.12xlarge': InstanceType(name='r5ad.12xlarge', cores=48, memory=384.0, disks=2.0, disk_capacity=900.0, architecture='amd64'), - 'r5ad.16xlarge': InstanceType(name='r5ad.16xlarge', cores=64, memory=512.0, disks=4.0, disk_capacity=600.0, architecture='amd64'), - 'r5ad.24xlarge': InstanceType(name='r5ad.24xlarge', cores=96, memory=768.0, disks=4.0, disk_capacity=900.0, architecture='amd64'), - 'r5ad.2xlarge': InstanceType(name='r5ad.2xlarge', cores=8, memory=64.0, disks=1.0, disk_capacity=300.0, architecture='amd64'), - 'r5ad.4xlarge': InstanceType(name='r5ad.4xlarge', cores=16, memory=128.0, disks=2.0, disk_capacity=300.0, architecture='amd64'), - 'r5ad.8xlarge': InstanceType(name='r5ad.8xlarge', cores=32, memory=256.0, disks=2.0, disk_capacity=600.0, architecture='amd64'), - 'r5ad.large': InstanceType(name='r5ad.large', cores=2, memory=16.0, disks=1.0, disk_capacity=75.0, architecture='amd64'), - 'r5ad.xlarge': InstanceType(name='r5ad.xlarge', cores=4, memory=32.0, disks=1.0, disk_capacity=150.0, architecture='amd64'), - 'r5b.12xlarge': InstanceType(name='r5b.12xlarge', cores=48, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5b.16xlarge': InstanceType(name='r5b.16xlarge', cores=64, memory=512.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5b.24xlarge': InstanceType(name='r5b.24xlarge', cores=96, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5b.2xlarge': InstanceType(name='r5b.2xlarge', cores=8, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5b.4xlarge': InstanceType(name='r5b.4xlarge', cores=16, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5b.8xlarge': InstanceType(name='r5b.8xlarge', cores=32, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5b.large': InstanceType(name='r5b.large', cores=2, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5b.metal': InstanceType(name='r5b.metal', cores=96, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5b.xlarge': InstanceType(name='r5b.xlarge', cores=4, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5d.12xlarge': InstanceType(name='r5d.12xlarge', cores=48, memory=384.0, disks=2.0, disk_capacity=900.0, architecture='amd64'), - 'r5d.16xlarge': InstanceType(name='r5d.16xlarge', cores=64, memory=512.0, disks=4.0, disk_capacity=600.0, architecture='amd64'), - 'r5d.24xlarge': InstanceType(name='r5d.24xlarge', cores=96, memory=768.0, disks=4.0, disk_capacity=900.0, architecture='amd64'), - 'r5d.2xlarge': InstanceType(name='r5d.2xlarge', cores=8, memory=64.0, disks=1.0, disk_capacity=300.0, architecture='amd64'), - 'r5d.4xlarge': InstanceType(name='r5d.4xlarge', cores=16, memory=128.0, disks=2.0, disk_capacity=300.0, architecture='amd64'), - 'r5d.8xlarge': InstanceType(name='r5d.8xlarge', cores=32, memory=256.0, disks=2.0, disk_capacity=600.0, architecture='amd64'), - 'r5d.large': InstanceType(name='r5d.large', cores=2, memory=16.0, disks=1.0, disk_capacity=75.0, architecture='amd64'), - 'r5d.metal': InstanceType(name='r5d.metal', cores=96, memory=768.0, disks=4.0, disk_capacity=900.0, architecture='amd64'), - 'r5d.xlarge': InstanceType(name='r5d.xlarge', cores=4, memory=32.0, disks=1.0, disk_capacity=150.0, architecture='amd64'), - 'r5dn.12xlarge': InstanceType(name='r5dn.12xlarge', cores=48, memory=384.0, disks=2.0, disk_capacity=900.0, architecture='amd64'), - 'r5dn.16xlarge': InstanceType(name='r5dn.16xlarge', cores=64, memory=512.0, disks=4.0, disk_capacity=600.0, architecture='amd64'), - 'r5dn.24xlarge': InstanceType(name='r5dn.24xlarge', cores=96, memory=768.0, disks=4.0, disk_capacity=900.0, architecture='amd64'), - 'r5dn.2xlarge': InstanceType(name='r5dn.2xlarge', cores=8, memory=64.0, disks=1.0, disk_capacity=300.0, architecture='amd64'), - 'r5dn.4xlarge': InstanceType(name='r5dn.4xlarge', cores=16, memory=128.0, disks=2.0, disk_capacity=300.0, architecture='amd64'), - 'r5dn.8xlarge': InstanceType(name='r5dn.8xlarge', cores=32, memory=256.0, disks=2.0, disk_capacity=600.0, architecture='amd64'), - 'r5dn.large': InstanceType(name='r5dn.large', cores=2, memory=16.0, disks=1.0, disk_capacity=75.0, architecture='amd64'), - 'r5dn.metal': InstanceType(name='r5dn.metal', cores=96, memory=768.0, disks=4.0, disk_capacity=900.0, architecture='amd64'), - 'r5dn.xlarge': InstanceType(name='r5dn.xlarge', cores=4, memory=32.0, disks=1.0, disk_capacity=150.0, architecture='amd64'), - 'r5n.12xlarge': InstanceType(name='r5n.12xlarge', cores=48, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5n.16xlarge': InstanceType(name='r5n.16xlarge', cores=64, memory=512.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5n.24xlarge': InstanceType(name='r5n.24xlarge', cores=96, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5n.2xlarge': InstanceType(name='r5n.2xlarge', cores=8, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5n.4xlarge': InstanceType(name='r5n.4xlarge', cores=16, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5n.8xlarge': InstanceType(name='r5n.8xlarge', cores=32, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5n.large': InstanceType(name='r5n.large', cores=2, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5n.metal': InstanceType(name='r5n.metal', cores=96, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r5n.xlarge': InstanceType(name='r5n.xlarge', cores=4, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6a.12xlarge': InstanceType(name='r6a.12xlarge', cores=48, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6a.16xlarge': InstanceType(name='r6a.16xlarge', cores=64, memory=512.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6a.24xlarge': InstanceType(name='r6a.24xlarge', cores=96, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6a.2xlarge': InstanceType(name='r6a.2xlarge', cores=8, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6a.32xlarge': InstanceType(name='r6a.32xlarge', cores=128, memory=1024.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6a.48xlarge': InstanceType(name='r6a.48xlarge', cores=192, memory=1536.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6a.4xlarge': InstanceType(name='r6a.4xlarge', cores=16, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6a.8xlarge': InstanceType(name='r6a.8xlarge', cores=32, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6a.large': InstanceType(name='r6a.large', cores=2, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6a.metal': InstanceType(name='r6a.metal', cores=192, memory=1536.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6a.xlarge': InstanceType(name='r6a.xlarge', cores=4, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6g.12xlarge': InstanceType(name='r6g.12xlarge', cores=48, memory=384.0, disks=0, disk_capacity=0, architecture='arm64'), - 'r6g.16xlarge': InstanceType(name='r6g.16xlarge', cores=64, memory=512.0, disks=0, disk_capacity=0, architecture='arm64'), - 'r6g.2xlarge': InstanceType(name='r6g.2xlarge', cores=8, memory=64.0, disks=0, disk_capacity=0, architecture='arm64'), - 'r6g.4xlarge': InstanceType(name='r6g.4xlarge', cores=16, memory=128.0, disks=0, disk_capacity=0, architecture='arm64'), - 'r6g.8xlarge': InstanceType(name='r6g.8xlarge', cores=32, memory=256.0, disks=0, disk_capacity=0, architecture='arm64'), - 'r6g.large': InstanceType(name='r6g.large', cores=2, memory=16.0, disks=0, disk_capacity=0, architecture='arm64'), - 'r6g.medium': InstanceType(name='r6g.medium', cores=1, memory=8.0, disks=0, disk_capacity=0, architecture='arm64'), - 'r6g.metal': InstanceType(name='r6g.metal', cores=64, memory=512.0, disks=0, disk_capacity=0, architecture='arm64'), - 'r6g.xlarge': InstanceType(name='r6g.xlarge', cores=4, memory=32.0, disks=0, disk_capacity=0, architecture='arm64'), - 'r6gd.12xlarge': InstanceType(name='r6gd.12xlarge', cores=48, memory=384.0, disks=2.0, disk_capacity=1425.0, architecture='arm64'), - 'r6gd.16xlarge': InstanceType(name='r6gd.16xlarge', cores=64, memory=512.0, disks=2.0, disk_capacity=1900.0, architecture='arm64'), - 'r6gd.2xlarge': InstanceType(name='r6gd.2xlarge', cores=8, memory=64.0, disks=1.0, disk_capacity=475.0, architecture='arm64'), - 'r6gd.4xlarge': InstanceType(name='r6gd.4xlarge', cores=16, memory=128.0, disks=1.0, disk_capacity=950.0, architecture='arm64'), - 'r6gd.8xlarge': InstanceType(name='r6gd.8xlarge', cores=32, memory=256.0, disks=1.0, disk_capacity=1900.0, architecture='arm64'), - 'r6gd.large': InstanceType(name='r6gd.large', cores=2, memory=16.0, disks=1.0, disk_capacity=118.0, architecture='arm64'), - 'r6gd.medium': InstanceType(name='r6gd.medium', cores=1, memory=8.0, disks=1.0, disk_capacity=59.0, architecture='arm64'), - 'r6gd.metal': InstanceType(name='r6gd.metal', cores=64, memory=512.0, disks=2.0, disk_capacity=1900.0, architecture='arm64'), - 'r6gd.xlarge': InstanceType(name='r6gd.xlarge', cores=4, memory=32.0, disks=1.0, disk_capacity=237.0, architecture='arm64'), - 'r6i.12xlarge': InstanceType(name='r6i.12xlarge', cores=48, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6i.16xlarge': InstanceType(name='r6i.16xlarge', cores=64, memory=512.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6i.24xlarge': InstanceType(name='r6i.24xlarge', cores=96, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6i.2xlarge': InstanceType(name='r6i.2xlarge', cores=8, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6i.32xlarge': InstanceType(name='r6i.32xlarge', cores=128, memory=1024.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6i.4xlarge': InstanceType(name='r6i.4xlarge', cores=16, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6i.8xlarge': InstanceType(name='r6i.8xlarge', cores=32, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6i.large': InstanceType(name='r6i.large', cores=2, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6i.metal': InstanceType(name='r6i.metal', cores=128, memory=1024.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6i.xlarge': InstanceType(name='r6i.xlarge', cores=4, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6id.12xlarge': InstanceType(name='r6id.12xlarge', cores=48, memory=384.0, disks=2.0, disk_capacity=1425.0, architecture='amd64'), - 'r6id.16xlarge': InstanceType(name='r6id.16xlarge', cores=64, memory=512.0, disks=2.0, disk_capacity=1900.0, architecture='amd64'), - 'r6id.24xlarge': InstanceType(name='r6id.24xlarge', cores=96, memory=768.0, disks=4.0, disk_capacity=1425.0, architecture='amd64'), - 'r6id.2xlarge': InstanceType(name='r6id.2xlarge', cores=8, memory=64.0, disks=1.0, disk_capacity=474.0, architecture='amd64'), - 'r6id.32xlarge': InstanceType(name='r6id.32xlarge', cores=128, memory=1024.0, disks=4.0, disk_capacity=1900.0, architecture='amd64'), - 'r6id.4xlarge': InstanceType(name='r6id.4xlarge', cores=16, memory=128.0, disks=1.0, disk_capacity=950.0, architecture='amd64'), - 'r6id.8xlarge': InstanceType(name='r6id.8xlarge', cores=32, memory=256.0, disks=1.0, disk_capacity=1900.0, architecture='amd64'), - 'r6id.large': InstanceType(name='r6id.large', cores=2, memory=16.0, disks=1.0, disk_capacity=118.0, architecture='amd64'), - 'r6id.metal': InstanceType(name='r6id.metal', cores=128, memory=1024.0, disks=4.0, disk_capacity=1900.0, architecture='amd64'), - 'r6id.xlarge': InstanceType(name='r6id.xlarge', cores=4, memory=32.0, disks=1.0, disk_capacity=237.0, architecture='amd64'), - 'r6idn.12xlarge': InstanceType(name='r6idn.12xlarge', cores=48, memory=384.0, disks=2.0, disk_capacity=1425.0, architecture='amd64'), - 'r6idn.16xlarge': InstanceType(name='r6idn.16xlarge', cores=64, memory=512.0, disks=2.0, disk_capacity=1900.0, architecture='amd64'), - 'r6idn.24xlarge': InstanceType(name='r6idn.24xlarge', cores=96, memory=768.0, disks=4.0, disk_capacity=1425.0, architecture='amd64'), - 'r6idn.2xlarge': InstanceType(name='r6idn.2xlarge', cores=8, memory=64.0, disks=1.0, disk_capacity=474.0, architecture='amd64'), - 'r6idn.32xlarge': InstanceType(name='r6idn.32xlarge', cores=128, memory=1024.0, disks=4.0, disk_capacity=1900.0, architecture='amd64'), - 'r6idn.4xlarge': InstanceType(name='r6idn.4xlarge', cores=16, memory=128.0, disks=1.0, disk_capacity=950.0, architecture='amd64'), - 'r6idn.8xlarge': InstanceType(name='r6idn.8xlarge', cores=32, memory=256.0, disks=1.0, disk_capacity=1900.0, architecture='amd64'), - 'r6idn.large': InstanceType(name='r6idn.large', cores=2, memory=16.0, disks=1.0, disk_capacity=118.0, architecture='amd64'), - 'r6idn.metal': InstanceType(name='r6idn.metal', cores=128, memory=1024.0, disks=4.0, disk_capacity=1900.0, architecture='amd64'), - 'r6idn.xlarge': InstanceType(name='r6idn.xlarge', cores=4, memory=32.0, disks=1.0, disk_capacity=237.0, architecture='amd64'), - 'r6in.12xlarge': InstanceType(name='r6in.12xlarge', cores=48, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6in.16xlarge': InstanceType(name='r6in.16xlarge', cores=64, memory=512.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6in.24xlarge': InstanceType(name='r6in.24xlarge', cores=96, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6in.2xlarge': InstanceType(name='r6in.2xlarge', cores=8, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6in.32xlarge': InstanceType(name='r6in.32xlarge', cores=128, memory=1024.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6in.4xlarge': InstanceType(name='r6in.4xlarge', cores=16, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6in.8xlarge': InstanceType(name='r6in.8xlarge', cores=32, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6in.large': InstanceType(name='r6in.large', cores=2, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6in.metal': InstanceType(name='r6in.metal', cores=128, memory=1024.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r6in.xlarge': InstanceType(name='r6in.xlarge', cores=4, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7a.12xlarge': InstanceType(name='r7a.12xlarge', cores=48, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7a.16xlarge': InstanceType(name='r7a.16xlarge', cores=64, memory=512.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7a.24xlarge': InstanceType(name='r7a.24xlarge', cores=96, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7a.2xlarge': InstanceType(name='r7a.2xlarge', cores=8, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7a.32xlarge': InstanceType(name='r7a.32xlarge', cores=128, memory=1024.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7a.48xlarge': InstanceType(name='r7a.48xlarge', cores=192, memory=1536.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7a.4xlarge': InstanceType(name='r7a.4xlarge', cores=16, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7a.8xlarge': InstanceType(name='r7a.8xlarge', cores=32, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7a.large': InstanceType(name='r7a.large', cores=2, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7a.medium': InstanceType(name='r7a.medium', cores=1, memory=8.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7a.metal-48xl': InstanceType(name='r7a.metal-48xl', cores=192, memory=1536.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7a.xlarge': InstanceType(name='r7a.xlarge', cores=4, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7g.12xlarge': InstanceType(name='r7g.12xlarge', cores=48, memory=384.0, disks=0, disk_capacity=0, architecture='arm64'), - 'r7g.16xlarge': InstanceType(name='r7g.16xlarge', cores=64, memory=512.0, disks=0, disk_capacity=0, architecture='arm64'), - 'r7g.2xlarge': InstanceType(name='r7g.2xlarge', cores=8, memory=64.0, disks=0, disk_capacity=0, architecture='arm64'), - 'r7g.4xlarge': InstanceType(name='r7g.4xlarge', cores=16, memory=128.0, disks=0, disk_capacity=0, architecture='arm64'), - 'r7g.8xlarge': InstanceType(name='r7g.8xlarge', cores=32, memory=256.0, disks=0, disk_capacity=0, architecture='arm64'), - 'r7g.large': InstanceType(name='r7g.large', cores=2, memory=16.0, disks=0, disk_capacity=0, architecture='arm64'), - 'r7g.medium': InstanceType(name='r7g.medium', cores=1, memory=8.0, disks=0, disk_capacity=0, architecture='arm64'), - 'r7g.metal': InstanceType(name='r7g.metal', cores=64, memory=512.0, disks=0, disk_capacity=0, architecture='arm64'), - 'r7g.xlarge': InstanceType(name='r7g.xlarge', cores=4, memory=32.0, disks=0, disk_capacity=0, architecture='arm64'), - 'r7gd.12xlarge': InstanceType(name='r7gd.12xlarge', cores=48, memory=384.0, disks=2.0, disk_capacity=1425.0, architecture='arm64'), - 'r7gd.16xlarge': InstanceType(name='r7gd.16xlarge', cores=64, memory=512.0, disks=2.0, disk_capacity=1900.0, architecture='arm64'), - 'r7gd.2xlarge': InstanceType(name='r7gd.2xlarge', cores=8, memory=64.0, disks=1.0, disk_capacity=475.0, architecture='arm64'), - 'r7gd.4xlarge': InstanceType(name='r7gd.4xlarge', cores=16, memory=128.0, disks=1.0, disk_capacity=950.0, architecture='arm64'), - 'r7gd.8xlarge': InstanceType(name='r7gd.8xlarge', cores=32, memory=256.0, disks=1.0, disk_capacity=1900.0, architecture='arm64'), - 'r7gd.large': InstanceType(name='r7gd.large', cores=2, memory=16.0, disks=1.0, disk_capacity=118.0, architecture='arm64'), - 'r7gd.medium': InstanceType(name='r7gd.medium', cores=1, memory=8.0, disks=1.0, disk_capacity=59.0, architecture='arm64'), - 'r7gd.xlarge': InstanceType(name='r7gd.xlarge', cores=4, memory=32.0, disks=1.0, disk_capacity=237.0, architecture='arm64'), - 'r7i.12xlarge': InstanceType(name='r7i.12xlarge', cores=48, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7i.16xlarge': InstanceType(name='r7i.16xlarge', cores=64, memory=512.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7i.24xlarge': InstanceType(name='r7i.24xlarge', cores=96, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7i.2xlarge': InstanceType(name='r7i.2xlarge', cores=8, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7i.48xlarge': InstanceType(name='r7i.48xlarge', cores=192, memory=1536.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7i.4xlarge': InstanceType(name='r7i.4xlarge', cores=16, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7i.8xlarge': InstanceType(name='r7i.8xlarge', cores=32, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7i.large': InstanceType(name='r7i.large', cores=2, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7i.metal-24xl': InstanceType(name='r7i.metal-24xl', cores=96, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7i.metal-48xl': InstanceType(name='r7i.metal-48xl', cores=192, memory=1536.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7i.xlarge': InstanceType(name='r7i.xlarge', cores=4, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7iz.12xlarge': InstanceType(name='r7iz.12xlarge', cores=48, memory=384.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7iz.16xlarge': InstanceType(name='r7iz.16xlarge', cores=64, memory=512.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7iz.2xlarge': InstanceType(name='r7iz.2xlarge', cores=8, memory=64.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7iz.32xlarge': InstanceType(name='r7iz.32xlarge', cores=128, memory=1024.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7iz.4xlarge': InstanceType(name='r7iz.4xlarge', cores=16, memory=128.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7iz.8xlarge': InstanceType(name='r7iz.8xlarge', cores=32, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7iz.large': InstanceType(name='r7iz.large', cores=2, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7iz.metal-16xl': InstanceType(name='r7iz.metal-16xl', cores=64, memory=512.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7iz.metal-32xl': InstanceType(name='r7iz.metal-32xl', cores=128, memory=1024.0, disks=0, disk_capacity=0, architecture='amd64'), - 'r7iz.xlarge': InstanceType(name='r7iz.xlarge', cores=4, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 't1.micro': InstanceType(name='t1.micro', cores=1, memory=0.613, disks=0, disk_capacity=0, architecture='amd64'), - 't2.2xlarge': InstanceType(name='t2.2xlarge', cores=8, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 't2.large': InstanceType(name='t2.large', cores=2, memory=8.0, disks=0, disk_capacity=0, architecture='amd64'), - 't2.medium': InstanceType(name='t2.medium', cores=2, memory=4.0, disks=0, disk_capacity=0, architecture='amd64'), - 't2.micro': InstanceType(name='t2.micro', cores=1, memory=1.0, disks=0, disk_capacity=0, architecture='amd64'), - 't2.nano': InstanceType(name='t2.nano', cores=1, memory=0.5, disks=0, disk_capacity=0, architecture='amd64'), - 't2.small': InstanceType(name='t2.small', cores=1, memory=2.0, disks=0, disk_capacity=0, architecture='amd64'), - 't2.xlarge': InstanceType(name='t2.xlarge', cores=4, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 't3.2xlarge': InstanceType(name='t3.2xlarge', cores=8, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 't3.large': InstanceType(name='t3.large', cores=2, memory=8.0, disks=0, disk_capacity=0, architecture='amd64'), - 't3.medium': InstanceType(name='t3.medium', cores=2, memory=4.0, disks=0, disk_capacity=0, architecture='amd64'), - 't3.micro': InstanceType(name='t3.micro', cores=2, memory=1.0, disks=0, disk_capacity=0, architecture='amd64'), - 't3.nano': InstanceType(name='t3.nano', cores=2, memory=0.5, disks=0, disk_capacity=0, architecture='amd64'), - 't3.small': InstanceType(name='t3.small', cores=2, memory=2.0, disks=0, disk_capacity=0, architecture='amd64'), - 't3.xlarge': InstanceType(name='t3.xlarge', cores=4, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 't3a.2xlarge': InstanceType(name='t3a.2xlarge', cores=8, memory=32.0, disks=0, disk_capacity=0, architecture='amd64'), - 't3a.large': InstanceType(name='t3a.large', cores=2, memory=8.0, disks=0, disk_capacity=0, architecture='amd64'), - 't3a.medium': InstanceType(name='t3a.medium', cores=2, memory=4.0, disks=0, disk_capacity=0, architecture='amd64'), - 't3a.micro': InstanceType(name='t3a.micro', cores=2, memory=1.0, disks=0, disk_capacity=0, architecture='amd64'), - 't3a.nano': InstanceType(name='t3a.nano', cores=2, memory=0.5, disks=0, disk_capacity=0, architecture='amd64'), - 't3a.small': InstanceType(name='t3a.small', cores=2, memory=2.0, disks=0, disk_capacity=0, architecture='amd64'), - 't3a.xlarge': InstanceType(name='t3a.xlarge', cores=4, memory=16.0, disks=0, disk_capacity=0, architecture='amd64'), - 't4g.2xlarge': InstanceType(name='t4g.2xlarge', cores=8, memory=32.0, disks=0, disk_capacity=0, architecture='arm64'), - 't4g.large': InstanceType(name='t4g.large', cores=2, memory=8.0, disks=0, disk_capacity=0, architecture='arm64'), - 't4g.medium': InstanceType(name='t4g.medium', cores=2, memory=4.0, disks=0, disk_capacity=0, architecture='arm64'), - 't4g.micro': InstanceType(name='t4g.micro', cores=2, memory=1.0, disks=0, disk_capacity=0, architecture='arm64'), - 't4g.nano': InstanceType(name='t4g.nano', cores=2, memory=0.5, disks=0, disk_capacity=0, architecture='arm64'), - 't4g.small': InstanceType(name='t4g.small', cores=2, memory=2.0, disks=0, disk_capacity=0, architecture='arm64'), - 't4g.xlarge': InstanceType(name='t4g.xlarge', cores=4, memory=16.0, disks=0, disk_capacity=0, architecture='arm64'), - 'trn1.2xlarge': InstanceType(name='trn1.2xlarge', cores=8, memory=32.0, disks=1.0, disk_capacity=475.0, architecture='amd64'), - 'trn1.32xlarge': InstanceType(name='trn1.32xlarge', cores=128, memory=512.0, disks=4.0, disk_capacity=1900.0, architecture='amd64'), - 'trn1n.32xlarge': InstanceType(name='trn1n.32xlarge', cores=128, memory=512.0, disks=4.0, disk_capacity=1900.0, architecture='amd64'), - 'u-12tb1.112xlarge': InstanceType(name='u-12tb1.112xlarge', cores=448, memory=12288.0, disks=0, disk_capacity=0, architecture='amd64'), - 'u-18tb1.112xlarge': InstanceType(name='u-18tb1.112xlarge', cores=448, memory=18432.0, disks=0, disk_capacity=0, architecture='amd64'), - 'u-24tb1.112xlarge': InstanceType(name='u-24tb1.112xlarge', cores=448, memory=24576.0, disks=0, disk_capacity=0, architecture='amd64'), - 'u-3tb1.56xlarge': InstanceType(name='u-3tb1.56xlarge', cores=224, memory=3072.0, disks=0, disk_capacity=0, architecture='amd64'), - 'u-6tb1.112xlarge': InstanceType(name='u-6tb1.112xlarge', cores=448, memory=6144.0, disks=0, disk_capacity=0, architecture='amd64'), - 'u-6tb1.56xlarge': InstanceType(name='u-6tb1.56xlarge', cores=224, memory=6144.0, disks=0, disk_capacity=0, architecture='amd64'), - 'u-9tb1.112xlarge': InstanceType(name='u-9tb1.112xlarge', cores=448, memory=9216.0, disks=0, disk_capacity=0, architecture='amd64'), - 'vt1.24xlarge': InstanceType(name='vt1.24xlarge', cores=96, memory=192.0, disks=0, disk_capacity=0, architecture='amd64'), - 'vt1.3xlarge': InstanceType(name='vt1.3xlarge', cores=12, memory=24.0, disks=0, disk_capacity=0, architecture='amd64'), - 'vt1.6xlarge': InstanceType(name='vt1.6xlarge', cores=24, memory=48.0, disks=0, disk_capacity=0, architecture='amd64'), - 'x1.16xlarge': InstanceType(name='x1.16xlarge', cores=64, memory=976.0, disks=1.0, disk_capacity=1920.0, architecture='amd64'), - 'x1.32xlarge': InstanceType(name='x1.32xlarge', cores=128, memory=1952.0, disks=2.0, disk_capacity=1920.0, architecture='amd64'), - 'x1e.16xlarge': InstanceType(name='x1e.16xlarge', cores=64, memory=1952.0, disks=1.0, disk_capacity=1920.0, architecture='amd64'), - 'x1e.2xlarge': InstanceType(name='x1e.2xlarge', cores=8, memory=244.0, disks=1.0, disk_capacity=240.0, architecture='amd64'), - 'x1e.32xlarge': InstanceType(name='x1e.32xlarge', cores=128, memory=3904.0, disks=2.0, disk_capacity=1920.0, architecture='amd64'), - 'x1e.4xlarge': InstanceType(name='x1e.4xlarge', cores=16, memory=488.0, disks=1.0, disk_capacity=480.0, architecture='amd64'), - 'x1e.8xlarge': InstanceType(name='x1e.8xlarge', cores=32, memory=976.0, disks=1.0, disk_capacity=960.0, architecture='amd64'), - 'x1e.xlarge': InstanceType(name='x1e.xlarge', cores=4, memory=122.0, disks=1.0, disk_capacity=120.0, architecture='amd64'), - 'x2gd.12xlarge': InstanceType(name='x2gd.12xlarge', cores=48, memory=768.0, disks=2.0, disk_capacity=1425.0, architecture='arm64'), - 'x2gd.16xlarge': InstanceType(name='x2gd.16xlarge', cores=64, memory=1024.0, disks=2.0, disk_capacity=1900.0, architecture='arm64'), - 'x2gd.2xlarge': InstanceType(name='x2gd.2xlarge', cores=8, memory=128.0, disks=1.0, disk_capacity=474.0, architecture='arm64'), - 'x2gd.4xlarge': InstanceType(name='x2gd.4xlarge', cores=16, memory=256.0, disks=1.0, disk_capacity=950.0, architecture='arm64'), - 'x2gd.8xlarge': InstanceType(name='x2gd.8xlarge', cores=32, memory=512.0, disks=1.0, disk_capacity=1900.0, architecture='arm64'), - 'x2gd.large': InstanceType(name='x2gd.large', cores=2, memory=32.0, disks=1.0, disk_capacity=118.0, architecture='arm64'), - 'x2gd.medium': InstanceType(name='x2gd.medium', cores=1, memory=16.0, disks=1.0, disk_capacity=59.0, architecture='arm64'), - 'x2gd.metal': InstanceType(name='x2gd.metal', cores=64, memory=1024.0, disks=2.0, disk_capacity=1900.0, architecture='arm64'), - 'x2gd.xlarge': InstanceType(name='x2gd.xlarge', cores=4, memory=64.0, disks=1.0, disk_capacity=237.0, architecture='arm64'), - 'x2idn.16xlarge': InstanceType(name='x2idn.16xlarge', cores=64, memory=1024.0, disks=1.0, disk_capacity=1900.0, architecture='amd64'), - 'x2idn.24xlarge': InstanceType(name='x2idn.24xlarge', cores=96, memory=1536.0, disks=2.0, disk_capacity=1425.0, architecture='amd64'), - 'x2idn.32xlarge': InstanceType(name='x2idn.32xlarge', cores=128, memory=2048.0, disks=2.0, disk_capacity=1900.0, architecture='amd64'), - 'x2idn.metal': InstanceType(name='x2idn.metal', cores=128, memory=2048.0, disks=2.0, disk_capacity=1900.0, architecture='amd64'), - 'x2iedn.16xlarge': InstanceType(name='x2iedn.16xlarge', cores=64, memory=2048.0, disks=1.0, disk_capacity=1900.0, architecture='amd64'), - 'x2iedn.24xlarge': InstanceType(name='x2iedn.24xlarge', cores=96, memory=3072.0, disks=2.0, disk_capacity=1425.0, architecture='amd64'), - 'x2iedn.2xlarge': InstanceType(name='x2iedn.2xlarge', cores=8, memory=256.0, disks=1.0, disk_capacity=237.0, architecture='amd64'), - 'x2iedn.32xlarge': InstanceType(name='x2iedn.32xlarge', cores=128, memory=4096.0, disks=2.0, disk_capacity=1900.0, architecture='amd64'), - 'x2iedn.4xlarge': InstanceType(name='x2iedn.4xlarge', cores=16, memory=512.0, disks=1.0, disk_capacity=475.0, architecture='amd64'), - 'x2iedn.8xlarge': InstanceType(name='x2iedn.8xlarge', cores=32, memory=1024.0, disks=1.0, disk_capacity=950.0, architecture='amd64'), - 'x2iedn.metal': InstanceType(name='x2iedn.metal', cores=128, memory=4096.0, disks=2.0, disk_capacity=1900.0, architecture='amd64'), - 'x2iedn.xlarge': InstanceType(name='x2iedn.xlarge', cores=4, memory=128.0, disks=1.0, disk_capacity=118.0, architecture='amd64'), - 'x2iezn.12xlarge': InstanceType(name='x2iezn.12xlarge', cores=48, memory=1536.0, disks=0, disk_capacity=0, architecture='amd64'), - 'x2iezn.2xlarge': InstanceType(name='x2iezn.2xlarge', cores=8, memory=256.0, disks=0, disk_capacity=0, architecture='amd64'), - 'x2iezn.4xlarge': InstanceType(name='x2iezn.4xlarge', cores=16, memory=512.0, disks=0, disk_capacity=0, architecture='amd64'), - 'x2iezn.6xlarge': InstanceType(name='x2iezn.6xlarge', cores=24, memory=768.0, disks=0, disk_capacity=0, architecture='amd64'), - 'x2iezn.8xlarge': InstanceType(name='x2iezn.8xlarge', cores=32, memory=1024.0, disks=0, disk_capacity=0, architecture='amd64'), - 'x2iezn.metal': InstanceType(name='x2iezn.metal', cores=48, memory=1536.0, disks=0, disk_capacity=0, architecture='amd64'), - 'z1d.12xlarge': InstanceType(name='z1d.12xlarge', cores=48, memory=384.0, disks=2.0, disk_capacity=900.0, architecture='amd64'), - 'z1d.2xlarge': InstanceType(name='z1d.2xlarge', cores=8, memory=64.0, disks=1.0, disk_capacity=300.0, architecture='amd64'), - 'z1d.3xlarge': InstanceType(name='z1d.3xlarge', cores=12, memory=96.0, disks=1.0, disk_capacity=450.0, architecture='amd64'), - 'z1d.6xlarge': InstanceType(name='z1d.6xlarge', cores=24, memory=192.0, disks=1.0, disk_capacity=900.0, architecture='amd64'), - 'z1d.large': InstanceType(name='z1d.large', cores=2, memory=16.0, disks=1.0, disk_capacity=75.0, architecture='amd64'), - 'z1d.metal': InstanceType(name='z1d.metal', cores=48, memory=384.0, disks=2.0, disk_capacity=900.0, architecture='amd64'), - 'z1d.xlarge': InstanceType(name='z1d.xlarge', cores=4, memory=32.0, disks=1.0, disk_capacity=150.0, architecture='amd64'), + "a1.2xlarge": InstanceType( + name="a1.2xlarge", + cores=8, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "a1.4xlarge": InstanceType( + name="a1.4xlarge", + cores=16, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "a1.large": InstanceType( + name="a1.large", + cores=2, + memory=4.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "a1.medium": InstanceType( + name="a1.medium", + cores=1, + memory=2.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "a1.metal": InstanceType( + name="a1.metal", + cores=16, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "a1.xlarge": InstanceType( + name="a1.xlarge", + cores=4, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c1.medium": InstanceType( + name="c1.medium", + cores=2, + memory=1.7, + disks=1.0, + disk_capacity=350.0, + architecture="amd64", + ), + "c1.xlarge": InstanceType( + name="c1.xlarge", + cores=8, + memory=7.0, + disks=4.0, + disk_capacity=420.0, + architecture="amd64", + ), + "c3.2xlarge": InstanceType( + name="c3.2xlarge", + cores=8, + memory=15.0, + disks=2.0, + disk_capacity=80.0, + architecture="amd64", + ), + "c3.4xlarge": InstanceType( + name="c3.4xlarge", + cores=16, + memory=30.0, + disks=2.0, + disk_capacity=160.0, + architecture="amd64", + ), + "c3.8xlarge": InstanceType( + name="c3.8xlarge", + cores=32, + memory=60.0, + disks=2.0, + disk_capacity=320.0, + architecture="amd64", + ), + "c3.large": InstanceType( + name="c3.large", + cores=2, + memory=3.75, + disks=2.0, + disk_capacity=16.0, + architecture="amd64", + ), + "c3.xlarge": InstanceType( + name="c3.xlarge", + cores=4, + memory=7.5, + disks=2.0, + disk_capacity=40.0, + architecture="amd64", + ), + "c4.2xlarge": InstanceType( + name="c4.2xlarge", + cores=8, + memory=15.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c4.4xlarge": InstanceType( + name="c4.4xlarge", + cores=16, + memory=30.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c4.8xlarge": InstanceType( + name="c4.8xlarge", + cores=36, + memory=60.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c4.large": InstanceType( + name="c4.large", + cores=2, + memory=3.75, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c4.xlarge": InstanceType( + name="c4.xlarge", + cores=4, + memory=7.5, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c5.12xlarge": InstanceType( + name="c5.12xlarge", + cores=48, + memory=96.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c5.18xlarge": InstanceType( + name="c5.18xlarge", + cores=72, + memory=144.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c5.24xlarge": InstanceType( + name="c5.24xlarge", + cores=96, + memory=192.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c5.2xlarge": InstanceType( + name="c5.2xlarge", + cores=8, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c5.4xlarge": InstanceType( + name="c5.4xlarge", + cores=16, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c5.9xlarge": InstanceType( + name="c5.9xlarge", + cores=36, + memory=72.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c5.large": InstanceType( + name="c5.large", + cores=2, + memory=4.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c5.metal": InstanceType( + name="c5.metal", + cores=96, + memory=192.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c5.xlarge": InstanceType( + name="c5.xlarge", + cores=4, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c5a.12xlarge": InstanceType( + name="c5a.12xlarge", + cores=48, + memory=96.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c5a.16xlarge": InstanceType( + name="c5a.16xlarge", + cores=64, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c5a.24xlarge": InstanceType( + name="c5a.24xlarge", + cores=96, + memory=192.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c5a.2xlarge": InstanceType( + name="c5a.2xlarge", + cores=8, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c5a.4xlarge": InstanceType( + name="c5a.4xlarge", + cores=16, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c5a.8xlarge": InstanceType( + name="c5a.8xlarge", + cores=32, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c5a.large": InstanceType( + name="c5a.large", + cores=2, + memory=4.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c5a.xlarge": InstanceType( + name="c5a.xlarge", + cores=4, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c5ad.12xlarge": InstanceType( + name="c5ad.12xlarge", + cores=48, + memory=96.0, + disks=2.0, + disk_capacity=900.0, + architecture="amd64", + ), + "c5ad.16xlarge": InstanceType( + name="c5ad.16xlarge", + cores=64, + memory=128.0, + disks=2.0, + disk_capacity=1200.0, + architecture="amd64", + ), + "c5ad.24xlarge": InstanceType( + name="c5ad.24xlarge", + cores=96, + memory=192.0, + disks=2.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "c5ad.2xlarge": InstanceType( + name="c5ad.2xlarge", + cores=8, + memory=16.0, + disks=1.0, + disk_capacity=300.0, + architecture="amd64", + ), + "c5ad.4xlarge": InstanceType( + name="c5ad.4xlarge", + cores=16, + memory=32.0, + disks=2.0, + disk_capacity=300.0, + architecture="amd64", + ), + "c5ad.8xlarge": InstanceType( + name="c5ad.8xlarge", + cores=32, + memory=64.0, + disks=2.0, + disk_capacity=600.0, + architecture="amd64", + ), + "c5ad.large": InstanceType( + name="c5ad.large", + cores=2, + memory=4.0, + disks=1.0, + disk_capacity=75.0, + architecture="amd64", + ), + "c5ad.xlarge": InstanceType( + name="c5ad.xlarge", + cores=4, + memory=8.0, + disks=1.0, + disk_capacity=150.0, + architecture="amd64", + ), + "c5d.12xlarge": InstanceType( + name="c5d.12xlarge", + cores=48, + memory=96.0, + disks=2.0, + disk_capacity=900.0, + architecture="amd64", + ), + "c5d.18xlarge": InstanceType( + name="c5d.18xlarge", + cores=72, + memory=144.0, + disks=2.0, + disk_capacity=900.0, + architecture="amd64", + ), + "c5d.24xlarge": InstanceType( + name="c5d.24xlarge", + cores=96, + memory=192.0, + disks=4.0, + disk_capacity=900.0, + architecture="amd64", + ), + "c5d.2xlarge": InstanceType( + name="c5d.2xlarge", + cores=8, + memory=16.0, + disks=1.0, + disk_capacity=200.0, + architecture="amd64", + ), + "c5d.4xlarge": InstanceType( + name="c5d.4xlarge", + cores=16, + memory=32.0, + disks=1.0, + disk_capacity=400.0, + architecture="amd64", + ), + "c5d.9xlarge": InstanceType( + name="c5d.9xlarge", + cores=36, + memory=72.0, + disks=1.0, + disk_capacity=900.0, + architecture="amd64", + ), + "c5d.large": InstanceType( + name="c5d.large", + cores=2, + memory=4.0, + disks=1.0, + disk_capacity=50.0, + architecture="amd64", + ), + "c5d.metal": InstanceType( + name="c5d.metal", + cores=96, + memory=192.0, + disks=4.0, + disk_capacity=900.0, + architecture="amd64", + ), + "c5d.xlarge": InstanceType( + name="c5d.xlarge", + cores=4, + memory=8.0, + disks=1.0, + disk_capacity=100.0, + architecture="amd64", + ), + "c5n.18xlarge": InstanceType( + name="c5n.18xlarge", + cores=72, + memory=192.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c5n.2xlarge": InstanceType( + name="c5n.2xlarge", + cores=8, + memory=21.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c5n.4xlarge": InstanceType( + name="c5n.4xlarge", + cores=16, + memory=42.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c5n.9xlarge": InstanceType( + name="c5n.9xlarge", + cores=36, + memory=96.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c5n.large": InstanceType( + name="c5n.large", + cores=2, + memory=5.25, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c5n.metal": InstanceType( + name="c5n.metal", + cores=72, + memory=192.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c5n.xlarge": InstanceType( + name="c5n.xlarge", + cores=4, + memory=10.5, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6a.12xlarge": InstanceType( + name="c6a.12xlarge", + cores=48, + memory=96.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6a.16xlarge": InstanceType( + name="c6a.16xlarge", + cores=64, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6a.24xlarge": InstanceType( + name="c6a.24xlarge", + cores=96, + memory=192.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6a.2xlarge": InstanceType( + name="c6a.2xlarge", + cores=8, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6a.32xlarge": InstanceType( + name="c6a.32xlarge", + cores=128, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6a.48xlarge": InstanceType( + name="c6a.48xlarge", + cores=192, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6a.4xlarge": InstanceType( + name="c6a.4xlarge", + cores=16, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6a.8xlarge": InstanceType( + name="c6a.8xlarge", + cores=32, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6a.large": InstanceType( + name="c6a.large", + cores=2, + memory=4.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6a.metal": InstanceType( + name="c6a.metal", + cores=192, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6a.xlarge": InstanceType( + name="c6a.xlarge", + cores=4, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6g.12xlarge": InstanceType( + name="c6g.12xlarge", + cores=48, + memory=96.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c6g.16xlarge": InstanceType( + name="c6g.16xlarge", + cores=64, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c6g.2xlarge": InstanceType( + name="c6g.2xlarge", + cores=8, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c6g.4xlarge": InstanceType( + name="c6g.4xlarge", + cores=16, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c6g.8xlarge": InstanceType( + name="c6g.8xlarge", + cores=32, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c6g.large": InstanceType( + name="c6g.large", + cores=2, + memory=4.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c6g.medium": InstanceType( + name="c6g.medium", + cores=1, + memory=2.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c6g.metal": InstanceType( + name="c6g.metal", + cores=64, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c6g.xlarge": InstanceType( + name="c6g.xlarge", + cores=4, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c6gd.12xlarge": InstanceType( + name="c6gd.12xlarge", + cores=48, + memory=96.0, + disks=2.0, + disk_capacity=1425.0, + architecture="arm64", + ), + "c6gd.16xlarge": InstanceType( + name="c6gd.16xlarge", + cores=64, + memory=128.0, + disks=2.0, + disk_capacity=1900.0, + architecture="arm64", + ), + "c6gd.2xlarge": InstanceType( + name="c6gd.2xlarge", + cores=8, + memory=16.0, + disks=1.0, + disk_capacity=475.0, + architecture="arm64", + ), + "c6gd.4xlarge": InstanceType( + name="c6gd.4xlarge", + cores=16, + memory=32.0, + disks=1.0, + disk_capacity=950.0, + architecture="arm64", + ), + "c6gd.8xlarge": InstanceType( + name="c6gd.8xlarge", + cores=32, + memory=64.0, + disks=1.0, + disk_capacity=1900.0, + architecture="arm64", + ), + "c6gd.large": InstanceType( + name="c6gd.large", + cores=2, + memory=4.0, + disks=1.0, + disk_capacity=118.0, + architecture="arm64", + ), + "c6gd.medium": InstanceType( + name="c6gd.medium", + cores=1, + memory=2.0, + disks=1.0, + disk_capacity=59.0, + architecture="arm64", + ), + "c6gd.metal": InstanceType( + name="c6gd.metal", + cores=64, + memory=128.0, + disks=2.0, + disk_capacity=1900.0, + architecture="arm64", + ), + "c6gd.xlarge": InstanceType( + name="c6gd.xlarge", + cores=4, + memory=8.0, + disks=1.0, + disk_capacity=237.0, + architecture="arm64", + ), + "c6gn.12xlarge": InstanceType( + name="c6gn.12xlarge", + cores=48, + memory=96.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c6gn.16xlarge": InstanceType( + name="c6gn.16xlarge", + cores=64, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c6gn.2xlarge": InstanceType( + name="c6gn.2xlarge", + cores=8, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c6gn.4xlarge": InstanceType( + name="c6gn.4xlarge", + cores=16, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c6gn.8xlarge": InstanceType( + name="c6gn.8xlarge", + cores=32, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c6gn.large": InstanceType( + name="c6gn.large", + cores=2, + memory=4.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c6gn.medium": InstanceType( + name="c6gn.medium", + cores=1, + memory=2.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c6gn.metal": InstanceType( + name="c6gn.metal", + cores=64, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c6gn.xlarge": InstanceType( + name="c6gn.xlarge", + cores=4, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c6i.12xlarge": InstanceType( + name="c6i.12xlarge", + cores=48, + memory=96.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6i.16xlarge": InstanceType( + name="c6i.16xlarge", + cores=64, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6i.24xlarge": InstanceType( + name="c6i.24xlarge", + cores=96, + memory=192.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6i.2xlarge": InstanceType( + name="c6i.2xlarge", + cores=8, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6i.32xlarge": InstanceType( + name="c6i.32xlarge", + cores=128, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6i.4xlarge": InstanceType( + name="c6i.4xlarge", + cores=16, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6i.8xlarge": InstanceType( + name="c6i.8xlarge", + cores=32, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6i.large": InstanceType( + name="c6i.large", + cores=2, + memory=4.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6i.metal": InstanceType( + name="c6i.metal", + cores=128, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6i.xlarge": InstanceType( + name="c6i.xlarge", + cores=4, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6id.12xlarge": InstanceType( + name="c6id.12xlarge", + cores=48, + memory=96.0, + disks=2.0, + disk_capacity=1425.0, + architecture="amd64", + ), + "c6id.16xlarge": InstanceType( + name="c6id.16xlarge", + cores=64, + memory=128.0, + disks=2.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "c6id.24xlarge": InstanceType( + name="c6id.24xlarge", + cores=96, + memory=192.0, + disks=4.0, + disk_capacity=1425.0, + architecture="amd64", + ), + "c6id.2xlarge": InstanceType( + name="c6id.2xlarge", + cores=8, + memory=16.0, + disks=1.0, + disk_capacity=474.0, + architecture="amd64", + ), + "c6id.32xlarge": InstanceType( + name="c6id.32xlarge", + cores=128, + memory=256.0, + disks=4.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "c6id.4xlarge": InstanceType( + name="c6id.4xlarge", + cores=16, + memory=32.0, + disks=1.0, + disk_capacity=950.0, + architecture="amd64", + ), + "c6id.8xlarge": InstanceType( + name="c6id.8xlarge", + cores=32, + memory=64.0, + disks=1.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "c6id.large": InstanceType( + name="c6id.large", + cores=2, + memory=4.0, + disks=1.0, + disk_capacity=118.0, + architecture="amd64", + ), + "c6id.metal": InstanceType( + name="c6id.metal", + cores=128, + memory=256.0, + disks=4.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "c6id.xlarge": InstanceType( + name="c6id.xlarge", + cores=4, + memory=8.0, + disks=1.0, + disk_capacity=237.0, + architecture="amd64", + ), + "c6in.12xlarge": InstanceType( + name="c6in.12xlarge", + cores=48, + memory=96.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6in.16xlarge": InstanceType( + name="c6in.16xlarge", + cores=64, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6in.24xlarge": InstanceType( + name="c6in.24xlarge", + cores=96, + memory=192.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6in.2xlarge": InstanceType( + name="c6in.2xlarge", + cores=8, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6in.32xlarge": InstanceType( + name="c6in.32xlarge", + cores=128, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6in.4xlarge": InstanceType( + name="c6in.4xlarge", + cores=16, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6in.8xlarge": InstanceType( + name="c6in.8xlarge", + cores=32, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6in.large": InstanceType( + name="c6in.large", + cores=2, + memory=4.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6in.metal": InstanceType( + name="c6in.metal", + cores=128, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c6in.xlarge": InstanceType( + name="c6in.xlarge", + cores=4, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c7a.12xlarge": InstanceType( + name="c7a.12xlarge", + cores=48, + memory=96.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c7a.16xlarge": InstanceType( + name="c7a.16xlarge", + cores=64, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c7a.24xlarge": InstanceType( + name="c7a.24xlarge", + cores=96, + memory=192.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c7a.2xlarge": InstanceType( + name="c7a.2xlarge", + cores=8, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c7a.32xlarge": InstanceType( + name="c7a.32xlarge", + cores=128, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c7a.48xlarge": InstanceType( + name="c7a.48xlarge", + cores=192, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c7a.4xlarge": InstanceType( + name="c7a.4xlarge", + cores=16, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c7a.8xlarge": InstanceType( + name="c7a.8xlarge", + cores=32, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c7a.large": InstanceType( + name="c7a.large", + cores=2, + memory=4.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c7a.medium": InstanceType( + name="c7a.medium", + cores=1, + memory=2.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c7a.metal-48xl": InstanceType( + name="c7a.metal-48xl", + cores=192, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c7a.xlarge": InstanceType( + name="c7a.xlarge", + cores=4, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c7g.12xlarge": InstanceType( + name="c7g.12xlarge", + cores=48, + memory=96.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c7g.16xlarge": InstanceType( + name="c7g.16xlarge", + cores=64, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c7g.2xlarge": InstanceType( + name="c7g.2xlarge", + cores=8, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c7g.4xlarge": InstanceType( + name="c7g.4xlarge", + cores=16, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c7g.8xlarge": InstanceType( + name="c7g.8xlarge", + cores=32, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c7g.large": InstanceType( + name="c7g.large", + cores=2, + memory=4.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c7g.medium": InstanceType( + name="c7g.medium", + cores=1, + memory=2.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c7g.metal": InstanceType( + name="c7g.metal", + cores=64, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c7g.xlarge": InstanceType( + name="c7g.xlarge", + cores=4, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c7gd.12xlarge": InstanceType( + name="c7gd.12xlarge", + cores=48, + memory=96.0, + disks=2.0, + disk_capacity=1425.0, + architecture="arm64", + ), + "c7gd.16xlarge": InstanceType( + name="c7gd.16xlarge", + cores=64, + memory=128.0, + disks=2.0, + disk_capacity=1900.0, + architecture="arm64", + ), + "c7gd.2xlarge": InstanceType( + name="c7gd.2xlarge", + cores=8, + memory=16.0, + disks=1.0, + disk_capacity=475.0, + architecture="arm64", + ), + "c7gd.4xlarge": InstanceType( + name="c7gd.4xlarge", + cores=16, + memory=32.0, + disks=1.0, + disk_capacity=950.0, + architecture="arm64", + ), + "c7gd.8xlarge": InstanceType( + name="c7gd.8xlarge", + cores=32, + memory=64.0, + disks=1.0, + disk_capacity=1900.0, + architecture="arm64", + ), + "c7gd.large": InstanceType( + name="c7gd.large", + cores=2, + memory=4.0, + disks=1.0, + disk_capacity=118.0, + architecture="arm64", + ), + "c7gd.medium": InstanceType( + name="c7gd.medium", + cores=1, + memory=2.0, + disks=1.0, + disk_capacity=59.0, + architecture="arm64", + ), + "c7gd.xlarge": InstanceType( + name="c7gd.xlarge", + cores=4, + memory=8.0, + disks=1.0, + disk_capacity=237.0, + architecture="arm64", + ), + "c7gn.12xlarge": InstanceType( + name="c7gn.12xlarge", + cores=48, + memory=96.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c7gn.16xlarge": InstanceType( + name="c7gn.16xlarge", + cores=64, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c7gn.2xlarge": InstanceType( + name="c7gn.2xlarge", + cores=8, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c7gn.4xlarge": InstanceType( + name="c7gn.4xlarge", + cores=16, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c7gn.8xlarge": InstanceType( + name="c7gn.8xlarge", + cores=32, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c7gn.large": InstanceType( + name="c7gn.large", + cores=2, + memory=4.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c7gn.medium": InstanceType( + name="c7gn.medium", + cores=1, + memory=2.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c7gn.xlarge": InstanceType( + name="c7gn.xlarge", + cores=4, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "c7i.12xlarge": InstanceType( + name="c7i.12xlarge", + cores=48, + memory=96.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c7i.16xlarge": InstanceType( + name="c7i.16xlarge", + cores=64, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c7i.24xlarge": InstanceType( + name="c7i.24xlarge", + cores=96, + memory=192.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c7i.2xlarge": InstanceType( + name="c7i.2xlarge", + cores=8, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c7i.48xlarge": InstanceType( + name="c7i.48xlarge", + cores=192, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c7i.4xlarge": InstanceType( + name="c7i.4xlarge", + cores=16, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c7i.8xlarge": InstanceType( + name="c7i.8xlarge", + cores=32, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c7i.large": InstanceType( + name="c7i.large", + cores=2, + memory=4.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c7i.metal-24xl": InstanceType( + name="c7i.metal-24xl", + cores=96, + memory=192.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c7i.metal-48xl": InstanceType( + name="c7i.metal-48xl", + cores=192, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "c7i.xlarge": InstanceType( + name="c7i.xlarge", + cores=4, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "cc2.8xlarge": InstanceType( + name="cc2.8xlarge", + cores=32, + memory=60.5, + disks=4.0, + disk_capacity=840.0, + architecture="amd64", + ), + "cr1.8xlarge": InstanceType( + name="cr1.8xlarge", + cores=32, + memory=244.0, + disks=2.0, + disk_capacity=120.0, + architecture="amd64", + ), + "d2.2xlarge": InstanceType( + name="d2.2xlarge", + cores=8, + memory=61.0, + disks=6.0, + disk_capacity=2000.0, + architecture="amd64", + ), + "d2.4xlarge": InstanceType( + name="d2.4xlarge", + cores=16, + memory=122.0, + disks=12.0, + disk_capacity=2000.0, + architecture="amd64", + ), + "d2.8xlarge": InstanceType( + name="d2.8xlarge", + cores=36, + memory=244.0, + disks=24.0, + disk_capacity=2000.0, + architecture="amd64", + ), + "d2.xlarge": InstanceType( + name="d2.xlarge", + cores=4, + memory=30.5, + disks=3.0, + disk_capacity=2000.0, + architecture="amd64", + ), + "d3.2xlarge": InstanceType( + name="d3.2xlarge", + cores=8, + memory=64.0, + disks=6.0, + disk_capacity=2000.0, + architecture="amd64", + ), + "d3.4xlarge": InstanceType( + name="d3.4xlarge", + cores=16, + memory=128.0, + disks=12.0, + disk_capacity=2000.0, + architecture="amd64", + ), + "d3.8xlarge": InstanceType( + name="d3.8xlarge", + cores=32, + memory=256.0, + disks=24.0, + disk_capacity=2000.0, + architecture="amd64", + ), + "d3.xlarge": InstanceType( + name="d3.xlarge", + cores=4, + memory=32.0, + disks=3.0, + disk_capacity=2000.0, + architecture="amd64", + ), + "d3en.12xlarge": InstanceType( + name="d3en.12xlarge", + cores=48, + memory=192.0, + disks=24.0, + disk_capacity=14000.0, + architecture="amd64", + ), + "d3en.2xlarge": InstanceType( + name="d3en.2xlarge", + cores=8, + memory=32.0, + disks=4.0, + disk_capacity=14000.0, + architecture="amd64", + ), + "d3en.4xlarge": InstanceType( + name="d3en.4xlarge", + cores=16, + memory=64.0, + disks=8.0, + disk_capacity=14000.0, + architecture="amd64", + ), + "d3en.6xlarge": InstanceType( + name="d3en.6xlarge", + cores=24, + memory=96.0, + disks=12.0, + disk_capacity=14000.0, + architecture="amd64", + ), + "d3en.8xlarge": InstanceType( + name="d3en.8xlarge", + cores=32, + memory=128.0, + disks=16.0, + disk_capacity=14000.0, + architecture="amd64", + ), + "d3en.xlarge": InstanceType( + name="d3en.xlarge", + cores=4, + memory=16.0, + disks=2.0, + disk_capacity=14000.0, + architecture="amd64", + ), + "dl1.24xlarge": InstanceType( + name="dl1.24xlarge", + cores=96, + memory=768.0, + disks=4.0, + disk_capacity=1000.0, + architecture="amd64", + ), + "dl2q.24xlarge": InstanceType( + name="dl2q.24xlarge", + cores=96, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "f1.16xlarge": InstanceType( + name="f1.16xlarge", + cores=64, + memory=976.0, + disks=4.0, + disk_capacity=940.0, + architecture="amd64", + ), + "f1.2xlarge": InstanceType( + name="f1.2xlarge", + cores=8, + memory=122.0, + disks=1.0, + disk_capacity=470.0, + architecture="amd64", + ), + "f1.4xlarge": InstanceType( + name="f1.4xlarge", + cores=16, + memory=244.0, + disks=1.0, + disk_capacity=940.0, + architecture="amd64", + ), + "g2.2xlarge": InstanceType( + name="g2.2xlarge", + cores=8, + memory=15.0, + disks=1.0, + disk_capacity=60.0, + architecture="amd64", + ), + "g2.8xlarge": InstanceType( + name="g2.8xlarge", + cores=32, + memory=60.0, + disks=2.0, + disk_capacity=120.0, + architecture="amd64", + ), + "g3.16xlarge": InstanceType( + name="g3.16xlarge", + cores=64, + memory=488.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "g3.4xlarge": InstanceType( + name="g3.4xlarge", + cores=16, + memory=122.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "g3.8xlarge": InstanceType( + name="g3.8xlarge", + cores=32, + memory=244.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "g3s.xlarge": InstanceType( + name="g3s.xlarge", + cores=4, + memory=30.5, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "g4ad.16xlarge": InstanceType( + name="g4ad.16xlarge", + cores=64, + memory=256.0, + disks=1, + disk_capacity=2400.0, + architecture="amd64", + ), + "g4ad.2xlarge": InstanceType( + name="g4ad.2xlarge", + cores=8, + memory=32.0, + disks=1, + disk_capacity=300.0, + architecture="amd64", + ), + "g4ad.4xlarge": InstanceType( + name="g4ad.4xlarge", + cores=16, + memory=64.0, + disks=1, + disk_capacity=600.0, + architecture="amd64", + ), + "g4ad.8xlarge": InstanceType( + name="g4ad.8xlarge", + cores=32, + memory=128.0, + disks=1, + disk_capacity=1200.0, + architecture="amd64", + ), + "g4ad.xlarge": InstanceType( + name="g4ad.xlarge", + cores=4, + memory=16.0, + disks=1, + disk_capacity=150.0, + architecture="amd64", + ), + "g4dn.12xlarge": InstanceType( + name="g4dn.12xlarge", + cores=48, + memory=192.0, + disks=1, + disk_capacity=900.0, + architecture="amd64", + ), + "g4dn.16xlarge": InstanceType( + name="g4dn.16xlarge", + cores=64, + memory=256.0, + disks=1, + disk_capacity=900.0, + architecture="amd64", + ), + "g4dn.2xlarge": InstanceType( + name="g4dn.2xlarge", + cores=8, + memory=32.0, + disks=1, + disk_capacity=225.0, + architecture="amd64", + ), + "g4dn.4xlarge": InstanceType( + name="g4dn.4xlarge", + cores=16, + memory=64.0, + disks=1, + disk_capacity=225.0, + architecture="amd64", + ), + "g4dn.8xlarge": InstanceType( + name="g4dn.8xlarge", + cores=32, + memory=128.0, + disks=1, + disk_capacity=900.0, + architecture="amd64", + ), + "g4dn.metal": InstanceType( + name="g4dn.metal", + cores=96, + memory=384.0, + disks=2.0, + disk_capacity=900.0, + architecture="amd64", + ), + "g4dn.xlarge": InstanceType( + name="g4dn.xlarge", + cores=4, + memory=16.0, + disks=1, + disk_capacity=125.0, + architecture="amd64", + ), + "g5.12xlarge": InstanceType( + name="g5.12xlarge", + cores=48, + memory=192.0, + disks=1.0, + disk_capacity=3800.0, + architecture="amd64", + ), + "g5.16xlarge": InstanceType( + name="g5.16xlarge", + cores=64, + memory=256.0, + disks=1.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "g5.24xlarge": InstanceType( + name="g5.24xlarge", + cores=96, + memory=384.0, + disks=1.0, + disk_capacity=3800.0, + architecture="amd64", + ), + "g5.2xlarge": InstanceType( + name="g5.2xlarge", + cores=8, + memory=32.0, + disks=1.0, + disk_capacity=450.0, + architecture="amd64", + ), + "g5.48xlarge": InstanceType( + name="g5.48xlarge", + cores=192, + memory=768.0, + disks=2.0, + disk_capacity=3800.0, + architecture="amd64", + ), + "g5.4xlarge": InstanceType( + name="g5.4xlarge", + cores=16, + memory=64.0, + disks=1.0, + disk_capacity=600.0, + architecture="amd64", + ), + "g5.8xlarge": InstanceType( + name="g5.8xlarge", + cores=32, + memory=128.0, + disks=1.0, + disk_capacity=900.0, + architecture="amd64", + ), + "g5.xlarge": InstanceType( + name="g5.xlarge", + cores=4, + memory=16.0, + disks=1.0, + disk_capacity=250.0, + architecture="amd64", + ), + "g5g.16xlarge": InstanceType( + name="g5g.16xlarge", + cores=64, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "g5g.2xlarge": InstanceType( + name="g5g.2xlarge", + cores=8, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "g5g.4xlarge": InstanceType( + name="g5g.4xlarge", + cores=16, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "g5g.8xlarge": InstanceType( + name="g5g.8xlarge", + cores=32, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "g5g.metal": InstanceType( + name="g5g.metal", + cores=64, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "g5g.xlarge": InstanceType( + name="g5g.xlarge", + cores=4, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "h1.16xlarge": InstanceType( + name="h1.16xlarge", + cores=64, + memory=256.0, + disks=8.0, + disk_capacity=2000.0, + architecture="amd64", + ), + "h1.2xlarge": InstanceType( + name="h1.2xlarge", + cores=8, + memory=32.0, + disks=1.0, + disk_capacity=2000.0, + architecture="amd64", + ), + "h1.4xlarge": InstanceType( + name="h1.4xlarge", + cores=16, + memory=64.0, + disks=2.0, + disk_capacity=2000.0, + architecture="amd64", + ), + "h1.8xlarge": InstanceType( + name="h1.8xlarge", + cores=32, + memory=128.0, + disks=4.0, + disk_capacity=2000.0, + architecture="amd64", + ), + "hpc6a.48xlarge": InstanceType( + name="hpc6a.48xlarge", + cores=96, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "hpc6id.32xlarge": InstanceType( + name="hpc6id.32xlarge", + cores=64, + memory=1024.0, + disks=4.0, + disk_capacity=3800.0, + architecture="amd64", + ), + "hpc7a.12xlarge": InstanceType( + name="hpc7a.12xlarge", + cores=24, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "hpc7a.24xlarge": InstanceType( + name="hpc7a.24xlarge", + cores=48, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "hpc7a.48xlarge": InstanceType( + name="hpc7a.48xlarge", + cores=96, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "hpc7a.96xlarge": InstanceType( + name="hpc7a.96xlarge", + cores=192, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "hpc7g.16xlarge": InstanceType( + name="hpc7g.16xlarge", + cores=64, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "hpc7g.4xlarge": InstanceType( + name="hpc7g.4xlarge", + cores=16, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "hpc7g.8xlarge": InstanceType( + name="hpc7g.8xlarge", + cores=32, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "hs1.8xlarge": InstanceType( + name="hs1.8xlarge", + cores=16, + memory=117.0, + disks=24.0, + disk_capacity=2000.0, + architecture="amd64", + ), + "i2.2xlarge": InstanceType( + name="i2.2xlarge", + cores=8, + memory=61.0, + disks=2.0, + disk_capacity=800.0, + architecture="amd64", + ), + "i2.4xlarge": InstanceType( + name="i2.4xlarge", + cores=16, + memory=122.0, + disks=4.0, + disk_capacity=800.0, + architecture="amd64", + ), + "i2.8xlarge": InstanceType( + name="i2.8xlarge", + cores=32, + memory=244.0, + disks=8.0, + disk_capacity=800.0, + architecture="amd64", + ), + "i2.xlarge": InstanceType( + name="i2.xlarge", + cores=4, + memory=30.5, + disks=1.0, + disk_capacity=800.0, + architecture="amd64", + ), + "i3.16xlarge": InstanceType( + name="i3.16xlarge", + cores=64, + memory=488.0, + disks=8.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "i3.2xlarge": InstanceType( + name="i3.2xlarge", + cores=8, + memory=61.0, + disks=1.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "i3.4xlarge": InstanceType( + name="i3.4xlarge", + cores=16, + memory=122.0, + disks=2.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "i3.8xlarge": InstanceType( + name="i3.8xlarge", + cores=32, + memory=244.0, + disks=4.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "i3.large": InstanceType( + name="i3.large", + cores=2, + memory=15.25, + disks=1.0, + disk_capacity=475.0, + architecture="amd64", + ), + "i3.metal": InstanceType( + name="i3.metal", + cores=72, + memory=512.0, + disks=8.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "i3.xlarge": InstanceType( + name="i3.xlarge", + cores=4, + memory=30.5, + disks=1.0, + disk_capacity=950.0, + architecture="amd64", + ), + "i3en.12xlarge": InstanceType( + name="i3en.12xlarge", + cores=48, + memory=384.0, + disks=4.0, + disk_capacity=7500.0, + architecture="amd64", + ), + "i3en.24xlarge": InstanceType( + name="i3en.24xlarge", + cores=96, + memory=768.0, + disks=8.0, + disk_capacity=7500.0, + architecture="amd64", + ), + "i3en.2xlarge": InstanceType( + name="i3en.2xlarge", + cores=8, + memory=64.0, + disks=2.0, + disk_capacity=2500.0, + architecture="amd64", + ), + "i3en.3xlarge": InstanceType( + name="i3en.3xlarge", + cores=12, + memory=96.0, + disks=1.0, + disk_capacity=7500.0, + architecture="amd64", + ), + "i3en.6xlarge": InstanceType( + name="i3en.6xlarge", + cores=24, + memory=192.0, + disks=2.0, + disk_capacity=7500.0, + architecture="amd64", + ), + "i3en.large": InstanceType( + name="i3en.large", + cores=2, + memory=16.0, + disks=1.0, + disk_capacity=1250.0, + architecture="amd64", + ), + "i3en.metal": InstanceType( + name="i3en.metal", + cores=96, + memory=768.0, + disks=8.0, + disk_capacity=7500.0, + architecture="amd64", + ), + "i3en.xlarge": InstanceType( + name="i3en.xlarge", + cores=4, + memory=32.0, + disks=1.0, + disk_capacity=2500.0, + architecture="amd64", + ), + "i3p.16xlarge": InstanceType( + name="i3p.16xlarge", + cores=64, + memory=488.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "i4g.16xlarge": InstanceType( + name="i4g.16xlarge", + cores=64, + memory=512.0, + disks=4.0, + disk_capacity=3750.0, + architecture="arm64", + ), + "i4g.2xlarge": InstanceType( + name="i4g.2xlarge", + cores=8, + memory=64.0, + disks=1.0, + disk_capacity=1875.0, + architecture="arm64", + ), + "i4g.4xlarge": InstanceType( + name="i4g.4xlarge", + cores=16, + memory=128.0, + disks=1.0, + disk_capacity=3750.0, + architecture="arm64", + ), + "i4g.8xlarge": InstanceType( + name="i4g.8xlarge", + cores=32, + memory=256.0, + disks=2.0, + disk_capacity=3750.0, + architecture="arm64", + ), + "i4g.large": InstanceType( + name="i4g.large", + cores=2, + memory=16.0, + disks=1.0, + disk_capacity=468.0, + architecture="arm64", + ), + "i4g.xlarge": InstanceType( + name="i4g.xlarge", + cores=4, + memory=32.0, + disks=1.0, + disk_capacity=937.0, + architecture="arm64", + ), + "i4i.12xlarge": InstanceType( + name="i4i.12xlarge", + cores=48, + memory=384.0, + disks=3.0, + disk_capacity=3750.0, + architecture="amd64", + ), + "i4i.16xlarge": InstanceType( + name="i4i.16xlarge", + cores=64, + memory=512.0, + disks=4.0, + disk_capacity=3750.0, + architecture="amd64", + ), + "i4i.24xlarge": InstanceType( + name="i4i.24xlarge", + cores=96, + memory=768.0, + disks=6.0, + disk_capacity=3750.0, + architecture="amd64", + ), + "i4i.2xlarge": InstanceType( + name="i4i.2xlarge", + cores=8, + memory=64.0, + disks=1.0, + disk_capacity=1875.0, + architecture="amd64", + ), + "i4i.32xlarge": InstanceType( + name="i4i.32xlarge", + cores=128, + memory=1024.0, + disks=8.0, + disk_capacity=3750.0, + architecture="amd64", + ), + "i4i.4xlarge": InstanceType( + name="i4i.4xlarge", + cores=16, + memory=128.0, + disks=1.0, + disk_capacity=3750.0, + architecture="amd64", + ), + "i4i.8xlarge": InstanceType( + name="i4i.8xlarge", + cores=32, + memory=256.0, + disks=2.0, + disk_capacity=3750.0, + architecture="amd64", + ), + "i4i.large": InstanceType( + name="i4i.large", + cores=2, + memory=16.0, + disks=1.0, + disk_capacity=468.0, + architecture="amd64", + ), + "i4i.metal": InstanceType( + name="i4i.metal", + cores=128, + memory=1024.0, + disks=8.0, + disk_capacity=3750.0, + architecture="amd64", + ), + "i4i.xlarge": InstanceType( + name="i4i.xlarge", + cores=4, + memory=32.0, + disks=1.0, + disk_capacity=937.0, + architecture="amd64", + ), + "im4gn.16xlarge": InstanceType( + name="im4gn.16xlarge", + cores=64, + memory=256.0, + disks=4.0, + disk_capacity=7500.0, + architecture="arm64", + ), + "im4gn.2xlarge": InstanceType( + name="im4gn.2xlarge", + cores=8, + memory=32.0, + disks=1.0, + disk_capacity=3750.0, + architecture="arm64", + ), + "im4gn.4xlarge": InstanceType( + name="im4gn.4xlarge", + cores=16, + memory=64.0, + disks=1.0, + disk_capacity=7500.0, + architecture="arm64", + ), + "im4gn.8xlarge": InstanceType( + name="im4gn.8xlarge", + cores=32, + memory=128.0, + disks=2.0, + disk_capacity=7500.0, + architecture="arm64", + ), + "im4gn.large": InstanceType( + name="im4gn.large", + cores=2, + memory=8.0, + disks=1.0, + disk_capacity=937.0, + architecture="arm64", + ), + "im4gn.xlarge": InstanceType( + name="im4gn.xlarge", + cores=4, + memory=16.0, + disks=1.0, + disk_capacity=1875.0, + architecture="arm64", + ), + "inf1.24xlarge": InstanceType( + name="inf1.24xlarge", + cores=96, + memory=192.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "inf1.2xlarge": InstanceType( + name="inf1.2xlarge", + cores=8, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "inf1.6xlarge": InstanceType( + name="inf1.6xlarge", + cores=24, + memory=48.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "inf1.xlarge": InstanceType( + name="inf1.xlarge", + cores=4, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "inf2.24xlarge": InstanceType( + name="inf2.24xlarge", + cores=96, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "inf2.48xlarge": InstanceType( + name="inf2.48xlarge", + cores=192, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "inf2.8xlarge": InstanceType( + name="inf2.8xlarge", + cores=32, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "inf2.xlarge": InstanceType( + name="inf2.xlarge", + cores=4, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "is4gen.2xlarge": InstanceType( + name="is4gen.2xlarge", + cores=8, + memory=48.0, + disks=1.0, + disk_capacity=7500.0, + architecture="arm64", + ), + "is4gen.4xlarge": InstanceType( + name="is4gen.4xlarge", + cores=16, + memory=96.0, + disks=2.0, + disk_capacity=7500.0, + architecture="arm64", + ), + "is4gen.8xlarge": InstanceType( + name="is4gen.8xlarge", + cores=32, + memory=192.0, + disks=4.0, + disk_capacity=7500.0, + architecture="arm64", + ), + "is4gen.large": InstanceType( + name="is4gen.large", + cores=2, + memory=12.0, + disks=1.0, + disk_capacity=1875.0, + architecture="arm64", + ), + "is4gen.medium": InstanceType( + name="is4gen.medium", + cores=1, + memory=6.0, + disks=1.0, + disk_capacity=937.0, + architecture="arm64", + ), + "is4gen.xlarge": InstanceType( + name="is4gen.xlarge", + cores=4, + memory=24.0, + disks=1.0, + disk_capacity=3750.0, + architecture="arm64", + ), + "m1.large": InstanceType( + name="m1.large", + cores=2, + memory=7.5, + disks=2.0, + disk_capacity=420.0, + architecture="amd64", + ), + "m1.medium": InstanceType( + name="m1.medium", + cores=1, + memory=3.75, + disks=1.0, + disk_capacity=410.0, + architecture="amd64", + ), + "m1.xlarge": InstanceType( + name="m1.xlarge", + cores=4, + memory=15.0, + disks=4.0, + disk_capacity=420.0, + architecture="amd64", + ), + "m2.2xlarge": InstanceType( + name="m2.2xlarge", + cores=4, + memory=34.2, + disks=1.0, + disk_capacity=850.0, + architecture="amd64", + ), + "m2.4xlarge": InstanceType( + name="m2.4xlarge", + cores=8, + memory=68.4, + disks=2.0, + disk_capacity=840.0, + architecture="amd64", + ), + "m2.xlarge": InstanceType( + name="m2.xlarge", + cores=2, + memory=17.1, + disks=1.0, + disk_capacity=420.0, + architecture="amd64", + ), + "m3.2xlarge": InstanceType( + name="m3.2xlarge", + cores=8, + memory=30.0, + disks=2.0, + disk_capacity=80.0, + architecture="amd64", + ), + "m3.large": InstanceType( + name="m3.large", + cores=2, + memory=7.5, + disks=1.0, + disk_capacity=32.0, + architecture="amd64", + ), + "m3.medium": InstanceType( + name="m3.medium", + cores=1, + memory=3.75, + disks=1.0, + disk_capacity=4.0, + architecture="amd64", + ), + "m3.xlarge": InstanceType( + name="m3.xlarge", + cores=4, + memory=15.0, + disks=2.0, + disk_capacity=40.0, + architecture="amd64", + ), + "m4.10xlarge": InstanceType( + name="m4.10xlarge", + cores=40, + memory=160.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m4.16xlarge": InstanceType( + name="m4.16xlarge", + cores=64, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m4.2xlarge": InstanceType( + name="m4.2xlarge", + cores=8, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m4.4xlarge": InstanceType( + name="m4.4xlarge", + cores=16, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m4.large": InstanceType( + name="m4.large", + cores=2, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m4.xlarge": InstanceType( + name="m4.xlarge", + cores=4, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5.12xlarge": InstanceType( + name="m5.12xlarge", + cores=48, + memory=192.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5.16xlarge": InstanceType( + name="m5.16xlarge", + cores=64, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5.24xlarge": InstanceType( + name="m5.24xlarge", + cores=96, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5.2xlarge": InstanceType( + name="m5.2xlarge", + cores=8, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5.4xlarge": InstanceType( + name="m5.4xlarge", + cores=16, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5.8xlarge": InstanceType( + name="m5.8xlarge", + cores=32, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5.large": InstanceType( + name="m5.large", + cores=2, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5.metal": InstanceType( + name="m5.metal", + cores=96, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5.xlarge": InstanceType( + name="m5.xlarge", + cores=4, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5a.12xlarge": InstanceType( + name="m5a.12xlarge", + cores=48, + memory=192.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5a.16xlarge": InstanceType( + name="m5a.16xlarge", + cores=64, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5a.24xlarge": InstanceType( + name="m5a.24xlarge", + cores=96, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5a.2xlarge": InstanceType( + name="m5a.2xlarge", + cores=8, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5a.4xlarge": InstanceType( + name="m5a.4xlarge", + cores=16, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5a.8xlarge": InstanceType( + name="m5a.8xlarge", + cores=32, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5a.large": InstanceType( + name="m5a.large", + cores=2, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5a.xlarge": InstanceType( + name="m5a.xlarge", + cores=4, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5ad.12xlarge": InstanceType( + name="m5ad.12xlarge", + cores=48, + memory=192.0, + disks=2.0, + disk_capacity=900.0, + architecture="amd64", + ), + "m5ad.16xlarge": InstanceType( + name="m5ad.16xlarge", + cores=64, + memory=256.0, + disks=4.0, + disk_capacity=600.0, + architecture="amd64", + ), + "m5ad.24xlarge": InstanceType( + name="m5ad.24xlarge", + cores=96, + memory=384.0, + disks=4.0, + disk_capacity=900.0, + architecture="amd64", + ), + "m5ad.2xlarge": InstanceType( + name="m5ad.2xlarge", + cores=8, + memory=32.0, + disks=1.0, + disk_capacity=300.0, + architecture="amd64", + ), + "m5ad.4xlarge": InstanceType( + name="m5ad.4xlarge", + cores=16, + memory=64.0, + disks=2.0, + disk_capacity=300.0, + architecture="amd64", + ), + "m5ad.8xlarge": InstanceType( + name="m5ad.8xlarge", + cores=32, + memory=128.0, + disks=2.0, + disk_capacity=600.0, + architecture="amd64", + ), + "m5ad.large": InstanceType( + name="m5ad.large", + cores=2, + memory=8.0, + disks=1.0, + disk_capacity=75.0, + architecture="amd64", + ), + "m5ad.xlarge": InstanceType( + name="m5ad.xlarge", + cores=4, + memory=16.0, + disks=1.0, + disk_capacity=150.0, + architecture="amd64", + ), + "m5d.12xlarge": InstanceType( + name="m5d.12xlarge", + cores=48, + memory=192.0, + disks=2.0, + disk_capacity=900.0, + architecture="amd64", + ), + "m5d.16xlarge": InstanceType( + name="m5d.16xlarge", + cores=64, + memory=256.0, + disks=4.0, + disk_capacity=600.0, + architecture="amd64", + ), + "m5d.24xlarge": InstanceType( + name="m5d.24xlarge", + cores=96, + memory=384.0, + disks=4.0, + disk_capacity=900.0, + architecture="amd64", + ), + "m5d.2xlarge": InstanceType( + name="m5d.2xlarge", + cores=8, + memory=32.0, + disks=1.0, + disk_capacity=300.0, + architecture="amd64", + ), + "m5d.4xlarge": InstanceType( + name="m5d.4xlarge", + cores=16, + memory=64.0, + disks=2.0, + disk_capacity=300.0, + architecture="amd64", + ), + "m5d.8xlarge": InstanceType( + name="m5d.8xlarge", + cores=32, + memory=128.0, + disks=2.0, + disk_capacity=600.0, + architecture="amd64", + ), + "m5d.large": InstanceType( + name="m5d.large", + cores=2, + memory=8.0, + disks=1.0, + disk_capacity=75.0, + architecture="amd64", + ), + "m5d.metal": InstanceType( + name="m5d.metal", + cores=96, + memory=384.0, + disks=4.0, + disk_capacity=900.0, + architecture="amd64", + ), + "m5d.xlarge": InstanceType( + name="m5d.xlarge", + cores=4, + memory=16.0, + disks=1.0, + disk_capacity=150.0, + architecture="amd64", + ), + "m5dn.12xlarge": InstanceType( + name="m5dn.12xlarge", + cores=48, + memory=192.0, + disks=2.0, + disk_capacity=900.0, + architecture="amd64", + ), + "m5dn.16xlarge": InstanceType( + name="m5dn.16xlarge", + cores=64, + memory=256.0, + disks=4.0, + disk_capacity=600.0, + architecture="amd64", + ), + "m5dn.24xlarge": InstanceType( + name="m5dn.24xlarge", + cores=96, + memory=384.0, + disks=4.0, + disk_capacity=900.0, + architecture="amd64", + ), + "m5dn.2xlarge": InstanceType( + name="m5dn.2xlarge", + cores=8, + memory=32.0, + disks=1.0, + disk_capacity=300.0, + architecture="amd64", + ), + "m5dn.4xlarge": InstanceType( + name="m5dn.4xlarge", + cores=16, + memory=64.0, + disks=2.0, + disk_capacity=300.0, + architecture="amd64", + ), + "m5dn.8xlarge": InstanceType( + name="m5dn.8xlarge", + cores=32, + memory=128.0, + disks=2.0, + disk_capacity=600.0, + architecture="amd64", + ), + "m5dn.large": InstanceType( + name="m5dn.large", + cores=2, + memory=8.0, + disks=1.0, + disk_capacity=75.0, + architecture="amd64", + ), + "m5dn.metal": InstanceType( + name="m5dn.metal", + cores=96, + memory=384.0, + disks=4.0, + disk_capacity=900.0, + architecture="amd64", + ), + "m5dn.xlarge": InstanceType( + name="m5dn.xlarge", + cores=4, + memory=16.0, + disks=1.0, + disk_capacity=150.0, + architecture="amd64", + ), + "m5n.12xlarge": InstanceType( + name="m5n.12xlarge", + cores=48, + memory=192.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5n.16xlarge": InstanceType( + name="m5n.16xlarge", + cores=64, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5n.24xlarge": InstanceType( + name="m5n.24xlarge", + cores=96, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5n.2xlarge": InstanceType( + name="m5n.2xlarge", + cores=8, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5n.4xlarge": InstanceType( + name="m5n.4xlarge", + cores=16, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5n.8xlarge": InstanceType( + name="m5n.8xlarge", + cores=32, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5n.large": InstanceType( + name="m5n.large", + cores=2, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5n.metal": InstanceType( + name="m5n.metal", + cores=96, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5n.xlarge": InstanceType( + name="m5n.xlarge", + cores=4, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5zn.12xlarge": InstanceType( + name="m5zn.12xlarge", + cores=48, + memory=192.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5zn.2xlarge": InstanceType( + name="m5zn.2xlarge", + cores=8, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5zn.3xlarge": InstanceType( + name="m5zn.3xlarge", + cores=12, + memory=48.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5zn.6xlarge": InstanceType( + name="m5zn.6xlarge", + cores=24, + memory=96.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5zn.large": InstanceType( + name="m5zn.large", + cores=2, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5zn.metal": InstanceType( + name="m5zn.metal", + cores=48, + memory=192.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m5zn.xlarge": InstanceType( + name="m5zn.xlarge", + cores=4, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6a.12xlarge": InstanceType( + name="m6a.12xlarge", + cores=48, + memory=192.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6a.16xlarge": InstanceType( + name="m6a.16xlarge", + cores=64, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6a.24xlarge": InstanceType( + name="m6a.24xlarge", + cores=96, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6a.2xlarge": InstanceType( + name="m6a.2xlarge", + cores=8, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6a.32xlarge": InstanceType( + name="m6a.32xlarge", + cores=128, + memory=512.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6a.48xlarge": InstanceType( + name="m6a.48xlarge", + cores=192, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6a.4xlarge": InstanceType( + name="m6a.4xlarge", + cores=16, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6a.8xlarge": InstanceType( + name="m6a.8xlarge", + cores=32, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6a.large": InstanceType( + name="m6a.large", + cores=2, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6a.metal": InstanceType( + name="m6a.metal", + cores=192, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6a.xlarge": InstanceType( + name="m6a.xlarge", + cores=4, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6g.12xlarge": InstanceType( + name="m6g.12xlarge", + cores=48, + memory=192.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "m6g.16xlarge": InstanceType( + name="m6g.16xlarge", + cores=64, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "m6g.2xlarge": InstanceType( + name="m6g.2xlarge", + cores=8, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "m6g.4xlarge": InstanceType( + name="m6g.4xlarge", + cores=16, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "m6g.8xlarge": InstanceType( + name="m6g.8xlarge", + cores=32, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "m6g.large": InstanceType( + name="m6g.large", + cores=2, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "m6g.medium": InstanceType( + name="m6g.medium", + cores=1, + memory=4.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "m6g.metal": InstanceType( + name="m6g.metal", + cores=64, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "m6g.xlarge": InstanceType( + name="m6g.xlarge", + cores=4, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "m6gd.12xlarge": InstanceType( + name="m6gd.12xlarge", + cores=48, + memory=192.0, + disks=2.0, + disk_capacity=1425.0, + architecture="arm64", + ), + "m6gd.16xlarge": InstanceType( + name="m6gd.16xlarge", + cores=64, + memory=256.0, + disks=2.0, + disk_capacity=1900.0, + architecture="arm64", + ), + "m6gd.2xlarge": InstanceType( + name="m6gd.2xlarge", + cores=8, + memory=32.0, + disks=1.0, + disk_capacity=475.0, + architecture="arm64", + ), + "m6gd.4xlarge": InstanceType( + name="m6gd.4xlarge", + cores=16, + memory=64.0, + disks=1.0, + disk_capacity=950.0, + architecture="arm64", + ), + "m6gd.8xlarge": InstanceType( + name="m6gd.8xlarge", + cores=32, + memory=128.0, + disks=1.0, + disk_capacity=1900.0, + architecture="arm64", + ), + "m6gd.large": InstanceType( + name="m6gd.large", + cores=2, + memory=8.0, + disks=1.0, + disk_capacity=118.0, + architecture="arm64", + ), + "m6gd.medium": InstanceType( + name="m6gd.medium", + cores=1, + memory=4.0, + disks=1.0, + disk_capacity=59.0, + architecture="arm64", + ), + "m6gd.metal": InstanceType( + name="m6gd.metal", + cores=64, + memory=256.0, + disks=2.0, + disk_capacity=1900.0, + architecture="arm64", + ), + "m6gd.xlarge": InstanceType( + name="m6gd.xlarge", + cores=4, + memory=16.0, + disks=1.0, + disk_capacity=237.0, + architecture="arm64", + ), + "m6i.12xlarge": InstanceType( + name="m6i.12xlarge", + cores=48, + memory=192.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6i.16xlarge": InstanceType( + name="m6i.16xlarge", + cores=64, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6i.24xlarge": InstanceType( + name="m6i.24xlarge", + cores=96, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6i.2xlarge": InstanceType( + name="m6i.2xlarge", + cores=8, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6i.32xlarge": InstanceType( + name="m6i.32xlarge", + cores=128, + memory=512.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6i.4xlarge": InstanceType( + name="m6i.4xlarge", + cores=16, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6i.8xlarge": InstanceType( + name="m6i.8xlarge", + cores=32, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6i.large": InstanceType( + name="m6i.large", + cores=2, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6i.metal": InstanceType( + name="m6i.metal", + cores=128, + memory=512.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6i.xlarge": InstanceType( + name="m6i.xlarge", + cores=4, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6id.12xlarge": InstanceType( + name="m6id.12xlarge", + cores=48, + memory=192.0, + disks=2.0, + disk_capacity=1425.0, + architecture="amd64", + ), + "m6id.16xlarge": InstanceType( + name="m6id.16xlarge", + cores=64, + memory=256.0, + disks=2.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "m6id.24xlarge": InstanceType( + name="m6id.24xlarge", + cores=96, + memory=384.0, + disks=4.0, + disk_capacity=1425.0, + architecture="amd64", + ), + "m6id.2xlarge": InstanceType( + name="m6id.2xlarge", + cores=8, + memory=32.0, + disks=1.0, + disk_capacity=474.0, + architecture="amd64", + ), + "m6id.32xlarge": InstanceType( + name="m6id.32xlarge", + cores=128, + memory=512.0, + disks=4.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "m6id.4xlarge": InstanceType( + name="m6id.4xlarge", + cores=16, + memory=64.0, + disks=1.0, + disk_capacity=950.0, + architecture="amd64", + ), + "m6id.8xlarge": InstanceType( + name="m6id.8xlarge", + cores=32, + memory=128.0, + disks=1.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "m6id.large": InstanceType( + name="m6id.large", + cores=2, + memory=8.0, + disks=1.0, + disk_capacity=118.0, + architecture="amd64", + ), + "m6id.metal": InstanceType( + name="m6id.metal", + cores=128, + memory=512.0, + disks=4.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "m6id.xlarge": InstanceType( + name="m6id.xlarge", + cores=4, + memory=16.0, + disks=1.0, + disk_capacity=237.0, + architecture="amd64", + ), + "m6idn.12xlarge": InstanceType( + name="m6idn.12xlarge", + cores=48, + memory=192.0, + disks=2.0, + disk_capacity=1425.0, + architecture="amd64", + ), + "m6idn.16xlarge": InstanceType( + name="m6idn.16xlarge", + cores=64, + memory=256.0, + disks=2.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "m6idn.24xlarge": InstanceType( + name="m6idn.24xlarge", + cores=96, + memory=384.0, + disks=4.0, + disk_capacity=1425.0, + architecture="amd64", + ), + "m6idn.2xlarge": InstanceType( + name="m6idn.2xlarge", + cores=8, + memory=32.0, + disks=1.0, + disk_capacity=474.0, + architecture="amd64", + ), + "m6idn.32xlarge": InstanceType( + name="m6idn.32xlarge", + cores=128, + memory=512.0, + disks=4.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "m6idn.4xlarge": InstanceType( + name="m6idn.4xlarge", + cores=16, + memory=64.0, + disks=1.0, + disk_capacity=950.0, + architecture="amd64", + ), + "m6idn.8xlarge": InstanceType( + name="m6idn.8xlarge", + cores=32, + memory=128.0, + disks=1.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "m6idn.large": InstanceType( + name="m6idn.large", + cores=2, + memory=8.0, + disks=1.0, + disk_capacity=118.0, + architecture="amd64", + ), + "m6idn.metal": InstanceType( + name="m6idn.metal", + cores=128, + memory=512.0, + disks=4.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "m6idn.xlarge": InstanceType( + name="m6idn.xlarge", + cores=4, + memory=16.0, + disks=1.0, + disk_capacity=237.0, + architecture="amd64", + ), + "m6in.12xlarge": InstanceType( + name="m6in.12xlarge", + cores=48, + memory=192.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6in.16xlarge": InstanceType( + name="m6in.16xlarge", + cores=64, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6in.24xlarge": InstanceType( + name="m6in.24xlarge", + cores=96, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6in.2xlarge": InstanceType( + name="m6in.2xlarge", + cores=8, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6in.32xlarge": InstanceType( + name="m6in.32xlarge", + cores=128, + memory=512.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6in.4xlarge": InstanceType( + name="m6in.4xlarge", + cores=16, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6in.8xlarge": InstanceType( + name="m6in.8xlarge", + cores=32, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6in.large": InstanceType( + name="m6in.large", + cores=2, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6in.metal": InstanceType( + name="m6in.metal", + cores=128, + memory=512.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m6in.xlarge": InstanceType( + name="m6in.xlarge", + cores=4, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7a.12xlarge": InstanceType( + name="m7a.12xlarge", + cores=48, + memory=192.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7a.16xlarge": InstanceType( + name="m7a.16xlarge", + cores=64, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7a.24xlarge": InstanceType( + name="m7a.24xlarge", + cores=96, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7a.2xlarge": InstanceType( + name="m7a.2xlarge", + cores=8, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7a.32xlarge": InstanceType( + name="m7a.32xlarge", + cores=128, + memory=512.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7a.48xlarge": InstanceType( + name="m7a.48xlarge", + cores=192, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7a.4xlarge": InstanceType( + name="m7a.4xlarge", + cores=16, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7a.8xlarge": InstanceType( + name="m7a.8xlarge", + cores=32, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7a.large": InstanceType( + name="m7a.large", + cores=2, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7a.medium": InstanceType( + name="m7a.medium", + cores=1, + memory=4.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7a.metal-48xl": InstanceType( + name="m7a.metal-48xl", + cores=192, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7a.xlarge": InstanceType( + name="m7a.xlarge", + cores=4, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7g.12xlarge": InstanceType( + name="m7g.12xlarge", + cores=48, + memory=192.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "m7g.16xlarge": InstanceType( + name="m7g.16xlarge", + cores=64, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "m7g.2xlarge": InstanceType( + name="m7g.2xlarge", + cores=8, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "m7g.4xlarge": InstanceType( + name="m7g.4xlarge", + cores=16, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "m7g.8xlarge": InstanceType( + name="m7g.8xlarge", + cores=32, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "m7g.large": InstanceType( + name="m7g.large", + cores=2, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "m7g.medium": InstanceType( + name="m7g.medium", + cores=1, + memory=4.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "m7g.metal": InstanceType( + name="m7g.metal", + cores=64, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "m7g.xlarge": InstanceType( + name="m7g.xlarge", + cores=4, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "m7gd.12xlarge": InstanceType( + name="m7gd.12xlarge", + cores=48, + memory=192.0, + disks=2.0, + disk_capacity=1425.0, + architecture="arm64", + ), + "m7gd.16xlarge": InstanceType( + name="m7gd.16xlarge", + cores=64, + memory=256.0, + disks=2.0, + disk_capacity=1900.0, + architecture="arm64", + ), + "m7gd.2xlarge": InstanceType( + name="m7gd.2xlarge", + cores=8, + memory=32.0, + disks=1.0, + disk_capacity=475.0, + architecture="arm64", + ), + "m7gd.4xlarge": InstanceType( + name="m7gd.4xlarge", + cores=16, + memory=64.0, + disks=1.0, + disk_capacity=950.0, + architecture="arm64", + ), + "m7gd.8xlarge": InstanceType( + name="m7gd.8xlarge", + cores=32, + memory=128.0, + disks=1.0, + disk_capacity=1900.0, + architecture="arm64", + ), + "m7gd.large": InstanceType( + name="m7gd.large", + cores=2, + memory=8.0, + disks=1.0, + disk_capacity=118.0, + architecture="arm64", + ), + "m7gd.medium": InstanceType( + name="m7gd.medium", + cores=1, + memory=4.0, + disks=1.0, + disk_capacity=59.0, + architecture="arm64", + ), + "m7gd.xlarge": InstanceType( + name="m7gd.xlarge", + cores=4, + memory=16.0, + disks=1.0, + disk_capacity=237.0, + architecture="arm64", + ), + "m7i-flex.2xlarge": InstanceType( + name="m7i-flex.2xlarge", + cores=8, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7i-flex.4xlarge": InstanceType( + name="m7i-flex.4xlarge", + cores=16, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7i-flex.8xlarge": InstanceType( + name="m7i-flex.8xlarge", + cores=32, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7i-flex.large": InstanceType( + name="m7i-flex.large", + cores=2, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7i-flex.xlarge": InstanceType( + name="m7i-flex.xlarge", + cores=4, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7i.12xlarge": InstanceType( + name="m7i.12xlarge", + cores=48, + memory=192.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7i.16xlarge": InstanceType( + name="m7i.16xlarge", + cores=64, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7i.24xlarge": InstanceType( + name="m7i.24xlarge", + cores=96, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7i.2xlarge": InstanceType( + name="m7i.2xlarge", + cores=8, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7i.48xlarge": InstanceType( + name="m7i.48xlarge", + cores=192, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7i.4xlarge": InstanceType( + name="m7i.4xlarge", + cores=16, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7i.8xlarge": InstanceType( + name="m7i.8xlarge", + cores=32, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7i.large": InstanceType( + name="m7i.large", + cores=2, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7i.metal-24xl": InstanceType( + name="m7i.metal-24xl", + cores=96, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7i.metal-48xl": InstanceType( + name="m7i.metal-48xl", + cores=192, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "m7i.xlarge": InstanceType( + name="m7i.xlarge", + cores=4, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "p2.16xlarge": InstanceType( + name="p2.16xlarge", + cores=64, + memory=732.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "p2.8xlarge": InstanceType( + name="p2.8xlarge", + cores=32, + memory=488.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "p2.xlarge": InstanceType( + name="p2.xlarge", + cores=4, + memory=61.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "p3.16xlarge": InstanceType( + name="p3.16xlarge", + cores=64, + memory=488.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "p3.2xlarge": InstanceType( + name="p3.2xlarge", + cores=8, + memory=61.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "p3.8xlarge": InstanceType( + name="p3.8xlarge", + cores=32, + memory=244.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "p3dn.24xlarge": InstanceType( + name="p3dn.24xlarge", + cores=96, + memory=768.0, + disks=2.0, + disk_capacity=900.0, + architecture="amd64", + ), + "p4d.24xlarge": InstanceType( + name="p4d.24xlarge", + cores=96, + memory=1152.0, + disks=8.0, + disk_capacity=1000.0, + architecture="amd64", + ), + "p4de.24xlarge": InstanceType( + name="p4de.24xlarge", + cores=96, + memory=1152.0, + disks=8.0, + disk_capacity=1000.0, + architecture="amd64", + ), + "p5.48xlarge": InstanceType( + name="p5.48xlarge", + cores=192, + memory=2048.0, + disks=8.0, + disk_capacity=3840.0, + architecture="amd64", + ), + "r3.2xlarge": InstanceType( + name="r3.2xlarge", + cores=8, + memory=61.0, + disks=1.0, + disk_capacity=160.0, + architecture="amd64", + ), + "r3.4xlarge": InstanceType( + name="r3.4xlarge", + cores=16, + memory=122.0, + disks=1.0, + disk_capacity=320.0, + architecture="amd64", + ), + "r3.8xlarge": InstanceType( + name="r3.8xlarge", + cores=32, + memory=244.0, + disks=2.0, + disk_capacity=320.0, + architecture="amd64", + ), + "r3.large": InstanceType( + name="r3.large", + cores=2, + memory=15.25, + disks=1.0, + disk_capacity=32.0, + architecture="amd64", + ), + "r3.xlarge": InstanceType( + name="r3.xlarge", + cores=4, + memory=30.5, + disks=1.0, + disk_capacity=80.0, + architecture="amd64", + ), + "r4.16xlarge": InstanceType( + name="r4.16xlarge", + cores=64, + memory=488.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r4.2xlarge": InstanceType( + name="r4.2xlarge", + cores=8, + memory=61.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r4.4xlarge": InstanceType( + name="r4.4xlarge", + cores=16, + memory=122.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r4.8xlarge": InstanceType( + name="r4.8xlarge", + cores=32, + memory=244.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r4.large": InstanceType( + name="r4.large", + cores=2, + memory=15.25, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r4.xlarge": InstanceType( + name="r4.xlarge", + cores=4, + memory=30.5, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5.12xlarge": InstanceType( + name="r5.12xlarge", + cores=48, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5.16xlarge": InstanceType( + name="r5.16xlarge", + cores=64, + memory=512.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5.24xlarge": InstanceType( + name="r5.24xlarge", + cores=96, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5.2xlarge": InstanceType( + name="r5.2xlarge", + cores=8, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5.4xlarge": InstanceType( + name="r5.4xlarge", + cores=16, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5.8xlarge": InstanceType( + name="r5.8xlarge", + cores=32, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5.large": InstanceType( + name="r5.large", + cores=2, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5.metal": InstanceType( + name="r5.metal", + cores=96, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5.xlarge": InstanceType( + name="r5.xlarge", + cores=4, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5a.12xlarge": InstanceType( + name="r5a.12xlarge", + cores=48, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5a.16xlarge": InstanceType( + name="r5a.16xlarge", + cores=64, + memory=512.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5a.24xlarge": InstanceType( + name="r5a.24xlarge", + cores=96, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5a.2xlarge": InstanceType( + name="r5a.2xlarge", + cores=8, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5a.4xlarge": InstanceType( + name="r5a.4xlarge", + cores=16, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5a.8xlarge": InstanceType( + name="r5a.8xlarge", + cores=32, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5a.large": InstanceType( + name="r5a.large", + cores=2, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5a.xlarge": InstanceType( + name="r5a.xlarge", + cores=4, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5ad.12xlarge": InstanceType( + name="r5ad.12xlarge", + cores=48, + memory=384.0, + disks=2.0, + disk_capacity=900.0, + architecture="amd64", + ), + "r5ad.16xlarge": InstanceType( + name="r5ad.16xlarge", + cores=64, + memory=512.0, + disks=4.0, + disk_capacity=600.0, + architecture="amd64", + ), + "r5ad.24xlarge": InstanceType( + name="r5ad.24xlarge", + cores=96, + memory=768.0, + disks=4.0, + disk_capacity=900.0, + architecture="amd64", + ), + "r5ad.2xlarge": InstanceType( + name="r5ad.2xlarge", + cores=8, + memory=64.0, + disks=1.0, + disk_capacity=300.0, + architecture="amd64", + ), + "r5ad.4xlarge": InstanceType( + name="r5ad.4xlarge", + cores=16, + memory=128.0, + disks=2.0, + disk_capacity=300.0, + architecture="amd64", + ), + "r5ad.8xlarge": InstanceType( + name="r5ad.8xlarge", + cores=32, + memory=256.0, + disks=2.0, + disk_capacity=600.0, + architecture="amd64", + ), + "r5ad.large": InstanceType( + name="r5ad.large", + cores=2, + memory=16.0, + disks=1.0, + disk_capacity=75.0, + architecture="amd64", + ), + "r5ad.xlarge": InstanceType( + name="r5ad.xlarge", + cores=4, + memory=32.0, + disks=1.0, + disk_capacity=150.0, + architecture="amd64", + ), + "r5b.12xlarge": InstanceType( + name="r5b.12xlarge", + cores=48, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5b.16xlarge": InstanceType( + name="r5b.16xlarge", + cores=64, + memory=512.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5b.24xlarge": InstanceType( + name="r5b.24xlarge", + cores=96, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5b.2xlarge": InstanceType( + name="r5b.2xlarge", + cores=8, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5b.4xlarge": InstanceType( + name="r5b.4xlarge", + cores=16, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5b.8xlarge": InstanceType( + name="r5b.8xlarge", + cores=32, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5b.large": InstanceType( + name="r5b.large", + cores=2, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5b.metal": InstanceType( + name="r5b.metal", + cores=96, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5b.xlarge": InstanceType( + name="r5b.xlarge", + cores=4, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5d.12xlarge": InstanceType( + name="r5d.12xlarge", + cores=48, + memory=384.0, + disks=2.0, + disk_capacity=900.0, + architecture="amd64", + ), + "r5d.16xlarge": InstanceType( + name="r5d.16xlarge", + cores=64, + memory=512.0, + disks=4.0, + disk_capacity=600.0, + architecture="amd64", + ), + "r5d.24xlarge": InstanceType( + name="r5d.24xlarge", + cores=96, + memory=768.0, + disks=4.0, + disk_capacity=900.0, + architecture="amd64", + ), + "r5d.2xlarge": InstanceType( + name="r5d.2xlarge", + cores=8, + memory=64.0, + disks=1.0, + disk_capacity=300.0, + architecture="amd64", + ), + "r5d.4xlarge": InstanceType( + name="r5d.4xlarge", + cores=16, + memory=128.0, + disks=2.0, + disk_capacity=300.0, + architecture="amd64", + ), + "r5d.8xlarge": InstanceType( + name="r5d.8xlarge", + cores=32, + memory=256.0, + disks=2.0, + disk_capacity=600.0, + architecture="amd64", + ), + "r5d.large": InstanceType( + name="r5d.large", + cores=2, + memory=16.0, + disks=1.0, + disk_capacity=75.0, + architecture="amd64", + ), + "r5d.metal": InstanceType( + name="r5d.metal", + cores=96, + memory=768.0, + disks=4.0, + disk_capacity=900.0, + architecture="amd64", + ), + "r5d.xlarge": InstanceType( + name="r5d.xlarge", + cores=4, + memory=32.0, + disks=1.0, + disk_capacity=150.0, + architecture="amd64", + ), + "r5dn.12xlarge": InstanceType( + name="r5dn.12xlarge", + cores=48, + memory=384.0, + disks=2.0, + disk_capacity=900.0, + architecture="amd64", + ), + "r5dn.16xlarge": InstanceType( + name="r5dn.16xlarge", + cores=64, + memory=512.0, + disks=4.0, + disk_capacity=600.0, + architecture="amd64", + ), + "r5dn.24xlarge": InstanceType( + name="r5dn.24xlarge", + cores=96, + memory=768.0, + disks=4.0, + disk_capacity=900.0, + architecture="amd64", + ), + "r5dn.2xlarge": InstanceType( + name="r5dn.2xlarge", + cores=8, + memory=64.0, + disks=1.0, + disk_capacity=300.0, + architecture="amd64", + ), + "r5dn.4xlarge": InstanceType( + name="r5dn.4xlarge", + cores=16, + memory=128.0, + disks=2.0, + disk_capacity=300.0, + architecture="amd64", + ), + "r5dn.8xlarge": InstanceType( + name="r5dn.8xlarge", + cores=32, + memory=256.0, + disks=2.0, + disk_capacity=600.0, + architecture="amd64", + ), + "r5dn.large": InstanceType( + name="r5dn.large", + cores=2, + memory=16.0, + disks=1.0, + disk_capacity=75.0, + architecture="amd64", + ), + "r5dn.metal": InstanceType( + name="r5dn.metal", + cores=96, + memory=768.0, + disks=4.0, + disk_capacity=900.0, + architecture="amd64", + ), + "r5dn.xlarge": InstanceType( + name="r5dn.xlarge", + cores=4, + memory=32.0, + disks=1.0, + disk_capacity=150.0, + architecture="amd64", + ), + "r5n.12xlarge": InstanceType( + name="r5n.12xlarge", + cores=48, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5n.16xlarge": InstanceType( + name="r5n.16xlarge", + cores=64, + memory=512.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5n.24xlarge": InstanceType( + name="r5n.24xlarge", + cores=96, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5n.2xlarge": InstanceType( + name="r5n.2xlarge", + cores=8, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5n.4xlarge": InstanceType( + name="r5n.4xlarge", + cores=16, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5n.8xlarge": InstanceType( + name="r5n.8xlarge", + cores=32, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5n.large": InstanceType( + name="r5n.large", + cores=2, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5n.metal": InstanceType( + name="r5n.metal", + cores=96, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r5n.xlarge": InstanceType( + name="r5n.xlarge", + cores=4, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6a.12xlarge": InstanceType( + name="r6a.12xlarge", + cores=48, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6a.16xlarge": InstanceType( + name="r6a.16xlarge", + cores=64, + memory=512.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6a.24xlarge": InstanceType( + name="r6a.24xlarge", + cores=96, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6a.2xlarge": InstanceType( + name="r6a.2xlarge", + cores=8, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6a.32xlarge": InstanceType( + name="r6a.32xlarge", + cores=128, + memory=1024.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6a.48xlarge": InstanceType( + name="r6a.48xlarge", + cores=192, + memory=1536.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6a.4xlarge": InstanceType( + name="r6a.4xlarge", + cores=16, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6a.8xlarge": InstanceType( + name="r6a.8xlarge", + cores=32, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6a.large": InstanceType( + name="r6a.large", + cores=2, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6a.metal": InstanceType( + name="r6a.metal", + cores=192, + memory=1536.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6a.xlarge": InstanceType( + name="r6a.xlarge", + cores=4, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6g.12xlarge": InstanceType( + name="r6g.12xlarge", + cores=48, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "r6g.16xlarge": InstanceType( + name="r6g.16xlarge", + cores=64, + memory=512.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "r6g.2xlarge": InstanceType( + name="r6g.2xlarge", + cores=8, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "r6g.4xlarge": InstanceType( + name="r6g.4xlarge", + cores=16, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "r6g.8xlarge": InstanceType( + name="r6g.8xlarge", + cores=32, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "r6g.large": InstanceType( + name="r6g.large", + cores=2, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "r6g.medium": InstanceType( + name="r6g.medium", + cores=1, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "r6g.metal": InstanceType( + name="r6g.metal", + cores=64, + memory=512.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "r6g.xlarge": InstanceType( + name="r6g.xlarge", + cores=4, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "r6gd.12xlarge": InstanceType( + name="r6gd.12xlarge", + cores=48, + memory=384.0, + disks=2.0, + disk_capacity=1425.0, + architecture="arm64", + ), + "r6gd.16xlarge": InstanceType( + name="r6gd.16xlarge", + cores=64, + memory=512.0, + disks=2.0, + disk_capacity=1900.0, + architecture="arm64", + ), + "r6gd.2xlarge": InstanceType( + name="r6gd.2xlarge", + cores=8, + memory=64.0, + disks=1.0, + disk_capacity=475.0, + architecture="arm64", + ), + "r6gd.4xlarge": InstanceType( + name="r6gd.4xlarge", + cores=16, + memory=128.0, + disks=1.0, + disk_capacity=950.0, + architecture="arm64", + ), + "r6gd.8xlarge": InstanceType( + name="r6gd.8xlarge", + cores=32, + memory=256.0, + disks=1.0, + disk_capacity=1900.0, + architecture="arm64", + ), + "r6gd.large": InstanceType( + name="r6gd.large", + cores=2, + memory=16.0, + disks=1.0, + disk_capacity=118.0, + architecture="arm64", + ), + "r6gd.medium": InstanceType( + name="r6gd.medium", + cores=1, + memory=8.0, + disks=1.0, + disk_capacity=59.0, + architecture="arm64", + ), + "r6gd.metal": InstanceType( + name="r6gd.metal", + cores=64, + memory=512.0, + disks=2.0, + disk_capacity=1900.0, + architecture="arm64", + ), + "r6gd.xlarge": InstanceType( + name="r6gd.xlarge", + cores=4, + memory=32.0, + disks=1.0, + disk_capacity=237.0, + architecture="arm64", + ), + "r6i.12xlarge": InstanceType( + name="r6i.12xlarge", + cores=48, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6i.16xlarge": InstanceType( + name="r6i.16xlarge", + cores=64, + memory=512.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6i.24xlarge": InstanceType( + name="r6i.24xlarge", + cores=96, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6i.2xlarge": InstanceType( + name="r6i.2xlarge", + cores=8, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6i.32xlarge": InstanceType( + name="r6i.32xlarge", + cores=128, + memory=1024.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6i.4xlarge": InstanceType( + name="r6i.4xlarge", + cores=16, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6i.8xlarge": InstanceType( + name="r6i.8xlarge", + cores=32, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6i.large": InstanceType( + name="r6i.large", + cores=2, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6i.metal": InstanceType( + name="r6i.metal", + cores=128, + memory=1024.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6i.xlarge": InstanceType( + name="r6i.xlarge", + cores=4, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6id.12xlarge": InstanceType( + name="r6id.12xlarge", + cores=48, + memory=384.0, + disks=2.0, + disk_capacity=1425.0, + architecture="amd64", + ), + "r6id.16xlarge": InstanceType( + name="r6id.16xlarge", + cores=64, + memory=512.0, + disks=2.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "r6id.24xlarge": InstanceType( + name="r6id.24xlarge", + cores=96, + memory=768.0, + disks=4.0, + disk_capacity=1425.0, + architecture="amd64", + ), + "r6id.2xlarge": InstanceType( + name="r6id.2xlarge", + cores=8, + memory=64.0, + disks=1.0, + disk_capacity=474.0, + architecture="amd64", + ), + "r6id.32xlarge": InstanceType( + name="r6id.32xlarge", + cores=128, + memory=1024.0, + disks=4.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "r6id.4xlarge": InstanceType( + name="r6id.4xlarge", + cores=16, + memory=128.0, + disks=1.0, + disk_capacity=950.0, + architecture="amd64", + ), + "r6id.8xlarge": InstanceType( + name="r6id.8xlarge", + cores=32, + memory=256.0, + disks=1.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "r6id.large": InstanceType( + name="r6id.large", + cores=2, + memory=16.0, + disks=1.0, + disk_capacity=118.0, + architecture="amd64", + ), + "r6id.metal": InstanceType( + name="r6id.metal", + cores=128, + memory=1024.0, + disks=4.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "r6id.xlarge": InstanceType( + name="r6id.xlarge", + cores=4, + memory=32.0, + disks=1.0, + disk_capacity=237.0, + architecture="amd64", + ), + "r6idn.12xlarge": InstanceType( + name="r6idn.12xlarge", + cores=48, + memory=384.0, + disks=2.0, + disk_capacity=1425.0, + architecture="amd64", + ), + "r6idn.16xlarge": InstanceType( + name="r6idn.16xlarge", + cores=64, + memory=512.0, + disks=2.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "r6idn.24xlarge": InstanceType( + name="r6idn.24xlarge", + cores=96, + memory=768.0, + disks=4.0, + disk_capacity=1425.0, + architecture="amd64", + ), + "r6idn.2xlarge": InstanceType( + name="r6idn.2xlarge", + cores=8, + memory=64.0, + disks=1.0, + disk_capacity=474.0, + architecture="amd64", + ), + "r6idn.32xlarge": InstanceType( + name="r6idn.32xlarge", + cores=128, + memory=1024.0, + disks=4.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "r6idn.4xlarge": InstanceType( + name="r6idn.4xlarge", + cores=16, + memory=128.0, + disks=1.0, + disk_capacity=950.0, + architecture="amd64", + ), + "r6idn.8xlarge": InstanceType( + name="r6idn.8xlarge", + cores=32, + memory=256.0, + disks=1.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "r6idn.large": InstanceType( + name="r6idn.large", + cores=2, + memory=16.0, + disks=1.0, + disk_capacity=118.0, + architecture="amd64", + ), + "r6idn.metal": InstanceType( + name="r6idn.metal", + cores=128, + memory=1024.0, + disks=4.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "r6idn.xlarge": InstanceType( + name="r6idn.xlarge", + cores=4, + memory=32.0, + disks=1.0, + disk_capacity=237.0, + architecture="amd64", + ), + "r6in.12xlarge": InstanceType( + name="r6in.12xlarge", + cores=48, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6in.16xlarge": InstanceType( + name="r6in.16xlarge", + cores=64, + memory=512.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6in.24xlarge": InstanceType( + name="r6in.24xlarge", + cores=96, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6in.2xlarge": InstanceType( + name="r6in.2xlarge", + cores=8, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6in.32xlarge": InstanceType( + name="r6in.32xlarge", + cores=128, + memory=1024.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6in.4xlarge": InstanceType( + name="r6in.4xlarge", + cores=16, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6in.8xlarge": InstanceType( + name="r6in.8xlarge", + cores=32, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6in.large": InstanceType( + name="r6in.large", + cores=2, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6in.metal": InstanceType( + name="r6in.metal", + cores=128, + memory=1024.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r6in.xlarge": InstanceType( + name="r6in.xlarge", + cores=4, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7a.12xlarge": InstanceType( + name="r7a.12xlarge", + cores=48, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7a.16xlarge": InstanceType( + name="r7a.16xlarge", + cores=64, + memory=512.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7a.24xlarge": InstanceType( + name="r7a.24xlarge", + cores=96, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7a.2xlarge": InstanceType( + name="r7a.2xlarge", + cores=8, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7a.32xlarge": InstanceType( + name="r7a.32xlarge", + cores=128, + memory=1024.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7a.48xlarge": InstanceType( + name="r7a.48xlarge", + cores=192, + memory=1536.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7a.4xlarge": InstanceType( + name="r7a.4xlarge", + cores=16, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7a.8xlarge": InstanceType( + name="r7a.8xlarge", + cores=32, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7a.large": InstanceType( + name="r7a.large", + cores=2, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7a.medium": InstanceType( + name="r7a.medium", + cores=1, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7a.metal-48xl": InstanceType( + name="r7a.metal-48xl", + cores=192, + memory=1536.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7a.xlarge": InstanceType( + name="r7a.xlarge", + cores=4, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7g.12xlarge": InstanceType( + name="r7g.12xlarge", + cores=48, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "r7g.16xlarge": InstanceType( + name="r7g.16xlarge", + cores=64, + memory=512.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "r7g.2xlarge": InstanceType( + name="r7g.2xlarge", + cores=8, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "r7g.4xlarge": InstanceType( + name="r7g.4xlarge", + cores=16, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "r7g.8xlarge": InstanceType( + name="r7g.8xlarge", + cores=32, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "r7g.large": InstanceType( + name="r7g.large", + cores=2, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "r7g.medium": InstanceType( + name="r7g.medium", + cores=1, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "r7g.metal": InstanceType( + name="r7g.metal", + cores=64, + memory=512.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "r7g.xlarge": InstanceType( + name="r7g.xlarge", + cores=4, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "r7gd.12xlarge": InstanceType( + name="r7gd.12xlarge", + cores=48, + memory=384.0, + disks=2.0, + disk_capacity=1425.0, + architecture="arm64", + ), + "r7gd.16xlarge": InstanceType( + name="r7gd.16xlarge", + cores=64, + memory=512.0, + disks=2.0, + disk_capacity=1900.0, + architecture="arm64", + ), + "r7gd.2xlarge": InstanceType( + name="r7gd.2xlarge", + cores=8, + memory=64.0, + disks=1.0, + disk_capacity=475.0, + architecture="arm64", + ), + "r7gd.4xlarge": InstanceType( + name="r7gd.4xlarge", + cores=16, + memory=128.0, + disks=1.0, + disk_capacity=950.0, + architecture="arm64", + ), + "r7gd.8xlarge": InstanceType( + name="r7gd.8xlarge", + cores=32, + memory=256.0, + disks=1.0, + disk_capacity=1900.0, + architecture="arm64", + ), + "r7gd.large": InstanceType( + name="r7gd.large", + cores=2, + memory=16.0, + disks=1.0, + disk_capacity=118.0, + architecture="arm64", + ), + "r7gd.medium": InstanceType( + name="r7gd.medium", + cores=1, + memory=8.0, + disks=1.0, + disk_capacity=59.0, + architecture="arm64", + ), + "r7gd.xlarge": InstanceType( + name="r7gd.xlarge", + cores=4, + memory=32.0, + disks=1.0, + disk_capacity=237.0, + architecture="arm64", + ), + "r7i.12xlarge": InstanceType( + name="r7i.12xlarge", + cores=48, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7i.16xlarge": InstanceType( + name="r7i.16xlarge", + cores=64, + memory=512.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7i.24xlarge": InstanceType( + name="r7i.24xlarge", + cores=96, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7i.2xlarge": InstanceType( + name="r7i.2xlarge", + cores=8, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7i.48xlarge": InstanceType( + name="r7i.48xlarge", + cores=192, + memory=1536.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7i.4xlarge": InstanceType( + name="r7i.4xlarge", + cores=16, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7i.8xlarge": InstanceType( + name="r7i.8xlarge", + cores=32, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7i.large": InstanceType( + name="r7i.large", + cores=2, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7i.metal-24xl": InstanceType( + name="r7i.metal-24xl", + cores=96, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7i.metal-48xl": InstanceType( + name="r7i.metal-48xl", + cores=192, + memory=1536.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7i.xlarge": InstanceType( + name="r7i.xlarge", + cores=4, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7iz.12xlarge": InstanceType( + name="r7iz.12xlarge", + cores=48, + memory=384.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7iz.16xlarge": InstanceType( + name="r7iz.16xlarge", + cores=64, + memory=512.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7iz.2xlarge": InstanceType( + name="r7iz.2xlarge", + cores=8, + memory=64.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7iz.32xlarge": InstanceType( + name="r7iz.32xlarge", + cores=128, + memory=1024.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7iz.4xlarge": InstanceType( + name="r7iz.4xlarge", + cores=16, + memory=128.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7iz.8xlarge": InstanceType( + name="r7iz.8xlarge", + cores=32, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7iz.large": InstanceType( + name="r7iz.large", + cores=2, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7iz.metal-16xl": InstanceType( + name="r7iz.metal-16xl", + cores=64, + memory=512.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7iz.metal-32xl": InstanceType( + name="r7iz.metal-32xl", + cores=128, + memory=1024.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "r7iz.xlarge": InstanceType( + name="r7iz.xlarge", + cores=4, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "t1.micro": InstanceType( + name="t1.micro", + cores=1, + memory=0.613, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "t2.2xlarge": InstanceType( + name="t2.2xlarge", + cores=8, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "t2.large": InstanceType( + name="t2.large", + cores=2, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "t2.medium": InstanceType( + name="t2.medium", + cores=2, + memory=4.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "t2.micro": InstanceType( + name="t2.micro", + cores=1, + memory=1.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "t2.nano": InstanceType( + name="t2.nano", + cores=1, + memory=0.5, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "t2.small": InstanceType( + name="t2.small", + cores=1, + memory=2.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "t2.xlarge": InstanceType( + name="t2.xlarge", + cores=4, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "t3.2xlarge": InstanceType( + name="t3.2xlarge", + cores=8, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "t3.large": InstanceType( + name="t3.large", + cores=2, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "t3.medium": InstanceType( + name="t3.medium", + cores=2, + memory=4.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "t3.micro": InstanceType( + name="t3.micro", + cores=2, + memory=1.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "t3.nano": InstanceType( + name="t3.nano", + cores=2, + memory=0.5, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "t3.small": InstanceType( + name="t3.small", + cores=2, + memory=2.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "t3.xlarge": InstanceType( + name="t3.xlarge", + cores=4, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "t3a.2xlarge": InstanceType( + name="t3a.2xlarge", + cores=8, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "t3a.large": InstanceType( + name="t3a.large", + cores=2, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "t3a.medium": InstanceType( + name="t3a.medium", + cores=2, + memory=4.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "t3a.micro": InstanceType( + name="t3a.micro", + cores=2, + memory=1.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "t3a.nano": InstanceType( + name="t3a.nano", + cores=2, + memory=0.5, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "t3a.small": InstanceType( + name="t3a.small", + cores=2, + memory=2.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "t3a.xlarge": InstanceType( + name="t3a.xlarge", + cores=4, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "t4g.2xlarge": InstanceType( + name="t4g.2xlarge", + cores=8, + memory=32.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "t4g.large": InstanceType( + name="t4g.large", + cores=2, + memory=8.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "t4g.medium": InstanceType( + name="t4g.medium", + cores=2, + memory=4.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "t4g.micro": InstanceType( + name="t4g.micro", + cores=2, + memory=1.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "t4g.nano": InstanceType( + name="t4g.nano", + cores=2, + memory=0.5, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "t4g.small": InstanceType( + name="t4g.small", + cores=2, + memory=2.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "t4g.xlarge": InstanceType( + name="t4g.xlarge", + cores=4, + memory=16.0, + disks=0, + disk_capacity=0, + architecture="arm64", + ), + "trn1.2xlarge": InstanceType( + name="trn1.2xlarge", + cores=8, + memory=32.0, + disks=1.0, + disk_capacity=475.0, + architecture="amd64", + ), + "trn1.32xlarge": InstanceType( + name="trn1.32xlarge", + cores=128, + memory=512.0, + disks=4.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "trn1n.32xlarge": InstanceType( + name="trn1n.32xlarge", + cores=128, + memory=512.0, + disks=4.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "u-12tb1.112xlarge": InstanceType( + name="u-12tb1.112xlarge", + cores=448, + memory=12288.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "u-18tb1.112xlarge": InstanceType( + name="u-18tb1.112xlarge", + cores=448, + memory=18432.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "u-24tb1.112xlarge": InstanceType( + name="u-24tb1.112xlarge", + cores=448, + memory=24576.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "u-3tb1.56xlarge": InstanceType( + name="u-3tb1.56xlarge", + cores=224, + memory=3072.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "u-6tb1.112xlarge": InstanceType( + name="u-6tb1.112xlarge", + cores=448, + memory=6144.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "u-6tb1.56xlarge": InstanceType( + name="u-6tb1.56xlarge", + cores=224, + memory=6144.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "u-9tb1.112xlarge": InstanceType( + name="u-9tb1.112xlarge", + cores=448, + memory=9216.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "vt1.24xlarge": InstanceType( + name="vt1.24xlarge", + cores=96, + memory=192.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "vt1.3xlarge": InstanceType( + name="vt1.3xlarge", + cores=12, + memory=24.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "vt1.6xlarge": InstanceType( + name="vt1.6xlarge", + cores=24, + memory=48.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "x1.16xlarge": InstanceType( + name="x1.16xlarge", + cores=64, + memory=976.0, + disks=1.0, + disk_capacity=1920.0, + architecture="amd64", + ), + "x1.32xlarge": InstanceType( + name="x1.32xlarge", + cores=128, + memory=1952.0, + disks=2.0, + disk_capacity=1920.0, + architecture="amd64", + ), + "x1e.16xlarge": InstanceType( + name="x1e.16xlarge", + cores=64, + memory=1952.0, + disks=1.0, + disk_capacity=1920.0, + architecture="amd64", + ), + "x1e.2xlarge": InstanceType( + name="x1e.2xlarge", + cores=8, + memory=244.0, + disks=1.0, + disk_capacity=240.0, + architecture="amd64", + ), + "x1e.32xlarge": InstanceType( + name="x1e.32xlarge", + cores=128, + memory=3904.0, + disks=2.0, + disk_capacity=1920.0, + architecture="amd64", + ), + "x1e.4xlarge": InstanceType( + name="x1e.4xlarge", + cores=16, + memory=488.0, + disks=1.0, + disk_capacity=480.0, + architecture="amd64", + ), + "x1e.8xlarge": InstanceType( + name="x1e.8xlarge", + cores=32, + memory=976.0, + disks=1.0, + disk_capacity=960.0, + architecture="amd64", + ), + "x1e.xlarge": InstanceType( + name="x1e.xlarge", + cores=4, + memory=122.0, + disks=1.0, + disk_capacity=120.0, + architecture="amd64", + ), + "x2gd.12xlarge": InstanceType( + name="x2gd.12xlarge", + cores=48, + memory=768.0, + disks=2.0, + disk_capacity=1425.0, + architecture="arm64", + ), + "x2gd.16xlarge": InstanceType( + name="x2gd.16xlarge", + cores=64, + memory=1024.0, + disks=2.0, + disk_capacity=1900.0, + architecture="arm64", + ), + "x2gd.2xlarge": InstanceType( + name="x2gd.2xlarge", + cores=8, + memory=128.0, + disks=1.0, + disk_capacity=474.0, + architecture="arm64", + ), + "x2gd.4xlarge": InstanceType( + name="x2gd.4xlarge", + cores=16, + memory=256.0, + disks=1.0, + disk_capacity=950.0, + architecture="arm64", + ), + "x2gd.8xlarge": InstanceType( + name="x2gd.8xlarge", + cores=32, + memory=512.0, + disks=1.0, + disk_capacity=1900.0, + architecture="arm64", + ), + "x2gd.large": InstanceType( + name="x2gd.large", + cores=2, + memory=32.0, + disks=1.0, + disk_capacity=118.0, + architecture="arm64", + ), + "x2gd.medium": InstanceType( + name="x2gd.medium", + cores=1, + memory=16.0, + disks=1.0, + disk_capacity=59.0, + architecture="arm64", + ), + "x2gd.metal": InstanceType( + name="x2gd.metal", + cores=64, + memory=1024.0, + disks=2.0, + disk_capacity=1900.0, + architecture="arm64", + ), + "x2gd.xlarge": InstanceType( + name="x2gd.xlarge", + cores=4, + memory=64.0, + disks=1.0, + disk_capacity=237.0, + architecture="arm64", + ), + "x2idn.16xlarge": InstanceType( + name="x2idn.16xlarge", + cores=64, + memory=1024.0, + disks=1.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "x2idn.24xlarge": InstanceType( + name="x2idn.24xlarge", + cores=96, + memory=1536.0, + disks=2.0, + disk_capacity=1425.0, + architecture="amd64", + ), + "x2idn.32xlarge": InstanceType( + name="x2idn.32xlarge", + cores=128, + memory=2048.0, + disks=2.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "x2idn.metal": InstanceType( + name="x2idn.metal", + cores=128, + memory=2048.0, + disks=2.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "x2iedn.16xlarge": InstanceType( + name="x2iedn.16xlarge", + cores=64, + memory=2048.0, + disks=1.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "x2iedn.24xlarge": InstanceType( + name="x2iedn.24xlarge", + cores=96, + memory=3072.0, + disks=2.0, + disk_capacity=1425.0, + architecture="amd64", + ), + "x2iedn.2xlarge": InstanceType( + name="x2iedn.2xlarge", + cores=8, + memory=256.0, + disks=1.0, + disk_capacity=237.0, + architecture="amd64", + ), + "x2iedn.32xlarge": InstanceType( + name="x2iedn.32xlarge", + cores=128, + memory=4096.0, + disks=2.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "x2iedn.4xlarge": InstanceType( + name="x2iedn.4xlarge", + cores=16, + memory=512.0, + disks=1.0, + disk_capacity=475.0, + architecture="amd64", + ), + "x2iedn.8xlarge": InstanceType( + name="x2iedn.8xlarge", + cores=32, + memory=1024.0, + disks=1.0, + disk_capacity=950.0, + architecture="amd64", + ), + "x2iedn.metal": InstanceType( + name="x2iedn.metal", + cores=128, + memory=4096.0, + disks=2.0, + disk_capacity=1900.0, + architecture="amd64", + ), + "x2iedn.xlarge": InstanceType( + name="x2iedn.xlarge", + cores=4, + memory=128.0, + disks=1.0, + disk_capacity=118.0, + architecture="amd64", + ), + "x2iezn.12xlarge": InstanceType( + name="x2iezn.12xlarge", + cores=48, + memory=1536.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "x2iezn.2xlarge": InstanceType( + name="x2iezn.2xlarge", + cores=8, + memory=256.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "x2iezn.4xlarge": InstanceType( + name="x2iezn.4xlarge", + cores=16, + memory=512.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "x2iezn.6xlarge": InstanceType( + name="x2iezn.6xlarge", + cores=24, + memory=768.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "x2iezn.8xlarge": InstanceType( + name="x2iezn.8xlarge", + cores=32, + memory=1024.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "x2iezn.metal": InstanceType( + name="x2iezn.metal", + cores=48, + memory=1536.0, + disks=0, + disk_capacity=0, + architecture="amd64", + ), + "z1d.12xlarge": InstanceType( + name="z1d.12xlarge", + cores=48, + memory=384.0, + disks=2.0, + disk_capacity=900.0, + architecture="amd64", + ), + "z1d.2xlarge": InstanceType( + name="z1d.2xlarge", + cores=8, + memory=64.0, + disks=1.0, + disk_capacity=300.0, + architecture="amd64", + ), + "z1d.3xlarge": InstanceType( + name="z1d.3xlarge", + cores=12, + memory=96.0, + disks=1.0, + disk_capacity=450.0, + architecture="amd64", + ), + "z1d.6xlarge": InstanceType( + name="z1d.6xlarge", + cores=24, + memory=192.0, + disks=1.0, + disk_capacity=900.0, + architecture="amd64", + ), + "z1d.large": InstanceType( + name="z1d.large", + cores=2, + memory=16.0, + disks=1.0, + disk_capacity=75.0, + architecture="amd64", + ), + "z1d.metal": InstanceType( + name="z1d.metal", + cores=48, + memory=384.0, + disks=2.0, + disk_capacity=900.0, + architecture="amd64", + ), + "z1d.xlarge": InstanceType( + name="z1d.xlarge", + cores=4, + memory=32.0, + disks=1.0, + disk_capacity=150.0, + architecture="amd64", + ), } regionDict = { - 'us-west-1': ['c1.medium', 'c1.xlarge', 'c3.2xlarge', 'c3.4xlarge', 'c3.8xlarge', 'c3.large', 'c3.xlarge', 'c4.2xlarge', 'c4.4xlarge', 'c4.8xlarge', 'c4.large', 'c4.xlarge', 'c5.12xlarge', 'c5.18xlarge', 'c5.24xlarge', 'c5.2xlarge', 'c5.4xlarge', 'c5.9xlarge', 'c5.large', 'c5.metal', 'c5.xlarge', 'c5a.12xlarge', 'c5a.16xlarge', 'c5a.24xlarge', 'c5a.2xlarge', 'c5a.4xlarge', 'c5a.8xlarge', 'c5a.large', 'c5a.xlarge', 'c5d.12xlarge', 'c5d.18xlarge', 'c5d.24xlarge', 'c5d.2xlarge', 'c5d.4xlarge', 'c5d.9xlarge', 'c5d.large', 'c5d.metal', 'c5d.xlarge', 'c5n.18xlarge', 'c5n.2xlarge', 'c5n.4xlarge', 'c5n.9xlarge', 'c5n.large', 'c5n.metal', 'c5n.xlarge', 'c6a.12xlarge', 'c6a.16xlarge', 'c6a.24xlarge', 'c6a.2xlarge', 'c6a.32xlarge', 'c6a.48xlarge', 'c6a.4xlarge', 'c6a.8xlarge', 'c6a.large', 'c6a.metal', 'c6a.xlarge', 'c6g.12xlarge', 'c6g.16xlarge', 'c6g.2xlarge', 'c6g.4xlarge', 'c6g.8xlarge', 'c6g.large', 'c6g.medium', 'c6g.metal', 'c6g.xlarge', 'c6gd.12xlarge', 'c6gd.16xlarge', 'c6gd.2xlarge', 'c6gd.4xlarge', 'c6gd.8xlarge', 'c6gd.large', 'c6gd.medium', 'c6gd.metal', 'c6gd.xlarge', 'c6gn.12xlarge', 'c6gn.16xlarge', 'c6gn.2xlarge', 'c6gn.4xlarge', 'c6gn.8xlarge', 'c6gn.large', 'c6gn.medium', 'c6gn.xlarge', 'c6i.12xlarge', 'c6i.16xlarge', 'c6i.24xlarge', 'c6i.2xlarge', 'c6i.32xlarge', 'c6i.4xlarge', 'c6i.8xlarge', 'c6i.large', 'c6i.metal', 'c6i.xlarge', 'c6in.12xlarge', 'c6in.16xlarge', 'c6in.24xlarge', 'c6in.2xlarge', 'c6in.32xlarge', 'c6in.4xlarge', 'c6in.8xlarge', 'c6in.large', 'c6in.metal', 'c6in.xlarge', 'c7g.12xlarge', 'c7g.16xlarge', 'c7g.2xlarge', 'c7g.4xlarge', 'c7g.8xlarge', 'c7g.large', 'c7g.medium', 'c7g.metal', 'c7g.xlarge', 'c7gd.12xlarge', 'c7gd.16xlarge', 'c7gd.2xlarge', 'c7gd.4xlarge', 'c7gd.8xlarge', 'c7gd.large', 'c7gd.medium', 'c7gd.xlarge', 'c7i.12xlarge', 'c7i.16xlarge', 'c7i.24xlarge', 'c7i.2xlarge', 'c7i.48xlarge', 'c7i.4xlarge', 'c7i.8xlarge', 'c7i.large', 'c7i.metal-24xl', 'c7i.metal-48xl', 'c7i.xlarge', 'd2.2xlarge', 'd2.4xlarge', 'd2.8xlarge', 'd2.xlarge', 'g2.2xlarge', 'g2.8xlarge', 'g3.16xlarge', 'g3.4xlarge', 'g3.8xlarge', 'g4dn.12xlarge', 'g4dn.16xlarge', 'g4dn.2xlarge', 'g4dn.4xlarge', 'g4dn.8xlarge', 'g4dn.metal', 'g4dn.xlarge', 'i2.2xlarge', 'i2.4xlarge', 'i2.8xlarge', 'i2.xlarge', 'i3.16xlarge', 'i3.2xlarge', 'i3.4xlarge', 'i3.8xlarge', 'i3.large', 'i3.metal', 'i3.xlarge', 'i3en.12xlarge', 'i3en.24xlarge', 'i3en.2xlarge', 'i3en.3xlarge', 'i3en.6xlarge', 'i3en.large', 'i3en.metal', 'i3en.xlarge', 'i4i.12xlarge', 'i4i.16xlarge', 'i4i.24xlarge', 'i4i.2xlarge', 'i4i.32xlarge', 'i4i.4xlarge', 'i4i.8xlarge', 'i4i.large', 'i4i.metal', 'i4i.xlarge', 'inf1.24xlarge', 'inf1.2xlarge', 'inf1.6xlarge', 'inf1.xlarge', 'm1.large', 'm1.medium', 'm1.xlarge', 'm2.2xlarge', 'm2.4xlarge', 'm2.xlarge', 'm3.2xlarge', 'm3.large', 'm3.medium', 'm3.xlarge', 'm4.10xlarge', 'm4.16xlarge', 'm4.2xlarge', 'm4.4xlarge', 'm4.large', 'm4.xlarge', 'm5.12xlarge', 'm5.16xlarge', 'm5.24xlarge', 'm5.2xlarge', 'm5.4xlarge', 'm5.8xlarge', 'm5.large', 'm5.metal', 'm5.xlarge', 'm5a.12xlarge', 'm5a.16xlarge', 'm5a.24xlarge', 'm5a.2xlarge', 'm5a.4xlarge', 'm5a.8xlarge', 'm5a.large', 'm5a.xlarge', 'm5ad.12xlarge', 'm5ad.16xlarge', 'm5ad.24xlarge', 'm5ad.2xlarge', 'm5ad.4xlarge', 'm5ad.8xlarge', 'm5ad.large', 'm5ad.xlarge', 'm5d.12xlarge', 'm5d.16xlarge', 'm5d.24xlarge', 'm5d.2xlarge', 'm5d.4xlarge', 'm5d.8xlarge', 'm5d.large', 'm5d.metal', 'm5d.xlarge', 'm5zn.12xlarge', 'm5zn.2xlarge', 'm5zn.3xlarge', 'm5zn.6xlarge', 'm5zn.large', 'm5zn.metal', 'm5zn.xlarge', 'm6a.12xlarge', 'm6a.16xlarge', 'm6a.24xlarge', 'm6a.2xlarge', 'm6a.32xlarge', 'm6a.48xlarge', 'm6a.4xlarge', 'm6a.8xlarge', 'm6a.large', 'm6a.metal', 'm6a.xlarge', 'm6g.12xlarge', 'm6g.16xlarge', 'm6g.2xlarge', 'm6g.4xlarge', 'm6g.8xlarge', 'm6g.large', 'm6g.medium', 'm6g.metal', 'm6g.xlarge', 'm6gd.12xlarge', 'm6gd.16xlarge', 'm6gd.2xlarge', 'm6gd.4xlarge', 'm6gd.8xlarge', 'm6gd.large', 'm6gd.medium', 'm6gd.metal', 'm6gd.xlarge', 'm6i.12xlarge', 'm6i.16xlarge', 'm6i.24xlarge', 'm6i.2xlarge', 'm6i.32xlarge', 'm6i.4xlarge', 'm6i.8xlarge', 'm6i.large', 'm6i.metal', 'm6i.xlarge', 'm6idn.12xlarge', 'm6idn.16xlarge', 'm6idn.24xlarge', 'm6idn.2xlarge', 'm6idn.32xlarge', 'm6idn.4xlarge', 'm6idn.8xlarge', 'm6idn.large', 'm6idn.metal', 'm6idn.xlarge', 'm6in.12xlarge', 'm6in.16xlarge', 'm6in.24xlarge', 'm6in.2xlarge', 'm6in.32xlarge', 'm6in.4xlarge', 'm6in.8xlarge', 'm6in.large', 'm6in.metal', 'm6in.xlarge', 'm7g.12xlarge', 'm7g.16xlarge', 'm7g.2xlarge', 'm7g.4xlarge', 'm7g.8xlarge', 'm7g.large', 'm7g.medium', 'm7g.metal', 'm7g.xlarge', 'm7gd.12xlarge', 'm7gd.16xlarge', 'm7gd.2xlarge', 'm7gd.4xlarge', 'm7gd.8xlarge', 'm7gd.large', 'm7gd.medium', 'm7gd.xlarge', 'm7i-flex.2xlarge', 'm7i-flex.4xlarge', 'm7i-flex.8xlarge', 'm7i-flex.large', 'm7i-flex.xlarge', 'm7i.12xlarge', 'm7i.16xlarge', 'm7i.24xlarge', 'm7i.2xlarge', 'm7i.48xlarge', 'm7i.4xlarge', 'm7i.8xlarge', 'm7i.large', 'm7i.metal-24xl', 'm7i.metal-48xl', 'm7i.xlarge', 'r3.2xlarge', 'r3.4xlarge', 'r3.8xlarge', 'r3.large', 'r3.xlarge', 'r4.16xlarge', 'r4.2xlarge', 'r4.4xlarge', 'r4.8xlarge', 'r4.large', 'r4.xlarge', 'r5.12xlarge', 'r5.16xlarge', 'r5.24xlarge', 'r5.2xlarge', 'r5.4xlarge', 'r5.8xlarge', 'r5.large', 'r5.metal', 'r5.xlarge', 'r5a.12xlarge', 'r5a.16xlarge', 'r5a.24xlarge', 'r5a.2xlarge', 'r5a.4xlarge', 'r5a.8xlarge', 'r5a.large', 'r5a.xlarge', 'r5ad.12xlarge', 'r5ad.16xlarge', 'r5ad.24xlarge', 'r5ad.2xlarge', 'r5ad.4xlarge', 'r5ad.8xlarge', 'r5ad.large', 'r5ad.xlarge', 'r5d.12xlarge', 'r5d.16xlarge', 'r5d.24xlarge', 'r5d.2xlarge', 'r5d.4xlarge', 'r5d.8xlarge', 'r5d.large', 'r5d.metal', 'r5d.xlarge', 'r5n.12xlarge', 'r5n.16xlarge', 'r5n.24xlarge', 'r5n.2xlarge', 'r5n.4xlarge', 'r5n.8xlarge', 'r5n.large', 'r5n.metal', 'r5n.xlarge', 'r6a.12xlarge', 'r6a.16xlarge', 'r6a.24xlarge', 'r6a.2xlarge', 'r6a.32xlarge', 'r6a.48xlarge', 'r6a.4xlarge', 'r6a.8xlarge', 'r6a.large', 'r6a.metal', 'r6a.xlarge', 'r6g.12xlarge', 'r6g.16xlarge', 'r6g.2xlarge', 'r6g.4xlarge', 'r6g.8xlarge', 'r6g.large', 'r6g.medium', 'r6g.metal', 'r6g.xlarge', 'r6gd.12xlarge', 'r6gd.16xlarge', 'r6gd.2xlarge', 'r6gd.4xlarge', 'r6gd.8xlarge', 'r6gd.large', 'r6gd.medium', 'r6gd.metal', 'r6gd.xlarge', 'r6i.12xlarge', 'r6i.16xlarge', 'r6i.24xlarge', 'r6i.2xlarge', 'r6i.32xlarge', 'r6i.4xlarge', 'r6i.8xlarge', 'r6i.large', 'r6i.metal', 'r6i.xlarge', 'r7g.12xlarge', 'r7g.16xlarge', 'r7g.2xlarge', 'r7g.4xlarge', 'r7g.8xlarge', 'r7g.large', 'r7g.medium', 'r7g.metal', 'r7g.xlarge', 'r7gd.12xlarge', 'r7gd.16xlarge', 'r7gd.2xlarge', 'r7gd.4xlarge', 'r7gd.8xlarge', 'r7gd.large', 'r7gd.medium', 'r7gd.xlarge', 'r7i.12xlarge', 'r7i.16xlarge', 'r7i.24xlarge', 'r7i.2xlarge', 'r7i.48xlarge', 'r7i.4xlarge', 'r7i.8xlarge', 'r7i.large', 'r7i.metal-24xl', 'r7i.metal-48xl', 'r7i.xlarge', 't1.micro', 't2.2xlarge', 't2.large', 't2.medium', 't2.micro', 't2.nano', 't2.small', 't2.xlarge', 't3.2xlarge', 't3.large', 't3.medium', 't3.micro', 't3.nano', 't3.small', 't3.xlarge', 't3a.2xlarge', 't3a.large', 't3a.medium', 't3a.micro', 't3a.nano', 't3a.small', 't3a.xlarge', 't4g.2xlarge', 't4g.large', 't4g.medium', 't4g.micro', 't4g.nano', 't4g.small', 't4g.xlarge', 'x2idn.16xlarge', 'x2idn.24xlarge', 'x2idn.32xlarge', 'x2idn.metal', 'x2iedn.16xlarge', 'x2iedn.24xlarge', 'x2iedn.2xlarge', 'x2iedn.32xlarge', 'x2iedn.4xlarge', 'x2iedn.8xlarge', 'x2iedn.metal', 'x2iedn.xlarge', 'z1d.12xlarge', 'z1d.2xlarge', 'z1d.3xlarge', 'z1d.6xlarge', 'z1d.large', 'z1d.metal', 'z1d.xlarge'], - 'us-west-2': ['a1.2xlarge', 'a1.4xlarge', 'a1.large', 'a1.medium', 'a1.metal', 'a1.xlarge', 'c1.medium', 'c1.xlarge', 'c3.2xlarge', 'c3.4xlarge', 'c3.8xlarge', 'c3.large', 'c3.xlarge', 'c4.2xlarge', 'c4.4xlarge', 'c4.8xlarge', 'c4.large', 'c4.xlarge', 'c5.12xlarge', 'c5.18xlarge', 'c5.24xlarge', 'c5.2xlarge', 'c5.4xlarge', 'c5.9xlarge', 'c5.large', 'c5.metal', 'c5.xlarge', 'c5a.12xlarge', 'c5a.16xlarge', 'c5a.24xlarge', 'c5a.2xlarge', 'c5a.4xlarge', 'c5a.8xlarge', 'c5a.large', 'c5a.xlarge', 'c5ad.12xlarge', 'c5ad.16xlarge', 'c5ad.24xlarge', 'c5ad.2xlarge', 'c5ad.4xlarge', 'c5ad.8xlarge', 'c5ad.large', 'c5ad.xlarge', 'c5d.12xlarge', 'c5d.18xlarge', 'c5d.24xlarge', 'c5d.2xlarge', 'c5d.4xlarge', 'c5d.9xlarge', 'c5d.large', 'c5d.metal', 'c5d.xlarge', 'c5n.18xlarge', 'c5n.2xlarge', 'c5n.4xlarge', 'c5n.9xlarge', 'c5n.large', 'c5n.metal', 'c5n.xlarge', 'c6a.12xlarge', 'c6a.16xlarge', 'c6a.24xlarge', 'c6a.2xlarge', 'c6a.32xlarge', 'c6a.48xlarge', 'c6a.4xlarge', 'c6a.8xlarge', 'c6a.large', 'c6a.metal', 'c6a.xlarge', 'c6g.12xlarge', 'c6g.16xlarge', 'c6g.2xlarge', 'c6g.4xlarge', 'c6g.8xlarge', 'c6g.large', 'c6g.medium', 'c6g.metal', 'c6g.xlarge', 'c6gd.12xlarge', 'c6gd.16xlarge', 'c6gd.2xlarge', 'c6gd.4xlarge', 'c6gd.8xlarge', 'c6gd.large', 'c6gd.medium', 'c6gd.metal', 'c6gd.xlarge', 'c6gn.12xlarge', 'c6gn.16xlarge', 'c6gn.2xlarge', 'c6gn.4xlarge', 'c6gn.8xlarge', 'c6gn.large', 'c6gn.medium', 'c6gn.xlarge', 'c6i.12xlarge', 'c6i.16xlarge', 'c6i.24xlarge', 'c6i.2xlarge', 'c6i.32xlarge', 'c6i.4xlarge', 'c6i.8xlarge', 'c6i.large', 'c6i.metal', 'c6i.xlarge', 'c6id.12xlarge', 'c6id.16xlarge', 'c6id.24xlarge', 'c6id.2xlarge', 'c6id.32xlarge', 'c6id.4xlarge', 'c6id.8xlarge', 'c6id.large', 'c6id.metal', 'c6id.xlarge', 'c6in.12xlarge', 'c6in.16xlarge', 'c6in.24xlarge', 'c6in.2xlarge', 'c6in.32xlarge', 'c6in.4xlarge', 'c6in.8xlarge', 'c6in.large', 'c6in.metal', 'c6in.xlarge', 'c7a.12xlarge', 'c7a.16xlarge', 'c7a.24xlarge', 'c7a.2xlarge', 'c7a.32xlarge', 'c7a.48xlarge', 'c7a.4xlarge', 'c7a.8xlarge', 'c7a.large', 'c7a.medium', 'c7a.metal-48xl', 'c7a.xlarge', 'c7g.12xlarge', 'c7g.16xlarge', 'c7g.2xlarge', 'c7g.4xlarge', 'c7g.8xlarge', 'c7g.large', 'c7g.medium', 'c7g.metal', 'c7g.xlarge', 'c7gd.12xlarge', 'c7gd.16xlarge', 'c7gd.2xlarge', 'c7gd.4xlarge', 'c7gd.8xlarge', 'c7gd.large', 'c7gd.medium', 'c7gd.xlarge', 'c7gn.12xlarge', 'c7gn.16xlarge', 'c7gn.2xlarge', 'c7gn.4xlarge', 'c7gn.8xlarge', 'c7gn.large', 'c7gn.medium', 'c7gn.xlarge', 'c7i.12xlarge', 'c7i.16xlarge', 'c7i.24xlarge', 'c7i.2xlarge', 'c7i.48xlarge', 'c7i.4xlarge', 'c7i.8xlarge', 'c7i.large', 'c7i.metal-24xl', 'c7i.metal-48xl', 'c7i.xlarge', 'cr1.8xlarge', 'd2.2xlarge', 'd2.4xlarge', 'd2.8xlarge', 'd2.xlarge', 'd3.2xlarge', 'd3.4xlarge', 'd3.8xlarge', 'd3.xlarge', 'd3en.12xlarge', 'd3en.2xlarge', 'd3en.4xlarge', 'd3en.6xlarge', 'd3en.8xlarge', 'd3en.xlarge', 'dl1.24xlarge', 'dl2q.24xlarge', 'f1.16xlarge', 'f1.2xlarge', 'f1.4xlarge', 'g2.2xlarge', 'g2.8xlarge', 'g3.16xlarge', 'g3.4xlarge', 'g3.8xlarge', 'g3s.xlarge', 'g4ad.16xlarge', 'g4ad.2xlarge', 'g4ad.4xlarge', 'g4ad.8xlarge', 'g4ad.xlarge', 'g4dn.12xlarge', 'g4dn.16xlarge', 'g4dn.2xlarge', 'g4dn.4xlarge', 'g4dn.8xlarge', 'g4dn.metal', 'g4dn.xlarge', 'g5.12xlarge', 'g5.16xlarge', 'g5.24xlarge', 'g5.2xlarge', 'g5.48xlarge', 'g5.4xlarge', 'g5.8xlarge', 'g5.xlarge', 'g5g.16xlarge', 'g5g.2xlarge', 'g5g.4xlarge', 'g5g.8xlarge', 'g5g.metal', 'g5g.xlarge', 'h1.16xlarge', 'h1.2xlarge', 'h1.4xlarge', 'h1.8xlarge', 'i2.2xlarge', 'i2.4xlarge', 'i2.8xlarge', 'i2.xlarge', 'i3.16xlarge', 'i3.2xlarge', 'i3.4xlarge', 'i3.8xlarge', 'i3.large', 'i3.metal', 'i3.xlarge', 'i3en.12xlarge', 'i3en.24xlarge', 'i3en.2xlarge', 'i3en.3xlarge', 'i3en.6xlarge', 'i3en.large', 'i3en.metal', 'i3en.xlarge', 'i4g.16xlarge', 'i4g.2xlarge', 'i4g.4xlarge', 'i4g.8xlarge', 'i4g.large', 'i4g.xlarge', 'i4i.12xlarge', 'i4i.16xlarge', 'i4i.24xlarge', 'i4i.2xlarge', 'i4i.32xlarge', 'i4i.4xlarge', 'i4i.8xlarge', 'i4i.large', 'i4i.metal', 'i4i.xlarge', 'im4gn.16xlarge', 'im4gn.2xlarge', 'im4gn.4xlarge', 'im4gn.8xlarge', 'im4gn.large', 'im4gn.xlarge', 'inf1.24xlarge', 'inf1.2xlarge', 'inf1.6xlarge', 'inf1.xlarge', 'inf2.24xlarge', 'inf2.48xlarge', 'inf2.8xlarge', 'inf2.xlarge', 'is4gen.2xlarge', 'is4gen.4xlarge', 'is4gen.8xlarge', 'is4gen.large', 'is4gen.medium', 'is4gen.xlarge', 'm1.large', 'm1.medium', 'm1.xlarge', 'm2.2xlarge', 'm2.4xlarge', 'm2.xlarge', 'm3.2xlarge', 'm3.large', 'm3.medium', 'm3.xlarge', 'm4.10xlarge', 'm4.16xlarge', 'm4.2xlarge', 'm4.4xlarge', 'm4.large', 'm4.xlarge', 'm5.12xlarge', 'm5.16xlarge', 'm5.24xlarge', 'm5.2xlarge', 'm5.4xlarge', 'm5.8xlarge', 'm5.large', 'm5.metal', 'm5.xlarge', 'm5a.12xlarge', 'm5a.16xlarge', 'm5a.24xlarge', 'm5a.2xlarge', 'm5a.4xlarge', 'm5a.8xlarge', 'm5a.large', 'm5a.xlarge', 'm5ad.12xlarge', 'm5ad.16xlarge', 'm5ad.24xlarge', 'm5ad.2xlarge', 'm5ad.4xlarge', 'm5ad.8xlarge', 'm5ad.large', 'm5ad.xlarge', 'm5d.12xlarge', 'm5d.16xlarge', 'm5d.24xlarge', 'm5d.2xlarge', 'm5d.4xlarge', 'm5d.8xlarge', 'm5d.large', 'm5d.metal', 'm5d.xlarge', 'm5dn.12xlarge', 'm5dn.16xlarge', 'm5dn.24xlarge', 'm5dn.2xlarge', 'm5dn.4xlarge', 'm5dn.8xlarge', 'm5dn.large', 'm5dn.metal', 'm5dn.xlarge', 'm5n.12xlarge', 'm5n.16xlarge', 'm5n.24xlarge', 'm5n.2xlarge', 'm5n.4xlarge', 'm5n.8xlarge', 'm5n.large', 'm5n.metal', 'm5n.xlarge', 'm5zn.12xlarge', 'm5zn.2xlarge', 'm5zn.3xlarge', 'm5zn.6xlarge', 'm5zn.large', 'm5zn.metal', 'm5zn.xlarge', 'm6a.12xlarge', 'm6a.16xlarge', 'm6a.24xlarge', 'm6a.2xlarge', 'm6a.32xlarge', 'm6a.48xlarge', 'm6a.4xlarge', 'm6a.8xlarge', 'm6a.large', 'm6a.metal', 'm6a.xlarge', 'm6g.12xlarge', 'm6g.16xlarge', 'm6g.2xlarge', 'm6g.4xlarge', 'm6g.8xlarge', 'm6g.large', 'm6g.medium', 'm6g.metal', 'm6g.xlarge', 'm6gd.12xlarge', 'm6gd.16xlarge', 'm6gd.2xlarge', 'm6gd.4xlarge', 'm6gd.8xlarge', 'm6gd.large', 'm6gd.medium', 'm6gd.metal', 'm6gd.xlarge', 'm6i.12xlarge', 'm6i.16xlarge', 'm6i.24xlarge', 'm6i.2xlarge', 'm6i.32xlarge', 'm6i.4xlarge', 'm6i.8xlarge', 'm6i.large', 'm6i.metal', 'm6i.xlarge', 'm6id.12xlarge', 'm6id.16xlarge', 'm6id.24xlarge', 'm6id.2xlarge', 'm6id.32xlarge', 'm6id.4xlarge', 'm6id.8xlarge', 'm6id.large', 'm6id.metal', 'm6id.xlarge', 'm6idn.12xlarge', 'm6idn.16xlarge', 'm6idn.24xlarge', 'm6idn.2xlarge', 'm6idn.32xlarge', 'm6idn.4xlarge', 'm6idn.8xlarge', 'm6idn.large', 'm6idn.metal', 'm6idn.xlarge', 'm6in.12xlarge', 'm6in.16xlarge', 'm6in.24xlarge', 'm6in.2xlarge', 'm6in.32xlarge', 'm6in.4xlarge', 'm6in.8xlarge', 'm6in.large', 'm6in.metal', 'm6in.xlarge', 'm7a.12xlarge', 'm7a.16xlarge', 'm7a.24xlarge', 'm7a.2xlarge', 'm7a.32xlarge', 'm7a.48xlarge', 'm7a.4xlarge', 'm7a.8xlarge', 'm7a.large', 'm7a.medium', 'm7a.metal-48xl', 'm7a.xlarge', 'm7g.12xlarge', 'm7g.16xlarge', 'm7g.2xlarge', 'm7g.4xlarge', 'm7g.8xlarge', 'm7g.large', 'm7g.medium', 'm7g.metal', 'm7g.xlarge', 'm7gd.12xlarge', 'm7gd.16xlarge', 'm7gd.2xlarge', 'm7gd.4xlarge', 'm7gd.8xlarge', 'm7gd.large', 'm7gd.medium', 'm7gd.xlarge', 'm7i-flex.2xlarge', 'm7i-flex.4xlarge', 'm7i-flex.8xlarge', 'm7i-flex.large', 'm7i-flex.xlarge', 'm7i.12xlarge', 'm7i.16xlarge', 'm7i.24xlarge', 'm7i.2xlarge', 'm7i.48xlarge', 'm7i.4xlarge', 'm7i.8xlarge', 'm7i.large', 'm7i.metal-24xl', 'm7i.metal-48xl', 'm7i.xlarge', 'p2.16xlarge', 'p2.8xlarge', 'p2.xlarge', 'p3.16xlarge', 'p3.2xlarge', 'p3.8xlarge', 'p3dn.24xlarge', 'p4d.24xlarge', 'p4de.24xlarge', 'p5.48xlarge', 'r3.2xlarge', 'r3.4xlarge', 'r3.8xlarge', 'r3.large', 'r3.xlarge', 'r4.16xlarge', 'r4.2xlarge', 'r4.4xlarge', 'r4.8xlarge', 'r4.large', 'r4.xlarge', 'r5.12xlarge', 'r5.16xlarge', 'r5.24xlarge', 'r5.2xlarge', 'r5.4xlarge', 'r5.8xlarge', 'r5.large', 'r5.metal', 'r5.xlarge', 'r5a.12xlarge', 'r5a.16xlarge', 'r5a.24xlarge', 'r5a.2xlarge', 'r5a.4xlarge', 'r5a.8xlarge', 'r5a.large', 'r5a.xlarge', 'r5ad.12xlarge', 'r5ad.16xlarge', 'r5ad.24xlarge', 'r5ad.2xlarge', 'r5ad.4xlarge', 'r5ad.8xlarge', 'r5ad.large', 'r5ad.xlarge', 'r5b.12xlarge', 'r5b.16xlarge', 'r5b.24xlarge', 'r5b.2xlarge', 'r5b.4xlarge', 'r5b.8xlarge', 'r5b.large', 'r5b.metal', 'r5b.xlarge', 'r5d.12xlarge', 'r5d.16xlarge', 'r5d.24xlarge', 'r5d.2xlarge', 'r5d.4xlarge', 'r5d.8xlarge', 'r5d.large', 'r5d.metal', 'r5d.xlarge', 'r5dn.12xlarge', 'r5dn.16xlarge', 'r5dn.24xlarge', 'r5dn.2xlarge', 'r5dn.4xlarge', 'r5dn.8xlarge', 'r5dn.large', 'r5dn.metal', 'r5dn.xlarge', 'r5n.12xlarge', 'r5n.16xlarge', 'r5n.24xlarge', 'r5n.2xlarge', 'r5n.4xlarge', 'r5n.8xlarge', 'r5n.large', 'r5n.metal', 'r5n.xlarge', 'r6a.12xlarge', 'r6a.16xlarge', 'r6a.24xlarge', 'r6a.2xlarge', 'r6a.32xlarge', 'r6a.48xlarge', 'r6a.4xlarge', 'r6a.8xlarge', 'r6a.large', 'r6a.metal', 'r6a.xlarge', 'r6g.12xlarge', 'r6g.16xlarge', 'r6g.2xlarge', 'r6g.4xlarge', 'r6g.8xlarge', 'r6g.large', 'r6g.medium', 'r6g.metal', 'r6g.xlarge', 'r6gd.12xlarge', 'r6gd.16xlarge', 'r6gd.2xlarge', 'r6gd.4xlarge', 'r6gd.8xlarge', 'r6gd.large', 'r6gd.medium', 'r6gd.metal', 'r6gd.xlarge', 'r6i.12xlarge', 'r6i.16xlarge', 'r6i.24xlarge', 'r6i.2xlarge', 'r6i.32xlarge', 'r6i.4xlarge', 'r6i.8xlarge', 'r6i.large', 'r6i.metal', 'r6i.xlarge', 'r6id.12xlarge', 'r6id.16xlarge', 'r6id.24xlarge', 'r6id.2xlarge', 'r6id.32xlarge', 'r6id.4xlarge', 'r6id.8xlarge', 'r6id.large', 'r6id.metal', 'r6id.xlarge', 'r6idn.12xlarge', 'r6idn.16xlarge', 'r6idn.24xlarge', 'r6idn.2xlarge', 'r6idn.32xlarge', 'r6idn.4xlarge', 'r6idn.8xlarge', 'r6idn.large', 'r6idn.metal', 'r6idn.xlarge', 'r6in.12xlarge', 'r6in.16xlarge', 'r6in.24xlarge', 'r6in.2xlarge', 'r6in.32xlarge', 'r6in.4xlarge', 'r6in.8xlarge', 'r6in.large', 'r6in.metal', 'r6in.xlarge', 'r7a.12xlarge', 'r7a.16xlarge', 'r7a.24xlarge', 'r7a.2xlarge', 'r7a.32xlarge', 'r7a.48xlarge', 'r7a.4xlarge', 'r7a.8xlarge', 'r7a.large', 'r7a.medium', 'r7a.metal-48xl', 'r7a.xlarge', 'r7g.12xlarge', 'r7g.16xlarge', 'r7g.2xlarge', 'r7g.4xlarge', 'r7g.8xlarge', 'r7g.large', 'r7g.medium', 'r7g.metal', 'r7g.xlarge', 'r7gd.12xlarge', 'r7gd.16xlarge', 'r7gd.2xlarge', 'r7gd.4xlarge', 'r7gd.8xlarge', 'r7gd.large', 'r7gd.medium', 'r7gd.xlarge', 'r7i.12xlarge', 'r7i.16xlarge', 'r7i.24xlarge', 'r7i.2xlarge', 'r7i.48xlarge', 'r7i.4xlarge', 'r7i.8xlarge', 'r7i.large', 'r7i.metal-24xl', 'r7i.metal-48xl', 'r7i.xlarge', 'r7iz.12xlarge', 'r7iz.16xlarge', 'r7iz.2xlarge', 'r7iz.32xlarge', 'r7iz.4xlarge', 'r7iz.8xlarge', 'r7iz.large', 'r7iz.metal-16xl', 'r7iz.metal-32xl', 'r7iz.xlarge', 't1.micro', 't2.2xlarge', 't2.large', 't2.medium', 't2.micro', 't2.nano', 't2.small', 't2.xlarge', 't3.2xlarge', 't3.large', 't3.medium', 't3.micro', 't3.nano', 't3.small', 't3.xlarge', 't3a.2xlarge', 't3a.large', 't3a.medium', 't3a.micro', 't3a.nano', 't3a.small', 't3a.xlarge', 't4g.2xlarge', 't4g.large', 't4g.medium', 't4g.micro', 't4g.nano', 't4g.small', 't4g.xlarge', 'trn1.2xlarge', 'trn1.32xlarge', 'trn1n.32xlarge', 'u-12tb1.112xlarge', 'u-18tb1.112xlarge', 'u-24tb1.112xlarge', 'u-3tb1.56xlarge', 'u-6tb1.112xlarge', 'u-6tb1.56xlarge', 'u-9tb1.112xlarge', 'vt1.24xlarge', 'vt1.3xlarge', 'vt1.6xlarge', 'x1.16xlarge', 'x1.32xlarge', 'x1e.16xlarge', 'x1e.2xlarge', 'x1e.32xlarge', 'x1e.4xlarge', 'x1e.8xlarge', 'x1e.xlarge', 'x2gd.12xlarge', 'x2gd.16xlarge', 'x2gd.2xlarge', 'x2gd.4xlarge', 'x2gd.8xlarge', 'x2gd.large', 'x2gd.medium', 'x2gd.metal', 'x2gd.xlarge', 'x2idn.16xlarge', 'x2idn.24xlarge', 'x2idn.32xlarge', 'x2idn.metal', 'x2iedn.16xlarge', 'x2iedn.24xlarge', 'x2iedn.2xlarge', 'x2iedn.32xlarge', 'x2iedn.4xlarge', 'x2iedn.8xlarge', 'x2iedn.metal', 'x2iedn.xlarge', 'x2iezn.12xlarge', 'x2iezn.2xlarge', 'x2iezn.4xlarge', 'x2iezn.6xlarge', 'x2iezn.8xlarge', 'x2iezn.metal', 'z1d.12xlarge', 'z1d.2xlarge', 'z1d.3xlarge', 'z1d.6xlarge', 'z1d.large', 'z1d.metal', 'z1d.xlarge'], - 'us-east-1': ['a1.2xlarge', 'a1.4xlarge', 'a1.large', 'a1.medium', 'a1.metal', 'a1.xlarge', 'c1.medium', 'c1.xlarge', 'c3.2xlarge', 'c3.4xlarge', 'c3.8xlarge', 'c3.large', 'c3.xlarge', 'c4.2xlarge', 'c4.4xlarge', 'c4.8xlarge', 'c4.large', 'c4.xlarge', 'c5.12xlarge', 'c5.18xlarge', 'c5.24xlarge', 'c5.2xlarge', 'c5.4xlarge', 'c5.9xlarge', 'c5.large', 'c5.metal', 'c5.xlarge', 'c5a.12xlarge', 'c5a.16xlarge', 'c5a.24xlarge', 'c5a.2xlarge', 'c5a.4xlarge', 'c5a.8xlarge', 'c5a.large', 'c5a.xlarge', 'c5ad.12xlarge', 'c5ad.16xlarge', 'c5ad.24xlarge', 'c5ad.2xlarge', 'c5ad.4xlarge', 'c5ad.8xlarge', 'c5ad.large', 'c5ad.xlarge', 'c5d.12xlarge', 'c5d.18xlarge', 'c5d.24xlarge', 'c5d.2xlarge', 'c5d.4xlarge', 'c5d.9xlarge', 'c5d.large', 'c5d.metal', 'c5d.xlarge', 'c5n.18xlarge', 'c5n.2xlarge', 'c5n.4xlarge', 'c5n.9xlarge', 'c5n.large', 'c5n.metal', 'c5n.xlarge', 'c6a.12xlarge', 'c6a.16xlarge', 'c6a.24xlarge', 'c6a.2xlarge', 'c6a.32xlarge', 'c6a.48xlarge', 'c6a.4xlarge', 'c6a.8xlarge', 'c6a.large', 'c6a.metal', 'c6a.xlarge', 'c6g.12xlarge', 'c6g.16xlarge', 'c6g.2xlarge', 'c6g.4xlarge', 'c6g.8xlarge', 'c6g.large', 'c6g.medium', 'c6g.metal', 'c6g.xlarge', 'c6gd.12xlarge', 'c6gd.16xlarge', 'c6gd.2xlarge', 'c6gd.4xlarge', 'c6gd.8xlarge', 'c6gd.large', 'c6gd.medium', 'c6gd.metal', 'c6gd.xlarge', 'c6gn.12xlarge', 'c6gn.16xlarge', 'c6gn.2xlarge', 'c6gn.4xlarge', 'c6gn.8xlarge', 'c6gn.large', 'c6gn.medium', 'c6gn.xlarge', 'c6i.12xlarge', 'c6i.16xlarge', 'c6i.24xlarge', 'c6i.2xlarge', 'c6i.32xlarge', 'c6i.4xlarge', 'c6i.8xlarge', 'c6i.large', 'c6i.metal', 'c6i.xlarge', 'c6id.12xlarge', 'c6id.16xlarge', 'c6id.24xlarge', 'c6id.2xlarge', 'c6id.32xlarge', 'c6id.4xlarge', 'c6id.8xlarge', 'c6id.large', 'c6id.metal', 'c6id.xlarge', 'c6in.12xlarge', 'c6in.16xlarge', 'c6in.24xlarge', 'c6in.2xlarge', 'c6in.32xlarge', 'c6in.4xlarge', 'c6in.8xlarge', 'c6in.large', 'c6in.metal', 'c6in.xlarge', 'c7a.12xlarge', 'c7a.16xlarge', 'c7a.24xlarge', 'c7a.2xlarge', 'c7a.32xlarge', 'c7a.48xlarge', 'c7a.4xlarge', 'c7a.8xlarge', 'c7a.large', 'c7a.medium', 'c7a.metal-48xl', 'c7a.xlarge', 'c7g.12xlarge', 'c7g.16xlarge', 'c7g.2xlarge', 'c7g.4xlarge', 'c7g.8xlarge', 'c7g.large', 'c7g.medium', 'c7g.metal', 'c7g.xlarge', 'c7gd.12xlarge', 'c7gd.16xlarge', 'c7gd.2xlarge', 'c7gd.4xlarge', 'c7gd.8xlarge', 'c7gd.large', 'c7gd.medium', 'c7gd.xlarge', 'c7gn.12xlarge', 'c7gn.16xlarge', 'c7gn.2xlarge', 'c7gn.4xlarge', 'c7gn.8xlarge', 'c7gn.large', 'c7gn.medium', 'c7gn.xlarge', 'c7i.12xlarge', 'c7i.16xlarge', 'c7i.24xlarge', 'c7i.2xlarge', 'c7i.48xlarge', 'c7i.4xlarge', 'c7i.8xlarge', 'c7i.large', 'c7i.metal-24xl', 'c7i.metal-48xl', 'c7i.xlarge', 'cr1.8xlarge', 'd2.2xlarge', 'd2.4xlarge', 'd2.8xlarge', 'd2.xlarge', 'd3.2xlarge', 'd3.4xlarge', 'd3.8xlarge', 'd3.xlarge', 'd3en.12xlarge', 'd3en.2xlarge', 'd3en.4xlarge', 'd3en.6xlarge', 'd3en.8xlarge', 'd3en.xlarge', 'dl1.24xlarge', 'f1.16xlarge', 'f1.2xlarge', 'f1.4xlarge', 'g2.2xlarge', 'g2.8xlarge', 'g3.16xlarge', 'g3.4xlarge', 'g3.8xlarge', 'g3s.xlarge', 'g4ad.16xlarge', 'g4ad.2xlarge', 'g4ad.4xlarge', 'g4ad.8xlarge', 'g4ad.xlarge', 'g4dn.12xlarge', 'g4dn.16xlarge', 'g4dn.2xlarge', 'g4dn.4xlarge', 'g4dn.8xlarge', 'g4dn.metal', 'g4dn.xlarge', 'g5.12xlarge', 'g5.16xlarge', 'g5.24xlarge', 'g5.2xlarge', 'g5.48xlarge', 'g5.4xlarge', 'g5.8xlarge', 'g5.xlarge', 'g5g.16xlarge', 'g5g.2xlarge', 'g5g.4xlarge', 'g5g.8xlarge', 'g5g.metal', 'g5g.xlarge', 'h1.16xlarge', 'h1.2xlarge', 'h1.4xlarge', 'h1.8xlarge', 'hpc7g.16xlarge', 'hpc7g.4xlarge', 'hpc7g.8xlarge', 'i2.2xlarge', 'i2.4xlarge', 'i2.8xlarge', 'i2.xlarge', 'i3.16xlarge', 'i3.2xlarge', 'i3.4xlarge', 'i3.8xlarge', 'i3.large', 'i3.metal', 'i3.xlarge', 'i3en.12xlarge', 'i3en.24xlarge', 'i3en.2xlarge', 'i3en.3xlarge', 'i3en.6xlarge', 'i3en.large', 'i3en.metal', 'i3en.xlarge', 'i4g.16xlarge', 'i4g.2xlarge', 'i4g.4xlarge', 'i4g.8xlarge', 'i4g.large', 'i4g.xlarge', 'i4i.12xlarge', 'i4i.16xlarge', 'i4i.24xlarge', 'i4i.2xlarge', 'i4i.32xlarge', 'i4i.4xlarge', 'i4i.8xlarge', 'i4i.large', 'i4i.metal', 'i4i.xlarge', 'im4gn.16xlarge', 'im4gn.2xlarge', 'im4gn.4xlarge', 'im4gn.8xlarge', 'im4gn.large', 'im4gn.xlarge', 'inf1.24xlarge', 'inf1.2xlarge', 'inf1.6xlarge', 'inf1.xlarge', 'inf2.24xlarge', 'inf2.48xlarge', 'inf2.8xlarge', 'inf2.xlarge', 'is4gen.2xlarge', 'is4gen.4xlarge', 'is4gen.8xlarge', 'is4gen.large', 'is4gen.medium', 'is4gen.xlarge', 'm1.large', 'm1.medium', 'm1.xlarge', 'm2.2xlarge', 'm2.4xlarge', 'm2.xlarge', 'm3.2xlarge', 'm3.large', 'm3.medium', 'm3.xlarge', 'm4.10xlarge', 'm4.16xlarge', 'm4.2xlarge', 'm4.4xlarge', 'm4.large', 'm4.xlarge', 'm5.12xlarge', 'm5.16xlarge', 'm5.24xlarge', 'm5.2xlarge', 'm5.4xlarge', 'm5.8xlarge', 'm5.large', 'm5.metal', 'm5.xlarge', 'm5a.12xlarge', 'm5a.16xlarge', 'm5a.24xlarge', 'm5a.2xlarge', 'm5a.4xlarge', 'm5a.8xlarge', 'm5a.large', 'm5a.xlarge', 'm5ad.12xlarge', 'm5ad.16xlarge', 'm5ad.24xlarge', 'm5ad.2xlarge', 'm5ad.4xlarge', 'm5ad.8xlarge', 'm5ad.large', 'm5ad.xlarge', 'm5d.12xlarge', 'm5d.16xlarge', 'm5d.24xlarge', 'm5d.2xlarge', 'm5d.4xlarge', 'm5d.8xlarge', 'm5d.large', 'm5d.metal', 'm5d.xlarge', 'm5dn.12xlarge', 'm5dn.16xlarge', 'm5dn.24xlarge', 'm5dn.2xlarge', 'm5dn.4xlarge', 'm5dn.8xlarge', 'm5dn.large', 'm5dn.metal', 'm5dn.xlarge', 'm5n.12xlarge', 'm5n.16xlarge', 'm5n.24xlarge', 'm5n.2xlarge', 'm5n.4xlarge', 'm5n.8xlarge', 'm5n.large', 'm5n.metal', 'm5n.xlarge', 'm5zn.12xlarge', 'm5zn.2xlarge', 'm5zn.3xlarge', 'm5zn.6xlarge', 'm5zn.large', 'm5zn.metal', 'm5zn.xlarge', 'm6a.12xlarge', 'm6a.16xlarge', 'm6a.24xlarge', 'm6a.2xlarge', 'm6a.32xlarge', 'm6a.48xlarge', 'm6a.4xlarge', 'm6a.8xlarge', 'm6a.large', 'm6a.metal', 'm6a.xlarge', 'm6g.12xlarge', 'm6g.16xlarge', 'm6g.2xlarge', 'm6g.4xlarge', 'm6g.8xlarge', 'm6g.large', 'm6g.medium', 'm6g.metal', 'm6g.xlarge', 'm6gd.12xlarge', 'm6gd.16xlarge', 'm6gd.2xlarge', 'm6gd.4xlarge', 'm6gd.8xlarge', 'm6gd.large', 'm6gd.medium', 'm6gd.metal', 'm6gd.xlarge', 'm6i.12xlarge', 'm6i.16xlarge', 'm6i.24xlarge', 'm6i.2xlarge', 'm6i.32xlarge', 'm6i.4xlarge', 'm6i.8xlarge', 'm6i.large', 'm6i.metal', 'm6i.xlarge', 'm6id.12xlarge', 'm6id.16xlarge', 'm6id.24xlarge', 'm6id.2xlarge', 'm6id.32xlarge', 'm6id.4xlarge', 'm6id.8xlarge', 'm6id.large', 'm6id.metal', 'm6id.xlarge', 'm6idn.12xlarge', 'm6idn.16xlarge', 'm6idn.24xlarge', 'm6idn.2xlarge', 'm6idn.32xlarge', 'm6idn.4xlarge', 'm6idn.8xlarge', 'm6idn.large', 'm6idn.metal', 'm6idn.xlarge', 'm6in.12xlarge', 'm6in.16xlarge', 'm6in.24xlarge', 'm6in.2xlarge', 'm6in.32xlarge', 'm6in.4xlarge', 'm6in.8xlarge', 'm6in.large', 'm6in.metal', 'm6in.xlarge', 'm7a.12xlarge', 'm7a.16xlarge', 'm7a.24xlarge', 'm7a.2xlarge', 'm7a.32xlarge', 'm7a.48xlarge', 'm7a.4xlarge', 'm7a.8xlarge', 'm7a.large', 'm7a.medium', 'm7a.metal-48xl', 'm7a.xlarge', 'm7g.12xlarge', 'm7g.16xlarge', 'm7g.2xlarge', 'm7g.4xlarge', 'm7g.8xlarge', 'm7g.large', 'm7g.medium', 'm7g.metal', 'm7g.xlarge', 'm7gd.12xlarge', 'm7gd.16xlarge', 'm7gd.2xlarge', 'm7gd.4xlarge', 'm7gd.8xlarge', 'm7gd.large', 'm7gd.medium', 'm7gd.xlarge', 'm7i-flex.2xlarge', 'm7i-flex.4xlarge', 'm7i-flex.8xlarge', 'm7i-flex.large', 'm7i-flex.xlarge', 'm7i.12xlarge', 'm7i.16xlarge', 'm7i.24xlarge', 'm7i.2xlarge', 'm7i.48xlarge', 'm7i.4xlarge', 'm7i.8xlarge', 'm7i.large', 'm7i.metal-24xl', 'm7i.metal-48xl', 'm7i.xlarge', 'p2.16xlarge', 'p2.8xlarge', 'p2.xlarge', 'p3.16xlarge', 'p3.2xlarge', 'p3.8xlarge', 'p3dn.24xlarge', 'p4d.24xlarge', 'p4de.24xlarge', 'p5.48xlarge', 'r3.2xlarge', 'r3.4xlarge', 'r3.8xlarge', 'r3.large', 'r3.xlarge', 'r4.16xlarge', 'r4.2xlarge', 'r4.4xlarge', 'r4.8xlarge', 'r4.large', 'r4.xlarge', 'r5.12xlarge', 'r5.16xlarge', 'r5.24xlarge', 'r5.2xlarge', 'r5.4xlarge', 'r5.8xlarge', 'r5.large', 'r5.metal', 'r5.xlarge', 'r5a.12xlarge', 'r5a.16xlarge', 'r5a.24xlarge', 'r5a.2xlarge', 'r5a.4xlarge', 'r5a.8xlarge', 'r5a.large', 'r5a.xlarge', 'r5ad.12xlarge', 'r5ad.16xlarge', 'r5ad.24xlarge', 'r5ad.2xlarge', 'r5ad.4xlarge', 'r5ad.8xlarge', 'r5ad.large', 'r5ad.xlarge', 'r5b.12xlarge', 'r5b.16xlarge', 'r5b.24xlarge', 'r5b.2xlarge', 'r5b.4xlarge', 'r5b.8xlarge', 'r5b.large', 'r5b.metal', 'r5b.xlarge', 'r5d.12xlarge', 'r5d.16xlarge', 'r5d.24xlarge', 'r5d.2xlarge', 'r5d.4xlarge', 'r5d.8xlarge', 'r5d.large', 'r5d.metal', 'r5d.xlarge', 'r5dn.12xlarge', 'r5dn.16xlarge', 'r5dn.24xlarge', 'r5dn.2xlarge', 'r5dn.4xlarge', 'r5dn.8xlarge', 'r5dn.large', 'r5dn.metal', 'r5dn.xlarge', 'r5n.12xlarge', 'r5n.16xlarge', 'r5n.24xlarge', 'r5n.2xlarge', 'r5n.4xlarge', 'r5n.8xlarge', 'r5n.large', 'r5n.metal', 'r5n.xlarge', 'r6a.12xlarge', 'r6a.16xlarge', 'r6a.24xlarge', 'r6a.2xlarge', 'r6a.32xlarge', 'r6a.48xlarge', 'r6a.4xlarge', 'r6a.8xlarge', 'r6a.large', 'r6a.metal', 'r6a.xlarge', 'r6g.12xlarge', 'r6g.16xlarge', 'r6g.2xlarge', 'r6g.4xlarge', 'r6g.8xlarge', 'r6g.large', 'r6g.medium', 'r6g.metal', 'r6g.xlarge', 'r6gd.12xlarge', 'r6gd.16xlarge', 'r6gd.2xlarge', 'r6gd.4xlarge', 'r6gd.8xlarge', 'r6gd.large', 'r6gd.medium', 'r6gd.metal', 'r6gd.xlarge', 'r6i.12xlarge', 'r6i.16xlarge', 'r6i.24xlarge', 'r6i.2xlarge', 'r6i.32xlarge', 'r6i.4xlarge', 'r6i.8xlarge', 'r6i.large', 'r6i.metal', 'r6i.xlarge', 'r6id.12xlarge', 'r6id.16xlarge', 'r6id.24xlarge', 'r6id.2xlarge', 'r6id.32xlarge', 'r6id.4xlarge', 'r6id.8xlarge', 'r6id.large', 'r6id.metal', 'r6id.xlarge', 'r6idn.12xlarge', 'r6idn.16xlarge', 'r6idn.24xlarge', 'r6idn.2xlarge', 'r6idn.32xlarge', 'r6idn.4xlarge', 'r6idn.8xlarge', 'r6idn.large', 'r6idn.metal', 'r6idn.xlarge', 'r6in.12xlarge', 'r6in.16xlarge', 'r6in.24xlarge', 'r6in.2xlarge', 'r6in.32xlarge', 'r6in.4xlarge', 'r6in.8xlarge', 'r6in.large', 'r6in.metal', 'r6in.xlarge', 'r7a.12xlarge', 'r7a.16xlarge', 'r7a.24xlarge', 'r7a.2xlarge', 'r7a.32xlarge', 'r7a.48xlarge', 'r7a.4xlarge', 'r7a.8xlarge', 'r7a.large', 'r7a.medium', 'r7a.metal-48xl', 'r7a.xlarge', 'r7g.12xlarge', 'r7g.16xlarge', 'r7g.2xlarge', 'r7g.4xlarge', 'r7g.8xlarge', 'r7g.large', 'r7g.medium', 'r7g.metal', 'r7g.xlarge', 'r7gd.12xlarge', 'r7gd.16xlarge', 'r7gd.2xlarge', 'r7gd.4xlarge', 'r7gd.8xlarge', 'r7gd.large', 'r7gd.medium', 'r7gd.xlarge', 'r7i.12xlarge', 'r7i.16xlarge', 'r7i.24xlarge', 'r7i.2xlarge', 'r7i.48xlarge', 'r7i.4xlarge', 'r7i.8xlarge', 'r7i.large', 'r7i.metal-24xl', 'r7i.metal-48xl', 'r7i.xlarge', 'r7iz.12xlarge', 'r7iz.16xlarge', 'r7iz.2xlarge', 'r7iz.32xlarge', 'r7iz.4xlarge', 'r7iz.8xlarge', 'r7iz.large', 'r7iz.metal-16xl', 'r7iz.metal-32xl', 'r7iz.xlarge', 't1.micro', 't2.2xlarge', 't2.large', 't2.medium', 't2.micro', 't2.nano', 't2.small', 't2.xlarge', 't3.2xlarge', 't3.large', 't3.medium', 't3.micro', 't3.nano', 't3.small', 't3.xlarge', 't3a.2xlarge', 't3a.large', 't3a.medium', 't3a.micro', 't3a.nano', 't3a.small', 't3a.xlarge', 't4g.2xlarge', 't4g.large', 't4g.medium', 't4g.micro', 't4g.nano', 't4g.small', 't4g.xlarge', 'trn1.2xlarge', 'trn1.32xlarge', 'trn1n.32xlarge', 'u-12tb1.112xlarge', 'u-18tb1.112xlarge', 'u-24tb1.112xlarge', 'u-3tb1.56xlarge', 'u-6tb1.112xlarge', 'u-6tb1.56xlarge', 'u-9tb1.112xlarge', 'vt1.24xlarge', 'vt1.3xlarge', 'vt1.6xlarge', 'x1.16xlarge', 'x1.32xlarge', 'x1e.16xlarge', 'x1e.2xlarge', 'x1e.32xlarge', 'x1e.4xlarge', 'x1e.8xlarge', 'x1e.xlarge', 'x2gd.12xlarge', 'x2gd.16xlarge', 'x2gd.2xlarge', 'x2gd.4xlarge', 'x2gd.8xlarge', 'x2gd.large', 'x2gd.medium', 'x2gd.metal', 'x2gd.xlarge', 'x2idn.16xlarge', 'x2idn.24xlarge', 'x2idn.32xlarge', 'x2idn.metal', 'x2iedn.16xlarge', 'x2iedn.24xlarge', 'x2iedn.2xlarge', 'x2iedn.32xlarge', 'x2iedn.4xlarge', 'x2iedn.8xlarge', 'x2iedn.metal', 'x2iedn.xlarge', 'x2iezn.12xlarge', 'x2iezn.2xlarge', 'x2iezn.4xlarge', 'x2iezn.6xlarge', 'x2iezn.8xlarge', 'x2iezn.metal', 'z1d.12xlarge', 'z1d.2xlarge', 'z1d.3xlarge', 'z1d.6xlarge', 'z1d.large', 'z1d.metal', 'z1d.xlarge'], - 'us-east-2': ['a1.2xlarge', 'a1.4xlarge', 'a1.large', 'a1.medium', 'a1.metal', 'a1.xlarge', 'c4.2xlarge', 'c4.4xlarge', 'c4.8xlarge', 'c4.large', 'c4.xlarge', 'c5.12xlarge', 'c5.18xlarge', 'c5.24xlarge', 'c5.2xlarge', 'c5.4xlarge', 'c5.9xlarge', 'c5.large', 'c5.metal', 'c5.xlarge', 'c5a.12xlarge', 'c5a.16xlarge', 'c5a.24xlarge', 'c5a.2xlarge', 'c5a.4xlarge', 'c5a.8xlarge', 'c5a.large', 'c5a.xlarge', 'c5ad.12xlarge', 'c5ad.16xlarge', 'c5ad.24xlarge', 'c5ad.2xlarge', 'c5ad.4xlarge', 'c5ad.8xlarge', 'c5ad.large', 'c5ad.xlarge', 'c5d.12xlarge', 'c5d.18xlarge', 'c5d.24xlarge', 'c5d.2xlarge', 'c5d.4xlarge', 'c5d.9xlarge', 'c5d.large', 'c5d.metal', 'c5d.xlarge', 'c5n.18xlarge', 'c5n.2xlarge', 'c5n.4xlarge', 'c5n.9xlarge', 'c5n.large', 'c5n.metal', 'c5n.xlarge', 'c6a.12xlarge', 'c6a.16xlarge', 'c6a.24xlarge', 'c6a.2xlarge', 'c6a.32xlarge', 'c6a.48xlarge', 'c6a.4xlarge', 'c6a.8xlarge', 'c6a.large', 'c6a.metal', 'c6a.xlarge', 'c6g.12xlarge', 'c6g.16xlarge', 'c6g.2xlarge', 'c6g.4xlarge', 'c6g.8xlarge', 'c6g.large', 'c6g.medium', 'c6g.metal', 'c6g.xlarge', 'c6gd.12xlarge', 'c6gd.16xlarge', 'c6gd.2xlarge', 'c6gd.4xlarge', 'c6gd.8xlarge', 'c6gd.large', 'c6gd.medium', 'c6gd.metal', 'c6gd.xlarge', 'c6gn.12xlarge', 'c6gn.16xlarge', 'c6gn.2xlarge', 'c6gn.4xlarge', 'c6gn.8xlarge', 'c6gn.large', 'c6gn.medium', 'c6gn.xlarge', 'c6i.12xlarge', 'c6i.16xlarge', 'c6i.24xlarge', 'c6i.2xlarge', 'c6i.32xlarge', 'c6i.4xlarge', 'c6i.8xlarge', 'c6i.large', 'c6i.metal', 'c6i.xlarge', 'c6id.12xlarge', 'c6id.16xlarge', 'c6id.24xlarge', 'c6id.2xlarge', 'c6id.32xlarge', 'c6id.4xlarge', 'c6id.8xlarge', 'c6id.large', 'c6id.metal', 'c6id.xlarge', 'c6in.12xlarge', 'c6in.16xlarge', 'c6in.24xlarge', 'c6in.2xlarge', 'c6in.32xlarge', 'c6in.4xlarge', 'c6in.8xlarge', 'c6in.large', 'c6in.metal', 'c6in.xlarge', 'c7a.12xlarge', 'c7a.16xlarge', 'c7a.24xlarge', 'c7a.2xlarge', 'c7a.32xlarge', 'c7a.48xlarge', 'c7a.4xlarge', 'c7a.8xlarge', 'c7a.large', 'c7a.medium', 'c7a.metal-48xl', 'c7a.xlarge', 'c7g.12xlarge', 'c7g.16xlarge', 'c7g.2xlarge', 'c7g.4xlarge', 'c7g.8xlarge', 'c7g.large', 'c7g.medium', 'c7g.metal', 'c7g.xlarge', 'c7gd.12xlarge', 'c7gd.16xlarge', 'c7gd.2xlarge', 'c7gd.4xlarge', 'c7gd.8xlarge', 'c7gd.large', 'c7gd.medium', 'c7gd.xlarge', 'c7gn.12xlarge', 'c7gn.16xlarge', 'c7gn.2xlarge', 'c7gn.4xlarge', 'c7gn.8xlarge', 'c7gn.large', 'c7gn.medium', 'c7gn.xlarge', 'c7i.12xlarge', 'c7i.16xlarge', 'c7i.24xlarge', 'c7i.2xlarge', 'c7i.48xlarge', 'c7i.4xlarge', 'c7i.8xlarge', 'c7i.large', 'c7i.metal-24xl', 'c7i.metal-48xl', 'c7i.xlarge', 'd2.2xlarge', 'd2.4xlarge', 'd2.8xlarge', 'd2.xlarge', 'd3.2xlarge', 'd3.4xlarge', 'd3.8xlarge', 'd3.xlarge', 'g3.16xlarge', 'g3.4xlarge', 'g3.8xlarge', 'g3s.xlarge', 'g4ad.16xlarge', 'g4ad.2xlarge', 'g4ad.4xlarge', 'g4ad.8xlarge', 'g4ad.xlarge', 'g4dn.12xlarge', 'g4dn.16xlarge', 'g4dn.2xlarge', 'g4dn.4xlarge', 'g4dn.8xlarge', 'g4dn.metal', 'g4dn.xlarge', 'g5.12xlarge', 'g5.16xlarge', 'g5.24xlarge', 'g5.2xlarge', 'g5.48xlarge', 'g5.4xlarge', 'g5.8xlarge', 'g5.xlarge', 'h1.16xlarge', 'h1.2xlarge', 'h1.4xlarge', 'h1.8xlarge', 'hpc6a.48xlarge', 'hpc6id.32xlarge', 'hpc7a.12xlarge', 'hpc7a.24xlarge', 'hpc7a.48xlarge', 'hpc7a.96xlarge', 'i2.2xlarge', 'i2.4xlarge', 'i2.8xlarge', 'i2.xlarge', 'i3.16xlarge', 'i3.2xlarge', 'i3.4xlarge', 'i3.8xlarge', 'i3.large', 'i3.metal', 'i3.xlarge', 'i3en.12xlarge', 'i3en.24xlarge', 'i3en.2xlarge', 'i3en.3xlarge', 'i3en.6xlarge', 'i3en.large', 'i3en.metal', 'i3en.xlarge', 'i4g.16xlarge', 'i4g.2xlarge', 'i4g.4xlarge', 'i4g.8xlarge', 'i4g.large', 'i4g.xlarge', 'i4i.12xlarge', 'i4i.16xlarge', 'i4i.24xlarge', 'i4i.2xlarge', 'i4i.32xlarge', 'i4i.4xlarge', 'i4i.8xlarge', 'i4i.large', 'i4i.metal', 'i4i.xlarge', 'im4gn.16xlarge', 'im4gn.2xlarge', 'im4gn.4xlarge', 'im4gn.8xlarge', 'im4gn.large', 'im4gn.xlarge', 'inf1.24xlarge', 'inf1.2xlarge', 'inf1.6xlarge', 'inf1.xlarge', 'inf2.24xlarge', 'inf2.48xlarge', 'inf2.8xlarge', 'inf2.xlarge', 'is4gen.2xlarge', 'is4gen.4xlarge', 'is4gen.8xlarge', 'is4gen.large', 'is4gen.medium', 'is4gen.xlarge', 'm4.10xlarge', 'm4.16xlarge', 'm4.2xlarge', 'm4.4xlarge', 'm4.large', 'm4.xlarge', 'm5.12xlarge', 'm5.16xlarge', 'm5.24xlarge', 'm5.2xlarge', 'm5.4xlarge', 'm5.8xlarge', 'm5.large', 'm5.metal', 'm5.xlarge', 'm5a.12xlarge', 'm5a.16xlarge', 'm5a.24xlarge', 'm5a.2xlarge', 'm5a.4xlarge', 'm5a.8xlarge', 'm5a.large', 'm5a.xlarge', 'm5ad.12xlarge', 'm5ad.16xlarge', 'm5ad.24xlarge', 'm5ad.2xlarge', 'm5ad.4xlarge', 'm5ad.8xlarge', 'm5ad.large', 'm5ad.xlarge', 'm5d.12xlarge', 'm5d.16xlarge', 'm5d.24xlarge', 'm5d.2xlarge', 'm5d.4xlarge', 'm5d.8xlarge', 'm5d.large', 'm5d.metal', 'm5d.xlarge', 'm5dn.12xlarge', 'm5dn.16xlarge', 'm5dn.24xlarge', 'm5dn.2xlarge', 'm5dn.4xlarge', 'm5dn.8xlarge', 'm5dn.large', 'm5dn.metal', 'm5dn.xlarge', 'm5n.12xlarge', 'm5n.16xlarge', 'm5n.24xlarge', 'm5n.2xlarge', 'm5n.4xlarge', 'm5n.8xlarge', 'm5n.large', 'm5n.metal', 'm5n.xlarge', 'm5zn.12xlarge', 'm5zn.2xlarge', 'm5zn.3xlarge', 'm5zn.6xlarge', 'm5zn.large', 'm5zn.metal', 'm5zn.xlarge', 'm6a.12xlarge', 'm6a.16xlarge', 'm6a.24xlarge', 'm6a.2xlarge', 'm6a.32xlarge', 'm6a.48xlarge', 'm6a.4xlarge', 'm6a.8xlarge', 'm6a.large', 'm6a.metal', 'm6a.xlarge', 'm6g.12xlarge', 'm6g.16xlarge', 'm6g.2xlarge', 'm6g.4xlarge', 'm6g.8xlarge', 'm6g.large', 'm6g.medium', 'm6g.metal', 'm6g.xlarge', 'm6gd.12xlarge', 'm6gd.16xlarge', 'm6gd.2xlarge', 'm6gd.4xlarge', 'm6gd.8xlarge', 'm6gd.large', 'm6gd.medium', 'm6gd.metal', 'm6gd.xlarge', 'm6i.12xlarge', 'm6i.16xlarge', 'm6i.24xlarge', 'm6i.2xlarge', 'm6i.32xlarge', 'm6i.4xlarge', 'm6i.8xlarge', 'm6i.large', 'm6i.metal', 'm6i.xlarge', 'm6id.12xlarge', 'm6id.16xlarge', 'm6id.24xlarge', 'm6id.2xlarge', 'm6id.32xlarge', 'm6id.4xlarge', 'm6id.8xlarge', 'm6id.large', 'm6id.metal', 'm6id.xlarge', 'm6idn.12xlarge', 'm6idn.16xlarge', 'm6idn.24xlarge', 'm6idn.2xlarge', 'm6idn.32xlarge', 'm6idn.4xlarge', 'm6idn.8xlarge', 'm6idn.large', 'm6idn.metal', 'm6idn.xlarge', 'm6in.12xlarge', 'm6in.16xlarge', 'm6in.24xlarge', 'm6in.2xlarge', 'm6in.32xlarge', 'm6in.4xlarge', 'm6in.8xlarge', 'm6in.large', 'm6in.metal', 'm6in.xlarge', 'm7a.12xlarge', 'm7a.16xlarge', 'm7a.24xlarge', 'm7a.2xlarge', 'm7a.32xlarge', 'm7a.48xlarge', 'm7a.4xlarge', 'm7a.8xlarge', 'm7a.large', 'm7a.medium', 'm7a.metal-48xl', 'm7a.xlarge', 'm7g.12xlarge', 'm7g.16xlarge', 'm7g.2xlarge', 'm7g.4xlarge', 'm7g.8xlarge', 'm7g.large', 'm7g.medium', 'm7g.metal', 'm7g.xlarge', 'm7gd.12xlarge', 'm7gd.16xlarge', 'm7gd.2xlarge', 'm7gd.4xlarge', 'm7gd.8xlarge', 'm7gd.large', 'm7gd.medium', 'm7gd.xlarge', 'm7i-flex.2xlarge', 'm7i-flex.4xlarge', 'm7i-flex.8xlarge', 'm7i-flex.large', 'm7i-flex.xlarge', 'm7i.12xlarge', 'm7i.16xlarge', 'm7i.24xlarge', 'm7i.2xlarge', 'm7i.48xlarge', 'm7i.4xlarge', 'm7i.8xlarge', 'm7i.large', 'm7i.metal-24xl', 'm7i.metal-48xl', 'm7i.xlarge', 'p2.16xlarge', 'p2.8xlarge', 'p2.xlarge', 'p3.16xlarge', 'p3.2xlarge', 'p3.8xlarge', 'p4d.24xlarge', 'p5.48xlarge', 'r3.2xlarge', 'r3.4xlarge', 'r3.8xlarge', 'r3.large', 'r3.xlarge', 'r4.16xlarge', 'r4.2xlarge', 'r4.4xlarge', 'r4.8xlarge', 'r4.large', 'r4.xlarge', 'r5.12xlarge', 'r5.16xlarge', 'r5.24xlarge', 'r5.2xlarge', 'r5.4xlarge', 'r5.8xlarge', 'r5.large', 'r5.metal', 'r5.xlarge', 'r5a.12xlarge', 'r5a.16xlarge', 'r5a.24xlarge', 'r5a.2xlarge', 'r5a.4xlarge', 'r5a.8xlarge', 'r5a.large', 'r5a.xlarge', 'r5ad.12xlarge', 'r5ad.16xlarge', 'r5ad.24xlarge', 'r5ad.2xlarge', 'r5ad.4xlarge', 'r5ad.8xlarge', 'r5ad.large', 'r5ad.xlarge', 'r5b.12xlarge', 'r5b.16xlarge', 'r5b.24xlarge', 'r5b.2xlarge', 'r5b.4xlarge', 'r5b.8xlarge', 'r5b.large', 'r5b.metal', 'r5b.xlarge', 'r5d.12xlarge', 'r5d.16xlarge', 'r5d.24xlarge', 'r5d.2xlarge', 'r5d.4xlarge', 'r5d.8xlarge', 'r5d.large', 'r5d.metal', 'r5d.xlarge', 'r5dn.12xlarge', 'r5dn.16xlarge', 'r5dn.24xlarge', 'r5dn.2xlarge', 'r5dn.4xlarge', 'r5dn.8xlarge', 'r5dn.large', 'r5dn.metal', 'r5dn.xlarge', 'r5n.12xlarge', 'r5n.16xlarge', 'r5n.24xlarge', 'r5n.2xlarge', 'r5n.4xlarge', 'r5n.8xlarge', 'r5n.large', 'r5n.metal', 'r5n.xlarge', 'r6a.12xlarge', 'r6a.16xlarge', 'r6a.24xlarge', 'r6a.2xlarge', 'r6a.32xlarge', 'r6a.48xlarge', 'r6a.4xlarge', 'r6a.8xlarge', 'r6a.large', 'r6a.metal', 'r6a.xlarge', 'r6g.12xlarge', 'r6g.16xlarge', 'r6g.2xlarge', 'r6g.4xlarge', 'r6g.8xlarge', 'r6g.large', 'r6g.medium', 'r6g.metal', 'r6g.xlarge', 'r6gd.12xlarge', 'r6gd.16xlarge', 'r6gd.2xlarge', 'r6gd.4xlarge', 'r6gd.8xlarge', 'r6gd.large', 'r6gd.medium', 'r6gd.metal', 'r6gd.xlarge', 'r6i.12xlarge', 'r6i.16xlarge', 'r6i.24xlarge', 'r6i.2xlarge', 'r6i.32xlarge', 'r6i.4xlarge', 'r6i.8xlarge', 'r6i.large', 'r6i.metal', 'r6i.xlarge', 'r6id.12xlarge', 'r6id.16xlarge', 'r6id.24xlarge', 'r6id.2xlarge', 'r6id.32xlarge', 'r6id.4xlarge', 'r6id.8xlarge', 'r6id.large', 'r6id.metal', 'r6id.xlarge', 'r6idn.12xlarge', 'r6idn.16xlarge', 'r6idn.24xlarge', 'r6idn.2xlarge', 'r6idn.32xlarge', 'r6idn.4xlarge', 'r6idn.8xlarge', 'r6idn.large', 'r6idn.metal', 'r6idn.xlarge', 'r6in.12xlarge', 'r6in.16xlarge', 'r6in.24xlarge', 'r6in.2xlarge', 'r6in.32xlarge', 'r6in.4xlarge', 'r6in.8xlarge', 'r6in.large', 'r6in.metal', 'r6in.xlarge', 'r7a.12xlarge', 'r7a.16xlarge', 'r7a.24xlarge', 'r7a.2xlarge', 'r7a.32xlarge', 'r7a.48xlarge', 'r7a.4xlarge', 'r7a.8xlarge', 'r7a.large', 'r7a.medium', 'r7a.metal-48xl', 'r7a.xlarge', 'r7g.12xlarge', 'r7g.16xlarge', 'r7g.2xlarge', 'r7g.4xlarge', 'r7g.8xlarge', 'r7g.large', 'r7g.medium', 'r7g.metal', 'r7g.xlarge', 'r7gd.12xlarge', 'r7gd.16xlarge', 'r7gd.2xlarge', 'r7gd.4xlarge', 'r7gd.8xlarge', 'r7gd.large', 'r7gd.medium', 'r7gd.xlarge', 'r7i.12xlarge', 'r7i.16xlarge', 'r7i.24xlarge', 'r7i.2xlarge', 'r7i.48xlarge', 'r7i.4xlarge', 'r7i.8xlarge', 'r7i.large', 'r7i.metal-24xl', 'r7i.metal-48xl', 'r7i.xlarge', 'r7iz.12xlarge', 'r7iz.16xlarge', 'r7iz.2xlarge', 'r7iz.32xlarge', 'r7iz.4xlarge', 'r7iz.8xlarge', 'r7iz.large', 'r7iz.metal-16xl', 'r7iz.metal-32xl', 'r7iz.xlarge', 't2.2xlarge', 't2.large', 't2.medium', 't2.micro', 't2.nano', 't2.small', 't2.xlarge', 't3.2xlarge', 't3.large', 't3.medium', 't3.micro', 't3.nano', 't3.small', 't3.xlarge', 't3a.2xlarge', 't3a.large', 't3a.medium', 't3a.micro', 't3a.nano', 't3a.small', 't3a.xlarge', 't4g.2xlarge', 't4g.large', 't4g.medium', 't4g.micro', 't4g.nano', 't4g.small', 't4g.xlarge', 'trn1.2xlarge', 'trn1.32xlarge', 'trn1n.32xlarge', 'u-12tb1.112xlarge', 'u-3tb1.56xlarge', 'u-6tb1.112xlarge', 'u-6tb1.56xlarge', 'u-9tb1.112xlarge', 'x1.16xlarge', 'x1.32xlarge', 'x1e.16xlarge', 'x1e.2xlarge', 'x1e.32xlarge', 'x1e.4xlarge', 'x1e.8xlarge', 'x1e.xlarge', 'x2gd.12xlarge', 'x2gd.16xlarge', 'x2gd.2xlarge', 'x2gd.4xlarge', 'x2gd.8xlarge', 'x2gd.large', 'x2gd.medium', 'x2gd.metal', 'x2gd.xlarge', 'x2idn.16xlarge', 'x2idn.24xlarge', 'x2idn.32xlarge', 'x2idn.metal', 'x2iedn.16xlarge', 'x2iedn.24xlarge', 'x2iedn.2xlarge', 'x2iedn.32xlarge', 'x2iedn.4xlarge', 'x2iedn.8xlarge', 'x2iedn.metal', 'x2iedn.xlarge', 'z1d.12xlarge', 'z1d.2xlarge', 'z1d.3xlarge', 'z1d.6xlarge', 'z1d.large', 'z1d.metal', 'z1d.xlarge'], - 'us-gov-west-1': ['c1.medium', 'c1.xlarge', 'c3.2xlarge', 'c3.4xlarge', 'c3.8xlarge', 'c3.large', 'c3.xlarge', 'c4.2xlarge', 'c4.4xlarge', 'c4.8xlarge', 'c4.large', 'c4.xlarge', 'c5.12xlarge', 'c5.18xlarge', 'c5.24xlarge', 'c5.2xlarge', 'c5.4xlarge', 'c5.9xlarge', 'c5.large', 'c5.metal', 'c5.xlarge', 'c5a.12xlarge', 'c5a.16xlarge', 'c5a.24xlarge', 'c5a.2xlarge', 'c5a.4xlarge', 'c5a.8xlarge', 'c5a.large', 'c5a.xlarge', 'c5d.12xlarge', 'c5d.18xlarge', 'c5d.24xlarge', 'c5d.2xlarge', 'c5d.4xlarge', 'c5d.9xlarge', 'c5d.large', 'c5d.metal', 'c5d.xlarge', 'c5n.18xlarge', 'c5n.2xlarge', 'c5n.4xlarge', 'c5n.9xlarge', 'c5n.large', 'c5n.metal', 'c5n.xlarge', 'c6g.12xlarge', 'c6g.16xlarge', 'c6g.2xlarge', 'c6g.4xlarge', 'c6g.8xlarge', 'c6g.large', 'c6g.medium', 'c6g.metal', 'c6g.xlarge', 'c6gd.12xlarge', 'c6gd.16xlarge', 'c6gd.2xlarge', 'c6gd.4xlarge', 'c6gd.8xlarge', 'c6gd.large', 'c6gd.medium', 'c6gd.metal', 'c6gd.xlarge', 'c6gn.12xlarge', 'c6gn.16xlarge', 'c6gn.2xlarge', 'c6gn.4xlarge', 'c6gn.8xlarge', 'c6gn.large', 'c6gn.medium', 'c6gn.xlarge', 'c6i.12xlarge', 'c6i.16xlarge', 'c6i.24xlarge', 'c6i.2xlarge', 'c6i.32xlarge', 'c6i.4xlarge', 'c6i.8xlarge', 'c6i.large', 'c6i.metal', 'c6i.xlarge', 'c6id.12xlarge', 'c6id.16xlarge', 'c6id.24xlarge', 'c6id.2xlarge', 'c6id.32xlarge', 'c6id.4xlarge', 'c6id.8xlarge', 'c6id.large', 'c6id.metal', 'c6id.xlarge', 'c6in.12xlarge', 'c6in.16xlarge', 'c6in.24xlarge', 'c6in.2xlarge', 'c6in.32xlarge', 'c6in.4xlarge', 'c6in.8xlarge', 'c6in.large', 'c6in.metal', 'c6in.xlarge', 'cc2.8xlarge', 'd2.2xlarge', 'd2.4xlarge', 'd2.8xlarge', 'd2.xlarge', 'd3.2xlarge', 'd3.4xlarge', 'd3.8xlarge', 'd3.xlarge', 'f1.16xlarge', 'f1.2xlarge', 'f1.4xlarge', 'g3.16xlarge', 'g3.4xlarge', 'g3.8xlarge', 'g3s.xlarge', 'g4dn.12xlarge', 'g4dn.16xlarge', 'g4dn.2xlarge', 'g4dn.4xlarge', 'g4dn.8xlarge', 'g4dn.metal', 'g4dn.xlarge', 'hpc6a.48xlarge', 'hpc6id.32xlarge', 'hpc7a.12xlarge', 'hpc7a.24xlarge', 'hpc7a.48xlarge', 'hpc7a.96xlarge', 'hpc7g.16xlarge', 'hpc7g.4xlarge', 'hpc7g.8xlarge', 'hs1.8xlarge', 'i2.2xlarge', 'i2.4xlarge', 'i2.8xlarge', 'i2.xlarge', 'i3.16xlarge', 'i3.2xlarge', 'i3.4xlarge', 'i3.8xlarge', 'i3.large', 'i3.metal', 'i3.xlarge', 'i3en.12xlarge', 'i3en.24xlarge', 'i3en.2xlarge', 'i3en.3xlarge', 'i3en.6xlarge', 'i3en.large', 'i3en.metal', 'i3en.xlarge', 'i3p.16xlarge', 'i4i.12xlarge', 'i4i.16xlarge', 'i4i.24xlarge', 'i4i.2xlarge', 'i4i.32xlarge', 'i4i.4xlarge', 'i4i.8xlarge', 'i4i.large', 'i4i.metal', 'i4i.xlarge', 'inf1.24xlarge', 'inf1.2xlarge', 'inf1.6xlarge', 'inf1.xlarge', 'm1.large', 'm1.medium', 'm1.xlarge', 'm2.2xlarge', 'm2.4xlarge', 'm2.xlarge', 'm3.2xlarge', 'm3.large', 'm3.medium', 'm3.xlarge', 'm4.10xlarge', 'm4.16xlarge', 'm4.2xlarge', 'm4.4xlarge', 'm4.large', 'm4.xlarge', 'm5.12xlarge', 'm5.16xlarge', 'm5.24xlarge', 'm5.2xlarge', 'm5.4xlarge', 'm5.8xlarge', 'm5.large', 'm5.metal', 'm5.xlarge', 'm5a.12xlarge', 'm5a.16xlarge', 'm5a.24xlarge', 'm5a.2xlarge', 'm5a.4xlarge', 'm5a.8xlarge', 'm5a.large', 'm5a.xlarge', 'm5ad.12xlarge', 'm5ad.16xlarge', 'm5ad.24xlarge', 'm5ad.2xlarge', 'm5ad.4xlarge', 'm5ad.8xlarge', 'm5ad.large', 'm5ad.xlarge', 'm5d.12xlarge', 'm5d.16xlarge', 'm5d.24xlarge', 'm5d.2xlarge', 'm5d.4xlarge', 'm5d.8xlarge', 'm5d.large', 'm5d.metal', 'm5d.xlarge', 'm5dn.12xlarge', 'm5dn.16xlarge', 'm5dn.24xlarge', 'm5dn.2xlarge', 'm5dn.4xlarge', 'm5dn.8xlarge', 'm5dn.large', 'm5dn.metal', 'm5dn.xlarge', 'm5n.12xlarge', 'm5n.16xlarge', 'm5n.24xlarge', 'm5n.2xlarge', 'm5n.4xlarge', 'm5n.8xlarge', 'm5n.large', 'm5n.metal', 'm5n.xlarge', 'm6g.12xlarge', 'm6g.16xlarge', 'm6g.2xlarge', 'm6g.4xlarge', 'm6g.8xlarge', 'm6g.large', 'm6g.medium', 'm6g.metal', 'm6g.xlarge', 'm6i.12xlarge', 'm6i.16xlarge', 'm6i.24xlarge', 'm6i.2xlarge', 'm6i.32xlarge', 'm6i.4xlarge', 'm6i.8xlarge', 'm6i.large', 'm6i.metal', 'm6i.xlarge', 'm6id.12xlarge', 'm6id.16xlarge', 'm6id.24xlarge', 'm6id.2xlarge', 'm6id.32xlarge', 'm6id.4xlarge', 'm6id.8xlarge', 'm6id.large', 'm6id.metal', 'm6id.xlarge', 'm6idn.12xlarge', 'm6idn.16xlarge', 'm6idn.24xlarge', 'm6idn.2xlarge', 'm6idn.32xlarge', 'm6idn.4xlarge', 'm6idn.8xlarge', 'm6idn.large', 'm6idn.metal', 'm6idn.xlarge', 'm6in.12xlarge', 'm6in.16xlarge', 'm6in.24xlarge', 'm6in.2xlarge', 'm6in.32xlarge', 'm6in.4xlarge', 'm6in.8xlarge', 'm6in.large', 'm6in.metal', 'm6in.xlarge', 'p2.16xlarge', 'p2.8xlarge', 'p2.xlarge', 'p3.16xlarge', 'p3.2xlarge', 'p3.8xlarge', 'p3dn.24xlarge', 'p4d.24xlarge', 'r3.2xlarge', 'r3.4xlarge', 'r3.8xlarge', 'r3.large', 'r3.xlarge', 'r4.16xlarge', 'r4.2xlarge', 'r4.4xlarge', 'r4.8xlarge', 'r4.large', 'r4.xlarge', 'r5.12xlarge', 'r5.16xlarge', 'r5.24xlarge', 'r5.2xlarge', 'r5.4xlarge', 'r5.8xlarge', 'r5.large', 'r5.metal', 'r5.xlarge', 'r5a.12xlarge', 'r5a.16xlarge', 'r5a.24xlarge', 'r5a.2xlarge', 'r5a.4xlarge', 'r5a.8xlarge', 'r5a.large', 'r5a.xlarge', 'r5ad.12xlarge', 'r5ad.16xlarge', 'r5ad.24xlarge', 'r5ad.2xlarge', 'r5ad.4xlarge', 'r5ad.8xlarge', 'r5ad.large', 'r5ad.xlarge', 'r5d.12xlarge', 'r5d.16xlarge', 'r5d.24xlarge', 'r5d.2xlarge', 'r5d.4xlarge', 'r5d.8xlarge', 'r5d.large', 'r5d.metal', 'r5d.xlarge', 'r5dn.12xlarge', 'r5dn.16xlarge', 'r5dn.24xlarge', 'r5dn.2xlarge', 'r5dn.4xlarge', 'r5dn.8xlarge', 'r5dn.large', 'r5dn.metal', 'r5dn.xlarge', 'r5n.12xlarge', 'r5n.16xlarge', 'r5n.24xlarge', 'r5n.2xlarge', 'r5n.4xlarge', 'r5n.8xlarge', 'r5n.large', 'r5n.metal', 'r5n.xlarge', 'r6g.12xlarge', 'r6g.16xlarge', 'r6g.2xlarge', 'r6g.4xlarge', 'r6g.8xlarge', 'r6g.large', 'r6g.medium', 'r6g.metal', 'r6g.xlarge', 'r6gd.12xlarge', 'r6gd.16xlarge', 'r6gd.2xlarge', 'r6gd.4xlarge', 'r6gd.8xlarge', 'r6gd.large', 'r6gd.medium', 'r6gd.metal', 'r6gd.xlarge', 'r6i.12xlarge', 'r6i.16xlarge', 'r6i.24xlarge', 'r6i.2xlarge', 'r6i.32xlarge', 'r6i.4xlarge', 'r6i.8xlarge', 'r6i.large', 'r6i.metal', 'r6i.xlarge', 'r6id.12xlarge', 'r6id.16xlarge', 'r6id.24xlarge', 'r6id.2xlarge', 'r6id.32xlarge', 'r6id.4xlarge', 'r6id.8xlarge', 'r6id.large', 'r6id.metal', 'r6id.xlarge', 'r6idn.12xlarge', 'r6idn.16xlarge', 'r6idn.24xlarge', 'r6idn.2xlarge', 'r6idn.32xlarge', 'r6idn.4xlarge', 'r6idn.8xlarge', 'r6idn.large', 'r6idn.metal', 'r6idn.xlarge', 'r6in.12xlarge', 'r6in.16xlarge', 'r6in.24xlarge', 'r6in.2xlarge', 'r6in.32xlarge', 'r6in.4xlarge', 'r6in.8xlarge', 'r6in.large', 'r6in.metal', 'r6in.xlarge', 't1.micro', 't2.2xlarge', 't2.large', 't2.medium', 't2.micro', 't2.nano', 't2.small', 't2.xlarge', 't3.2xlarge', 't3.large', 't3.medium', 't3.micro', 't3.nano', 't3.small', 't3.xlarge', 't3a.2xlarge', 't3a.large', 't3a.medium', 't3a.micro', 't3a.nano', 't3a.small', 't3a.xlarge', 't4g.2xlarge', 't4g.large', 't4g.medium', 't4g.micro', 't4g.nano', 't4g.small', 't4g.xlarge', 'u-12tb1.112xlarge', 'u-24tb1.112xlarge', 'u-3tb1.56xlarge', 'u-6tb1.112xlarge', 'u-6tb1.56xlarge', 'u-9tb1.112xlarge', 'x1.16xlarge', 'x1.32xlarge', 'x1e.16xlarge', 'x1e.2xlarge', 'x1e.32xlarge', 'x1e.4xlarge', 'x1e.8xlarge', 'x1e.xlarge', 'x2idn.16xlarge', 'x2idn.24xlarge', 'x2idn.32xlarge', 'x2idn.metal', 'x2iedn.16xlarge', 'x2iedn.24xlarge', 'x2iedn.2xlarge', 'x2iedn.32xlarge', 'x2iedn.4xlarge', 'x2iedn.8xlarge', 'x2iedn.metal', 'x2iedn.xlarge'], - 'ca-central-1': ['c4.2xlarge', 'c4.4xlarge', 'c4.8xlarge', 'c4.large', 'c4.xlarge', 'c5.12xlarge', 'c5.18xlarge', 'c5.24xlarge', 'c5.2xlarge', 'c5.4xlarge', 'c5.9xlarge', 'c5.large', 'c5.metal', 'c5.xlarge', 'c5a.12xlarge', 'c5a.16xlarge', 'c5a.24xlarge', 'c5a.2xlarge', 'c5a.4xlarge', 'c5a.8xlarge', 'c5a.large', 'c5a.xlarge', 'c5d.12xlarge', 'c5d.18xlarge', 'c5d.24xlarge', 'c5d.2xlarge', 'c5d.4xlarge', 'c5d.9xlarge', 'c5d.large', 'c5d.metal', 'c5d.xlarge', 'c5n.18xlarge', 'c5n.2xlarge', 'c5n.4xlarge', 'c5n.9xlarge', 'c5n.large', 'c5n.metal', 'c5n.xlarge', 'c6a.12xlarge', 'c6a.16xlarge', 'c6a.24xlarge', 'c6a.2xlarge', 'c6a.32xlarge', 'c6a.48xlarge', 'c6a.4xlarge', 'c6a.8xlarge', 'c6a.large', 'c6a.metal', 'c6a.xlarge', 'c6g.12xlarge', 'c6g.16xlarge', 'c6g.2xlarge', 'c6g.4xlarge', 'c6g.8xlarge', 'c6g.large', 'c6g.medium', 'c6g.metal', 'c6g.xlarge', 'c6gd.12xlarge', 'c6gd.16xlarge', 'c6gd.2xlarge', 'c6gd.4xlarge', 'c6gd.8xlarge', 'c6gd.large', 'c6gd.medium', 'c6gd.metal', 'c6gd.xlarge', 'c6gn.12xlarge', 'c6gn.16xlarge', 'c6gn.2xlarge', 'c6gn.4xlarge', 'c6gn.8xlarge', 'c6gn.large', 'c6gn.medium', 'c6gn.metal', 'c6gn.xlarge', 'c6i.12xlarge', 'c6i.16xlarge', 'c6i.24xlarge', 'c6i.2xlarge', 'c6i.32xlarge', 'c6i.4xlarge', 'c6i.8xlarge', 'c6i.large', 'c6i.metal', 'c6i.xlarge', 'c6in.12xlarge', 'c6in.16xlarge', 'c6in.24xlarge', 'c6in.2xlarge', 'c6in.32xlarge', 'c6in.4xlarge', 'c6in.8xlarge', 'c6in.large', 'c6in.metal', 'c6in.xlarge', 'c7g.12xlarge', 'c7g.16xlarge', 'c7g.2xlarge', 'c7g.4xlarge', 'c7g.8xlarge', 'c7g.large', 'c7g.medium', 'c7g.metal', 'c7g.xlarge', 'c7i.12xlarge', 'c7i.16xlarge', 'c7i.24xlarge', 'c7i.2xlarge', 'c7i.48xlarge', 'c7i.4xlarge', 'c7i.8xlarge', 'c7i.large', 'c7i.metal-24xl', 'c7i.metal-48xl', 'c7i.xlarge', 'd2.2xlarge', 'd2.4xlarge', 'd2.8xlarge', 'd2.xlarge', 'd3.2xlarge', 'd3.4xlarge', 'd3.8xlarge', 'd3.xlarge', 'g3.16xlarge', 'g3.4xlarge', 'g3.8xlarge', 'g4ad.16xlarge', 'g4ad.2xlarge', 'g4ad.4xlarge', 'g4ad.8xlarge', 'g4ad.xlarge', 'g4dn.12xlarge', 'g4dn.16xlarge', 'g4dn.2xlarge', 'g4dn.4xlarge', 'g4dn.8xlarge', 'g4dn.metal', 'g4dn.xlarge', 'g5.12xlarge', 'g5.16xlarge', 'g5.24xlarge', 'g5.2xlarge', 'g5.48xlarge', 'g5.4xlarge', 'g5.8xlarge', 'g5.xlarge', 'i3.16xlarge', 'i3.2xlarge', 'i3.4xlarge', 'i3.8xlarge', 'i3.large', 'i3.metal', 'i3.xlarge', 'i3en.12xlarge', 'i3en.24xlarge', 'i3en.2xlarge', 'i3en.3xlarge', 'i3en.6xlarge', 'i3en.large', 'i3en.metal', 'i3en.xlarge', 'i4g.16xlarge', 'i4g.2xlarge', 'i4g.4xlarge', 'i4g.8xlarge', 'i4g.large', 'i4g.xlarge', 'i4i.12xlarge', 'i4i.16xlarge', 'i4i.24xlarge', 'i4i.2xlarge', 'i4i.32xlarge', 'i4i.4xlarge', 'i4i.8xlarge', 'i4i.large', 'i4i.metal', 'i4i.xlarge', 'im4gn.16xlarge', 'im4gn.2xlarge', 'im4gn.4xlarge', 'im4gn.8xlarge', 'im4gn.large', 'im4gn.xlarge', 'inf1.24xlarge', 'inf1.2xlarge', 'inf1.6xlarge', 'inf1.xlarge', 'is4gen.2xlarge', 'is4gen.4xlarge', 'is4gen.8xlarge', 'is4gen.large', 'is4gen.medium', 'is4gen.xlarge', 'm4.10xlarge', 'm4.16xlarge', 'm4.2xlarge', 'm4.4xlarge', 'm4.large', 'm4.xlarge', 'm5.12xlarge', 'm5.16xlarge', 'm5.24xlarge', 'm5.2xlarge', 'm5.4xlarge', 'm5.8xlarge', 'm5.large', 'm5.metal', 'm5.xlarge', 'm5a.12xlarge', 'm5a.16xlarge', 'm5a.24xlarge', 'm5a.2xlarge', 'm5a.4xlarge', 'm5a.8xlarge', 'm5a.large', 'm5a.xlarge', 'm5ad.12xlarge', 'm5ad.16xlarge', 'm5ad.24xlarge', 'm5ad.2xlarge', 'm5ad.4xlarge', 'm5ad.8xlarge', 'm5ad.large', 'm5ad.xlarge', 'm5d.12xlarge', 'm5d.16xlarge', 'm5d.24xlarge', 'm5d.2xlarge', 'm5d.4xlarge', 'm5d.8xlarge', 'm5d.large', 'm5d.metal', 'm5d.xlarge', 'm6a.12xlarge', 'm6a.16xlarge', 'm6a.24xlarge', 'm6a.2xlarge', 'm6a.32xlarge', 'm6a.48xlarge', 'm6a.4xlarge', 'm6a.8xlarge', 'm6a.large', 'm6a.metal', 'm6a.xlarge', 'm6g.12xlarge', 'm6g.16xlarge', 'm6g.2xlarge', 'm6g.4xlarge', 'm6g.8xlarge', 'm6g.large', 'm6g.medium', 'm6g.metal', 'm6g.xlarge', 'm6gd.12xlarge', 'm6gd.16xlarge', 'm6gd.2xlarge', 'm6gd.4xlarge', 'm6gd.8xlarge', 'm6gd.large', 'm6gd.medium', 'm6gd.metal', 'm6gd.xlarge', 'm6i.12xlarge', 'm6i.16xlarge', 'm6i.24xlarge', 'm6i.2xlarge', 'm6i.32xlarge', 'm6i.4xlarge', 'm6i.8xlarge', 'm6i.large', 'm6i.metal', 'm6i.xlarge', 'm6id.12xlarge', 'm6id.16xlarge', 'm6id.24xlarge', 'm6id.2xlarge', 'm6id.32xlarge', 'm6id.4xlarge', 'm6id.8xlarge', 'm6id.large', 'm6id.metal', 'm6id.xlarge', 'm7g.12xlarge', 'm7g.16xlarge', 'm7g.2xlarge', 'm7g.4xlarge', 'm7g.8xlarge', 'm7g.large', 'm7g.medium', 'm7g.metal', 'm7g.xlarge', 'm7i-flex.2xlarge', 'm7i-flex.4xlarge', 'm7i-flex.8xlarge', 'm7i-flex.large', 'm7i-flex.xlarge', 'm7i.12xlarge', 'm7i.16xlarge', 'm7i.24xlarge', 'm7i.2xlarge', 'm7i.48xlarge', 'm7i.4xlarge', 'm7i.8xlarge', 'm7i.large', 'm7i.metal-24xl', 'm7i.metal-48xl', 'm7i.xlarge', 'p3.16xlarge', 'p3.2xlarge', 'p3.8xlarge', 'r4.16xlarge', 'r4.2xlarge', 'r4.4xlarge', 'r4.8xlarge', 'r4.large', 'r4.xlarge', 'r5.12xlarge', 'r5.16xlarge', 'r5.24xlarge', 'r5.2xlarge', 'r5.4xlarge', 'r5.8xlarge', 'r5.large', 'r5.metal', 'r5.xlarge', 'r5a.12xlarge', 'r5a.16xlarge', 'r5a.24xlarge', 'r5a.2xlarge', 'r5a.4xlarge', 'r5a.8xlarge', 'r5a.large', 'r5a.xlarge', 'r5ad.12xlarge', 'r5ad.16xlarge', 'r5ad.24xlarge', 'r5ad.2xlarge', 'r5ad.4xlarge', 'r5ad.8xlarge', 'r5ad.large', 'r5ad.xlarge', 'r5b.12xlarge', 'r5b.16xlarge', 'r5b.24xlarge', 'r5b.2xlarge', 'r5b.4xlarge', 'r5b.8xlarge', 'r5b.large', 'r5b.metal', 'r5b.xlarge', 'r5d.12xlarge', 'r5d.16xlarge', 'r5d.24xlarge', 'r5d.2xlarge', 'r5d.4xlarge', 'r5d.8xlarge', 'r5d.large', 'r5d.metal', 'r5d.xlarge', 'r5n.12xlarge', 'r5n.16xlarge', 'r5n.24xlarge', 'r5n.2xlarge', 'r5n.4xlarge', 'r5n.8xlarge', 'r5n.large', 'r5n.metal', 'r5n.xlarge', 'r6g.12xlarge', 'r6g.16xlarge', 'r6g.2xlarge', 'r6g.4xlarge', 'r6g.8xlarge', 'r6g.large', 'r6g.medium', 'r6g.metal', 'r6g.xlarge', 'r6gd.12xlarge', 'r6gd.16xlarge', 'r6gd.2xlarge', 'r6gd.4xlarge', 'r6gd.8xlarge', 'r6gd.large', 'r6gd.medium', 'r6gd.metal', 'r6gd.xlarge', 'r6i.12xlarge', 'r6i.16xlarge', 'r6i.24xlarge', 'r6i.2xlarge', 'r6i.32xlarge', 'r6i.4xlarge', 'r6i.8xlarge', 'r6i.large', 'r6i.metal', 'r6i.xlarge', 'r7g.12xlarge', 'r7g.16xlarge', 'r7g.2xlarge', 'r7g.4xlarge', 'r7g.8xlarge', 'r7g.large', 'r7g.medium', 'r7g.metal', 'r7g.xlarge', 'r7i.12xlarge', 'r7i.16xlarge', 'r7i.24xlarge', 'r7i.2xlarge', 'r7i.48xlarge', 'r7i.4xlarge', 'r7i.8xlarge', 'r7i.large', 'r7i.metal-24xl', 'r7i.metal-48xl', 'r7i.xlarge', 't2.2xlarge', 't2.large', 't2.medium', 't2.micro', 't2.nano', 't2.small', 't2.xlarge', 't3.2xlarge', 't3.large', 't3.medium', 't3.micro', 't3.nano', 't3.small', 't3.xlarge', 't3a.2xlarge', 't3a.large', 't3a.medium', 't3a.micro', 't3a.nano', 't3a.small', 't3a.xlarge', 't4g.2xlarge', 't4g.large', 't4g.medium', 't4g.micro', 't4g.nano', 't4g.small', 't4g.xlarge', 'u-3tb1.56xlarge', 'u-6tb1.112xlarge', 'u-6tb1.56xlarge', 'x1.16xlarge', 'x1.32xlarge', 'x1e.16xlarge', 'x1e.2xlarge', 'x1e.32xlarge', 'x1e.4xlarge', 'x1e.8xlarge', 'x1e.xlarge', 'x2idn.16xlarge', 'x2idn.24xlarge', 'x2idn.32xlarge', 'x2idn.metal', 'x2iedn.16xlarge', 'x2iedn.24xlarge', 'x2iedn.2xlarge', 'x2iedn.32xlarge', 'x2iedn.4xlarge', 'x2iedn.8xlarge', 'x2iedn.metal', 'x2iedn.xlarge'], - 'ap-northeast-1': ['a1.2xlarge', 'a1.4xlarge', 'a1.large', 'a1.medium', 'a1.metal', 'a1.xlarge', 'c1.medium', 'c1.xlarge', 'c3.2xlarge', 'c3.4xlarge', 'c3.8xlarge', 'c3.large', 'c3.xlarge', 'c4.2xlarge', 'c4.4xlarge', 'c4.8xlarge', 'c4.large', 'c4.xlarge', 'c5.12xlarge', 'c5.18xlarge', 'c5.24xlarge', 'c5.2xlarge', 'c5.4xlarge', 'c5.9xlarge', 'c5.large', 'c5.metal', 'c5.xlarge', 'c5a.12xlarge', 'c5a.16xlarge', 'c5a.24xlarge', 'c5a.2xlarge', 'c5a.4xlarge', 'c5a.8xlarge', 'c5a.large', 'c5a.xlarge', 'c5d.12xlarge', 'c5d.18xlarge', 'c5d.24xlarge', 'c5d.2xlarge', 'c5d.4xlarge', 'c5d.9xlarge', 'c5d.large', 'c5d.metal', 'c5d.xlarge', 'c5n.18xlarge', 'c5n.2xlarge', 'c5n.4xlarge', 'c5n.9xlarge', 'c5n.large', 'c5n.metal', 'c5n.xlarge', 'c6a.12xlarge', 'c6a.16xlarge', 'c6a.24xlarge', 'c6a.2xlarge', 'c6a.32xlarge', 'c6a.48xlarge', 'c6a.4xlarge', 'c6a.8xlarge', 'c6a.large', 'c6a.metal', 'c6a.xlarge', 'c6g.12xlarge', 'c6g.16xlarge', 'c6g.2xlarge', 'c6g.4xlarge', 'c6g.8xlarge', 'c6g.large', 'c6g.medium', 'c6g.metal', 'c6g.xlarge', 'c6gd.12xlarge', 'c6gd.16xlarge', 'c6gd.2xlarge', 'c6gd.4xlarge', 'c6gd.8xlarge', 'c6gd.large', 'c6gd.medium', 'c6gd.metal', 'c6gd.xlarge', 'c6gn.12xlarge', 'c6gn.16xlarge', 'c6gn.2xlarge', 'c6gn.4xlarge', 'c6gn.8xlarge', 'c6gn.large', 'c6gn.medium', 'c6gn.xlarge', 'c6i.12xlarge', 'c6i.16xlarge', 'c6i.24xlarge', 'c6i.2xlarge', 'c6i.32xlarge', 'c6i.4xlarge', 'c6i.8xlarge', 'c6i.large', 'c6i.metal', 'c6i.xlarge', 'c6id.12xlarge', 'c6id.16xlarge', 'c6id.24xlarge', 'c6id.2xlarge', 'c6id.32xlarge', 'c6id.4xlarge', 'c6id.8xlarge', 'c6id.large', 'c6id.metal', 'c6id.xlarge', 'c6in.12xlarge', 'c6in.16xlarge', 'c6in.24xlarge', 'c6in.2xlarge', 'c6in.32xlarge', 'c6in.4xlarge', 'c6in.8xlarge', 'c6in.large', 'c6in.metal', 'c6in.xlarge', 'c7g.12xlarge', 'c7g.16xlarge', 'c7g.2xlarge', 'c7g.4xlarge', 'c7g.8xlarge', 'c7g.large', 'c7g.medium', 'c7g.metal', 'c7g.xlarge', 'c7gd.12xlarge', 'c7gd.16xlarge', 'c7gd.2xlarge', 'c7gd.4xlarge', 'c7gd.8xlarge', 'c7gd.large', 'c7gd.medium', 'c7gd.xlarge', 'c7gn.12xlarge', 'c7gn.16xlarge', 'c7gn.2xlarge', 'c7gn.4xlarge', 'c7gn.8xlarge', 'c7gn.large', 'c7gn.medium', 'c7gn.xlarge', 'c7i.12xlarge', 'c7i.16xlarge', 'c7i.24xlarge', 'c7i.2xlarge', 'c7i.48xlarge', 'c7i.4xlarge', 'c7i.8xlarge', 'c7i.large', 'c7i.metal-24xl', 'c7i.metal-48xl', 'c7i.xlarge', 'cr1.8xlarge', 'd2.2xlarge', 'd2.4xlarge', 'd2.8xlarge', 'd2.xlarge', 'd3.2xlarge', 'd3.4xlarge', 'd3.8xlarge', 'd3.xlarge', 'd3en.12xlarge', 'd3en.2xlarge', 'd3en.4xlarge', 'd3en.6xlarge', 'd3en.8xlarge', 'd3en.xlarge', 'g2.2xlarge', 'g2.8xlarge', 'g3.16xlarge', 'g3.4xlarge', 'g3.8xlarge', 'g3s.xlarge', 'g4ad.16xlarge', 'g4ad.2xlarge', 'g4ad.4xlarge', 'g4ad.8xlarge', 'g4ad.xlarge', 'g4dn.12xlarge', 'g4dn.16xlarge', 'g4dn.2xlarge', 'g4dn.4xlarge', 'g4dn.8xlarge', 'g4dn.metal', 'g4dn.xlarge', 'g5.12xlarge', 'g5.16xlarge', 'g5.24xlarge', 'g5.2xlarge', 'g5.48xlarge', 'g5.4xlarge', 'g5.8xlarge', 'g5.xlarge', 'g5g.16xlarge', 'g5g.2xlarge', 'g5g.4xlarge', 'g5g.8xlarge', 'g5g.metal', 'g5g.xlarge', 'hpc7g.16xlarge', 'hpc7g.4xlarge', 'hpc7g.8xlarge', 'i2.2xlarge', 'i2.4xlarge', 'i2.8xlarge', 'i2.xlarge', 'i3.16xlarge', 'i3.2xlarge', 'i3.4xlarge', 'i3.8xlarge', 'i3.large', 'i3.metal', 'i3.xlarge', 'i3en.12xlarge', 'i3en.24xlarge', 'i3en.2xlarge', 'i3en.3xlarge', 'i3en.6xlarge', 'i3en.large', 'i3en.metal', 'i3en.xlarge', 'i4i.12xlarge', 'i4i.16xlarge', 'i4i.24xlarge', 'i4i.2xlarge', 'i4i.32xlarge', 'i4i.4xlarge', 'i4i.8xlarge', 'i4i.large', 'i4i.metal', 'i4i.xlarge', 'im4gn.16xlarge', 'im4gn.2xlarge', 'im4gn.4xlarge', 'im4gn.8xlarge', 'im4gn.large', 'im4gn.xlarge', 'inf1.24xlarge', 'inf1.2xlarge', 'inf1.6xlarge', 'inf1.xlarge', 'inf2.24xlarge', 'inf2.48xlarge', 'inf2.8xlarge', 'inf2.xlarge', 'is4gen.2xlarge', 'is4gen.4xlarge', 'is4gen.8xlarge', 'is4gen.large', 'is4gen.medium', 'is4gen.xlarge', 'm1.large', 'm1.medium', 'm1.xlarge', 'm2.2xlarge', 'm2.4xlarge', 'm2.xlarge', 'm3.2xlarge', 'm3.large', 'm3.medium', 'm3.xlarge', 'm4.10xlarge', 'm4.16xlarge', 'm4.2xlarge', 'm4.4xlarge', 'm4.large', 'm4.xlarge', 'm5.12xlarge', 'm5.16xlarge', 'm5.24xlarge', 'm5.2xlarge', 'm5.4xlarge', 'm5.8xlarge', 'm5.large', 'm5.metal', 'm5.xlarge', 'm5a.12xlarge', 'm5a.16xlarge', 'm5a.24xlarge', 'm5a.2xlarge', 'm5a.4xlarge', 'm5a.8xlarge', 'm5a.large', 'm5a.xlarge', 'm5ad.12xlarge', 'm5ad.16xlarge', 'm5ad.24xlarge', 'm5ad.2xlarge', 'm5ad.4xlarge', 'm5ad.8xlarge', 'm5ad.large', 'm5ad.xlarge', 'm5d.12xlarge', 'm5d.16xlarge', 'm5d.24xlarge', 'm5d.2xlarge', 'm5d.4xlarge', 'm5d.8xlarge', 'm5d.large', 'm5d.metal', 'm5d.xlarge', 'm5dn.12xlarge', 'm5dn.16xlarge', 'm5dn.24xlarge', 'm5dn.2xlarge', 'm5dn.4xlarge', 'm5dn.8xlarge', 'm5dn.large', 'm5dn.metal', 'm5dn.xlarge', 'm5n.12xlarge', 'm5n.16xlarge', 'm5n.24xlarge', 'm5n.2xlarge', 'm5n.4xlarge', 'm5n.8xlarge', 'm5n.large', 'm5n.metal', 'm5n.xlarge', 'm5zn.12xlarge', 'm5zn.2xlarge', 'm5zn.3xlarge', 'm5zn.6xlarge', 'm5zn.large', 'm5zn.metal', 'm5zn.xlarge', 'm6a.12xlarge', 'm6a.16xlarge', 'm6a.24xlarge', 'm6a.2xlarge', 'm6a.32xlarge', 'm6a.48xlarge', 'm6a.4xlarge', 'm6a.8xlarge', 'm6a.large', 'm6a.metal', 'm6a.xlarge', 'm6g.12xlarge', 'm6g.16xlarge', 'm6g.2xlarge', 'm6g.4xlarge', 'm6g.8xlarge', 'm6g.large', 'm6g.medium', 'm6g.metal', 'm6g.xlarge', 'm6gd.12xlarge', 'm6gd.16xlarge', 'm6gd.2xlarge', 'm6gd.4xlarge', 'm6gd.8xlarge', 'm6gd.large', 'm6gd.medium', 'm6gd.metal', 'm6gd.xlarge', 'm6i.12xlarge', 'm6i.16xlarge', 'm6i.24xlarge', 'm6i.2xlarge', 'm6i.32xlarge', 'm6i.4xlarge', 'm6i.8xlarge', 'm6i.large', 'm6i.metal', 'm6i.xlarge', 'm6id.12xlarge', 'm6id.16xlarge', 'm6id.24xlarge', 'm6id.2xlarge', 'm6id.32xlarge', 'm6id.4xlarge', 'm6id.8xlarge', 'm6id.large', 'm6id.metal', 'm6id.xlarge', 'm6idn.12xlarge', 'm6idn.16xlarge', 'm6idn.24xlarge', 'm6idn.2xlarge', 'm6idn.32xlarge', 'm6idn.4xlarge', 'm6idn.8xlarge', 'm6idn.large', 'm6idn.metal', 'm6idn.xlarge', 'm6in.12xlarge', 'm6in.16xlarge', 'm6in.24xlarge', 'm6in.2xlarge', 'm6in.32xlarge', 'm6in.4xlarge', 'm6in.8xlarge', 'm6in.large', 'm6in.metal', 'm6in.xlarge', 'm7a.12xlarge', 'm7a.16xlarge', 'm7a.24xlarge', 'm7a.2xlarge', 'm7a.32xlarge', 'm7a.48xlarge', 'm7a.4xlarge', 'm7a.8xlarge', 'm7a.large', 'm7a.medium', 'm7a.metal-48xl', 'm7a.xlarge', 'm7g.12xlarge', 'm7g.16xlarge', 'm7g.2xlarge', 'm7g.4xlarge', 'm7g.8xlarge', 'm7g.large', 'm7g.medium', 'm7g.metal', 'm7g.xlarge', 'm7gd.12xlarge', 'm7gd.16xlarge', 'm7gd.2xlarge', 'm7gd.4xlarge', 'm7gd.8xlarge', 'm7gd.large', 'm7gd.medium', 'm7gd.xlarge', 'm7i-flex.2xlarge', 'm7i-flex.4xlarge', 'm7i-flex.8xlarge', 'm7i-flex.large', 'm7i-flex.xlarge', 'm7i.12xlarge', 'm7i.16xlarge', 'm7i.24xlarge', 'm7i.2xlarge', 'm7i.48xlarge', 'm7i.4xlarge', 'm7i.8xlarge', 'm7i.large', 'm7i.metal-24xl', 'm7i.metal-48xl', 'm7i.xlarge', 'p2.16xlarge', 'p2.8xlarge', 'p2.xlarge', 'p3.16xlarge', 'p3.2xlarge', 'p3.8xlarge', 'p3dn.24xlarge', 'p4d.24xlarge', 'r3.2xlarge', 'r3.4xlarge', 'r3.8xlarge', 'r3.large', 'r3.xlarge', 'r4.16xlarge', 'r4.2xlarge', 'r4.4xlarge', 'r4.8xlarge', 'r4.large', 'r4.xlarge', 'r5.12xlarge', 'r5.16xlarge', 'r5.24xlarge', 'r5.2xlarge', 'r5.4xlarge', 'r5.8xlarge', 'r5.large', 'r5.metal', 'r5.xlarge', 'r5a.12xlarge', 'r5a.16xlarge', 'r5a.24xlarge', 'r5a.2xlarge', 'r5a.4xlarge', 'r5a.8xlarge', 'r5a.large', 'r5a.xlarge', 'r5ad.12xlarge', 'r5ad.16xlarge', 'r5ad.24xlarge', 'r5ad.2xlarge', 'r5ad.4xlarge', 'r5ad.8xlarge', 'r5ad.large', 'r5ad.xlarge', 'r5b.12xlarge', 'r5b.16xlarge', 'r5b.24xlarge', 'r5b.2xlarge', 'r5b.4xlarge', 'r5b.8xlarge', 'r5b.large', 'r5b.metal', 'r5b.xlarge', 'r5d.12xlarge', 'r5d.16xlarge', 'r5d.24xlarge', 'r5d.2xlarge', 'r5d.4xlarge', 'r5d.8xlarge', 'r5d.large', 'r5d.metal', 'r5d.xlarge', 'r5dn.12xlarge', 'r5dn.16xlarge', 'r5dn.24xlarge', 'r5dn.2xlarge', 'r5dn.4xlarge', 'r5dn.8xlarge', 'r5dn.large', 'r5dn.metal', 'r5dn.xlarge', 'r5n.12xlarge', 'r5n.16xlarge', 'r5n.24xlarge', 'r5n.2xlarge', 'r5n.4xlarge', 'r5n.8xlarge', 'r5n.large', 'r5n.metal', 'r5n.xlarge', 'r6a.12xlarge', 'r6a.16xlarge', 'r6a.24xlarge', 'r6a.2xlarge', 'r6a.32xlarge', 'r6a.48xlarge', 'r6a.4xlarge', 'r6a.8xlarge', 'r6a.large', 'r6a.metal', 'r6a.xlarge', 'r6g.12xlarge', 'r6g.16xlarge', 'r6g.2xlarge', 'r6g.4xlarge', 'r6g.8xlarge', 'r6g.large', 'r6g.medium', 'r6g.metal', 'r6g.xlarge', 'r6gd.12xlarge', 'r6gd.16xlarge', 'r6gd.2xlarge', 'r6gd.4xlarge', 'r6gd.8xlarge', 'r6gd.large', 'r6gd.medium', 'r6gd.metal', 'r6gd.xlarge', 'r6i.12xlarge', 'r6i.16xlarge', 'r6i.24xlarge', 'r6i.2xlarge', 'r6i.32xlarge', 'r6i.4xlarge', 'r6i.8xlarge', 'r6i.large', 'r6i.metal', 'r6i.xlarge', 'r6id.12xlarge', 'r6id.16xlarge', 'r6id.24xlarge', 'r6id.2xlarge', 'r6id.32xlarge', 'r6id.4xlarge', 'r6id.8xlarge', 'r6id.large', 'r6id.metal', 'r6id.xlarge', 'r6idn.12xlarge', 'r6idn.16xlarge', 'r6idn.24xlarge', 'r6idn.2xlarge', 'r6idn.32xlarge', 'r6idn.4xlarge', 'r6idn.8xlarge', 'r6idn.large', 'r6idn.metal', 'r6idn.xlarge', 'r6in.12xlarge', 'r6in.16xlarge', 'r6in.24xlarge', 'r6in.2xlarge', 'r6in.32xlarge', 'r6in.4xlarge', 'r6in.8xlarge', 'r6in.large', 'r6in.metal', 'r6in.xlarge', 'r7a.12xlarge', 'r7a.16xlarge', 'r7a.24xlarge', 'r7a.2xlarge', 'r7a.32xlarge', 'r7a.48xlarge', 'r7a.4xlarge', 'r7a.8xlarge', 'r7a.large', 'r7a.medium', 'r7a.metal-48xl', 'r7a.xlarge', 'r7g.12xlarge', 'r7g.16xlarge', 'r7g.2xlarge', 'r7g.4xlarge', 'r7g.8xlarge', 'r7g.large', 'r7g.medium', 'r7g.metal', 'r7g.xlarge', 'r7gd.12xlarge', 'r7gd.16xlarge', 'r7gd.2xlarge', 'r7gd.4xlarge', 'r7gd.8xlarge', 'r7gd.large', 'r7gd.medium', 'r7gd.xlarge', 'r7i.12xlarge', 'r7i.16xlarge', 'r7i.24xlarge', 'r7i.2xlarge', 'r7i.48xlarge', 'r7i.4xlarge', 'r7i.8xlarge', 'r7i.large', 'r7i.metal-24xl', 'r7i.metal-48xl', 'r7i.xlarge', 'r7iz.12xlarge', 'r7iz.16xlarge', 'r7iz.2xlarge', 'r7iz.32xlarge', 'r7iz.4xlarge', 'r7iz.8xlarge', 'r7iz.large', 'r7iz.metal-16xl', 'r7iz.metal-32xl', 'r7iz.xlarge', 't1.micro', 't2.2xlarge', 't2.large', 't2.medium', 't2.micro', 't2.nano', 't2.small', 't2.xlarge', 't3.2xlarge', 't3.large', 't3.medium', 't3.micro', 't3.nano', 't3.small', 't3.xlarge', 't3a.2xlarge', 't3a.large', 't3a.medium', 't3a.micro', 't3a.nano', 't3a.small', 't3a.xlarge', 't4g.2xlarge', 't4g.large', 't4g.medium', 't4g.micro', 't4g.nano', 't4g.small', 't4g.xlarge', 'u-12tb1.112xlarge', 'u-3tb1.56xlarge', 'u-6tb1.112xlarge', 'u-6tb1.56xlarge', 'u-9tb1.112xlarge', 'vt1.24xlarge', 'vt1.3xlarge', 'vt1.6xlarge', 'x1.16xlarge', 'x1.32xlarge', 'x1e.16xlarge', 'x1e.2xlarge', 'x1e.32xlarge', 'x1e.4xlarge', 'x1e.8xlarge', 'x1e.xlarge', 'x2idn.16xlarge', 'x2idn.24xlarge', 'x2idn.32xlarge', 'x2idn.metal', 'x2iedn.16xlarge', 'x2iedn.24xlarge', 'x2iedn.2xlarge', 'x2iedn.32xlarge', 'x2iedn.4xlarge', 'x2iedn.8xlarge', 'x2iedn.metal', 'x2iedn.xlarge', 'x2iezn.12xlarge', 'x2iezn.2xlarge', 'x2iezn.4xlarge', 'x2iezn.6xlarge', 'x2iezn.8xlarge', 'x2iezn.metal', 'z1d.12xlarge', 'z1d.2xlarge', 'z1d.3xlarge', 'z1d.6xlarge', 'z1d.large', 'z1d.metal', 'z1d.xlarge'], - 'ap-northeast-2': ['c3.2xlarge', 'c3.4xlarge', 'c3.8xlarge', 'c3.large', 'c3.xlarge', 'c4.2xlarge', 'c4.4xlarge', 'c4.8xlarge', 'c4.large', 'c4.xlarge', 'c5.12xlarge', 'c5.18xlarge', 'c5.24xlarge', 'c5.2xlarge', 'c5.4xlarge', 'c5.9xlarge', 'c5.large', 'c5.metal', 'c5.xlarge', 'c5a.12xlarge', 'c5a.16xlarge', 'c5a.24xlarge', 'c5a.2xlarge', 'c5a.4xlarge', 'c5a.8xlarge', 'c5a.large', 'c5a.xlarge', 'c5d.12xlarge', 'c5d.18xlarge', 'c5d.24xlarge', 'c5d.2xlarge', 'c5d.4xlarge', 'c5d.9xlarge', 'c5d.large', 'c5d.metal', 'c5d.xlarge', 'c5n.18xlarge', 'c5n.2xlarge', 'c5n.4xlarge', 'c5n.9xlarge', 'c5n.large', 'c5n.metal', 'c5n.xlarge', 'c6g.12xlarge', 'c6g.16xlarge', 'c6g.2xlarge', 'c6g.4xlarge', 'c6g.8xlarge', 'c6g.large', 'c6g.medium', 'c6g.metal', 'c6g.xlarge', 'c6gd.12xlarge', 'c6gd.16xlarge', 'c6gd.2xlarge', 'c6gd.4xlarge', 'c6gd.8xlarge', 'c6gd.large', 'c6gd.medium', 'c6gd.metal', 'c6gd.xlarge', 'c6gn.12xlarge', 'c6gn.16xlarge', 'c6gn.2xlarge', 'c6gn.4xlarge', 'c6gn.8xlarge', 'c6gn.large', 'c6gn.medium', 'c6gn.xlarge', 'c6i.12xlarge', 'c6i.16xlarge', 'c6i.24xlarge', 'c6i.2xlarge', 'c6i.32xlarge', 'c6i.4xlarge', 'c6i.8xlarge', 'c6i.large', 'c6i.metal', 'c6i.xlarge', 'c6id.12xlarge', 'c6id.16xlarge', 'c6id.24xlarge', 'c6id.2xlarge', 'c6id.32xlarge', 'c6id.4xlarge', 'c6id.8xlarge', 'c6id.large', 'c6id.metal', 'c6id.xlarge', 'c6in.12xlarge', 'c6in.16xlarge', 'c6in.24xlarge', 'c6in.2xlarge', 'c6in.32xlarge', 'c6in.4xlarge', 'c6in.8xlarge', 'c6in.large', 'c6in.metal', 'c6in.xlarge', 'c7g.12xlarge', 'c7g.16xlarge', 'c7g.2xlarge', 'c7g.4xlarge', 'c7g.8xlarge', 'c7g.large', 'c7g.medium', 'c7g.metal', 'c7g.xlarge', 'c7i.12xlarge', 'c7i.16xlarge', 'c7i.24xlarge', 'c7i.2xlarge', 'c7i.48xlarge', 'c7i.4xlarge', 'c7i.8xlarge', 'c7i.large', 'c7i.metal-24xl', 'c7i.metal-48xl', 'c7i.xlarge', 'd2.2xlarge', 'd2.4xlarge', 'd2.8xlarge', 'd2.xlarge', 'g2.2xlarge', 'g2.8xlarge', 'g3.16xlarge', 'g3.4xlarge', 'g3.8xlarge', 'g3s.xlarge', 'g4dn.12xlarge', 'g4dn.16xlarge', 'g4dn.2xlarge', 'g4dn.4xlarge', 'g4dn.8xlarge', 'g4dn.metal', 'g4dn.xlarge', 'g5.12xlarge', 'g5.16xlarge', 'g5.24xlarge', 'g5.2xlarge', 'g5.48xlarge', 'g5.4xlarge', 'g5.8xlarge', 'g5.xlarge', 'g5g.16xlarge', 'g5g.2xlarge', 'g5g.4xlarge', 'g5g.8xlarge', 'g5g.metal', 'g5g.xlarge', 'i2.2xlarge', 'i2.4xlarge', 'i2.8xlarge', 'i2.xlarge', 'i3.16xlarge', 'i3.2xlarge', 'i3.4xlarge', 'i3.8xlarge', 'i3.large', 'i3.metal', 'i3.xlarge', 'i3en.12xlarge', 'i3en.24xlarge', 'i3en.2xlarge', 'i3en.3xlarge', 'i3en.6xlarge', 'i3en.large', 'i3en.metal', 'i3en.xlarge', 'i4i.12xlarge', 'i4i.16xlarge', 'i4i.24xlarge', 'i4i.2xlarge', 'i4i.32xlarge', 'i4i.4xlarge', 'i4i.8xlarge', 'i4i.large', 'i4i.metal', 'i4i.xlarge', 'inf1.24xlarge', 'inf1.2xlarge', 'inf1.6xlarge', 'inf1.xlarge', 'm3.2xlarge', 'm3.large', 'm3.medium', 'm3.xlarge', 'm4.10xlarge', 'm4.16xlarge', 'm4.2xlarge', 'm4.4xlarge', 'm4.large', 'm4.xlarge', 'm5.12xlarge', 'm5.16xlarge', 'm5.24xlarge', 'm5.2xlarge', 'm5.4xlarge', 'm5.8xlarge', 'm5.large', 'm5.metal', 'm5.xlarge', 'm5a.12xlarge', 'm5a.16xlarge', 'm5a.24xlarge', 'm5a.2xlarge', 'm5a.4xlarge', 'm5a.8xlarge', 'm5a.large', 'm5a.xlarge', 'm5ad.12xlarge', 'm5ad.16xlarge', 'm5ad.24xlarge', 'm5ad.2xlarge', 'm5ad.4xlarge', 'm5ad.8xlarge', 'm5ad.large', 'm5ad.xlarge', 'm5d.12xlarge', 'm5d.16xlarge', 'm5d.24xlarge', 'm5d.2xlarge', 'm5d.4xlarge', 'm5d.8xlarge', 'm5d.large', 'm5d.metal', 'm5d.xlarge', 'm5zn.12xlarge', 'm5zn.2xlarge', 'm5zn.3xlarge', 'm5zn.6xlarge', 'm5zn.large', 'm5zn.metal', 'm5zn.xlarge', 'm6g.12xlarge', 'm6g.16xlarge', 'm6g.2xlarge', 'm6g.4xlarge', 'm6g.8xlarge', 'm6g.large', 'm6g.medium', 'm6g.metal', 'm6g.xlarge', 'm6gd.12xlarge', 'm6gd.16xlarge', 'm6gd.2xlarge', 'm6gd.4xlarge', 'm6gd.8xlarge', 'm6gd.large', 'm6gd.medium', 'm6gd.metal', 'm6gd.xlarge', 'm6i.12xlarge', 'm6i.16xlarge', 'm6i.24xlarge', 'm6i.2xlarge', 'm6i.32xlarge', 'm6i.4xlarge', 'm6i.8xlarge', 'm6i.large', 'm6i.metal', 'm6i.xlarge', 'm6id.12xlarge', 'm6id.16xlarge', 'm6id.24xlarge', 'm6id.2xlarge', 'm6id.32xlarge', 'm6id.4xlarge', 'm6id.8xlarge', 'm6id.large', 'm6id.metal', 'm6id.xlarge', 'm7g.12xlarge', 'm7g.16xlarge', 'm7g.2xlarge', 'm7g.4xlarge', 'm7g.8xlarge', 'm7g.large', 'm7g.medium', 'm7g.metal', 'm7g.xlarge', 'm7i-flex.2xlarge', 'm7i-flex.4xlarge', 'm7i-flex.8xlarge', 'm7i-flex.large', 'm7i-flex.xlarge', 'm7i.12xlarge', 'm7i.16xlarge', 'm7i.24xlarge', 'm7i.2xlarge', 'm7i.48xlarge', 'm7i.4xlarge', 'm7i.8xlarge', 'm7i.large', 'm7i.metal-24xl', 'm7i.metal-48xl', 'm7i.xlarge', 'p2.16xlarge', 'p2.8xlarge', 'p2.xlarge', 'p3.16xlarge', 'p3.2xlarge', 'p3.8xlarge', 'p4d.24xlarge', 'r3.2xlarge', 'r3.4xlarge', 'r3.8xlarge', 'r3.large', 'r3.xlarge', 'r4.16xlarge', 'r4.2xlarge', 'r4.4xlarge', 'r4.8xlarge', 'r4.large', 'r4.xlarge', 'r5.12xlarge', 'r5.16xlarge', 'r5.24xlarge', 'r5.2xlarge', 'r5.4xlarge', 'r5.8xlarge', 'r5.large', 'r5.metal', 'r5.xlarge', 'r5a.12xlarge', 'r5a.16xlarge', 'r5a.24xlarge', 'r5a.2xlarge', 'r5a.4xlarge', 'r5a.8xlarge', 'r5a.large', 'r5a.xlarge', 'r5ad.12xlarge', 'r5ad.16xlarge', 'r5ad.24xlarge', 'r5ad.2xlarge', 'r5ad.4xlarge', 'r5ad.8xlarge', 'r5ad.large', 'r5ad.xlarge', 'r5b.12xlarge', 'r5b.16xlarge', 'r5b.24xlarge', 'r5b.2xlarge', 'r5b.4xlarge', 'r5b.8xlarge', 'r5b.large', 'r5b.metal', 'r5b.xlarge', 'r5d.12xlarge', 'r5d.16xlarge', 'r5d.24xlarge', 'r5d.2xlarge', 'r5d.4xlarge', 'r5d.8xlarge', 'r5d.large', 'r5d.metal', 'r5d.xlarge', 'r5dn.12xlarge', 'r5dn.16xlarge', 'r5dn.24xlarge', 'r5dn.2xlarge', 'r5dn.4xlarge', 'r5dn.8xlarge', 'r5dn.large', 'r5dn.metal', 'r5dn.xlarge', 'r5n.12xlarge', 'r5n.16xlarge', 'r5n.24xlarge', 'r5n.2xlarge', 'r5n.4xlarge', 'r5n.8xlarge', 'r5n.large', 'r5n.metal', 'r5n.xlarge', 'r6g.12xlarge', 'r6g.16xlarge', 'r6g.2xlarge', 'r6g.4xlarge', 'r6g.8xlarge', 'r6g.large', 'r6g.medium', 'r6g.metal', 'r6g.xlarge', 'r6gd.12xlarge', 'r6gd.16xlarge', 'r6gd.2xlarge', 'r6gd.4xlarge', 'r6gd.8xlarge', 'r6gd.large', 'r6gd.medium', 'r6gd.metal', 'r6gd.xlarge', 'r6i.12xlarge', 'r6i.16xlarge', 'r6i.24xlarge', 'r6i.2xlarge', 'r6i.32xlarge', 'r6i.4xlarge', 'r6i.8xlarge', 'r6i.large', 'r6i.metal', 'r6i.xlarge', 'r6id.12xlarge', 'r6id.16xlarge', 'r6id.24xlarge', 'r6id.2xlarge', 'r6id.32xlarge', 'r6id.4xlarge', 'r6id.8xlarge', 'r6id.large', 'r6id.metal', 'r6id.xlarge', 'r7g.12xlarge', 'r7g.16xlarge', 'r7g.2xlarge', 'r7g.4xlarge', 'r7g.8xlarge', 'r7g.large', 'r7g.medium', 'r7g.metal', 'r7g.xlarge', 'r7i.12xlarge', 'r7i.16xlarge', 'r7i.24xlarge', 'r7i.2xlarge', 'r7i.48xlarge', 'r7i.4xlarge', 'r7i.8xlarge', 'r7i.large', 'r7i.metal-24xl', 'r7i.metal-48xl', 'r7i.xlarge', 't2.2xlarge', 't2.large', 't2.medium', 't2.micro', 't2.nano', 't2.small', 't2.xlarge', 't3.2xlarge', 't3.large', 't3.medium', 't3.micro', 't3.nano', 't3.small', 't3.xlarge', 't3a.2xlarge', 't3a.large', 't3a.medium', 't3a.micro', 't3a.nano', 't3a.small', 't3a.xlarge', 't4g.2xlarge', 't4g.large', 't4g.medium', 't4g.micro', 't4g.nano', 't4g.small', 't4g.xlarge', 'u-12tb1.112xlarge', 'u-24tb1.112xlarge', 'u-6tb1.112xlarge', 'u-6tb1.56xlarge', 'u-9tb1.112xlarge', 'x1.16xlarge', 'x1.32xlarge', 'x1e.16xlarge', 'x1e.2xlarge', 'x1e.32xlarge', 'x1e.4xlarge', 'x1e.8xlarge', 'x1e.xlarge', 'x2idn.16xlarge', 'x2idn.24xlarge', 'x2idn.32xlarge', 'x2idn.metal', 'x2iedn.16xlarge', 'x2iedn.24xlarge', 'x2iedn.2xlarge', 'x2iedn.32xlarge', 'x2iedn.4xlarge', 'x2iedn.8xlarge', 'x2iedn.metal', 'x2iedn.xlarge', 'z1d.12xlarge', 'z1d.2xlarge', 'z1d.3xlarge', 'z1d.6xlarge', 'z1d.large', 'z1d.metal', 'z1d.xlarge'], - 'ap-northeast-3': ['c4.2xlarge', 'c4.4xlarge', 'c4.8xlarge', 'c4.large', 'c4.xlarge', 'c5.12xlarge', 'c5.18xlarge', 'c5.24xlarge', 'c5.2xlarge', 'c5.4xlarge', 'c5.9xlarge', 'c5.large', 'c5.metal', 'c5.xlarge', 'c5d.12xlarge', 'c5d.18xlarge', 'c5d.24xlarge', 'c5d.2xlarge', 'c5d.4xlarge', 'c5d.9xlarge', 'c5d.large', 'c5d.metal', 'c5d.xlarge', 'c5n.18xlarge', 'c5n.2xlarge', 'c5n.4xlarge', 'c5n.9xlarge', 'c5n.large', 'c5n.metal', 'c5n.xlarge', 'c6g.12xlarge', 'c6g.16xlarge', 'c6g.2xlarge', 'c6g.4xlarge', 'c6g.8xlarge', 'c6g.large', 'c6g.medium', 'c6g.metal', 'c6g.xlarge', 'c6gd.12xlarge', 'c6gd.16xlarge', 'c6gd.2xlarge', 'c6gd.4xlarge', 'c6gd.8xlarge', 'c6gd.large', 'c6gd.medium', 'c6gd.metal', 'c6gd.xlarge', 'c6gn.12xlarge', 'c6gn.16xlarge', 'c6gn.2xlarge', 'c6gn.4xlarge', 'c6gn.8xlarge', 'c6gn.large', 'c6gn.medium', 'c6gn.xlarge', 'c6i.12xlarge', 'c6i.16xlarge', 'c6i.24xlarge', 'c6i.2xlarge', 'c6i.32xlarge', 'c6i.4xlarge', 'c6i.8xlarge', 'c6i.large', 'c6i.metal', 'c6i.xlarge', 'd2.2xlarge', 'd2.4xlarge', 'd2.8xlarge', 'd2.xlarge', 'g4dn.12xlarge', 'g4dn.16xlarge', 'g4dn.2xlarge', 'g4dn.4xlarge', 'g4dn.8xlarge', 'g4dn.metal', 'g4dn.xlarge', 'i3.16xlarge', 'i3.2xlarge', 'i3.4xlarge', 'i3.8xlarge', 'i3.large', 'i3.metal', 'i3.xlarge', 'i3en.12xlarge', 'i3en.24xlarge', 'i3en.2xlarge', 'i3en.3xlarge', 'i3en.6xlarge', 'i3en.large', 'i3en.metal', 'i3en.xlarge', 'i4i.12xlarge', 'i4i.16xlarge', 'i4i.24xlarge', 'i4i.2xlarge', 'i4i.32xlarge', 'i4i.4xlarge', 'i4i.8xlarge', 'i4i.large', 'i4i.metal', 'i4i.xlarge', 'm4.16xlarge', 'm4.2xlarge', 'm4.4xlarge', 'm4.large', 'm4.xlarge', 'm5.12xlarge', 'm5.16xlarge', 'm5.24xlarge', 'm5.2xlarge', 'm5.4xlarge', 'm5.8xlarge', 'm5.large', 'm5.metal', 'm5.xlarge', 'm5d.12xlarge', 'm5d.16xlarge', 'm5d.24xlarge', 'm5d.2xlarge', 'm5d.4xlarge', 'm5d.8xlarge', 'm5d.large', 'm5d.metal', 'm5d.xlarge', 'm6g.12xlarge', 'm6g.16xlarge', 'm6g.2xlarge', 'm6g.4xlarge', 'm6g.8xlarge', 'm6g.large', 'm6g.medium', 'm6g.metal', 'm6g.xlarge', 'm6gd.12xlarge', 'm6gd.16xlarge', 'm6gd.2xlarge', 'm6gd.4xlarge', 'm6gd.8xlarge', 'm6gd.large', 'm6gd.medium', 'm6gd.metal', 'm6gd.xlarge', 'm6i.12xlarge', 'm6i.16xlarge', 'm6i.24xlarge', 'm6i.2xlarge', 'm6i.32xlarge', 'm6i.4xlarge', 'm6i.8xlarge', 'm6i.large', 'm6i.metal', 'm6i.xlarge', 'r4.16xlarge', 'r4.2xlarge', 'r4.4xlarge', 'r4.8xlarge', 'r4.large', 'r4.xlarge', 'r5.12xlarge', 'r5.16xlarge', 'r5.24xlarge', 'r5.2xlarge', 'r5.4xlarge', 'r5.8xlarge', 'r5.large', 'r5.metal', 'r5.xlarge', 'r5d.12xlarge', 'r5d.16xlarge', 'r5d.24xlarge', 'r5d.2xlarge', 'r5d.4xlarge', 'r5d.8xlarge', 'r5d.large', 'r5d.metal', 'r5d.xlarge', 'r6g.12xlarge', 'r6g.16xlarge', 'r6g.2xlarge', 'r6g.4xlarge', 'r6g.8xlarge', 'r6g.large', 'r6g.medium', 'r6g.metal', 'r6g.xlarge', 'r6gd.12xlarge', 'r6gd.16xlarge', 'r6gd.2xlarge', 'r6gd.4xlarge', 'r6gd.8xlarge', 'r6gd.large', 'r6gd.medium', 'r6gd.metal', 'r6gd.xlarge', 'r6i.12xlarge', 'r6i.16xlarge', 'r6i.24xlarge', 'r6i.2xlarge', 'r6i.32xlarge', 'r6i.4xlarge', 'r6i.8xlarge', 'r6i.large', 'r6i.metal', 'r6i.xlarge', 't2.2xlarge', 't2.large', 't2.medium', 't2.micro', 't2.nano', 't2.small', 't2.xlarge', 't3.2xlarge', 't3.large', 't3.medium', 't3.micro', 't3.nano', 't3.small', 't3.xlarge', 't4g.2xlarge', 't4g.large', 't4g.medium', 't4g.micro', 't4g.nano', 't4g.small', 't4g.xlarge', 'x1.16xlarge', 'x1.32xlarge', 'x1e.16xlarge', 'x1e.2xlarge', 'x1e.32xlarge', 'x1e.4xlarge', 'x1e.8xlarge', 'x1e.xlarge', 'x2idn.16xlarge', 'x2idn.24xlarge', 'x2idn.32xlarge', 'x2idn.metal', 'x2iedn.16xlarge', 'x2iedn.24xlarge', 'x2iedn.2xlarge', 'x2iedn.32xlarge', 'x2iedn.4xlarge', 'x2iedn.8xlarge', 'x2iedn.metal', 'x2iedn.xlarge'], - 'ap-southeast-1': ['a1.2xlarge', 'a1.4xlarge', 'a1.large', 'a1.medium', 'a1.metal', 'a1.xlarge', 'c1.medium', 'c1.xlarge', 'c3.2xlarge', 'c3.4xlarge', 'c3.8xlarge', 'c3.large', 'c3.xlarge', 'c4.2xlarge', 'c4.4xlarge', 'c4.8xlarge', 'c4.large', 'c4.xlarge', 'c5.12xlarge', 'c5.18xlarge', 'c5.24xlarge', 'c5.2xlarge', 'c5.4xlarge', 'c5.9xlarge', 'c5.large', 'c5.metal', 'c5.xlarge', 'c5a.12xlarge', 'c5a.16xlarge', 'c5a.24xlarge', 'c5a.2xlarge', 'c5a.4xlarge', 'c5a.8xlarge', 'c5a.large', 'c5a.xlarge', 'c5ad.12xlarge', 'c5ad.16xlarge', 'c5ad.24xlarge', 'c5ad.2xlarge', 'c5ad.4xlarge', 'c5ad.8xlarge', 'c5ad.large', 'c5ad.xlarge', 'c5d.12xlarge', 'c5d.18xlarge', 'c5d.24xlarge', 'c5d.2xlarge', 'c5d.4xlarge', 'c5d.9xlarge', 'c5d.large', 'c5d.metal', 'c5d.xlarge', 'c5n.18xlarge', 'c5n.2xlarge', 'c5n.4xlarge', 'c5n.9xlarge', 'c5n.large', 'c5n.metal', 'c5n.xlarge', 'c6a.12xlarge', 'c6a.16xlarge', 'c6a.24xlarge', 'c6a.2xlarge', 'c6a.32xlarge', 'c6a.48xlarge', 'c6a.4xlarge', 'c6a.8xlarge', 'c6a.large', 'c6a.metal', 'c6a.xlarge', 'c6g.12xlarge', 'c6g.16xlarge', 'c6g.2xlarge', 'c6g.4xlarge', 'c6g.8xlarge', 'c6g.large', 'c6g.medium', 'c6g.metal', 'c6g.xlarge', 'c6gd.12xlarge', 'c6gd.16xlarge', 'c6gd.2xlarge', 'c6gd.4xlarge', 'c6gd.8xlarge', 'c6gd.large', 'c6gd.medium', 'c6gd.metal', 'c6gd.xlarge', 'c6gn.12xlarge', 'c6gn.16xlarge', 'c6gn.2xlarge', 'c6gn.4xlarge', 'c6gn.8xlarge', 'c6gn.large', 'c6gn.medium', 'c6gn.xlarge', 'c6i.12xlarge', 'c6i.16xlarge', 'c6i.24xlarge', 'c6i.2xlarge', 'c6i.32xlarge', 'c6i.4xlarge', 'c6i.8xlarge', 'c6i.large', 'c6i.metal', 'c6i.xlarge', 'c6id.12xlarge', 'c6id.16xlarge', 'c6id.24xlarge', 'c6id.2xlarge', 'c6id.32xlarge', 'c6id.4xlarge', 'c6id.8xlarge', 'c6id.large', 'c6id.metal', 'c6id.xlarge', 'c6in.12xlarge', 'c6in.16xlarge', 'c6in.24xlarge', 'c6in.2xlarge', 'c6in.32xlarge', 'c6in.4xlarge', 'c6in.8xlarge', 'c6in.large', 'c6in.metal', 'c6in.xlarge', 'c7g.12xlarge', 'c7g.16xlarge', 'c7g.2xlarge', 'c7g.4xlarge', 'c7g.8xlarge', 'c7g.large', 'c7g.medium', 'c7g.metal', 'c7g.xlarge', 'c7gd.12xlarge', 'c7gd.16xlarge', 'c7gd.2xlarge', 'c7gd.4xlarge', 'c7gd.8xlarge', 'c7gd.large', 'c7gd.medium', 'c7gd.xlarge', 'c7i.12xlarge', 'c7i.16xlarge', 'c7i.24xlarge', 'c7i.2xlarge', 'c7i.48xlarge', 'c7i.4xlarge', 'c7i.8xlarge', 'c7i.large', 'c7i.metal-24xl', 'c7i.metal-48xl', 'c7i.xlarge', 'd2.2xlarge', 'd2.4xlarge', 'd2.8xlarge', 'd2.xlarge', 'd3.2xlarge', 'd3.4xlarge', 'd3.8xlarge', 'd3.xlarge', 'd3en.12xlarge', 'd3en.2xlarge', 'd3en.4xlarge', 'd3en.6xlarge', 'd3en.8xlarge', 'd3en.xlarge', 'g2.2xlarge', 'g2.8xlarge', 'g3.16xlarge', 'g3.4xlarge', 'g3.8xlarge', 'g4dn.12xlarge', 'g4dn.16xlarge', 'g4dn.2xlarge', 'g4dn.4xlarge', 'g4dn.8xlarge', 'g4dn.metal', 'g4dn.xlarge', 'g5g.16xlarge', 'g5g.2xlarge', 'g5g.4xlarge', 'g5g.8xlarge', 'g5g.metal', 'g5g.xlarge', 'hpc6a.48xlarge', 'i2.2xlarge', 'i2.4xlarge', 'i2.8xlarge', 'i2.xlarge', 'i3.16xlarge', 'i3.2xlarge', 'i3.4xlarge', 'i3.8xlarge', 'i3.large', 'i3.metal', 'i3.xlarge', 'i3en.12xlarge', 'i3en.24xlarge', 'i3en.2xlarge', 'i3en.3xlarge', 'i3en.6xlarge', 'i3en.large', 'i3en.metal', 'i3en.xlarge', 'i4g.16xlarge', 'i4g.2xlarge', 'i4g.4xlarge', 'i4g.8xlarge', 'i4g.large', 'i4g.xlarge', 'i4i.12xlarge', 'i4i.16xlarge', 'i4i.24xlarge', 'i4i.2xlarge', 'i4i.32xlarge', 'i4i.4xlarge', 'i4i.8xlarge', 'i4i.large', 'i4i.metal', 'i4i.xlarge', 'im4gn.16xlarge', 'im4gn.2xlarge', 'im4gn.4xlarge', 'im4gn.8xlarge', 'im4gn.large', 'im4gn.xlarge', 'inf1.24xlarge', 'inf1.2xlarge', 'inf1.6xlarge', 'inf1.xlarge', 'inf2.24xlarge', 'inf2.48xlarge', 'inf2.8xlarge', 'inf2.xlarge', 'is4gen.2xlarge', 'is4gen.4xlarge', 'is4gen.8xlarge', 'is4gen.large', 'is4gen.medium', 'is4gen.xlarge', 'm1.large', 'm1.medium', 'm1.xlarge', 'm2.2xlarge', 'm2.4xlarge', 'm2.xlarge', 'm3.2xlarge', 'm3.large', 'm3.medium', 'm3.xlarge', 'm4.10xlarge', 'm4.16xlarge', 'm4.2xlarge', 'm4.4xlarge', 'm4.large', 'm4.xlarge', 'm5.12xlarge', 'm5.16xlarge', 'm5.24xlarge', 'm5.2xlarge', 'm5.4xlarge', 'm5.8xlarge', 'm5.large', 'm5.metal', 'm5.xlarge', 'm5a.12xlarge', 'm5a.16xlarge', 'm5a.24xlarge', 'm5a.2xlarge', 'm5a.4xlarge', 'm5a.8xlarge', 'm5a.large', 'm5a.xlarge', 'm5ad.12xlarge', 'm5ad.16xlarge', 'm5ad.24xlarge', 'm5ad.2xlarge', 'm5ad.4xlarge', 'm5ad.8xlarge', 'm5ad.large', 'm5ad.xlarge', 'm5d.12xlarge', 'm5d.16xlarge', 'm5d.24xlarge', 'm5d.2xlarge', 'm5d.4xlarge', 'm5d.8xlarge', 'm5d.large', 'm5d.metal', 'm5d.xlarge', 'm5dn.12xlarge', 'm5dn.16xlarge', 'm5dn.24xlarge', 'm5dn.2xlarge', 'm5dn.4xlarge', 'm5dn.8xlarge', 'm5dn.large', 'm5dn.metal', 'm5dn.xlarge', 'm5n.12xlarge', 'm5n.16xlarge', 'm5n.24xlarge', 'm5n.2xlarge', 'm5n.4xlarge', 'm5n.8xlarge', 'm5n.large', 'm5n.metal', 'm5n.xlarge', 'm5zn.12xlarge', 'm5zn.2xlarge', 'm5zn.3xlarge', 'm5zn.6xlarge', 'm5zn.large', 'm5zn.metal', 'm5zn.xlarge', 'm6a.12xlarge', 'm6a.16xlarge', 'm6a.24xlarge', 'm6a.2xlarge', 'm6a.32xlarge', 'm6a.48xlarge', 'm6a.4xlarge', 'm6a.8xlarge', 'm6a.large', 'm6a.metal', 'm6a.xlarge', 'm6g.12xlarge', 'm6g.16xlarge', 'm6g.2xlarge', 'm6g.4xlarge', 'm6g.8xlarge', 'm6g.large', 'm6g.medium', 'm6g.metal', 'm6g.xlarge', 'm6gd.12xlarge', 'm6gd.16xlarge', 'm6gd.2xlarge', 'm6gd.4xlarge', 'm6gd.8xlarge', 'm6gd.large', 'm6gd.medium', 'm6gd.metal', 'm6gd.xlarge', 'm6i.12xlarge', 'm6i.16xlarge', 'm6i.24xlarge', 'm6i.2xlarge', 'm6i.32xlarge', 'm6i.4xlarge', 'm6i.8xlarge', 'm6i.large', 'm6i.metal', 'm6i.xlarge', 'm6id.12xlarge', 'm6id.16xlarge', 'm6id.24xlarge', 'm6id.2xlarge', 'm6id.32xlarge', 'm6id.4xlarge', 'm6id.8xlarge', 'm6id.large', 'm6id.metal', 'm6id.xlarge', 'm6idn.12xlarge', 'm6idn.16xlarge', 'm6idn.24xlarge', 'm6idn.2xlarge', 'm6idn.32xlarge', 'm6idn.4xlarge', 'm6idn.8xlarge', 'm6idn.large', 'm6idn.metal', 'm6idn.xlarge', 'm6in.12xlarge', 'm6in.16xlarge', 'm6in.24xlarge', 'm6in.2xlarge', 'm6in.32xlarge', 'm6in.4xlarge', 'm6in.8xlarge', 'm6in.large', 'm6in.metal', 'm6in.xlarge', 'm7g.12xlarge', 'm7g.16xlarge', 'm7g.2xlarge', 'm7g.4xlarge', 'm7g.8xlarge', 'm7g.large', 'm7g.medium', 'm7g.metal', 'm7g.xlarge', 'm7gd.12xlarge', 'm7gd.16xlarge', 'm7gd.2xlarge', 'm7gd.4xlarge', 'm7gd.8xlarge', 'm7gd.large', 'm7gd.medium', 'm7gd.xlarge', 'm7i-flex.2xlarge', 'm7i-flex.4xlarge', 'm7i-flex.8xlarge', 'm7i-flex.large', 'm7i-flex.xlarge', 'm7i.12xlarge', 'm7i.16xlarge', 'm7i.24xlarge', 'm7i.2xlarge', 'm7i.48xlarge', 'm7i.4xlarge', 'm7i.8xlarge', 'm7i.large', 'm7i.metal-24xl', 'm7i.metal-48xl', 'm7i.xlarge', 'p2.16xlarge', 'p2.8xlarge', 'p2.xlarge', 'p3.16xlarge', 'p3.2xlarge', 'p3.8xlarge', 'p4d.24xlarge', 'p4de.24xlarge', 'r3.2xlarge', 'r3.4xlarge', 'r3.8xlarge', 'r3.large', 'r3.xlarge', 'r4.16xlarge', 'r4.2xlarge', 'r4.4xlarge', 'r4.8xlarge', 'r4.large', 'r4.xlarge', 'r5.12xlarge', 'r5.16xlarge', 'r5.24xlarge', 'r5.2xlarge', 'r5.4xlarge', 'r5.8xlarge', 'r5.large', 'r5.metal', 'r5.xlarge', 'r5a.12xlarge', 'r5a.16xlarge', 'r5a.24xlarge', 'r5a.2xlarge', 'r5a.4xlarge', 'r5a.8xlarge', 'r5a.large', 'r5a.xlarge', 'r5ad.12xlarge', 'r5ad.16xlarge', 'r5ad.24xlarge', 'r5ad.2xlarge', 'r5ad.4xlarge', 'r5ad.8xlarge', 'r5ad.large', 'r5ad.xlarge', 'r5b.12xlarge', 'r5b.16xlarge', 'r5b.24xlarge', 'r5b.2xlarge', 'r5b.4xlarge', 'r5b.8xlarge', 'r5b.large', 'r5b.metal', 'r5b.xlarge', 'r5d.12xlarge', 'r5d.16xlarge', 'r5d.24xlarge', 'r5d.2xlarge', 'r5d.4xlarge', 'r5d.8xlarge', 'r5d.large', 'r5d.metal', 'r5d.xlarge', 'r5dn.12xlarge', 'r5dn.16xlarge', 'r5dn.24xlarge', 'r5dn.2xlarge', 'r5dn.4xlarge', 'r5dn.8xlarge', 'r5dn.large', 'r5dn.metal', 'r5dn.xlarge', 'r5n.12xlarge', 'r5n.16xlarge', 'r5n.24xlarge', 'r5n.2xlarge', 'r5n.4xlarge', 'r5n.8xlarge', 'r5n.large', 'r5n.metal', 'r5n.xlarge', 'r6a.12xlarge', 'r6a.16xlarge', 'r6a.24xlarge', 'r6a.2xlarge', 'r6a.32xlarge', 'r6a.48xlarge', 'r6a.4xlarge', 'r6a.8xlarge', 'r6a.large', 'r6a.metal', 'r6a.xlarge', 'r6g.12xlarge', 'r6g.16xlarge', 'r6g.2xlarge', 'r6g.4xlarge', 'r6g.8xlarge', 'r6g.large', 'r6g.medium', 'r6g.metal', 'r6g.xlarge', 'r6gd.12xlarge', 'r6gd.16xlarge', 'r6gd.2xlarge', 'r6gd.4xlarge', 'r6gd.8xlarge', 'r6gd.large', 'r6gd.medium', 'r6gd.metal', 'r6gd.xlarge', 'r6i.12xlarge', 'r6i.16xlarge', 'r6i.24xlarge', 'r6i.2xlarge', 'r6i.32xlarge', 'r6i.4xlarge', 'r6i.8xlarge', 'r6i.large', 'r6i.metal', 'r6i.xlarge', 'r6id.12xlarge', 'r6id.16xlarge', 'r6id.24xlarge', 'r6id.2xlarge', 'r6id.32xlarge', 'r6id.4xlarge', 'r6id.8xlarge', 'r6id.large', 'r6id.metal', 'r6id.xlarge', 'r6idn.12xlarge', 'r6idn.16xlarge', 'r6idn.24xlarge', 'r6idn.2xlarge', 'r6idn.32xlarge', 'r6idn.4xlarge', 'r6idn.8xlarge', 'r6idn.large', 'r6idn.metal', 'r6idn.xlarge', 'r6in.12xlarge', 'r6in.16xlarge', 'r6in.24xlarge', 'r6in.2xlarge', 'r6in.32xlarge', 'r6in.4xlarge', 'r6in.8xlarge', 'r6in.large', 'r6in.metal', 'r6in.xlarge', 'r7g.12xlarge', 'r7g.16xlarge', 'r7g.2xlarge', 'r7g.4xlarge', 'r7g.8xlarge', 'r7g.large', 'r7g.medium', 'r7g.metal', 'r7g.xlarge', 'r7gd.12xlarge', 'r7gd.16xlarge', 'r7gd.2xlarge', 'r7gd.4xlarge', 'r7gd.8xlarge', 'r7gd.large', 'r7gd.medium', 'r7gd.xlarge', 'r7i.12xlarge', 'r7i.16xlarge', 'r7i.24xlarge', 'r7i.2xlarge', 'r7i.48xlarge', 'r7i.4xlarge', 'r7i.8xlarge', 'r7i.large', 'r7i.metal-24xl', 'r7i.metal-48xl', 'r7i.xlarge', 't1.micro', 't2.2xlarge', 't2.large', 't2.medium', 't2.micro', 't2.nano', 't2.small', 't2.xlarge', 't3.2xlarge', 't3.large', 't3.medium', 't3.micro', 't3.nano', 't3.small', 't3.xlarge', 't3a.2xlarge', 't3a.large', 't3a.medium', 't3a.micro', 't3a.nano', 't3a.small', 't3a.xlarge', 't4g.2xlarge', 't4g.large', 't4g.medium', 't4g.micro', 't4g.nano', 't4g.small', 't4g.xlarge', 'u-12tb1.112xlarge', 'u-3tb1.56xlarge', 'u-6tb1.112xlarge', 'u-6tb1.56xlarge', 'u-9tb1.112xlarge', 'x1.16xlarge', 'x1.32xlarge', 'x1e.16xlarge', 'x1e.2xlarge', 'x1e.32xlarge', 'x1e.4xlarge', 'x1e.8xlarge', 'x1e.xlarge', 'x2idn.16xlarge', 'x2idn.24xlarge', 'x2idn.32xlarge', 'x2idn.metal', 'x2iedn.16xlarge', 'x2iedn.24xlarge', 'x2iedn.2xlarge', 'x2iedn.32xlarge', 'x2iedn.4xlarge', 'x2iedn.8xlarge', 'x2iedn.metal', 'x2iedn.xlarge', 'z1d.12xlarge', 'z1d.2xlarge', 'z1d.3xlarge', 'z1d.6xlarge', 'z1d.large', 'z1d.metal', 'z1d.xlarge'], - 'ap-southeast-2': ['a1.2xlarge', 'a1.4xlarge', 'a1.large', 'a1.medium', 'a1.metal', 'a1.xlarge', 'c1.medium', 'c1.xlarge', 'c3.2xlarge', 'c3.4xlarge', 'c3.8xlarge', 'c3.large', 'c3.xlarge', 'c4.2xlarge', 'c4.4xlarge', 'c4.8xlarge', 'c4.large', 'c4.xlarge', 'c5.12xlarge', 'c5.18xlarge', 'c5.24xlarge', 'c5.2xlarge', 'c5.4xlarge', 'c5.9xlarge', 'c5.large', 'c5.metal', 'c5.xlarge', 'c5a.12xlarge', 'c5a.16xlarge', 'c5a.24xlarge', 'c5a.2xlarge', 'c5a.4xlarge', 'c5a.8xlarge', 'c5a.large', 'c5a.xlarge', 'c5ad.12xlarge', 'c5ad.16xlarge', 'c5ad.24xlarge', 'c5ad.2xlarge', 'c5ad.4xlarge', 'c5ad.8xlarge', 'c5ad.large', 'c5ad.xlarge', 'c5d.12xlarge', 'c5d.18xlarge', 'c5d.24xlarge', 'c5d.2xlarge', 'c5d.4xlarge', 'c5d.9xlarge', 'c5d.large', 'c5d.metal', 'c5d.xlarge', 'c5n.18xlarge', 'c5n.2xlarge', 'c5n.4xlarge', 'c5n.9xlarge', 'c5n.large', 'c5n.metal', 'c5n.xlarge', 'c6a.12xlarge', 'c6a.16xlarge', 'c6a.24xlarge', 'c6a.2xlarge', 'c6a.32xlarge', 'c6a.48xlarge', 'c6a.4xlarge', 'c6a.8xlarge', 'c6a.large', 'c6a.metal', 'c6a.xlarge', 'c6g.12xlarge', 'c6g.16xlarge', 'c6g.2xlarge', 'c6g.4xlarge', 'c6g.8xlarge', 'c6g.large', 'c6g.medium', 'c6g.metal', 'c6g.xlarge', 'c6gd.12xlarge', 'c6gd.16xlarge', 'c6gd.2xlarge', 'c6gd.4xlarge', 'c6gd.8xlarge', 'c6gd.large', 'c6gd.medium', 'c6gd.metal', 'c6gd.xlarge', 'c6gn.12xlarge', 'c6gn.16xlarge', 'c6gn.2xlarge', 'c6gn.4xlarge', 'c6gn.8xlarge', 'c6gn.large', 'c6gn.medium', 'c6gn.xlarge', 'c6i.12xlarge', 'c6i.16xlarge', 'c6i.24xlarge', 'c6i.2xlarge', 'c6i.32xlarge', 'c6i.4xlarge', 'c6i.8xlarge', 'c6i.large', 'c6i.metal', 'c6i.xlarge', 'c6id.12xlarge', 'c6id.16xlarge', 'c6id.24xlarge', 'c6id.2xlarge', 'c6id.32xlarge', 'c6id.4xlarge', 'c6id.8xlarge', 'c6id.large', 'c6id.metal', 'c6id.xlarge', 'c6in.12xlarge', 'c6in.16xlarge', 'c6in.24xlarge', 'c6in.2xlarge', 'c6in.32xlarge', 'c6in.4xlarge', 'c6in.8xlarge', 'c6in.large', 'c6in.metal', 'c6in.xlarge', 'c7g.12xlarge', 'c7g.16xlarge', 'c7g.2xlarge', 'c7g.4xlarge', 'c7g.8xlarge', 'c7g.large', 'c7g.medium', 'c7g.metal', 'c7g.xlarge', 'c7gd.12xlarge', 'c7gd.16xlarge', 'c7gd.2xlarge', 'c7gd.4xlarge', 'c7gd.8xlarge', 'c7gd.large', 'c7gd.medium', 'c7gd.xlarge', 'c7i.12xlarge', 'c7i.16xlarge', 'c7i.24xlarge', 'c7i.2xlarge', 'c7i.48xlarge', 'c7i.4xlarge', 'c7i.8xlarge', 'c7i.large', 'c7i.metal-24xl', 'c7i.metal-48xl', 'c7i.xlarge', 'd2.2xlarge', 'd2.4xlarge', 'd2.8xlarge', 'd2.xlarge', 'd3.2xlarge', 'd3.4xlarge', 'd3.8xlarge', 'd3.xlarge', 'f1.16xlarge', 'f1.2xlarge', 'f1.4xlarge', 'g2.2xlarge', 'g2.8xlarge', 'g3.16xlarge', 'g3.4xlarge', 'g3.8xlarge', 'g3s.xlarge', 'g4dn.12xlarge', 'g4dn.16xlarge', 'g4dn.2xlarge', 'g4dn.4xlarge', 'g4dn.8xlarge', 'g4dn.metal', 'g4dn.xlarge', 'g5.12xlarge', 'g5.16xlarge', 'g5.24xlarge', 'g5.2xlarge', 'g5.48xlarge', 'g5.4xlarge', 'g5.8xlarge', 'g5.xlarge', 'hpc6a.48xlarge', 'i2.2xlarge', 'i2.4xlarge', 'i2.8xlarge', 'i2.xlarge', 'i3.16xlarge', 'i3.2xlarge', 'i3.4xlarge', 'i3.8xlarge', 'i3.large', 'i3.metal', 'i3.xlarge', 'i3en.12xlarge', 'i3en.24xlarge', 'i3en.2xlarge', 'i3en.3xlarge', 'i3en.6xlarge', 'i3en.large', 'i3en.metal', 'i3en.xlarge', 'i4i.12xlarge', 'i4i.16xlarge', 'i4i.24xlarge', 'i4i.2xlarge', 'i4i.32xlarge', 'i4i.4xlarge', 'i4i.8xlarge', 'i4i.large', 'i4i.metal', 'i4i.xlarge', 'im4gn.16xlarge', 'im4gn.2xlarge', 'im4gn.4xlarge', 'im4gn.8xlarge', 'im4gn.large', 'im4gn.xlarge', 'inf1.24xlarge', 'inf1.2xlarge', 'inf1.6xlarge', 'inf1.xlarge', 'is4gen.2xlarge', 'is4gen.4xlarge', 'is4gen.8xlarge', 'is4gen.large', 'is4gen.medium', 'is4gen.xlarge', 'm1.large', 'm1.medium', 'm1.xlarge', 'm2.2xlarge', 'm2.4xlarge', 'm2.xlarge', 'm3.2xlarge', 'm3.large', 'm3.medium', 'm3.xlarge', 'm4.10xlarge', 'm4.16xlarge', 'm4.2xlarge', 'm4.4xlarge', 'm4.large', 'm4.xlarge', 'm5.12xlarge', 'm5.16xlarge', 'm5.24xlarge', 'm5.2xlarge', 'm5.4xlarge', 'm5.8xlarge', 'm5.large', 'm5.metal', 'm5.xlarge', 'm5a.12xlarge', 'm5a.16xlarge', 'm5a.24xlarge', 'm5a.2xlarge', 'm5a.4xlarge', 'm5a.8xlarge', 'm5a.large', 'm5a.xlarge', 'm5ad.12xlarge', 'm5ad.16xlarge', 'm5ad.24xlarge', 'm5ad.2xlarge', 'm5ad.4xlarge', 'm5ad.8xlarge', 'm5ad.large', 'm5ad.xlarge', 'm5d.12xlarge', 'm5d.16xlarge', 'm5d.24xlarge', 'm5d.2xlarge', 'm5d.4xlarge', 'm5d.8xlarge', 'm5d.large', 'm5d.metal', 'm5d.xlarge', 'm5zn.12xlarge', 'm5zn.2xlarge', 'm5zn.3xlarge', 'm5zn.6xlarge', 'm5zn.large', 'm5zn.metal', 'm5zn.xlarge', 'm6a.12xlarge', 'm6a.16xlarge', 'm6a.24xlarge', 'm6a.2xlarge', 'm6a.32xlarge', 'm6a.48xlarge', 'm6a.4xlarge', 'm6a.8xlarge', 'm6a.large', 'm6a.metal', 'm6a.xlarge', 'm6g.12xlarge', 'm6g.16xlarge', 'm6g.2xlarge', 'm6g.4xlarge', 'm6g.8xlarge', 'm6g.large', 'm6g.medium', 'm6g.metal', 'm6g.xlarge', 'm6gd.12xlarge', 'm6gd.16xlarge', 'm6gd.2xlarge', 'm6gd.4xlarge', 'm6gd.8xlarge', 'm6gd.large', 'm6gd.medium', 'm6gd.metal', 'm6gd.xlarge', 'm6i.12xlarge', 'm6i.16xlarge', 'm6i.24xlarge', 'm6i.2xlarge', 'm6i.32xlarge', 'm6i.4xlarge', 'm6i.8xlarge', 'm6i.large', 'm6i.metal', 'm6i.xlarge', 'm6id.12xlarge', 'm6id.16xlarge', 'm6id.24xlarge', 'm6id.2xlarge', 'm6id.32xlarge', 'm6id.4xlarge', 'm6id.8xlarge', 'm6id.large', 'm6id.metal', 'm6id.xlarge', 'm6idn.12xlarge', 'm6idn.16xlarge', 'm6idn.24xlarge', 'm6idn.2xlarge', 'm6idn.32xlarge', 'm6idn.4xlarge', 'm6idn.8xlarge', 'm6idn.large', 'm6idn.metal', 'm6idn.xlarge', 'm6in.12xlarge', 'm6in.16xlarge', 'm6in.24xlarge', 'm6in.2xlarge', 'm6in.32xlarge', 'm6in.4xlarge', 'm6in.8xlarge', 'm6in.large', 'm6in.metal', 'm6in.xlarge', 'm7g.12xlarge', 'm7g.16xlarge', 'm7g.2xlarge', 'm7g.4xlarge', 'm7g.8xlarge', 'm7g.large', 'm7g.medium', 'm7g.metal', 'm7g.xlarge', 'm7gd.12xlarge', 'm7gd.16xlarge', 'm7gd.2xlarge', 'm7gd.4xlarge', 'm7gd.8xlarge', 'm7gd.large', 'm7gd.medium', 'm7gd.xlarge', 'm7i-flex.2xlarge', 'm7i-flex.4xlarge', 'm7i-flex.8xlarge', 'm7i-flex.large', 'm7i-flex.xlarge', 'm7i.12xlarge', 'm7i.16xlarge', 'm7i.24xlarge', 'm7i.2xlarge', 'm7i.48xlarge', 'm7i.4xlarge', 'm7i.8xlarge', 'm7i.large', 'm7i.metal-24xl', 'm7i.metal-48xl', 'm7i.xlarge', 'p2.16xlarge', 'p2.8xlarge', 'p2.xlarge', 'p3.16xlarge', 'p3.2xlarge', 'p3.8xlarge', 'r3.2xlarge', 'r3.4xlarge', 'r3.8xlarge', 'r3.large', 'r3.xlarge', 'r4.16xlarge', 'r4.2xlarge', 'r4.4xlarge', 'r4.8xlarge', 'r4.large', 'r4.xlarge', 'r5.12xlarge', 'r5.16xlarge', 'r5.24xlarge', 'r5.2xlarge', 'r5.4xlarge', 'r5.8xlarge', 'r5.large', 'r5.metal', 'r5.xlarge', 'r5a.12xlarge', 'r5a.16xlarge', 'r5a.24xlarge', 'r5a.2xlarge', 'r5a.4xlarge', 'r5a.8xlarge', 'r5a.large', 'r5a.xlarge', 'r5ad.12xlarge', 'r5ad.16xlarge', 'r5ad.24xlarge', 'r5ad.2xlarge', 'r5ad.4xlarge', 'r5ad.8xlarge', 'r5ad.large', 'r5ad.xlarge', 'r5b.12xlarge', 'r5b.16xlarge', 'r5b.24xlarge', 'r5b.2xlarge', 'r5b.4xlarge', 'r5b.8xlarge', 'r5b.large', 'r5b.metal', 'r5b.xlarge', 'r5d.12xlarge', 'r5d.16xlarge', 'r5d.24xlarge', 'r5d.2xlarge', 'r5d.4xlarge', 'r5d.8xlarge', 'r5d.large', 'r5d.metal', 'r5d.xlarge', 'r5dn.12xlarge', 'r5dn.16xlarge', 'r5dn.24xlarge', 'r5dn.2xlarge', 'r5dn.4xlarge', 'r5dn.8xlarge', 'r5dn.large', 'r5dn.metal', 'r5dn.xlarge', 'r5n.12xlarge', 'r5n.16xlarge', 'r5n.24xlarge', 'r5n.2xlarge', 'r5n.4xlarge', 'r5n.8xlarge', 'r5n.large', 'r5n.metal', 'r5n.xlarge', 'r6a.12xlarge', 'r6a.16xlarge', 'r6a.24xlarge', 'r6a.2xlarge', 'r6a.32xlarge', 'r6a.48xlarge', 'r6a.4xlarge', 'r6a.8xlarge', 'r6a.large', 'r6a.metal', 'r6a.xlarge', 'r6g.12xlarge', 'r6g.16xlarge', 'r6g.2xlarge', 'r6g.4xlarge', 'r6g.8xlarge', 'r6g.large', 'r6g.medium', 'r6g.metal', 'r6g.xlarge', 'r6gd.12xlarge', 'r6gd.16xlarge', 'r6gd.2xlarge', 'r6gd.4xlarge', 'r6gd.8xlarge', 'r6gd.large', 'r6gd.medium', 'r6gd.metal', 'r6gd.xlarge', 'r6i.12xlarge', 'r6i.16xlarge', 'r6i.24xlarge', 'r6i.2xlarge', 'r6i.32xlarge', 'r6i.4xlarge', 'r6i.8xlarge', 'r6i.large', 'r6i.metal', 'r6i.xlarge', 'r6id.12xlarge', 'r6id.16xlarge', 'r6id.24xlarge', 'r6id.2xlarge', 'r6id.32xlarge', 'r6id.4xlarge', 'r6id.8xlarge', 'r6id.large', 'r6id.metal', 'r6id.xlarge', 'r7g.12xlarge', 'r7g.16xlarge', 'r7g.2xlarge', 'r7g.4xlarge', 'r7g.8xlarge', 'r7g.large', 'r7g.medium', 'r7g.metal', 'r7g.xlarge', 'r7gd.12xlarge', 'r7gd.16xlarge', 'r7gd.2xlarge', 'r7gd.4xlarge', 'r7gd.8xlarge', 'r7gd.large', 'r7gd.medium', 'r7gd.xlarge', 'r7i.12xlarge', 'r7i.16xlarge', 'r7i.24xlarge', 'r7i.2xlarge', 'r7i.48xlarge', 'r7i.4xlarge', 'r7i.8xlarge', 'r7i.large', 'r7i.metal-24xl', 'r7i.metal-48xl', 'r7i.xlarge', 't1.micro', 't2.2xlarge', 't2.large', 't2.medium', 't2.micro', 't2.nano', 't2.small', 't2.xlarge', 't3.2xlarge', 't3.large', 't3.medium', 't3.micro', 't3.nano', 't3.small', 't3.xlarge', 't3a.2xlarge', 't3a.large', 't3a.medium', 't3a.micro', 't3a.nano', 't3a.small', 't3a.xlarge', 't4g.2xlarge', 't4g.large', 't4g.medium', 't4g.micro', 't4g.nano', 't4g.small', 't4g.xlarge', 'u-12tb1.112xlarge', 'u-3tb1.56xlarge', 'u-6tb1.112xlarge', 'u-6tb1.56xlarge', 'x1.16xlarge', 'x1.32xlarge', 'x1e.16xlarge', 'x1e.2xlarge', 'x1e.32xlarge', 'x1e.4xlarge', 'x1e.8xlarge', 'x1e.xlarge', 'x2idn.16xlarge', 'x2idn.24xlarge', 'x2idn.32xlarge', 'x2idn.metal', 'x2iedn.16xlarge', 'x2iedn.24xlarge', 'x2iedn.2xlarge', 'x2iedn.32xlarge', 'x2iedn.4xlarge', 'x2iedn.8xlarge', 'x2iedn.metal', 'x2iedn.xlarge', 'z1d.12xlarge', 'z1d.2xlarge', 'z1d.3xlarge', 'z1d.6xlarge', 'z1d.large', 'z1d.metal', 'z1d.xlarge'], - 'ap-south-1': ['a1.2xlarge', 'a1.4xlarge', 'a1.large', 'a1.medium', 'a1.metal', 'a1.xlarge', 'c4.2xlarge', 'c4.4xlarge', 'c4.8xlarge', 'c4.large', 'c4.xlarge', 'c5.12xlarge', 'c5.18xlarge', 'c5.24xlarge', 'c5.2xlarge', 'c5.4xlarge', 'c5.9xlarge', 'c5.large', 'c5.metal', 'c5.xlarge', 'c5a.12xlarge', 'c5a.16xlarge', 'c5a.24xlarge', 'c5a.2xlarge', 'c5a.4xlarge', 'c5a.8xlarge', 'c5a.large', 'c5a.xlarge', 'c5d.12xlarge', 'c5d.18xlarge', 'c5d.24xlarge', 'c5d.2xlarge', 'c5d.4xlarge', 'c5d.9xlarge', 'c5d.large', 'c5d.metal', 'c5d.xlarge', 'c5n.18xlarge', 'c5n.2xlarge', 'c5n.4xlarge', 'c5n.9xlarge', 'c5n.large', 'c5n.metal', 'c5n.xlarge', 'c6a.12xlarge', 'c6a.16xlarge', 'c6a.24xlarge', 'c6a.2xlarge', 'c6a.32xlarge', 'c6a.48xlarge', 'c6a.4xlarge', 'c6a.8xlarge', 'c6a.large', 'c6a.metal', 'c6a.xlarge', 'c6g.12xlarge', 'c6g.16xlarge', 'c6g.2xlarge', 'c6g.4xlarge', 'c6g.8xlarge', 'c6g.large', 'c6g.medium', 'c6g.metal', 'c6g.xlarge', 'c6gd.12xlarge', 'c6gd.16xlarge', 'c6gd.2xlarge', 'c6gd.4xlarge', 'c6gd.8xlarge', 'c6gd.large', 'c6gd.medium', 'c6gd.metal', 'c6gd.xlarge', 'c6gn.12xlarge', 'c6gn.16xlarge', 'c6gn.2xlarge', 'c6gn.4xlarge', 'c6gn.8xlarge', 'c6gn.large', 'c6gn.medium', 'c6gn.xlarge', 'c6i.12xlarge', 'c6i.16xlarge', 'c6i.24xlarge', 'c6i.2xlarge', 'c6i.32xlarge', 'c6i.4xlarge', 'c6i.8xlarge', 'c6i.large', 'c6i.metal', 'c6i.xlarge', 'c6in.12xlarge', 'c6in.16xlarge', 'c6in.24xlarge', 'c6in.2xlarge', 'c6in.32xlarge', 'c6in.4xlarge', 'c6in.8xlarge', 'c6in.large', 'c6in.metal', 'c6in.xlarge', 'c7g.12xlarge', 'c7g.16xlarge', 'c7g.2xlarge', 'c7g.4xlarge', 'c7g.8xlarge', 'c7g.large', 'c7g.medium', 'c7g.metal', 'c7g.xlarge', 'c7gd.12xlarge', 'c7gd.16xlarge', 'c7gd.2xlarge', 'c7gd.4xlarge', 'c7gd.8xlarge', 'c7gd.large', 'c7gd.medium', 'c7gd.xlarge', 'c7i.12xlarge', 'c7i.16xlarge', 'c7i.24xlarge', 'c7i.2xlarge', 'c7i.48xlarge', 'c7i.4xlarge', 'c7i.8xlarge', 'c7i.large', 'c7i.metal-24xl', 'c7i.metal-48xl', 'c7i.xlarge', 'd2.2xlarge', 'd2.4xlarge', 'd2.8xlarge', 'd2.xlarge', 'd3.2xlarge', 'd3.4xlarge', 'd3.8xlarge', 'd3.xlarge', 'g4dn.12xlarge', 'g4dn.16xlarge', 'g4dn.2xlarge', 'g4dn.4xlarge', 'g4dn.8xlarge', 'g4dn.metal', 'g4dn.xlarge', 'g5.12xlarge', 'g5.16xlarge', 'g5.24xlarge', 'g5.2xlarge', 'g5.48xlarge', 'g5.4xlarge', 'g5.8xlarge', 'g5.xlarge', 'i2.2xlarge', 'i2.4xlarge', 'i2.8xlarge', 'i2.xlarge', 'i3.16xlarge', 'i3.2xlarge', 'i3.4xlarge', 'i3.8xlarge', 'i3.large', 'i3.metal', 'i3.xlarge', 'i3en.12xlarge', 'i3en.24xlarge', 'i3en.2xlarge', 'i3en.3xlarge', 'i3en.6xlarge', 'i3en.large', 'i3en.metal', 'i3en.xlarge', 'i4i.12xlarge', 'i4i.16xlarge', 'i4i.24xlarge', 'i4i.2xlarge', 'i4i.32xlarge', 'i4i.4xlarge', 'i4i.8xlarge', 'i4i.large', 'i4i.metal', 'i4i.xlarge', 'inf1.24xlarge', 'inf1.2xlarge', 'inf1.6xlarge', 'inf1.xlarge', 'inf2.24xlarge', 'inf2.48xlarge', 'inf2.8xlarge', 'inf2.xlarge', 'is4gen.2xlarge', 'is4gen.4xlarge', 'is4gen.8xlarge', 'is4gen.large', 'is4gen.medium', 'is4gen.xlarge', 'm4.10xlarge', 'm4.16xlarge', 'm4.2xlarge', 'm4.4xlarge', 'm4.large', 'm4.xlarge', 'm5.12xlarge', 'm5.16xlarge', 'm5.24xlarge', 'm5.2xlarge', 'm5.4xlarge', 'm5.8xlarge', 'm5.large', 'm5.metal', 'm5.xlarge', 'm5a.12xlarge', 'm5a.16xlarge', 'm5a.24xlarge', 'm5a.2xlarge', 'm5a.4xlarge', 'm5a.8xlarge', 'm5a.large', 'm5a.xlarge', 'm5ad.12xlarge', 'm5ad.16xlarge', 'm5ad.24xlarge', 'm5ad.2xlarge', 'm5ad.4xlarge', 'm5ad.8xlarge', 'm5ad.large', 'm5ad.xlarge', 'm5d.12xlarge', 'm5d.16xlarge', 'm5d.24xlarge', 'm5d.2xlarge', 'm5d.4xlarge', 'm5d.8xlarge', 'm5d.large', 'm5d.metal', 'm5d.xlarge', 'm6a.12xlarge', 'm6a.16xlarge', 'm6a.24xlarge', 'm6a.2xlarge', 'm6a.32xlarge', 'm6a.48xlarge', 'm6a.4xlarge', 'm6a.8xlarge', 'm6a.large', 'm6a.metal', 'm6a.xlarge', 'm6g.12xlarge', 'm6g.16xlarge', 'm6g.2xlarge', 'm6g.4xlarge', 'm6g.8xlarge', 'm6g.large', 'm6g.medium', 'm6g.metal', 'm6g.xlarge', 'm6gd.12xlarge', 'm6gd.16xlarge', 'm6gd.2xlarge', 'm6gd.4xlarge', 'm6gd.8xlarge', 'm6gd.large', 'm6gd.medium', 'm6gd.metal', 'm6gd.xlarge', 'm6i.12xlarge', 'm6i.16xlarge', 'm6i.24xlarge', 'm6i.2xlarge', 'm6i.32xlarge', 'm6i.4xlarge', 'm6i.8xlarge', 'm6i.large', 'm6i.metal', 'm6i.xlarge', 'm6id.12xlarge', 'm6id.16xlarge', 'm6id.24xlarge', 'm6id.2xlarge', 'm6id.32xlarge', 'm6id.4xlarge', 'm6id.8xlarge', 'm6id.large', 'm6id.metal', 'm6id.xlarge', 'm7g.12xlarge', 'm7g.16xlarge', 'm7g.2xlarge', 'm7g.4xlarge', 'm7g.8xlarge', 'm7g.large', 'm7g.medium', 'm7g.metal', 'm7g.xlarge', 'm7gd.12xlarge', 'm7gd.16xlarge', 'm7gd.2xlarge', 'm7gd.4xlarge', 'm7gd.8xlarge', 'm7gd.large', 'm7gd.medium', 'm7gd.xlarge', 'm7i-flex.2xlarge', 'm7i-flex.4xlarge', 'm7i-flex.8xlarge', 'm7i-flex.large', 'm7i-flex.xlarge', 'm7i.12xlarge', 'm7i.16xlarge', 'm7i.24xlarge', 'm7i.2xlarge', 'm7i.48xlarge', 'm7i.4xlarge', 'm7i.8xlarge', 'm7i.large', 'm7i.metal-24xl', 'm7i.metal-48xl', 'm7i.xlarge', 'p2.16xlarge', 'p2.8xlarge', 'p2.xlarge', 'r3.2xlarge', 'r3.4xlarge', 'r3.8xlarge', 'r3.large', 'r3.xlarge', 'r4.16xlarge', 'r4.2xlarge', 'r4.4xlarge', 'r4.8xlarge', 'r4.large', 'r4.xlarge', 'r5.12xlarge', 'r5.16xlarge', 'r5.24xlarge', 'r5.2xlarge', 'r5.4xlarge', 'r5.8xlarge', 'r5.large', 'r5.metal', 'r5.xlarge', 'r5a.12xlarge', 'r5a.16xlarge', 'r5a.24xlarge', 'r5a.2xlarge', 'r5a.4xlarge', 'r5a.8xlarge', 'r5a.large', 'r5a.xlarge', 'r5ad.12xlarge', 'r5ad.16xlarge', 'r5ad.24xlarge', 'r5ad.2xlarge', 'r5ad.4xlarge', 'r5ad.8xlarge', 'r5ad.large', 'r5ad.xlarge', 'r5d.12xlarge', 'r5d.16xlarge', 'r5d.24xlarge', 'r5d.2xlarge', 'r5d.4xlarge', 'r5d.8xlarge', 'r5d.large', 'r5d.metal', 'r5d.xlarge', 'r5n.12xlarge', 'r5n.16xlarge', 'r5n.24xlarge', 'r5n.2xlarge', 'r5n.4xlarge', 'r5n.8xlarge', 'r5n.large', 'r5n.metal', 'r5n.xlarge', 'r6a.12xlarge', 'r6a.16xlarge', 'r6a.24xlarge', 'r6a.2xlarge', 'r6a.32xlarge', 'r6a.48xlarge', 'r6a.4xlarge', 'r6a.8xlarge', 'r6a.large', 'r6a.metal', 'r6a.xlarge', 'r6g.12xlarge', 'r6g.16xlarge', 'r6g.2xlarge', 'r6g.4xlarge', 'r6g.8xlarge', 'r6g.large', 'r6g.medium', 'r6g.metal', 'r6g.xlarge', 'r6gd.12xlarge', 'r6gd.16xlarge', 'r6gd.2xlarge', 'r6gd.4xlarge', 'r6gd.8xlarge', 'r6gd.large', 'r6gd.medium', 'r6gd.metal', 'r6gd.xlarge', 'r6i.12xlarge', 'r6i.16xlarge', 'r6i.24xlarge', 'r6i.2xlarge', 'r6i.32xlarge', 'r6i.4xlarge', 'r6i.8xlarge', 'r6i.large', 'r6i.metal', 'r6i.xlarge', 'r6id.12xlarge', 'r6id.16xlarge', 'r6id.24xlarge', 'r6id.2xlarge', 'r6id.32xlarge', 'r6id.4xlarge', 'r6id.8xlarge', 'r6id.large', 'r6id.metal', 'r6id.xlarge', 'r7g.12xlarge', 'r7g.16xlarge', 'r7g.2xlarge', 'r7g.4xlarge', 'r7g.8xlarge', 'r7g.large', 'r7g.medium', 'r7g.metal', 'r7g.xlarge', 'r7gd.12xlarge', 'r7gd.16xlarge', 'r7gd.2xlarge', 'r7gd.4xlarge', 'r7gd.8xlarge', 'r7gd.large', 'r7gd.medium', 'r7gd.xlarge', 'r7i.12xlarge', 'r7i.16xlarge', 'r7i.24xlarge', 'r7i.2xlarge', 'r7i.48xlarge', 'r7i.4xlarge', 'r7i.8xlarge', 'r7i.large', 'r7i.metal-24xl', 'r7i.metal-48xl', 'r7i.xlarge', 't2.2xlarge', 't2.large', 't2.medium', 't2.micro', 't2.nano', 't2.small', 't2.xlarge', 't3.2xlarge', 't3.large', 't3.medium', 't3.micro', 't3.nano', 't3.small', 't3.xlarge', 't3a.2xlarge', 't3a.large', 't3a.medium', 't3a.micro', 't3a.nano', 't3a.small', 't3a.xlarge', 't4g.2xlarge', 't4g.large', 't4g.medium', 't4g.micro', 't4g.nano', 't4g.small', 't4g.xlarge', 'u-12tb1.112xlarge', 'u-6tb1.112xlarge', 'u-6tb1.56xlarge', 'x1.16xlarge', 'x1.32xlarge', 'x1e.16xlarge', 'x1e.2xlarge', 'x1e.32xlarge', 'x1e.4xlarge', 'x1e.8xlarge', 'x1e.xlarge', 'x2idn.16xlarge', 'x2idn.24xlarge', 'x2idn.32xlarge', 'x2idn.metal', 'x2iedn.16xlarge', 'x2iedn.24xlarge', 'x2iedn.2xlarge', 'x2iedn.32xlarge', 'x2iedn.4xlarge', 'x2iedn.8xlarge', 'x2iedn.metal', 'x2iedn.xlarge', 'z1d.12xlarge', 'z1d.2xlarge', 'z1d.3xlarge', 'z1d.6xlarge', 'z1d.large', 'z1d.metal', 'z1d.xlarge'], - 'eu-west-1': ['a1.2xlarge', 'a1.4xlarge', 'a1.large', 'a1.medium', 'a1.metal', 'a1.xlarge', 'c1.medium', 'c1.xlarge', 'c3.2xlarge', 'c3.4xlarge', 'c3.8xlarge', 'c3.large', 'c3.xlarge', 'c4.2xlarge', 'c4.4xlarge', 'c4.8xlarge', 'c4.large', 'c4.xlarge', 'c5.12xlarge', 'c5.18xlarge', 'c5.24xlarge', 'c5.2xlarge', 'c5.4xlarge', 'c5.9xlarge', 'c5.large', 'c5.metal', 'c5.xlarge', 'c5a.12xlarge', 'c5a.16xlarge', 'c5a.24xlarge', 'c5a.2xlarge', 'c5a.4xlarge', 'c5a.8xlarge', 'c5a.large', 'c5a.xlarge', 'c5ad.12xlarge', 'c5ad.16xlarge', 'c5ad.24xlarge', 'c5ad.2xlarge', 'c5ad.4xlarge', 'c5ad.8xlarge', 'c5ad.large', 'c5ad.xlarge', 'c5d.12xlarge', 'c5d.18xlarge', 'c5d.24xlarge', 'c5d.2xlarge', 'c5d.4xlarge', 'c5d.9xlarge', 'c5d.large', 'c5d.metal', 'c5d.xlarge', 'c5n.18xlarge', 'c5n.2xlarge', 'c5n.4xlarge', 'c5n.9xlarge', 'c5n.large', 'c5n.metal', 'c5n.xlarge', 'c6a.12xlarge', 'c6a.16xlarge', 'c6a.24xlarge', 'c6a.2xlarge', 'c6a.32xlarge', 'c6a.48xlarge', 'c6a.4xlarge', 'c6a.8xlarge', 'c6a.large', 'c6a.metal', 'c6a.xlarge', 'c6g.12xlarge', 'c6g.16xlarge', 'c6g.2xlarge', 'c6g.4xlarge', 'c6g.8xlarge', 'c6g.large', 'c6g.medium', 'c6g.metal', 'c6g.xlarge', 'c6gd.12xlarge', 'c6gd.16xlarge', 'c6gd.2xlarge', 'c6gd.4xlarge', 'c6gd.8xlarge', 'c6gd.large', 'c6gd.medium', 'c6gd.metal', 'c6gd.xlarge', 'c6gn.12xlarge', 'c6gn.16xlarge', 'c6gn.2xlarge', 'c6gn.4xlarge', 'c6gn.8xlarge', 'c6gn.large', 'c6gn.medium', 'c6gn.xlarge', 'c6i.12xlarge', 'c6i.16xlarge', 'c6i.24xlarge', 'c6i.2xlarge', 'c6i.32xlarge', 'c6i.4xlarge', 'c6i.8xlarge', 'c6i.large', 'c6i.metal', 'c6i.xlarge', 'c6id.12xlarge', 'c6id.16xlarge', 'c6id.24xlarge', 'c6id.2xlarge', 'c6id.32xlarge', 'c6id.4xlarge', 'c6id.8xlarge', 'c6id.large', 'c6id.metal', 'c6id.xlarge', 'c6in.12xlarge', 'c6in.16xlarge', 'c6in.24xlarge', 'c6in.2xlarge', 'c6in.32xlarge', 'c6in.4xlarge', 'c6in.8xlarge', 'c6in.large', 'c6in.metal', 'c6in.xlarge', 'c7a.12xlarge', 'c7a.16xlarge', 'c7a.24xlarge', 'c7a.2xlarge', 'c7a.32xlarge', 'c7a.48xlarge', 'c7a.4xlarge', 'c7a.8xlarge', 'c7a.large', 'c7a.medium', 'c7a.metal-48xl', 'c7a.xlarge', 'c7g.12xlarge', 'c7g.16xlarge', 'c7g.2xlarge', 'c7g.4xlarge', 'c7g.8xlarge', 'c7g.large', 'c7g.medium', 'c7g.metal', 'c7g.xlarge', 'c7gd.12xlarge', 'c7gd.16xlarge', 'c7gd.2xlarge', 'c7gd.4xlarge', 'c7gd.8xlarge', 'c7gd.large', 'c7gd.medium', 'c7gd.xlarge', 'c7gn.12xlarge', 'c7gn.16xlarge', 'c7gn.2xlarge', 'c7gn.4xlarge', 'c7gn.8xlarge', 'c7gn.large', 'c7gn.medium', 'c7gn.xlarge', 'c7i.12xlarge', 'c7i.16xlarge', 'c7i.24xlarge', 'c7i.2xlarge', 'c7i.48xlarge', 'c7i.4xlarge', 'c7i.8xlarge', 'c7i.large', 'c7i.metal-24xl', 'c7i.metal-48xl', 'c7i.xlarge', 'cr1.8xlarge', 'd2.2xlarge', 'd2.4xlarge', 'd2.8xlarge', 'd2.xlarge', 'd3.2xlarge', 'd3.4xlarge', 'd3.8xlarge', 'd3.xlarge', 'd3en.12xlarge', 'd3en.2xlarge', 'd3en.4xlarge', 'd3en.6xlarge', 'd3en.8xlarge', 'd3en.xlarge', 'f1.16xlarge', 'f1.2xlarge', 'f1.4xlarge', 'g2.2xlarge', 'g2.8xlarge', 'g3.16xlarge', 'g3.4xlarge', 'g3.8xlarge', 'g3s.xlarge', 'g4ad.16xlarge', 'g4ad.2xlarge', 'g4ad.4xlarge', 'g4ad.8xlarge', 'g4ad.xlarge', 'g4dn.12xlarge', 'g4dn.16xlarge', 'g4dn.2xlarge', 'g4dn.4xlarge', 'g4dn.8xlarge', 'g4dn.metal', 'g4dn.xlarge', 'g5.12xlarge', 'g5.16xlarge', 'g5.24xlarge', 'g5.2xlarge', 'g5.48xlarge', 'g5.4xlarge', 'g5.8xlarge', 'g5.xlarge', 'h1.16xlarge', 'h1.2xlarge', 'h1.4xlarge', 'h1.8xlarge', 'hpc7a.12xlarge', 'hpc7a.24xlarge', 'hpc7a.48xlarge', 'hpc7a.96xlarge', 'hpc7g.16xlarge', 'hpc7g.4xlarge', 'hpc7g.8xlarge', 'i2.2xlarge', 'i2.4xlarge', 'i2.8xlarge', 'i2.xlarge', 'i3.16xlarge', 'i3.2xlarge', 'i3.4xlarge', 'i3.8xlarge', 'i3.large', 'i3.metal', 'i3.xlarge', 'i3en.12xlarge', 'i3en.24xlarge', 'i3en.2xlarge', 'i3en.3xlarge', 'i3en.6xlarge', 'i3en.large', 'i3en.metal', 'i3en.xlarge', 'i4g.16xlarge', 'i4g.2xlarge', 'i4g.4xlarge', 'i4g.8xlarge', 'i4g.large', 'i4g.xlarge', 'i4i.12xlarge', 'i4i.16xlarge', 'i4i.24xlarge', 'i4i.2xlarge', 'i4i.32xlarge', 'i4i.4xlarge', 'i4i.8xlarge', 'i4i.large', 'i4i.metal', 'i4i.xlarge', 'im4gn.16xlarge', 'im4gn.2xlarge', 'im4gn.4xlarge', 'im4gn.8xlarge', 'im4gn.large', 'im4gn.xlarge', 'inf1.24xlarge', 'inf1.2xlarge', 'inf1.6xlarge', 'inf1.xlarge', 'inf2.24xlarge', 'inf2.48xlarge', 'inf2.8xlarge', 'inf2.xlarge', 'is4gen.2xlarge', 'is4gen.4xlarge', 'is4gen.8xlarge', 'is4gen.large', 'is4gen.medium', 'is4gen.xlarge', 'm1.large', 'm1.medium', 'm1.xlarge', 'm2.2xlarge', 'm2.4xlarge', 'm2.xlarge', 'm3.2xlarge', 'm3.large', 'm3.medium', 'm3.xlarge', 'm4.10xlarge', 'm4.16xlarge', 'm4.2xlarge', 'm4.4xlarge', 'm4.large', 'm4.xlarge', 'm5.12xlarge', 'm5.16xlarge', 'm5.24xlarge', 'm5.2xlarge', 'm5.4xlarge', 'm5.8xlarge', 'm5.large', 'm5.metal', 'm5.xlarge', 'm5a.12xlarge', 'm5a.16xlarge', 'm5a.24xlarge', 'm5a.2xlarge', 'm5a.4xlarge', 'm5a.8xlarge', 'm5a.large', 'm5a.xlarge', 'm5ad.12xlarge', 'm5ad.16xlarge', 'm5ad.24xlarge', 'm5ad.2xlarge', 'm5ad.4xlarge', 'm5ad.8xlarge', 'm5ad.large', 'm5ad.xlarge', 'm5d.12xlarge', 'm5d.16xlarge', 'm5d.24xlarge', 'm5d.2xlarge', 'm5d.4xlarge', 'm5d.8xlarge', 'm5d.large', 'm5d.metal', 'm5d.xlarge', 'm5dn.12xlarge', 'm5dn.16xlarge', 'm5dn.24xlarge', 'm5dn.2xlarge', 'm5dn.4xlarge', 'm5dn.8xlarge', 'm5dn.large', 'm5dn.metal', 'm5dn.xlarge', 'm5n.12xlarge', 'm5n.16xlarge', 'm5n.24xlarge', 'm5n.2xlarge', 'm5n.4xlarge', 'm5n.8xlarge', 'm5n.large', 'm5n.xlarge', 'm5zn.12xlarge', 'm5zn.2xlarge', 'm5zn.3xlarge', 'm5zn.6xlarge', 'm5zn.large', 'm5zn.metal', 'm5zn.xlarge', 'm6a.12xlarge', 'm6a.16xlarge', 'm6a.24xlarge', 'm6a.2xlarge', 'm6a.32xlarge', 'm6a.48xlarge', 'm6a.4xlarge', 'm6a.8xlarge', 'm6a.large', 'm6a.metal', 'm6a.xlarge', 'm6g.12xlarge', 'm6g.16xlarge', 'm6g.2xlarge', 'm6g.4xlarge', 'm6g.8xlarge', 'm6g.large', 'm6g.medium', 'm6g.metal', 'm6g.xlarge', 'm6gd.12xlarge', 'm6gd.16xlarge', 'm6gd.2xlarge', 'm6gd.4xlarge', 'm6gd.8xlarge', 'm6gd.large', 'm6gd.medium', 'm6gd.metal', 'm6gd.xlarge', 'm6i.12xlarge', 'm6i.16xlarge', 'm6i.24xlarge', 'm6i.2xlarge', 'm6i.32xlarge', 'm6i.4xlarge', 'm6i.8xlarge', 'm6i.large', 'm6i.metal', 'm6i.xlarge', 'm6id.12xlarge', 'm6id.16xlarge', 'm6id.24xlarge', 'm6id.2xlarge', 'm6id.32xlarge', 'm6id.4xlarge', 'm6id.8xlarge', 'm6id.large', 'm6id.metal', 'm6id.xlarge', 'm6idn.12xlarge', 'm6idn.16xlarge', 'm6idn.24xlarge', 'm6idn.2xlarge', 'm6idn.32xlarge', 'm6idn.4xlarge', 'm6idn.8xlarge', 'm6idn.large', 'm6idn.metal', 'm6idn.xlarge', 'm6in.12xlarge', 'm6in.16xlarge', 'm6in.24xlarge', 'm6in.2xlarge', 'm6in.32xlarge', 'm6in.4xlarge', 'm6in.8xlarge', 'm6in.large', 'm6in.metal', 'm6in.xlarge', 'm7a.12xlarge', 'm7a.16xlarge', 'm7a.24xlarge', 'm7a.2xlarge', 'm7a.32xlarge', 'm7a.48xlarge', 'm7a.4xlarge', 'm7a.8xlarge', 'm7a.large', 'm7a.medium', 'm7a.metal-48xl', 'm7a.xlarge', 'm7g.12xlarge', 'm7g.16xlarge', 'm7g.2xlarge', 'm7g.4xlarge', 'm7g.8xlarge', 'm7g.large', 'm7g.medium', 'm7g.metal', 'm7g.xlarge', 'm7gd.12xlarge', 'm7gd.16xlarge', 'm7gd.2xlarge', 'm7gd.4xlarge', 'm7gd.8xlarge', 'm7gd.large', 'm7gd.medium', 'm7gd.xlarge', 'm7i-flex.2xlarge', 'm7i-flex.4xlarge', 'm7i-flex.8xlarge', 'm7i-flex.large', 'm7i-flex.xlarge', 'm7i.12xlarge', 'm7i.16xlarge', 'm7i.24xlarge', 'm7i.2xlarge', 'm7i.48xlarge', 'm7i.4xlarge', 'm7i.8xlarge', 'm7i.large', 'm7i.metal-24xl', 'm7i.metal-48xl', 'm7i.xlarge', 'p2.16xlarge', 'p2.8xlarge', 'p2.xlarge', 'p3.16xlarge', 'p3.2xlarge', 'p3.8xlarge', 'p3dn.24xlarge', 'p4d.24xlarge', 'r3.2xlarge', 'r3.4xlarge', 'r3.8xlarge', 'r3.large', 'r3.xlarge', 'r4.16xlarge', 'r4.2xlarge', 'r4.4xlarge', 'r4.8xlarge', 'r4.large', 'r4.xlarge', 'r5.12xlarge', 'r5.16xlarge', 'r5.24xlarge', 'r5.2xlarge', 'r5.4xlarge', 'r5.8xlarge', 'r5.large', 'r5.metal', 'r5.xlarge', 'r5a.12xlarge', 'r5a.16xlarge', 'r5a.24xlarge', 'r5a.2xlarge', 'r5a.4xlarge', 'r5a.8xlarge', 'r5a.large', 'r5a.xlarge', 'r5ad.12xlarge', 'r5ad.16xlarge', 'r5ad.24xlarge', 'r5ad.2xlarge', 'r5ad.4xlarge', 'r5ad.8xlarge', 'r5ad.large', 'r5ad.xlarge', 'r5b.12xlarge', 'r5b.16xlarge', 'r5b.24xlarge', 'r5b.2xlarge', 'r5b.4xlarge', 'r5b.8xlarge', 'r5b.large', 'r5b.metal', 'r5b.xlarge', 'r5d.12xlarge', 'r5d.16xlarge', 'r5d.24xlarge', 'r5d.2xlarge', 'r5d.4xlarge', 'r5d.8xlarge', 'r5d.large', 'r5d.metal', 'r5d.xlarge', 'r5dn.12xlarge', 'r5dn.16xlarge', 'r5dn.24xlarge', 'r5dn.2xlarge', 'r5dn.4xlarge', 'r5dn.8xlarge', 'r5dn.large', 'r5dn.metal', 'r5dn.xlarge', 'r5n.12xlarge', 'r5n.16xlarge', 'r5n.24xlarge', 'r5n.2xlarge', 'r5n.4xlarge', 'r5n.8xlarge', 'r5n.large', 'r5n.metal', 'r5n.xlarge', 'r6a.12xlarge', 'r6a.16xlarge', 'r6a.24xlarge', 'r6a.2xlarge', 'r6a.32xlarge', 'r6a.48xlarge', 'r6a.4xlarge', 'r6a.8xlarge', 'r6a.large', 'r6a.metal', 'r6a.xlarge', 'r6g.12xlarge', 'r6g.16xlarge', 'r6g.2xlarge', 'r6g.4xlarge', 'r6g.8xlarge', 'r6g.large', 'r6g.medium', 'r6g.metal', 'r6g.xlarge', 'r6gd.12xlarge', 'r6gd.16xlarge', 'r6gd.2xlarge', 'r6gd.4xlarge', 'r6gd.8xlarge', 'r6gd.large', 'r6gd.medium', 'r6gd.metal', 'r6gd.xlarge', 'r6i.12xlarge', 'r6i.16xlarge', 'r6i.24xlarge', 'r6i.2xlarge', 'r6i.32xlarge', 'r6i.4xlarge', 'r6i.8xlarge', 'r6i.large', 'r6i.metal', 'r6i.xlarge', 'r6id.12xlarge', 'r6id.16xlarge', 'r6id.24xlarge', 'r6id.2xlarge', 'r6id.32xlarge', 'r6id.4xlarge', 'r6id.8xlarge', 'r6id.large', 'r6id.metal', 'r6id.xlarge', 'r6idn.12xlarge', 'r6idn.16xlarge', 'r6idn.24xlarge', 'r6idn.2xlarge', 'r6idn.32xlarge', 'r6idn.4xlarge', 'r6idn.8xlarge', 'r6idn.large', 'r6idn.metal', 'r6idn.xlarge', 'r6in.12xlarge', 'r6in.16xlarge', 'r6in.24xlarge', 'r6in.2xlarge', 'r6in.32xlarge', 'r6in.4xlarge', 'r6in.8xlarge', 'r6in.large', 'r6in.metal', 'r6in.xlarge', 'r7a.12xlarge', 'r7a.16xlarge', 'r7a.24xlarge', 'r7a.2xlarge', 'r7a.32xlarge', 'r7a.48xlarge', 'r7a.4xlarge', 'r7a.8xlarge', 'r7a.large', 'r7a.medium', 'r7a.metal-48xl', 'r7a.xlarge', 'r7g.12xlarge', 'r7g.16xlarge', 'r7g.2xlarge', 'r7g.4xlarge', 'r7g.8xlarge', 'r7g.large', 'r7g.medium', 'r7g.metal', 'r7g.xlarge', 'r7gd.12xlarge', 'r7gd.16xlarge', 'r7gd.2xlarge', 'r7gd.4xlarge', 'r7gd.8xlarge', 'r7gd.large', 'r7gd.medium', 'r7gd.xlarge', 'r7i.12xlarge', 'r7i.16xlarge', 'r7i.24xlarge', 'r7i.2xlarge', 'r7i.48xlarge', 'r7i.4xlarge', 'r7i.8xlarge', 'r7i.large', 'r7i.metal-24xl', 'r7i.metal-48xl', 'r7i.xlarge', 'r7iz.12xlarge', 'r7iz.16xlarge', 'r7iz.2xlarge', 'r7iz.32xlarge', 'r7iz.4xlarge', 'r7iz.8xlarge', 'r7iz.large', 'r7iz.metal-16xl', 'r7iz.metal-32xl', 'r7iz.xlarge', 't1.micro', 't2.2xlarge', 't2.large', 't2.medium', 't2.micro', 't2.nano', 't2.small', 't2.xlarge', 't3.2xlarge', 't3.large', 't3.medium', 't3.micro', 't3.nano', 't3.small', 't3.xlarge', 't3a.2xlarge', 't3a.large', 't3a.medium', 't3a.micro', 't3a.nano', 't3a.small', 't3a.xlarge', 't4g.2xlarge', 't4g.large', 't4g.medium', 't4g.micro', 't4g.nano', 't4g.small', 't4g.xlarge', 'u-12tb1.112xlarge', 'u-18tb1.112xlarge', 'u-3tb1.56xlarge', 'u-6tb1.112xlarge', 'u-6tb1.56xlarge', 'u-9tb1.112xlarge', 'vt1.24xlarge', 'vt1.3xlarge', 'vt1.6xlarge', 'x1.16xlarge', 'x1.32xlarge', 'x1e.16xlarge', 'x1e.2xlarge', 'x1e.32xlarge', 'x1e.4xlarge', 'x1e.8xlarge', 'x1e.xlarge', 'x2gd.12xlarge', 'x2gd.16xlarge', 'x2gd.2xlarge', 'x2gd.4xlarge', 'x2gd.8xlarge', 'x2gd.large', 'x2gd.medium', 'x2gd.metal', 'x2gd.xlarge', 'x2idn.16xlarge', 'x2idn.24xlarge', 'x2idn.32xlarge', 'x2idn.metal', 'x2iedn.16xlarge', 'x2iedn.24xlarge', 'x2iedn.2xlarge', 'x2iedn.32xlarge', 'x2iedn.4xlarge', 'x2iedn.8xlarge', 'x2iedn.metal', 'x2iedn.xlarge', 'x2iezn.12xlarge', 'x2iezn.2xlarge', 'x2iezn.4xlarge', 'x2iezn.6xlarge', 'x2iezn.8xlarge', 'x2iezn.metal', 'z1d.12xlarge', 'z1d.2xlarge', 'z1d.3xlarge', 'z1d.6xlarge', 'z1d.large', 'z1d.metal', 'z1d.xlarge'], - 'eu-west-2': ['c4.2xlarge', 'c4.4xlarge', 'c4.8xlarge', 'c4.large', 'c4.xlarge', 'c5.12xlarge', 'c5.18xlarge', 'c5.24xlarge', 'c5.2xlarge', 'c5.4xlarge', 'c5.9xlarge', 'c5.large', 'c5.metal', 'c5.xlarge', 'c5a.12xlarge', 'c5a.16xlarge', 'c5a.24xlarge', 'c5a.2xlarge', 'c5a.4xlarge', 'c5a.8xlarge', 'c5a.large', 'c5a.xlarge', 'c5d.12xlarge', 'c5d.18xlarge', 'c5d.24xlarge', 'c5d.2xlarge', 'c5d.4xlarge', 'c5d.9xlarge', 'c5d.large', 'c5d.metal', 'c5d.xlarge', 'c5n.18xlarge', 'c5n.2xlarge', 'c5n.4xlarge', 'c5n.9xlarge', 'c5n.large', 'c5n.metal', 'c5n.xlarge', 'c6a.12xlarge', 'c6a.16xlarge', 'c6a.24xlarge', 'c6a.2xlarge', 'c6a.32xlarge', 'c6a.48xlarge', 'c6a.4xlarge', 'c6a.8xlarge', 'c6a.large', 'c6a.metal', 'c6a.xlarge', 'c6g.12xlarge', 'c6g.16xlarge', 'c6g.2xlarge', 'c6g.4xlarge', 'c6g.8xlarge', 'c6g.large', 'c6g.medium', 'c6g.metal', 'c6g.xlarge', 'c6gd.12xlarge', 'c6gd.16xlarge', 'c6gd.2xlarge', 'c6gd.4xlarge', 'c6gd.8xlarge', 'c6gd.large', 'c6gd.medium', 'c6gd.metal', 'c6gd.xlarge', 'c6gn.12xlarge', 'c6gn.16xlarge', 'c6gn.2xlarge', 'c6gn.4xlarge', 'c6gn.8xlarge', 'c6gn.large', 'c6gn.medium', 'c6gn.xlarge', 'c6i.12xlarge', 'c6i.16xlarge', 'c6i.24xlarge', 'c6i.2xlarge', 'c6i.32xlarge', 'c6i.4xlarge', 'c6i.8xlarge', 'c6i.large', 'c6i.metal', 'c6i.xlarge', 'c6id.12xlarge', 'c6id.16xlarge', 'c6id.24xlarge', 'c6id.2xlarge', 'c6id.32xlarge', 'c6id.4xlarge', 'c6id.8xlarge', 'c6id.large', 'c6id.metal', 'c6id.xlarge', 'c6in.12xlarge', 'c6in.16xlarge', 'c6in.24xlarge', 'c6in.2xlarge', 'c6in.32xlarge', 'c6in.4xlarge', 'c6in.8xlarge', 'c6in.large', 'c6in.metal', 'c6in.xlarge', 'c7g.12xlarge', 'c7g.16xlarge', 'c7g.2xlarge', 'c7g.4xlarge', 'c7g.8xlarge', 'c7g.large', 'c7g.medium', 'c7g.metal', 'c7g.xlarge', 'c7i.12xlarge', 'c7i.16xlarge', 'c7i.24xlarge', 'c7i.2xlarge', 'c7i.48xlarge', 'c7i.4xlarge', 'c7i.8xlarge', 'c7i.large', 'c7i.metal-24xl', 'c7i.metal-48xl', 'c7i.xlarge', 'd2.2xlarge', 'd2.4xlarge', 'd2.8xlarge', 'd2.xlarge', 'd3.2xlarge', 'd3.4xlarge', 'd3.8xlarge', 'd3.xlarge', 'f1.2xlarge', 'f1.4xlarge', 'g3.16xlarge', 'g3.4xlarge', 'g3.8xlarge', 'g3s.xlarge', 'g4ad.16xlarge', 'g4ad.2xlarge', 'g4ad.4xlarge', 'g4ad.8xlarge', 'g4ad.xlarge', 'g4dn.12xlarge', 'g4dn.16xlarge', 'g4dn.2xlarge', 'g4dn.4xlarge', 'g4dn.8xlarge', 'g4dn.metal', 'g4dn.xlarge', 'g5.12xlarge', 'g5.16xlarge', 'g5.24xlarge', 'g5.2xlarge', 'g5.48xlarge', 'g5.4xlarge', 'g5.8xlarge', 'g5.xlarge', 'i3.16xlarge', 'i3.2xlarge', 'i3.4xlarge', 'i3.8xlarge', 'i3.large', 'i3.metal', 'i3.xlarge', 'i3en.12xlarge', 'i3en.24xlarge', 'i3en.2xlarge', 'i3en.3xlarge', 'i3en.6xlarge', 'i3en.large', 'i3en.metal', 'i3en.xlarge', 'i4i.12xlarge', 'i4i.16xlarge', 'i4i.24xlarge', 'i4i.2xlarge', 'i4i.32xlarge', 'i4i.4xlarge', 'i4i.8xlarge', 'i4i.large', 'i4i.metal', 'i4i.xlarge', 'im4gn.16xlarge', 'im4gn.2xlarge', 'im4gn.4xlarge', 'im4gn.8xlarge', 'im4gn.large', 'im4gn.xlarge', 'inf1.24xlarge', 'inf1.2xlarge', 'inf1.6xlarge', 'inf1.xlarge', 'is4gen.2xlarge', 'is4gen.4xlarge', 'is4gen.8xlarge', 'is4gen.large', 'is4gen.medium', 'is4gen.xlarge', 'm4.10xlarge', 'm4.16xlarge', 'm4.2xlarge', 'm4.4xlarge', 'm4.large', 'm4.xlarge', 'm5.12xlarge', 'm5.16xlarge', 'm5.24xlarge', 'm5.2xlarge', 'm5.4xlarge', 'm5.8xlarge', 'm5.large', 'm5.metal', 'm5.xlarge', 'm5a.12xlarge', 'm5a.16xlarge', 'm5a.24xlarge', 'm5a.2xlarge', 'm5a.4xlarge', 'm5a.8xlarge', 'm5a.large', 'm5a.xlarge', 'm5ad.12xlarge', 'm5ad.16xlarge', 'm5ad.24xlarge', 'm5ad.2xlarge', 'm5ad.4xlarge', 'm5ad.8xlarge', 'm5ad.large', 'm5ad.xlarge', 'm5d.12xlarge', 'm5d.16xlarge', 'm5d.24xlarge', 'm5d.2xlarge', 'm5d.4xlarge', 'm5d.8xlarge', 'm5d.large', 'm5d.metal', 'm5d.xlarge', 'm6a.12xlarge', 'm6a.16xlarge', 'm6a.24xlarge', 'm6a.2xlarge', 'm6a.32xlarge', 'm6a.48xlarge', 'm6a.4xlarge', 'm6a.8xlarge', 'm6a.large', 'm6a.metal', 'm6a.xlarge', 'm6g.12xlarge', 'm6g.16xlarge', 'm6g.2xlarge', 'm6g.4xlarge', 'm6g.8xlarge', 'm6g.large', 'm6g.medium', 'm6g.metal', 'm6g.xlarge', 'm6gd.12xlarge', 'm6gd.16xlarge', 'm6gd.2xlarge', 'm6gd.4xlarge', 'm6gd.8xlarge', 'm6gd.large', 'm6gd.medium', 'm6gd.metal', 'm6gd.xlarge', 'm6i.12xlarge', 'm6i.16xlarge', 'm6i.24xlarge', 'm6i.2xlarge', 'm6i.32xlarge', 'm6i.4xlarge', 'm6i.8xlarge', 'm6i.large', 'm6i.metal', 'm6i.xlarge', 'm7g.12xlarge', 'm7g.16xlarge', 'm7g.2xlarge', 'm7g.4xlarge', 'm7g.8xlarge', 'm7g.large', 'm7g.medium', 'm7g.metal', 'm7g.xlarge', 'm7i-flex.2xlarge', 'm7i-flex.4xlarge', 'm7i-flex.8xlarge', 'm7i-flex.large', 'm7i-flex.xlarge', 'm7i.12xlarge', 'm7i.16xlarge', 'm7i.24xlarge', 'm7i.2xlarge', 'm7i.48xlarge', 'm7i.4xlarge', 'm7i.8xlarge', 'm7i.large', 'm7i.metal-24xl', 'm7i.metal-48xl', 'm7i.xlarge', 'p3.16xlarge', 'p3.2xlarge', 'p3.8xlarge', 'r4.16xlarge', 'r4.2xlarge', 'r4.4xlarge', 'r4.8xlarge', 'r4.large', 'r4.xlarge', 'r5.12xlarge', 'r5.16xlarge', 'r5.24xlarge', 'r5.2xlarge', 'r5.4xlarge', 'r5.8xlarge', 'r5.large', 'r5.metal', 'r5.xlarge', 'r5a.12xlarge', 'r5a.16xlarge', 'r5a.24xlarge', 'r5a.2xlarge', 'r5a.4xlarge', 'r5a.8xlarge', 'r5a.large', 'r5a.xlarge', 'r5ad.12xlarge', 'r5ad.16xlarge', 'r5ad.24xlarge', 'r5ad.2xlarge', 'r5ad.4xlarge', 'r5ad.8xlarge', 'r5ad.large', 'r5ad.xlarge', 'r5b.12xlarge', 'r5b.16xlarge', 'r5b.24xlarge', 'r5b.2xlarge', 'r5b.4xlarge', 'r5b.8xlarge', 'r5b.large', 'r5b.metal', 'r5b.xlarge', 'r5d.12xlarge', 'r5d.16xlarge', 'r5d.24xlarge', 'r5d.2xlarge', 'r5d.4xlarge', 'r5d.8xlarge', 'r5d.large', 'r5d.metal', 'r5d.xlarge', 'r5n.12xlarge', 'r5n.16xlarge', 'r5n.24xlarge', 'r5n.2xlarge', 'r5n.4xlarge', 'r5n.8xlarge', 'r5n.large', 'r5n.metal', 'r5n.xlarge', 'r6g.12xlarge', 'r6g.16xlarge', 'r6g.2xlarge', 'r6g.4xlarge', 'r6g.8xlarge', 'r6g.large', 'r6g.medium', 'r6g.metal', 'r6g.xlarge', 'r6gd.12xlarge', 'r6gd.16xlarge', 'r6gd.2xlarge', 'r6gd.4xlarge', 'r6gd.8xlarge', 'r6gd.large', 'r6gd.medium', 'r6gd.metal', 'r6gd.xlarge', 'r6i.12xlarge', 'r6i.16xlarge', 'r6i.24xlarge', 'r6i.2xlarge', 'r6i.32xlarge', 'r6i.4xlarge', 'r6i.8xlarge', 'r6i.large', 'r6i.metal', 'r6i.xlarge', 'r6id.12xlarge', 'r6id.16xlarge', 'r6id.24xlarge', 'r6id.2xlarge', 'r6id.32xlarge', 'r6id.4xlarge', 'r6id.8xlarge', 'r6id.large', 'r6id.metal', 'r6id.xlarge', 'r7g.12xlarge', 'r7g.16xlarge', 'r7g.2xlarge', 'r7g.4xlarge', 'r7g.8xlarge', 'r7g.large', 'r7g.medium', 'r7g.metal', 'r7g.xlarge', 'r7i.12xlarge', 'r7i.16xlarge', 'r7i.24xlarge', 'r7i.2xlarge', 'r7i.48xlarge', 'r7i.4xlarge', 'r7i.8xlarge', 'r7i.large', 'r7i.metal-24xl', 'r7i.metal-48xl', 'r7i.xlarge', 't2.2xlarge', 't2.large', 't2.medium', 't2.micro', 't2.nano', 't2.small', 't2.xlarge', 't3.2xlarge', 't3.large', 't3.medium', 't3.micro', 't3.nano', 't3.small', 't3.xlarge', 't3a.2xlarge', 't3a.large', 't3a.medium', 't3a.micro', 't3a.nano', 't3a.small', 't3a.xlarge', 't4g.2xlarge', 't4g.large', 't4g.medium', 't4g.micro', 't4g.nano', 't4g.small', 't4g.xlarge', 'u-6tb1.112xlarge', 'u-6tb1.56xlarge', 'u-9tb1.112xlarge', 'x1.16xlarge', 'x1.32xlarge', 'x2idn.16xlarge', 'x2idn.24xlarge', 'x2idn.32xlarge', 'x2idn.metal', 'x2iedn.16xlarge', 'x2iedn.24xlarge', 'x2iedn.2xlarge', 'x2iedn.32xlarge', 'x2iedn.4xlarge', 'x2iedn.8xlarge', 'x2iedn.metal', 'x2iedn.xlarge', 'z1d.12xlarge', 'z1d.2xlarge', 'z1d.3xlarge', 'z1d.6xlarge', 'z1d.large', 'z1d.metal', 'z1d.xlarge'], - 'eu-west-3': ['c5.12xlarge', 'c5.18xlarge', 'c5.24xlarge', 'c5.2xlarge', 'c5.4xlarge', 'c5.9xlarge', 'c5.large', 'c5.metal', 'c5.xlarge', 'c5a.12xlarge', 'c5a.16xlarge', 'c5a.24xlarge', 'c5a.2xlarge', 'c5a.4xlarge', 'c5a.8xlarge', 'c5a.large', 'c5a.xlarge', 'c5d.18xlarge', 'c5d.2xlarge', 'c5d.4xlarge', 'c5d.9xlarge', 'c5d.large', 'c5d.xlarge', 'c5n.18xlarge', 'c5n.2xlarge', 'c5n.4xlarge', 'c5n.9xlarge', 'c5n.large', 'c5n.metal', 'c5n.xlarge', 'c6g.12xlarge', 'c6g.16xlarge', 'c6g.2xlarge', 'c6g.4xlarge', 'c6g.8xlarge', 'c6g.large', 'c6g.medium', 'c6g.metal', 'c6g.xlarge', 'c6gd.12xlarge', 'c6gd.16xlarge', 'c6gd.2xlarge', 'c6gd.4xlarge', 'c6gd.8xlarge', 'c6gd.large', 'c6gd.medium', 'c6gd.metal', 'c6gd.xlarge', 'c6gn.12xlarge', 'c6gn.16xlarge', 'c6gn.2xlarge', 'c6gn.4xlarge', 'c6gn.8xlarge', 'c6gn.large', 'c6gn.medium', 'c6gn.metal', 'c6gn.xlarge', 'c6i.12xlarge', 'c6i.16xlarge', 'c6i.24xlarge', 'c6i.2xlarge', 'c6i.32xlarge', 'c6i.4xlarge', 'c6i.8xlarge', 'c6i.large', 'c6i.metal', 'c6i.xlarge', 'c6in.12xlarge', 'c6in.16xlarge', 'c6in.24xlarge', 'c6in.2xlarge', 'c6in.32xlarge', 'c6in.4xlarge', 'c6in.8xlarge', 'c6in.large', 'c6in.metal', 'c6in.xlarge', 'd2.2xlarge', 'd2.4xlarge', 'd2.8xlarge', 'd2.xlarge', 'g4dn.12xlarge', 'g4dn.16xlarge', 'g4dn.2xlarge', 'g4dn.4xlarge', 'g4dn.8xlarge', 'g4dn.metal', 'g4dn.xlarge', 'i3.16xlarge', 'i3.2xlarge', 'i3.4xlarge', 'i3.8xlarge', 'i3.large', 'i3.metal', 'i3.xlarge', 'i3en.12xlarge', 'i3en.24xlarge', 'i3en.2xlarge', 'i3en.3xlarge', 'i3en.6xlarge', 'i3en.large', 'i3en.metal', 'i3en.xlarge', 'i4i.12xlarge', 'i4i.16xlarge', 'i4i.24xlarge', 'i4i.2xlarge', 'i4i.32xlarge', 'i4i.4xlarge', 'i4i.8xlarge', 'i4i.large', 'i4i.metal', 'i4i.xlarge', 'im4gn.16xlarge', 'im4gn.2xlarge', 'im4gn.4xlarge', 'im4gn.8xlarge', 'im4gn.large', 'im4gn.xlarge', 'inf1.24xlarge', 'inf1.2xlarge', 'inf1.6xlarge', 'inf1.xlarge', 'is4gen.2xlarge', 'is4gen.4xlarge', 'is4gen.8xlarge', 'is4gen.large', 'is4gen.medium', 'is4gen.xlarge', 'm5.12xlarge', 'm5.16xlarge', 'm5.24xlarge', 'm5.2xlarge', 'm5.4xlarge', 'm5.8xlarge', 'm5.large', 'm5.metal', 'm5.xlarge', 'm5a.12xlarge', 'm5a.16xlarge', 'm5a.24xlarge', 'm5a.2xlarge', 'm5a.4xlarge', 'm5a.8xlarge', 'm5a.large', 'm5a.xlarge', 'm5ad.12xlarge', 'm5ad.16xlarge', 'm5ad.24xlarge', 'm5ad.2xlarge', 'm5ad.4xlarge', 'm5ad.8xlarge', 'm5ad.large', 'm5ad.xlarge', 'm5d.12xlarge', 'm5d.16xlarge', 'm5d.24xlarge', 'm5d.2xlarge', 'm5d.4xlarge', 'm5d.8xlarge', 'm5d.large', 'm5d.metal', 'm5d.xlarge', 'm6g.12xlarge', 'm6g.16xlarge', 'm6g.2xlarge', 'm6g.4xlarge', 'm6g.8xlarge', 'm6g.large', 'm6g.medium', 'm6g.metal', 'm6g.xlarge', 'm6gd.12xlarge', 'm6gd.16xlarge', 'm6gd.2xlarge', 'm6gd.4xlarge', 'm6gd.8xlarge', 'm6gd.large', 'm6gd.medium', 'm6gd.metal', 'm6gd.xlarge', 'm6i.12xlarge', 'm6i.16xlarge', 'm6i.24xlarge', 'm6i.2xlarge', 'm6i.32xlarge', 'm6i.4xlarge', 'm6i.8xlarge', 'm6i.large', 'm6i.metal', 'm6i.xlarge', 'm7g.12xlarge', 'm7g.16xlarge', 'm7g.2xlarge', 'm7g.4xlarge', 'm7g.8xlarge', 'm7g.large', 'm7g.medium', 'm7g.metal', 'm7g.xlarge', 'm7i-flex.2xlarge', 'm7i-flex.4xlarge', 'm7i-flex.8xlarge', 'm7i-flex.large', 'm7i-flex.xlarge', 'm7i.12xlarge', 'm7i.16xlarge', 'm7i.24xlarge', 'm7i.2xlarge', 'm7i.48xlarge', 'm7i.4xlarge', 'm7i.8xlarge', 'm7i.large', 'm7i.metal-24xl', 'm7i.metal-48xl', 'm7i.xlarge', 'r4.16xlarge', 'r4.2xlarge', 'r4.4xlarge', 'r4.8xlarge', 'r4.large', 'r4.xlarge', 'r5.12xlarge', 'r5.16xlarge', 'r5.24xlarge', 'r5.2xlarge', 'r5.4xlarge', 'r5.8xlarge', 'r5.large', 'r5.metal', 'r5.xlarge', 'r5a.12xlarge', 'r5a.16xlarge', 'r5a.24xlarge', 'r5a.2xlarge', 'r5a.4xlarge', 'r5a.8xlarge', 'r5a.large', 'r5a.xlarge', 'r5ad.12xlarge', 'r5ad.16xlarge', 'r5ad.24xlarge', 'r5ad.2xlarge', 'r5ad.4xlarge', 'r5ad.8xlarge', 'r5ad.large', 'r5ad.xlarge', 'r5d.12xlarge', 'r5d.16xlarge', 'r5d.24xlarge', 'r5d.2xlarge', 'r5d.4xlarge', 'r5d.8xlarge', 'r5d.large', 'r5d.metal', 'r5d.xlarge', 'r5dn.12xlarge', 'r5dn.16xlarge', 'r5dn.24xlarge', 'r5dn.2xlarge', 'r5dn.4xlarge', 'r5dn.8xlarge', 'r5dn.large', 'r5dn.metal', 'r5dn.xlarge', 'r5n.12xlarge', 'r5n.16xlarge', 'r5n.24xlarge', 'r5n.2xlarge', 'r5n.4xlarge', 'r5n.8xlarge', 'r5n.large', 'r5n.metal', 'r5n.xlarge', 'r6g.12xlarge', 'r6g.16xlarge', 'r6g.2xlarge', 'r6g.4xlarge', 'r6g.8xlarge', 'r6g.large', 'r6g.medium', 'r6g.metal', 'r6g.xlarge', 'r6gd.12xlarge', 'r6gd.16xlarge', 'r6gd.2xlarge', 'r6gd.4xlarge', 'r6gd.8xlarge', 'r6gd.large', 'r6gd.medium', 'r6gd.metal', 'r6gd.xlarge', 'r6i.12xlarge', 'r6i.16xlarge', 'r6i.24xlarge', 'r6i.2xlarge', 'r6i.32xlarge', 'r6i.4xlarge', 'r6i.8xlarge', 'r6i.large', 'r6i.metal', 'r6i.xlarge', 't2.2xlarge', 't2.large', 't2.medium', 't2.micro', 't2.nano', 't2.small', 't2.xlarge', 't3.2xlarge', 't3.large', 't3.medium', 't3.micro', 't3.nano', 't3.small', 't3.xlarge', 't3a.2xlarge', 't3a.large', 't3a.medium', 't3a.micro', 't3a.nano', 't3a.small', 't3a.xlarge', 't4g.2xlarge', 't4g.large', 't4g.medium', 't4g.micro', 't4g.nano', 't4g.small', 't4g.xlarge', 'u-6tb1.112xlarge', 'u-6tb1.56xlarge', 'x1.16xlarge', 'x1.32xlarge', 'x2idn.16xlarge', 'x2idn.24xlarge', 'x2idn.32xlarge', 'x2idn.metal', 'x2iedn.16xlarge', 'x2iedn.24xlarge', 'x2iedn.2xlarge', 'x2iedn.32xlarge', 'x2iedn.4xlarge', 'x2iedn.8xlarge', 'x2iedn.metal', 'x2iedn.xlarge'], - 'eu-central-1': ['a1.2xlarge', 'a1.4xlarge', 'a1.large', 'a1.medium', 'a1.metal', 'a1.xlarge', 'c3.2xlarge', 'c3.4xlarge', 'c3.8xlarge', 'c3.large', 'c3.xlarge', 'c4.2xlarge', 'c4.4xlarge', 'c4.8xlarge', 'c4.large', 'c4.xlarge', 'c5.12xlarge', 'c5.18xlarge', 'c5.24xlarge', 'c5.2xlarge', 'c5.4xlarge', 'c5.9xlarge', 'c5.large', 'c5.metal', 'c5.xlarge', 'c5a.12xlarge', 'c5a.16xlarge', 'c5a.24xlarge', 'c5a.2xlarge', 'c5a.4xlarge', 'c5a.8xlarge', 'c5a.large', 'c5a.xlarge', 'c5ad.12xlarge', 'c5ad.16xlarge', 'c5ad.24xlarge', 'c5ad.2xlarge', 'c5ad.4xlarge', 'c5ad.8xlarge', 'c5ad.large', 'c5ad.xlarge', 'c5d.12xlarge', 'c5d.18xlarge', 'c5d.24xlarge', 'c5d.2xlarge', 'c5d.4xlarge', 'c5d.9xlarge', 'c5d.large', 'c5d.metal', 'c5d.xlarge', 'c5n.18xlarge', 'c5n.2xlarge', 'c5n.4xlarge', 'c5n.9xlarge', 'c5n.large', 'c5n.metal', 'c5n.xlarge', 'c6a.12xlarge', 'c6a.16xlarge', 'c6a.24xlarge', 'c6a.2xlarge', 'c6a.32xlarge', 'c6a.48xlarge', 'c6a.4xlarge', 'c6a.8xlarge', 'c6a.large', 'c6a.metal', 'c6a.xlarge', 'c6g.12xlarge', 'c6g.16xlarge', 'c6g.2xlarge', 'c6g.4xlarge', 'c6g.8xlarge', 'c6g.large', 'c6g.medium', 'c6g.metal', 'c6g.xlarge', 'c6gd.12xlarge', 'c6gd.16xlarge', 'c6gd.2xlarge', 'c6gd.4xlarge', 'c6gd.8xlarge', 'c6gd.large', 'c6gd.medium', 'c6gd.metal', 'c6gd.xlarge', 'c6gn.12xlarge', 'c6gn.16xlarge', 'c6gn.2xlarge', 'c6gn.4xlarge', 'c6gn.8xlarge', 'c6gn.large', 'c6gn.medium', 'c6gn.xlarge', 'c6i.12xlarge', 'c6i.16xlarge', 'c6i.24xlarge', 'c6i.2xlarge', 'c6i.32xlarge', 'c6i.4xlarge', 'c6i.8xlarge', 'c6i.large', 'c6i.metal', 'c6i.xlarge', 'c6id.12xlarge', 'c6id.16xlarge', 'c6id.24xlarge', 'c6id.2xlarge', 'c6id.32xlarge', 'c6id.4xlarge', 'c6id.8xlarge', 'c6id.large', 'c6id.metal', 'c6id.xlarge', 'c6in.12xlarge', 'c6in.16xlarge', 'c6in.24xlarge', 'c6in.2xlarge', 'c6in.32xlarge', 'c6in.4xlarge', 'c6in.8xlarge', 'c6in.large', 'c6in.metal', 'c6in.xlarge', 'c7g.12xlarge', 'c7g.16xlarge', 'c7g.2xlarge', 'c7g.4xlarge', 'c7g.8xlarge', 'c7g.large', 'c7g.medium', 'c7g.metal', 'c7g.xlarge', 'c7gd.12xlarge', 'c7gd.16xlarge', 'c7gd.2xlarge', 'c7gd.4xlarge', 'c7gd.8xlarge', 'c7gd.large', 'c7gd.medium', 'c7gd.xlarge', 'c7i.12xlarge', 'c7i.16xlarge', 'c7i.24xlarge', 'c7i.2xlarge', 'c7i.48xlarge', 'c7i.4xlarge', 'c7i.8xlarge', 'c7i.large', 'c7i.metal-24xl', 'c7i.metal-48xl', 'c7i.xlarge', 'd2.2xlarge', 'd2.4xlarge', 'd2.8xlarge', 'd2.xlarge', 'd3.2xlarge', 'd3.4xlarge', 'd3.8xlarge', 'd3.xlarge', 'd3en.12xlarge', 'd3en.2xlarge', 'd3en.4xlarge', 'd3en.6xlarge', 'd3en.8xlarge', 'd3en.xlarge', 'dl2q.24xlarge', 'f1.2xlarge', 'f1.4xlarge', 'g2.2xlarge', 'g2.8xlarge', 'g3.16xlarge', 'g3.4xlarge', 'g3.8xlarge', 'g3s.xlarge', 'g4ad.16xlarge', 'g4ad.2xlarge', 'g4ad.4xlarge', 'g4ad.8xlarge', 'g4ad.xlarge', 'g4dn.12xlarge', 'g4dn.16xlarge', 'g4dn.2xlarge', 'g4dn.4xlarge', 'g4dn.8xlarge', 'g4dn.metal', 'g4dn.xlarge', 'g5.12xlarge', 'g5.16xlarge', 'g5.24xlarge', 'g5.2xlarge', 'g5.48xlarge', 'g5.4xlarge', 'g5.8xlarge', 'g5.xlarge', 'g5g.16xlarge', 'g5g.2xlarge', 'g5g.4xlarge', 'g5g.8xlarge', 'g5g.metal', 'g5g.xlarge', 'i2.2xlarge', 'i2.4xlarge', 'i2.8xlarge', 'i2.xlarge', 'i3.16xlarge', 'i3.2xlarge', 'i3.4xlarge', 'i3.8xlarge', 'i3.large', 'i3.metal', 'i3.xlarge', 'i3en.12xlarge', 'i3en.24xlarge', 'i3en.2xlarge', 'i3en.3xlarge', 'i3en.6xlarge', 'i3en.large', 'i3en.metal', 'i3en.xlarge', 'i4i.12xlarge', 'i4i.16xlarge', 'i4i.24xlarge', 'i4i.2xlarge', 'i4i.32xlarge', 'i4i.4xlarge', 'i4i.8xlarge', 'i4i.large', 'i4i.metal', 'i4i.xlarge', 'im4gn.16xlarge', 'im4gn.2xlarge', 'im4gn.4xlarge', 'im4gn.8xlarge', 'im4gn.large', 'im4gn.xlarge', 'inf1.24xlarge', 'inf1.2xlarge', 'inf1.6xlarge', 'inf1.xlarge', 'inf2.24xlarge', 'inf2.48xlarge', 'inf2.8xlarge', 'inf2.xlarge', 'is4gen.2xlarge', 'is4gen.4xlarge', 'is4gen.8xlarge', 'is4gen.large', 'is4gen.medium', 'is4gen.xlarge', 'm3.2xlarge', 'm3.large', 'm3.medium', 'm3.xlarge', 'm4.10xlarge', 'm4.16xlarge', 'm4.2xlarge', 'm4.4xlarge', 'm4.large', 'm4.xlarge', 'm5.12xlarge', 'm5.16xlarge', 'm5.24xlarge', 'm5.2xlarge', 'm5.4xlarge', 'm5.8xlarge', 'm5.large', 'm5.metal', 'm5.xlarge', 'm5a.12xlarge', 'm5a.16xlarge', 'm5a.24xlarge', 'm5a.2xlarge', 'm5a.4xlarge', 'm5a.8xlarge', 'm5a.large', 'm5a.xlarge', 'm5ad.12xlarge', 'm5ad.16xlarge', 'm5ad.24xlarge', 'm5ad.2xlarge', 'm5ad.4xlarge', 'm5ad.8xlarge', 'm5ad.large', 'm5ad.xlarge', 'm5d.12xlarge', 'm5d.16xlarge', 'm5d.24xlarge', 'm5d.2xlarge', 'm5d.4xlarge', 'm5d.8xlarge', 'm5d.large', 'm5d.metal', 'm5d.xlarge', 'm5dn.12xlarge', 'm5dn.16xlarge', 'm5dn.24xlarge', 'm5dn.2xlarge', 'm5dn.4xlarge', 'm5dn.8xlarge', 'm5dn.large', 'm5dn.metal', 'm5dn.xlarge', 'm5n.12xlarge', 'm5n.16xlarge', 'm5n.24xlarge', 'm5n.2xlarge', 'm5n.4xlarge', 'm5n.8xlarge', 'm5n.large', 'm5n.metal', 'm5n.xlarge', 'm5zn.12xlarge', 'm5zn.2xlarge', 'm5zn.3xlarge', 'm5zn.6xlarge', 'm5zn.large', 'm5zn.metal', 'm5zn.xlarge', 'm6a.12xlarge', 'm6a.16xlarge', 'm6a.24xlarge', 'm6a.2xlarge', 'm6a.32xlarge', 'm6a.48xlarge', 'm6a.4xlarge', 'm6a.8xlarge', 'm6a.large', 'm6a.metal', 'm6a.xlarge', 'm6g.12xlarge', 'm6g.16xlarge', 'm6g.2xlarge', 'm6g.4xlarge', 'm6g.8xlarge', 'm6g.large', 'm6g.medium', 'm6g.metal', 'm6g.xlarge', 'm6gd.12xlarge', 'm6gd.16xlarge', 'm6gd.2xlarge', 'm6gd.4xlarge', 'm6gd.8xlarge', 'm6gd.large', 'm6gd.medium', 'm6gd.metal', 'm6gd.xlarge', 'm6i.12xlarge', 'm6i.16xlarge', 'm6i.24xlarge', 'm6i.2xlarge', 'm6i.32xlarge', 'm6i.4xlarge', 'm6i.8xlarge', 'm6i.large', 'm6i.metal', 'm6i.xlarge', 'm6id.12xlarge', 'm6id.16xlarge', 'm6id.24xlarge', 'm6id.2xlarge', 'm6id.32xlarge', 'm6id.4xlarge', 'm6id.8xlarge', 'm6id.large', 'm6id.metal', 'm6id.xlarge', 'm6idn.12xlarge', 'm6idn.16xlarge', 'm6idn.24xlarge', 'm6idn.2xlarge', 'm6idn.32xlarge', 'm6idn.4xlarge', 'm6idn.8xlarge', 'm6idn.large', 'm6idn.metal', 'm6idn.xlarge', 'm6in.12xlarge', 'm6in.16xlarge', 'm6in.24xlarge', 'm6in.2xlarge', 'm6in.32xlarge', 'm6in.4xlarge', 'm6in.8xlarge', 'm6in.large', 'm6in.metal', 'm6in.xlarge', 'm7g.12xlarge', 'm7g.16xlarge', 'm7g.2xlarge', 'm7g.4xlarge', 'm7g.8xlarge', 'm7g.large', 'm7g.medium', 'm7g.metal', 'm7g.xlarge', 'm7gd.12xlarge', 'm7gd.16xlarge', 'm7gd.2xlarge', 'm7gd.4xlarge', 'm7gd.8xlarge', 'm7gd.large', 'm7gd.medium', 'm7gd.xlarge', 'm7i-flex.2xlarge', 'm7i-flex.4xlarge', 'm7i-flex.8xlarge', 'm7i-flex.large', 'm7i-flex.xlarge', 'm7i.12xlarge', 'm7i.16xlarge', 'm7i.24xlarge', 'm7i.2xlarge', 'm7i.48xlarge', 'm7i.4xlarge', 'm7i.8xlarge', 'm7i.large', 'm7i.metal-24xl', 'm7i.metal-48xl', 'm7i.xlarge', 'p2.16xlarge', 'p2.8xlarge', 'p2.xlarge', 'p3.16xlarge', 'p3.2xlarge', 'p3.8xlarge', 'p4d.24xlarge', 'r3.2xlarge', 'r3.4xlarge', 'r3.8xlarge', 'r3.large', 'r3.xlarge', 'r4.16xlarge', 'r4.2xlarge', 'r4.4xlarge', 'r4.8xlarge', 'r4.large', 'r4.xlarge', 'r5.12xlarge', 'r5.16xlarge', 'r5.24xlarge', 'r5.2xlarge', 'r5.4xlarge', 'r5.8xlarge', 'r5.large', 'r5.metal', 'r5.xlarge', 'r5a.12xlarge', 'r5a.16xlarge', 'r5a.24xlarge', 'r5a.2xlarge', 'r5a.4xlarge', 'r5a.8xlarge', 'r5a.large', 'r5a.xlarge', 'r5ad.12xlarge', 'r5ad.16xlarge', 'r5ad.24xlarge', 'r5ad.2xlarge', 'r5ad.4xlarge', 'r5ad.8xlarge', 'r5ad.large', 'r5ad.xlarge', 'r5b.12xlarge', 'r5b.16xlarge', 'r5b.24xlarge', 'r5b.2xlarge', 'r5b.4xlarge', 'r5b.8xlarge', 'r5b.large', 'r5b.metal', 'r5b.xlarge', 'r5d.12xlarge', 'r5d.16xlarge', 'r5d.24xlarge', 'r5d.2xlarge', 'r5d.4xlarge', 'r5d.8xlarge', 'r5d.large', 'r5d.metal', 'r5d.xlarge', 'r5dn.12xlarge', 'r5dn.16xlarge', 'r5dn.24xlarge', 'r5dn.2xlarge', 'r5dn.4xlarge', 'r5dn.8xlarge', 'r5dn.large', 'r5dn.metal', 'r5dn.xlarge', 'r5n.12xlarge', 'r5n.16xlarge', 'r5n.24xlarge', 'r5n.2xlarge', 'r5n.4xlarge', 'r5n.8xlarge', 'r5n.large', 'r5n.metal', 'r5n.xlarge', 'r6a.12xlarge', 'r6a.16xlarge', 'r6a.24xlarge', 'r6a.2xlarge', 'r6a.32xlarge', 'r6a.48xlarge', 'r6a.4xlarge', 'r6a.8xlarge', 'r6a.large', 'r6a.metal', 'r6a.xlarge', 'r6g.12xlarge', 'r6g.16xlarge', 'r6g.2xlarge', 'r6g.4xlarge', 'r6g.8xlarge', 'r6g.large', 'r6g.medium', 'r6g.metal', 'r6g.xlarge', 'r6gd.12xlarge', 'r6gd.16xlarge', 'r6gd.2xlarge', 'r6gd.4xlarge', 'r6gd.8xlarge', 'r6gd.large', 'r6gd.medium', 'r6gd.metal', 'r6gd.xlarge', 'r6i.12xlarge', 'r6i.16xlarge', 'r6i.24xlarge', 'r6i.2xlarge', 'r6i.32xlarge', 'r6i.4xlarge', 'r6i.8xlarge', 'r6i.large', 'r6i.metal', 'r6i.xlarge', 'r6id.12xlarge', 'r6id.16xlarge', 'r6id.24xlarge', 'r6id.2xlarge', 'r6id.32xlarge', 'r6id.4xlarge', 'r6id.8xlarge', 'r6id.large', 'r6id.metal', 'r6id.xlarge', 'r6idn.12xlarge', 'r6idn.16xlarge', 'r6idn.24xlarge', 'r6idn.2xlarge', 'r6idn.32xlarge', 'r6idn.4xlarge', 'r6idn.8xlarge', 'r6idn.large', 'r6idn.metal', 'r6idn.xlarge', 'r6in.12xlarge', 'r6in.16xlarge', 'r6in.24xlarge', 'r6in.2xlarge', 'r6in.32xlarge', 'r6in.4xlarge', 'r6in.8xlarge', 'r6in.large', 'r6in.metal', 'r6in.xlarge', 'r7g.12xlarge', 'r7g.16xlarge', 'r7g.2xlarge', 'r7g.4xlarge', 'r7g.8xlarge', 'r7g.large', 'r7g.medium', 'r7g.metal', 'r7g.xlarge', 'r7gd.12xlarge', 'r7gd.16xlarge', 'r7gd.2xlarge', 'r7gd.4xlarge', 'r7gd.8xlarge', 'r7gd.large', 'r7gd.medium', 'r7gd.xlarge', 'r7i.12xlarge', 'r7i.16xlarge', 'r7i.24xlarge', 'r7i.2xlarge', 'r7i.48xlarge', 'r7i.4xlarge', 'r7i.8xlarge', 'r7i.large', 'r7i.metal-24xl', 'r7i.metal-48xl', 'r7i.xlarge', 'r7iz.12xlarge', 'r7iz.16xlarge', 'r7iz.2xlarge', 'r7iz.32xlarge', 'r7iz.4xlarge', 'r7iz.8xlarge', 'r7iz.large', 'r7iz.metal-16xl', 'r7iz.metal-32xl', 'r7iz.xlarge', 't2.2xlarge', 't2.large', 't2.medium', 't2.micro', 't2.nano', 't2.small', 't2.xlarge', 't3.2xlarge', 't3.large', 't3.medium', 't3.micro', 't3.nano', 't3.small', 't3.xlarge', 't3a.2xlarge', 't3a.large', 't3a.medium', 't3a.micro', 't3a.nano', 't3a.small', 't3a.xlarge', 't4g.2xlarge', 't4g.large', 't4g.medium', 't4g.micro', 't4g.nano', 't4g.small', 't4g.xlarge', 'u-12tb1.112xlarge', 'u-3tb1.56xlarge', 'u-6tb1.112xlarge', 'u-6tb1.56xlarge', 'u-9tb1.112xlarge', 'x1.16xlarge', 'x1.32xlarge', 'x1e.16xlarge', 'x1e.2xlarge', 'x1e.32xlarge', 'x1e.4xlarge', 'x1e.8xlarge', 'x1e.xlarge', 'x2idn.16xlarge', 'x2idn.24xlarge', 'x2idn.32xlarge', 'x2idn.metal', 'x2iedn.16xlarge', 'x2iedn.24xlarge', 'x2iedn.2xlarge', 'x2iedn.32xlarge', 'x2iedn.4xlarge', 'x2iedn.8xlarge', 'x2iedn.metal', 'x2iedn.xlarge', 'z1d.12xlarge', 'z1d.2xlarge', 'z1d.3xlarge', 'z1d.6xlarge', 'z1d.large', 'z1d.metal', 'z1d.xlarge'], - 'sa-east-1': ['c1.medium', 'c1.xlarge', 'c3.2xlarge', 'c3.4xlarge', 'c3.8xlarge', 'c3.large', 'c3.xlarge', 'c4.2xlarge', 'c4.4xlarge', 'c4.8xlarge', 'c4.large', 'c4.xlarge', 'c5.12xlarge', 'c5.18xlarge', 'c5.24xlarge', 'c5.2xlarge', 'c5.4xlarge', 'c5.9xlarge', 'c5.large', 'c5.metal', 'c5.xlarge', 'c5a.12xlarge', 'c5a.16xlarge', 'c5a.24xlarge', 'c5a.2xlarge', 'c5a.4xlarge', 'c5a.8xlarge', 'c5a.large', 'c5a.xlarge', 'c5ad.12xlarge', 'c5ad.16xlarge', 'c5ad.24xlarge', 'c5ad.2xlarge', 'c5ad.4xlarge', 'c5ad.8xlarge', 'c5ad.large', 'c5ad.xlarge', 'c5d.12xlarge', 'c5d.18xlarge', 'c5d.24xlarge', 'c5d.2xlarge', 'c5d.4xlarge', 'c5d.9xlarge', 'c5d.large', 'c5d.metal', 'c5d.xlarge', 'c5n.18xlarge', 'c5n.2xlarge', 'c5n.4xlarge', 'c5n.9xlarge', 'c5n.large', 'c5n.metal', 'c5n.xlarge', 'c6a.12xlarge', 'c6a.16xlarge', 'c6a.24xlarge', 'c6a.2xlarge', 'c6a.32xlarge', 'c6a.48xlarge', 'c6a.4xlarge', 'c6a.8xlarge', 'c6a.large', 'c6a.metal', 'c6a.xlarge', 'c6g.12xlarge', 'c6g.16xlarge', 'c6g.2xlarge', 'c6g.4xlarge', 'c6g.8xlarge', 'c6g.large', 'c6g.medium', 'c6g.metal', 'c6g.xlarge', 'c6gd.12xlarge', 'c6gd.16xlarge', 'c6gd.2xlarge', 'c6gd.4xlarge', 'c6gd.8xlarge', 'c6gd.large', 'c6gd.medium', 'c6gd.metal', 'c6gd.xlarge', 'c6gn.12xlarge', 'c6gn.16xlarge', 'c6gn.2xlarge', 'c6gn.4xlarge', 'c6gn.8xlarge', 'c6gn.large', 'c6gn.medium', 'c6gn.xlarge', 'c6i.12xlarge', 'c6i.16xlarge', 'c6i.24xlarge', 'c6i.2xlarge', 'c6i.32xlarge', 'c6i.4xlarge', 'c6i.8xlarge', 'c6i.large', 'c6i.metal', 'c6i.xlarge', 'c6in.12xlarge', 'c6in.16xlarge', 'c6in.24xlarge', 'c6in.2xlarge', 'c6in.32xlarge', 'c6in.4xlarge', 'c6in.8xlarge', 'c6in.large', 'c6in.metal', 'c6in.xlarge', 'c7i.12xlarge', 'c7i.16xlarge', 'c7i.24xlarge', 'c7i.2xlarge', 'c7i.48xlarge', 'c7i.4xlarge', 'c7i.8xlarge', 'c7i.large', 'c7i.metal-24xl', 'c7i.metal-48xl', 'c7i.xlarge', 'g4dn.12xlarge', 'g4dn.16xlarge', 'g4dn.2xlarge', 'g4dn.4xlarge', 'g4dn.8xlarge', 'g4dn.metal', 'g4dn.xlarge', 'g5.12xlarge', 'g5.16xlarge', 'g5.24xlarge', 'g5.2xlarge', 'g5.48xlarge', 'g5.4xlarge', 'g5.8xlarge', 'g5.xlarge', 'i3.16xlarge', 'i3.2xlarge', 'i3.4xlarge', 'i3.8xlarge', 'i3.large', 'i3.metal', 'i3.xlarge', 'i3en.12xlarge', 'i3en.24xlarge', 'i3en.2xlarge', 'i3en.3xlarge', 'i3en.6xlarge', 'i3en.large', 'i3en.metal', 'i3en.xlarge', 'i4i.12xlarge', 'i4i.16xlarge', 'i4i.24xlarge', 'i4i.2xlarge', 'i4i.32xlarge', 'i4i.4xlarge', 'i4i.8xlarge', 'i4i.large', 'i4i.metal', 'i4i.xlarge', 'inf1.24xlarge', 'inf1.2xlarge', 'inf1.6xlarge', 'inf1.xlarge', 'm1.large', 'm1.medium', 'm1.xlarge', 'm2.2xlarge', 'm2.4xlarge', 'm2.xlarge', 'm3.2xlarge', 'm3.large', 'm3.medium', 'm3.xlarge', 'm4.10xlarge', 'm4.16xlarge', 'm4.2xlarge', 'm4.4xlarge', 'm4.large', 'm4.xlarge', 'm5.12xlarge', 'm5.16xlarge', 'm5.24xlarge', 'm5.2xlarge', 'm5.4xlarge', 'm5.8xlarge', 'm5.large', 'm5.metal', 'm5.xlarge', 'm5a.12xlarge', 'm5a.16xlarge', 'm5a.24xlarge', 'm5a.2xlarge', 'm5a.4xlarge', 'm5a.8xlarge', 'm5a.large', 'm5a.xlarge', 'm5ad.12xlarge', 'm5ad.16xlarge', 'm5ad.24xlarge', 'm5ad.2xlarge', 'm5ad.4xlarge', 'm5ad.8xlarge', 'm5ad.large', 'm5ad.xlarge', 'm5d.12xlarge', 'm5d.16xlarge', 'm5d.24xlarge', 'm5d.2xlarge', 'm5d.4xlarge', 'm5d.8xlarge', 'm5d.large', 'm5d.metal', 'm5d.xlarge', 'm5zn.12xlarge', 'm5zn.2xlarge', 'm5zn.3xlarge', 'm5zn.6xlarge', 'm5zn.large', 'm5zn.metal', 'm5zn.xlarge', 'm6a.12xlarge', 'm6a.16xlarge', 'm6a.24xlarge', 'm6a.2xlarge', 'm6a.32xlarge', 'm6a.48xlarge', 'm6a.4xlarge', 'm6a.8xlarge', 'm6a.large', 'm6a.metal', 'm6a.xlarge', 'm6g.12xlarge', 'm6g.16xlarge', 'm6g.2xlarge', 'm6g.4xlarge', 'm6g.8xlarge', 'm6g.large', 'm6g.medium', 'm6g.metal', 'm6g.xlarge', 'm6gd.12xlarge', 'm6gd.16xlarge', 'm6gd.2xlarge', 'm6gd.4xlarge', 'm6gd.8xlarge', 'm6gd.large', 'm6gd.medium', 'm6gd.metal', 'm6gd.xlarge', 'm6i.12xlarge', 'm6i.16xlarge', 'm6i.24xlarge', 'm6i.2xlarge', 'm6i.32xlarge', 'm6i.4xlarge', 'm6i.8xlarge', 'm6i.large', 'm6i.metal', 'm6i.xlarge', 'm6id.12xlarge', 'm6id.16xlarge', 'm6id.24xlarge', 'm6id.2xlarge', 'm6id.32xlarge', 'm6id.4xlarge', 'm6id.8xlarge', 'm6id.large', 'm6id.metal', 'm6id.xlarge', 'm7g.12xlarge', 'm7g.16xlarge', 'm7g.2xlarge', 'm7g.4xlarge', 'm7g.8xlarge', 'm7g.large', 'm7g.medium', 'm7g.metal', 'm7g.xlarge', 'm7i-flex.2xlarge', 'm7i-flex.4xlarge', 'm7i-flex.8xlarge', 'm7i-flex.large', 'm7i-flex.xlarge', 'm7i.12xlarge', 'm7i.16xlarge', 'm7i.24xlarge', 'm7i.2xlarge', 'm7i.48xlarge', 'm7i.4xlarge', 'm7i.8xlarge', 'm7i.large', 'm7i.metal-24xl', 'm7i.metal-48xl', 'm7i.xlarge', 'r3.2xlarge', 'r3.4xlarge', 'r3.8xlarge', 'r3.large', 'r3.xlarge', 'r4.16xlarge', 'r4.2xlarge', 'r4.4xlarge', 'r4.8xlarge', 'r4.large', 'r4.xlarge', 'r5.12xlarge', 'r5.16xlarge', 'r5.24xlarge', 'r5.2xlarge', 'r5.4xlarge', 'r5.8xlarge', 'r5.large', 'r5.metal', 'r5.xlarge', 'r5a.12xlarge', 'r5a.16xlarge', 'r5a.24xlarge', 'r5a.2xlarge', 'r5a.4xlarge', 'r5a.8xlarge', 'r5a.large', 'r5a.xlarge', 'r5ad.12xlarge', 'r5ad.16xlarge', 'r5ad.24xlarge', 'r5ad.2xlarge', 'r5ad.4xlarge', 'r5ad.8xlarge', 'r5ad.large', 'r5ad.xlarge', 'r5b.12xlarge', 'r5b.16xlarge', 'r5b.24xlarge', 'r5b.2xlarge', 'r5b.4xlarge', 'r5b.8xlarge', 'r5b.large', 'r5b.metal', 'r5b.xlarge', 'r5d.12xlarge', 'r5d.16xlarge', 'r5d.24xlarge', 'r5d.2xlarge', 'r5d.4xlarge', 'r5d.8xlarge', 'r5d.large', 'r5d.metal', 'r5d.xlarge', 'r5n.12xlarge', 'r5n.16xlarge', 'r5n.24xlarge', 'r5n.2xlarge', 'r5n.4xlarge', 'r5n.8xlarge', 'r5n.large', 'r5n.metal', 'r5n.xlarge', 'r6g.12xlarge', 'r6g.16xlarge', 'r6g.2xlarge', 'r6g.4xlarge', 'r6g.8xlarge', 'r6g.large', 'r6g.medium', 'r6g.metal', 'r6g.xlarge', 'r6gd.12xlarge', 'r6gd.16xlarge', 'r6gd.2xlarge', 'r6gd.4xlarge', 'r6gd.8xlarge', 'r6gd.large', 'r6gd.medium', 'r6gd.metal', 'r6gd.xlarge', 'r6i.12xlarge', 'r6i.16xlarge', 'r6i.24xlarge', 'r6i.2xlarge', 'r6i.32xlarge', 'r6i.4xlarge', 'r6i.8xlarge', 'r6i.large', 'r6i.metal', 'r6i.xlarge', 'r7i.12xlarge', 'r7i.16xlarge', 'r7i.24xlarge', 'r7i.2xlarge', 'r7i.48xlarge', 'r7i.4xlarge', 'r7i.8xlarge', 'r7i.large', 'r7i.metal-24xl', 'r7i.metal-48xl', 'r7i.xlarge', 't1.micro', 't2.2xlarge', 't2.large', 't2.medium', 't2.micro', 't2.nano', 't2.small', 't2.xlarge', 't3.2xlarge', 't3.large', 't3.medium', 't3.micro', 't3.nano', 't3.small', 't3.xlarge', 't3a.2xlarge', 't3a.large', 't3a.medium', 't3a.micro', 't3a.nano', 't3a.small', 't3a.xlarge', 't4g.2xlarge', 't4g.large', 't4g.medium', 't4g.micro', 't4g.nano', 't4g.small', 't4g.xlarge', 'u-12tb1.112xlarge', 'u-3tb1.56xlarge', 'u-6tb1.112xlarge', 'u-6tb1.56xlarge', 'x1.16xlarge', 'x1.32xlarge', 'x1e.16xlarge', 'x1e.2xlarge', 'x1e.32xlarge', 'x1e.4xlarge', 'x1e.8xlarge', 'x1e.xlarge', 'x2idn.16xlarge', 'x2idn.24xlarge', 'x2idn.32xlarge', 'x2idn.metal', 'x2iedn.16xlarge', 'x2iedn.24xlarge', 'x2iedn.2xlarge', 'x2iedn.32xlarge', 'x2iedn.4xlarge', 'x2iedn.8xlarge', 'x2iedn.metal', 'x2iedn.xlarge']} + "us-west-1": [ + "c1.medium", + "c1.xlarge", + "c3.2xlarge", + "c3.4xlarge", + "c3.8xlarge", + "c3.large", + "c3.xlarge", + "c4.2xlarge", + "c4.4xlarge", + "c4.8xlarge", + "c4.large", + "c4.xlarge", + "c5.12xlarge", + "c5.18xlarge", + "c5.24xlarge", + "c5.2xlarge", + "c5.4xlarge", + "c5.9xlarge", + "c5.large", + "c5.metal", + "c5.xlarge", + "c5a.12xlarge", + "c5a.16xlarge", + "c5a.24xlarge", + "c5a.2xlarge", + "c5a.4xlarge", + "c5a.8xlarge", + "c5a.large", + "c5a.xlarge", + "c5d.12xlarge", + "c5d.18xlarge", + "c5d.24xlarge", + "c5d.2xlarge", + "c5d.4xlarge", + "c5d.9xlarge", + "c5d.large", + "c5d.metal", + "c5d.xlarge", + "c5n.18xlarge", + "c5n.2xlarge", + "c5n.4xlarge", + "c5n.9xlarge", + "c5n.large", + "c5n.metal", + "c5n.xlarge", + "c6a.12xlarge", + "c6a.16xlarge", + "c6a.24xlarge", + "c6a.2xlarge", + "c6a.32xlarge", + "c6a.48xlarge", + "c6a.4xlarge", + "c6a.8xlarge", + "c6a.large", + "c6a.metal", + "c6a.xlarge", + "c6g.12xlarge", + "c6g.16xlarge", + "c6g.2xlarge", + "c6g.4xlarge", + "c6g.8xlarge", + "c6g.large", + "c6g.medium", + "c6g.metal", + "c6g.xlarge", + "c6gd.12xlarge", + "c6gd.16xlarge", + "c6gd.2xlarge", + "c6gd.4xlarge", + "c6gd.8xlarge", + "c6gd.large", + "c6gd.medium", + "c6gd.metal", + "c6gd.xlarge", + "c6gn.12xlarge", + "c6gn.16xlarge", + "c6gn.2xlarge", + "c6gn.4xlarge", + "c6gn.8xlarge", + "c6gn.large", + "c6gn.medium", + "c6gn.xlarge", + "c6i.12xlarge", + "c6i.16xlarge", + "c6i.24xlarge", + "c6i.2xlarge", + "c6i.32xlarge", + "c6i.4xlarge", + "c6i.8xlarge", + "c6i.large", + "c6i.metal", + "c6i.xlarge", + "c6in.12xlarge", + "c6in.16xlarge", + "c6in.24xlarge", + "c6in.2xlarge", + "c6in.32xlarge", + "c6in.4xlarge", + "c6in.8xlarge", + "c6in.large", + "c6in.metal", + "c6in.xlarge", + "c7g.12xlarge", + "c7g.16xlarge", + "c7g.2xlarge", + "c7g.4xlarge", + "c7g.8xlarge", + "c7g.large", + "c7g.medium", + "c7g.metal", + "c7g.xlarge", + "c7gd.12xlarge", + "c7gd.16xlarge", + "c7gd.2xlarge", + "c7gd.4xlarge", + "c7gd.8xlarge", + "c7gd.large", + "c7gd.medium", + "c7gd.xlarge", + "c7i.12xlarge", + "c7i.16xlarge", + "c7i.24xlarge", + "c7i.2xlarge", + "c7i.48xlarge", + "c7i.4xlarge", + "c7i.8xlarge", + "c7i.large", + "c7i.metal-24xl", + "c7i.metal-48xl", + "c7i.xlarge", + "d2.2xlarge", + "d2.4xlarge", + "d2.8xlarge", + "d2.xlarge", + "g2.2xlarge", + "g2.8xlarge", + "g3.16xlarge", + "g3.4xlarge", + "g3.8xlarge", + "g4dn.12xlarge", + "g4dn.16xlarge", + "g4dn.2xlarge", + "g4dn.4xlarge", + "g4dn.8xlarge", + "g4dn.metal", + "g4dn.xlarge", + "i2.2xlarge", + "i2.4xlarge", + "i2.8xlarge", + "i2.xlarge", + "i3.16xlarge", + "i3.2xlarge", + "i3.4xlarge", + "i3.8xlarge", + "i3.large", + "i3.metal", + "i3.xlarge", + "i3en.12xlarge", + "i3en.24xlarge", + "i3en.2xlarge", + "i3en.3xlarge", + "i3en.6xlarge", + "i3en.large", + "i3en.metal", + "i3en.xlarge", + "i4i.12xlarge", + "i4i.16xlarge", + "i4i.24xlarge", + "i4i.2xlarge", + "i4i.32xlarge", + "i4i.4xlarge", + "i4i.8xlarge", + "i4i.large", + "i4i.metal", + "i4i.xlarge", + "inf1.24xlarge", + "inf1.2xlarge", + "inf1.6xlarge", + "inf1.xlarge", + "m1.large", + "m1.medium", + "m1.xlarge", + "m2.2xlarge", + "m2.4xlarge", + "m2.xlarge", + "m3.2xlarge", + "m3.large", + "m3.medium", + "m3.xlarge", + "m4.10xlarge", + "m4.16xlarge", + "m4.2xlarge", + "m4.4xlarge", + "m4.large", + "m4.xlarge", + "m5.12xlarge", + "m5.16xlarge", + "m5.24xlarge", + "m5.2xlarge", + "m5.4xlarge", + "m5.8xlarge", + "m5.large", + "m5.metal", + "m5.xlarge", + "m5a.12xlarge", + "m5a.16xlarge", + "m5a.24xlarge", + "m5a.2xlarge", + "m5a.4xlarge", + "m5a.8xlarge", + "m5a.large", + "m5a.xlarge", + "m5ad.12xlarge", + "m5ad.16xlarge", + "m5ad.24xlarge", + "m5ad.2xlarge", + "m5ad.4xlarge", + "m5ad.8xlarge", + "m5ad.large", + "m5ad.xlarge", + "m5d.12xlarge", + "m5d.16xlarge", + "m5d.24xlarge", + "m5d.2xlarge", + "m5d.4xlarge", + "m5d.8xlarge", + "m5d.large", + "m5d.metal", + "m5d.xlarge", + "m5zn.12xlarge", + "m5zn.2xlarge", + "m5zn.3xlarge", + "m5zn.6xlarge", + "m5zn.large", + "m5zn.metal", + "m5zn.xlarge", + "m6a.12xlarge", + "m6a.16xlarge", + "m6a.24xlarge", + "m6a.2xlarge", + "m6a.32xlarge", + "m6a.48xlarge", + "m6a.4xlarge", + "m6a.8xlarge", + "m6a.large", + "m6a.metal", + "m6a.xlarge", + "m6g.12xlarge", + "m6g.16xlarge", + "m6g.2xlarge", + "m6g.4xlarge", + "m6g.8xlarge", + "m6g.large", + "m6g.medium", + "m6g.metal", + "m6g.xlarge", + "m6gd.12xlarge", + "m6gd.16xlarge", + "m6gd.2xlarge", + "m6gd.4xlarge", + "m6gd.8xlarge", + "m6gd.large", + "m6gd.medium", + "m6gd.metal", + "m6gd.xlarge", + "m6i.12xlarge", + "m6i.16xlarge", + "m6i.24xlarge", + "m6i.2xlarge", + "m6i.32xlarge", + "m6i.4xlarge", + "m6i.8xlarge", + "m6i.large", + "m6i.metal", + "m6i.xlarge", + "m6idn.12xlarge", + "m6idn.16xlarge", + "m6idn.24xlarge", + "m6idn.2xlarge", + "m6idn.32xlarge", + "m6idn.4xlarge", + "m6idn.8xlarge", + "m6idn.large", + "m6idn.metal", + "m6idn.xlarge", + "m6in.12xlarge", + "m6in.16xlarge", + "m6in.24xlarge", + "m6in.2xlarge", + "m6in.32xlarge", + "m6in.4xlarge", + "m6in.8xlarge", + "m6in.large", + "m6in.metal", + "m6in.xlarge", + "m7g.12xlarge", + "m7g.16xlarge", + "m7g.2xlarge", + "m7g.4xlarge", + "m7g.8xlarge", + "m7g.large", + "m7g.medium", + "m7g.metal", + "m7g.xlarge", + "m7gd.12xlarge", + "m7gd.16xlarge", + "m7gd.2xlarge", + "m7gd.4xlarge", + "m7gd.8xlarge", + "m7gd.large", + "m7gd.medium", + "m7gd.xlarge", + "m7i-flex.2xlarge", + "m7i-flex.4xlarge", + "m7i-flex.8xlarge", + "m7i-flex.large", + "m7i-flex.xlarge", + "m7i.12xlarge", + "m7i.16xlarge", + "m7i.24xlarge", + "m7i.2xlarge", + "m7i.48xlarge", + "m7i.4xlarge", + "m7i.8xlarge", + "m7i.large", + "m7i.metal-24xl", + "m7i.metal-48xl", + "m7i.xlarge", + "r3.2xlarge", + "r3.4xlarge", + "r3.8xlarge", + "r3.large", + "r3.xlarge", + "r4.16xlarge", + "r4.2xlarge", + "r4.4xlarge", + "r4.8xlarge", + "r4.large", + "r4.xlarge", + "r5.12xlarge", + "r5.16xlarge", + "r5.24xlarge", + "r5.2xlarge", + "r5.4xlarge", + "r5.8xlarge", + "r5.large", + "r5.metal", + "r5.xlarge", + "r5a.12xlarge", + "r5a.16xlarge", + "r5a.24xlarge", + "r5a.2xlarge", + "r5a.4xlarge", + "r5a.8xlarge", + "r5a.large", + "r5a.xlarge", + "r5ad.12xlarge", + "r5ad.16xlarge", + "r5ad.24xlarge", + "r5ad.2xlarge", + "r5ad.4xlarge", + "r5ad.8xlarge", + "r5ad.large", + "r5ad.xlarge", + "r5d.12xlarge", + "r5d.16xlarge", + "r5d.24xlarge", + "r5d.2xlarge", + "r5d.4xlarge", + "r5d.8xlarge", + "r5d.large", + "r5d.metal", + "r5d.xlarge", + "r5n.12xlarge", + "r5n.16xlarge", + "r5n.24xlarge", + "r5n.2xlarge", + "r5n.4xlarge", + "r5n.8xlarge", + "r5n.large", + "r5n.metal", + "r5n.xlarge", + "r6a.12xlarge", + "r6a.16xlarge", + "r6a.24xlarge", + "r6a.2xlarge", + "r6a.32xlarge", + "r6a.48xlarge", + "r6a.4xlarge", + "r6a.8xlarge", + "r6a.large", + "r6a.metal", + "r6a.xlarge", + "r6g.12xlarge", + "r6g.16xlarge", + "r6g.2xlarge", + "r6g.4xlarge", + "r6g.8xlarge", + "r6g.large", + "r6g.medium", + "r6g.metal", + "r6g.xlarge", + "r6gd.12xlarge", + "r6gd.16xlarge", + "r6gd.2xlarge", + "r6gd.4xlarge", + "r6gd.8xlarge", + "r6gd.large", + "r6gd.medium", + "r6gd.metal", + "r6gd.xlarge", + "r6i.12xlarge", + "r6i.16xlarge", + "r6i.24xlarge", + "r6i.2xlarge", + "r6i.32xlarge", + "r6i.4xlarge", + "r6i.8xlarge", + "r6i.large", + "r6i.metal", + "r6i.xlarge", + "r7g.12xlarge", + "r7g.16xlarge", + "r7g.2xlarge", + "r7g.4xlarge", + "r7g.8xlarge", + "r7g.large", + "r7g.medium", + "r7g.metal", + "r7g.xlarge", + "r7gd.12xlarge", + "r7gd.16xlarge", + "r7gd.2xlarge", + "r7gd.4xlarge", + "r7gd.8xlarge", + "r7gd.large", + "r7gd.medium", + "r7gd.xlarge", + "r7i.12xlarge", + "r7i.16xlarge", + "r7i.24xlarge", + "r7i.2xlarge", + "r7i.48xlarge", + "r7i.4xlarge", + "r7i.8xlarge", + "r7i.large", + "r7i.metal-24xl", + "r7i.metal-48xl", + "r7i.xlarge", + "t1.micro", + "t2.2xlarge", + "t2.large", + "t2.medium", + "t2.micro", + "t2.nano", + "t2.small", + "t2.xlarge", + "t3.2xlarge", + "t3.large", + "t3.medium", + "t3.micro", + "t3.nano", + "t3.small", + "t3.xlarge", + "t3a.2xlarge", + "t3a.large", + "t3a.medium", + "t3a.micro", + "t3a.nano", + "t3a.small", + "t3a.xlarge", + "t4g.2xlarge", + "t4g.large", + "t4g.medium", + "t4g.micro", + "t4g.nano", + "t4g.small", + "t4g.xlarge", + "x2idn.16xlarge", + "x2idn.24xlarge", + "x2idn.32xlarge", + "x2idn.metal", + "x2iedn.16xlarge", + "x2iedn.24xlarge", + "x2iedn.2xlarge", + "x2iedn.32xlarge", + "x2iedn.4xlarge", + "x2iedn.8xlarge", + "x2iedn.metal", + "x2iedn.xlarge", + "z1d.12xlarge", + "z1d.2xlarge", + "z1d.3xlarge", + "z1d.6xlarge", + "z1d.large", + "z1d.metal", + "z1d.xlarge", + ], + "us-west-2": [ + "a1.2xlarge", + "a1.4xlarge", + "a1.large", + "a1.medium", + "a1.metal", + "a1.xlarge", + "c1.medium", + "c1.xlarge", + "c3.2xlarge", + "c3.4xlarge", + "c3.8xlarge", + "c3.large", + "c3.xlarge", + "c4.2xlarge", + "c4.4xlarge", + "c4.8xlarge", + "c4.large", + "c4.xlarge", + "c5.12xlarge", + "c5.18xlarge", + "c5.24xlarge", + "c5.2xlarge", + "c5.4xlarge", + "c5.9xlarge", + "c5.large", + "c5.metal", + "c5.xlarge", + "c5a.12xlarge", + "c5a.16xlarge", + "c5a.24xlarge", + "c5a.2xlarge", + "c5a.4xlarge", + "c5a.8xlarge", + "c5a.large", + "c5a.xlarge", + "c5ad.12xlarge", + "c5ad.16xlarge", + "c5ad.24xlarge", + "c5ad.2xlarge", + "c5ad.4xlarge", + "c5ad.8xlarge", + "c5ad.large", + "c5ad.xlarge", + "c5d.12xlarge", + "c5d.18xlarge", + "c5d.24xlarge", + "c5d.2xlarge", + "c5d.4xlarge", + "c5d.9xlarge", + "c5d.large", + "c5d.metal", + "c5d.xlarge", + "c5n.18xlarge", + "c5n.2xlarge", + "c5n.4xlarge", + "c5n.9xlarge", + "c5n.large", + "c5n.metal", + "c5n.xlarge", + "c6a.12xlarge", + "c6a.16xlarge", + "c6a.24xlarge", + "c6a.2xlarge", + "c6a.32xlarge", + "c6a.48xlarge", + "c6a.4xlarge", + "c6a.8xlarge", + "c6a.large", + "c6a.metal", + "c6a.xlarge", + "c6g.12xlarge", + "c6g.16xlarge", + "c6g.2xlarge", + "c6g.4xlarge", + "c6g.8xlarge", + "c6g.large", + "c6g.medium", + "c6g.metal", + "c6g.xlarge", + "c6gd.12xlarge", + "c6gd.16xlarge", + "c6gd.2xlarge", + "c6gd.4xlarge", + "c6gd.8xlarge", + "c6gd.large", + "c6gd.medium", + "c6gd.metal", + "c6gd.xlarge", + "c6gn.12xlarge", + "c6gn.16xlarge", + "c6gn.2xlarge", + "c6gn.4xlarge", + "c6gn.8xlarge", + "c6gn.large", + "c6gn.medium", + "c6gn.xlarge", + "c6i.12xlarge", + "c6i.16xlarge", + "c6i.24xlarge", + "c6i.2xlarge", + "c6i.32xlarge", + "c6i.4xlarge", + "c6i.8xlarge", + "c6i.large", + "c6i.metal", + "c6i.xlarge", + "c6id.12xlarge", + "c6id.16xlarge", + "c6id.24xlarge", + "c6id.2xlarge", + "c6id.32xlarge", + "c6id.4xlarge", + "c6id.8xlarge", + "c6id.large", + "c6id.metal", + "c6id.xlarge", + "c6in.12xlarge", + "c6in.16xlarge", + "c6in.24xlarge", + "c6in.2xlarge", + "c6in.32xlarge", + "c6in.4xlarge", + "c6in.8xlarge", + "c6in.large", + "c6in.metal", + "c6in.xlarge", + "c7a.12xlarge", + "c7a.16xlarge", + "c7a.24xlarge", + "c7a.2xlarge", + "c7a.32xlarge", + "c7a.48xlarge", + "c7a.4xlarge", + "c7a.8xlarge", + "c7a.large", + "c7a.medium", + "c7a.metal-48xl", + "c7a.xlarge", + "c7g.12xlarge", + "c7g.16xlarge", + "c7g.2xlarge", + "c7g.4xlarge", + "c7g.8xlarge", + "c7g.large", + "c7g.medium", + "c7g.metal", + "c7g.xlarge", + "c7gd.12xlarge", + "c7gd.16xlarge", + "c7gd.2xlarge", + "c7gd.4xlarge", + "c7gd.8xlarge", + "c7gd.large", + "c7gd.medium", + "c7gd.xlarge", + "c7gn.12xlarge", + "c7gn.16xlarge", + "c7gn.2xlarge", + "c7gn.4xlarge", + "c7gn.8xlarge", + "c7gn.large", + "c7gn.medium", + "c7gn.xlarge", + "c7i.12xlarge", + "c7i.16xlarge", + "c7i.24xlarge", + "c7i.2xlarge", + "c7i.48xlarge", + "c7i.4xlarge", + "c7i.8xlarge", + "c7i.large", + "c7i.metal-24xl", + "c7i.metal-48xl", + "c7i.xlarge", + "cr1.8xlarge", + "d2.2xlarge", + "d2.4xlarge", + "d2.8xlarge", + "d2.xlarge", + "d3.2xlarge", + "d3.4xlarge", + "d3.8xlarge", + "d3.xlarge", + "d3en.12xlarge", + "d3en.2xlarge", + "d3en.4xlarge", + "d3en.6xlarge", + "d3en.8xlarge", + "d3en.xlarge", + "dl1.24xlarge", + "dl2q.24xlarge", + "f1.16xlarge", + "f1.2xlarge", + "f1.4xlarge", + "g2.2xlarge", + "g2.8xlarge", + "g3.16xlarge", + "g3.4xlarge", + "g3.8xlarge", + "g3s.xlarge", + "g4ad.16xlarge", + "g4ad.2xlarge", + "g4ad.4xlarge", + "g4ad.8xlarge", + "g4ad.xlarge", + "g4dn.12xlarge", + "g4dn.16xlarge", + "g4dn.2xlarge", + "g4dn.4xlarge", + "g4dn.8xlarge", + "g4dn.metal", + "g4dn.xlarge", + "g5.12xlarge", + "g5.16xlarge", + "g5.24xlarge", + "g5.2xlarge", + "g5.48xlarge", + "g5.4xlarge", + "g5.8xlarge", + "g5.xlarge", + "g5g.16xlarge", + "g5g.2xlarge", + "g5g.4xlarge", + "g5g.8xlarge", + "g5g.metal", + "g5g.xlarge", + "h1.16xlarge", + "h1.2xlarge", + "h1.4xlarge", + "h1.8xlarge", + "i2.2xlarge", + "i2.4xlarge", + "i2.8xlarge", + "i2.xlarge", + "i3.16xlarge", + "i3.2xlarge", + "i3.4xlarge", + "i3.8xlarge", + "i3.large", + "i3.metal", + "i3.xlarge", + "i3en.12xlarge", + "i3en.24xlarge", + "i3en.2xlarge", + "i3en.3xlarge", + "i3en.6xlarge", + "i3en.large", + "i3en.metal", + "i3en.xlarge", + "i4g.16xlarge", + "i4g.2xlarge", + "i4g.4xlarge", + "i4g.8xlarge", + "i4g.large", + "i4g.xlarge", + "i4i.12xlarge", + "i4i.16xlarge", + "i4i.24xlarge", + "i4i.2xlarge", + "i4i.32xlarge", + "i4i.4xlarge", + "i4i.8xlarge", + "i4i.large", + "i4i.metal", + "i4i.xlarge", + "im4gn.16xlarge", + "im4gn.2xlarge", + "im4gn.4xlarge", + "im4gn.8xlarge", + "im4gn.large", + "im4gn.xlarge", + "inf1.24xlarge", + "inf1.2xlarge", + "inf1.6xlarge", + "inf1.xlarge", + "inf2.24xlarge", + "inf2.48xlarge", + "inf2.8xlarge", + "inf2.xlarge", + "is4gen.2xlarge", + "is4gen.4xlarge", + "is4gen.8xlarge", + "is4gen.large", + "is4gen.medium", + "is4gen.xlarge", + "m1.large", + "m1.medium", + "m1.xlarge", + "m2.2xlarge", + "m2.4xlarge", + "m2.xlarge", + "m3.2xlarge", + "m3.large", + "m3.medium", + "m3.xlarge", + "m4.10xlarge", + "m4.16xlarge", + "m4.2xlarge", + "m4.4xlarge", + "m4.large", + "m4.xlarge", + "m5.12xlarge", + "m5.16xlarge", + "m5.24xlarge", + "m5.2xlarge", + "m5.4xlarge", + "m5.8xlarge", + "m5.large", + "m5.metal", + "m5.xlarge", + "m5a.12xlarge", + "m5a.16xlarge", + "m5a.24xlarge", + "m5a.2xlarge", + "m5a.4xlarge", + "m5a.8xlarge", + "m5a.large", + "m5a.xlarge", + "m5ad.12xlarge", + "m5ad.16xlarge", + "m5ad.24xlarge", + "m5ad.2xlarge", + "m5ad.4xlarge", + "m5ad.8xlarge", + "m5ad.large", + "m5ad.xlarge", + "m5d.12xlarge", + "m5d.16xlarge", + "m5d.24xlarge", + "m5d.2xlarge", + "m5d.4xlarge", + "m5d.8xlarge", + "m5d.large", + "m5d.metal", + "m5d.xlarge", + "m5dn.12xlarge", + "m5dn.16xlarge", + "m5dn.24xlarge", + "m5dn.2xlarge", + "m5dn.4xlarge", + "m5dn.8xlarge", + "m5dn.large", + "m5dn.metal", + "m5dn.xlarge", + "m5n.12xlarge", + "m5n.16xlarge", + "m5n.24xlarge", + "m5n.2xlarge", + "m5n.4xlarge", + "m5n.8xlarge", + "m5n.large", + "m5n.metal", + "m5n.xlarge", + "m5zn.12xlarge", + "m5zn.2xlarge", + "m5zn.3xlarge", + "m5zn.6xlarge", + "m5zn.large", + "m5zn.metal", + "m5zn.xlarge", + "m6a.12xlarge", + "m6a.16xlarge", + "m6a.24xlarge", + "m6a.2xlarge", + "m6a.32xlarge", + "m6a.48xlarge", + "m6a.4xlarge", + "m6a.8xlarge", + "m6a.large", + "m6a.metal", + "m6a.xlarge", + "m6g.12xlarge", + "m6g.16xlarge", + "m6g.2xlarge", + "m6g.4xlarge", + "m6g.8xlarge", + "m6g.large", + "m6g.medium", + "m6g.metal", + "m6g.xlarge", + "m6gd.12xlarge", + "m6gd.16xlarge", + "m6gd.2xlarge", + "m6gd.4xlarge", + "m6gd.8xlarge", + "m6gd.large", + "m6gd.medium", + "m6gd.metal", + "m6gd.xlarge", + "m6i.12xlarge", + "m6i.16xlarge", + "m6i.24xlarge", + "m6i.2xlarge", + "m6i.32xlarge", + "m6i.4xlarge", + "m6i.8xlarge", + "m6i.large", + "m6i.metal", + "m6i.xlarge", + "m6id.12xlarge", + "m6id.16xlarge", + "m6id.24xlarge", + "m6id.2xlarge", + "m6id.32xlarge", + "m6id.4xlarge", + "m6id.8xlarge", + "m6id.large", + "m6id.metal", + "m6id.xlarge", + "m6idn.12xlarge", + "m6idn.16xlarge", + "m6idn.24xlarge", + "m6idn.2xlarge", + "m6idn.32xlarge", + "m6idn.4xlarge", + "m6idn.8xlarge", + "m6idn.large", + "m6idn.metal", + "m6idn.xlarge", + "m6in.12xlarge", + "m6in.16xlarge", + "m6in.24xlarge", + "m6in.2xlarge", + "m6in.32xlarge", + "m6in.4xlarge", + "m6in.8xlarge", + "m6in.large", + "m6in.metal", + "m6in.xlarge", + "m7a.12xlarge", + "m7a.16xlarge", + "m7a.24xlarge", + "m7a.2xlarge", + "m7a.32xlarge", + "m7a.48xlarge", + "m7a.4xlarge", + "m7a.8xlarge", + "m7a.large", + "m7a.medium", + "m7a.metal-48xl", + "m7a.xlarge", + "m7g.12xlarge", + "m7g.16xlarge", + "m7g.2xlarge", + "m7g.4xlarge", + "m7g.8xlarge", + "m7g.large", + "m7g.medium", + "m7g.metal", + "m7g.xlarge", + "m7gd.12xlarge", + "m7gd.16xlarge", + "m7gd.2xlarge", + "m7gd.4xlarge", + "m7gd.8xlarge", + "m7gd.large", + "m7gd.medium", + "m7gd.xlarge", + "m7i-flex.2xlarge", + "m7i-flex.4xlarge", + "m7i-flex.8xlarge", + "m7i-flex.large", + "m7i-flex.xlarge", + "m7i.12xlarge", + "m7i.16xlarge", + "m7i.24xlarge", + "m7i.2xlarge", + "m7i.48xlarge", + "m7i.4xlarge", + "m7i.8xlarge", + "m7i.large", + "m7i.metal-24xl", + "m7i.metal-48xl", + "m7i.xlarge", + "p2.16xlarge", + "p2.8xlarge", + "p2.xlarge", + "p3.16xlarge", + "p3.2xlarge", + "p3.8xlarge", + "p3dn.24xlarge", + "p4d.24xlarge", + "p4de.24xlarge", + "p5.48xlarge", + "r3.2xlarge", + "r3.4xlarge", + "r3.8xlarge", + "r3.large", + "r3.xlarge", + "r4.16xlarge", + "r4.2xlarge", + "r4.4xlarge", + "r4.8xlarge", + "r4.large", + "r4.xlarge", + "r5.12xlarge", + "r5.16xlarge", + "r5.24xlarge", + "r5.2xlarge", + "r5.4xlarge", + "r5.8xlarge", + "r5.large", + "r5.metal", + "r5.xlarge", + "r5a.12xlarge", + "r5a.16xlarge", + "r5a.24xlarge", + "r5a.2xlarge", + "r5a.4xlarge", + "r5a.8xlarge", + "r5a.large", + "r5a.xlarge", + "r5ad.12xlarge", + "r5ad.16xlarge", + "r5ad.24xlarge", + "r5ad.2xlarge", + "r5ad.4xlarge", + "r5ad.8xlarge", + "r5ad.large", + "r5ad.xlarge", + "r5b.12xlarge", + "r5b.16xlarge", + "r5b.24xlarge", + "r5b.2xlarge", + "r5b.4xlarge", + "r5b.8xlarge", + "r5b.large", + "r5b.metal", + "r5b.xlarge", + "r5d.12xlarge", + "r5d.16xlarge", + "r5d.24xlarge", + "r5d.2xlarge", + "r5d.4xlarge", + "r5d.8xlarge", + "r5d.large", + "r5d.metal", + "r5d.xlarge", + "r5dn.12xlarge", + "r5dn.16xlarge", + "r5dn.24xlarge", + "r5dn.2xlarge", + "r5dn.4xlarge", + "r5dn.8xlarge", + "r5dn.large", + "r5dn.metal", + "r5dn.xlarge", + "r5n.12xlarge", + "r5n.16xlarge", + "r5n.24xlarge", + "r5n.2xlarge", + "r5n.4xlarge", + "r5n.8xlarge", + "r5n.large", + "r5n.metal", + "r5n.xlarge", + "r6a.12xlarge", + "r6a.16xlarge", + "r6a.24xlarge", + "r6a.2xlarge", + "r6a.32xlarge", + "r6a.48xlarge", + "r6a.4xlarge", + "r6a.8xlarge", + "r6a.large", + "r6a.metal", + "r6a.xlarge", + "r6g.12xlarge", + "r6g.16xlarge", + "r6g.2xlarge", + "r6g.4xlarge", + "r6g.8xlarge", + "r6g.large", + "r6g.medium", + "r6g.metal", + "r6g.xlarge", + "r6gd.12xlarge", + "r6gd.16xlarge", + "r6gd.2xlarge", + "r6gd.4xlarge", + "r6gd.8xlarge", + "r6gd.large", + "r6gd.medium", + "r6gd.metal", + "r6gd.xlarge", + "r6i.12xlarge", + "r6i.16xlarge", + "r6i.24xlarge", + "r6i.2xlarge", + "r6i.32xlarge", + "r6i.4xlarge", + "r6i.8xlarge", + "r6i.large", + "r6i.metal", + "r6i.xlarge", + "r6id.12xlarge", + "r6id.16xlarge", + "r6id.24xlarge", + "r6id.2xlarge", + "r6id.32xlarge", + "r6id.4xlarge", + "r6id.8xlarge", + "r6id.large", + "r6id.metal", + "r6id.xlarge", + "r6idn.12xlarge", + "r6idn.16xlarge", + "r6idn.24xlarge", + "r6idn.2xlarge", + "r6idn.32xlarge", + "r6idn.4xlarge", + "r6idn.8xlarge", + "r6idn.large", + "r6idn.metal", + "r6idn.xlarge", + "r6in.12xlarge", + "r6in.16xlarge", + "r6in.24xlarge", + "r6in.2xlarge", + "r6in.32xlarge", + "r6in.4xlarge", + "r6in.8xlarge", + "r6in.large", + "r6in.metal", + "r6in.xlarge", + "r7a.12xlarge", + "r7a.16xlarge", + "r7a.24xlarge", + "r7a.2xlarge", + "r7a.32xlarge", + "r7a.48xlarge", + "r7a.4xlarge", + "r7a.8xlarge", + "r7a.large", + "r7a.medium", + "r7a.metal-48xl", + "r7a.xlarge", + "r7g.12xlarge", + "r7g.16xlarge", + "r7g.2xlarge", + "r7g.4xlarge", + "r7g.8xlarge", + "r7g.large", + "r7g.medium", + "r7g.metal", + "r7g.xlarge", + "r7gd.12xlarge", + "r7gd.16xlarge", + "r7gd.2xlarge", + "r7gd.4xlarge", + "r7gd.8xlarge", + "r7gd.large", + "r7gd.medium", + "r7gd.xlarge", + "r7i.12xlarge", + "r7i.16xlarge", + "r7i.24xlarge", + "r7i.2xlarge", + "r7i.48xlarge", + "r7i.4xlarge", + "r7i.8xlarge", + "r7i.large", + "r7i.metal-24xl", + "r7i.metal-48xl", + "r7i.xlarge", + "r7iz.12xlarge", + "r7iz.16xlarge", + "r7iz.2xlarge", + "r7iz.32xlarge", + "r7iz.4xlarge", + "r7iz.8xlarge", + "r7iz.large", + "r7iz.metal-16xl", + "r7iz.metal-32xl", + "r7iz.xlarge", + "t1.micro", + "t2.2xlarge", + "t2.large", + "t2.medium", + "t2.micro", + "t2.nano", + "t2.small", + "t2.xlarge", + "t3.2xlarge", + "t3.large", + "t3.medium", + "t3.micro", + "t3.nano", + "t3.small", + "t3.xlarge", + "t3a.2xlarge", + "t3a.large", + "t3a.medium", + "t3a.micro", + "t3a.nano", + "t3a.small", + "t3a.xlarge", + "t4g.2xlarge", + "t4g.large", + "t4g.medium", + "t4g.micro", + "t4g.nano", + "t4g.small", + "t4g.xlarge", + "trn1.2xlarge", + "trn1.32xlarge", + "trn1n.32xlarge", + "u-12tb1.112xlarge", + "u-18tb1.112xlarge", + "u-24tb1.112xlarge", + "u-3tb1.56xlarge", + "u-6tb1.112xlarge", + "u-6tb1.56xlarge", + "u-9tb1.112xlarge", + "vt1.24xlarge", + "vt1.3xlarge", + "vt1.6xlarge", + "x1.16xlarge", + "x1.32xlarge", + "x1e.16xlarge", + "x1e.2xlarge", + "x1e.32xlarge", + "x1e.4xlarge", + "x1e.8xlarge", + "x1e.xlarge", + "x2gd.12xlarge", + "x2gd.16xlarge", + "x2gd.2xlarge", + "x2gd.4xlarge", + "x2gd.8xlarge", + "x2gd.large", + "x2gd.medium", + "x2gd.metal", + "x2gd.xlarge", + "x2idn.16xlarge", + "x2idn.24xlarge", + "x2idn.32xlarge", + "x2idn.metal", + "x2iedn.16xlarge", + "x2iedn.24xlarge", + "x2iedn.2xlarge", + "x2iedn.32xlarge", + "x2iedn.4xlarge", + "x2iedn.8xlarge", + "x2iedn.metal", + "x2iedn.xlarge", + "x2iezn.12xlarge", + "x2iezn.2xlarge", + "x2iezn.4xlarge", + "x2iezn.6xlarge", + "x2iezn.8xlarge", + "x2iezn.metal", + "z1d.12xlarge", + "z1d.2xlarge", + "z1d.3xlarge", + "z1d.6xlarge", + "z1d.large", + "z1d.metal", + "z1d.xlarge", + ], + "us-east-1": [ + "a1.2xlarge", + "a1.4xlarge", + "a1.large", + "a1.medium", + "a1.metal", + "a1.xlarge", + "c1.medium", + "c1.xlarge", + "c3.2xlarge", + "c3.4xlarge", + "c3.8xlarge", + "c3.large", + "c3.xlarge", + "c4.2xlarge", + "c4.4xlarge", + "c4.8xlarge", + "c4.large", + "c4.xlarge", + "c5.12xlarge", + "c5.18xlarge", + "c5.24xlarge", + "c5.2xlarge", + "c5.4xlarge", + "c5.9xlarge", + "c5.large", + "c5.metal", + "c5.xlarge", + "c5a.12xlarge", + "c5a.16xlarge", + "c5a.24xlarge", + "c5a.2xlarge", + "c5a.4xlarge", + "c5a.8xlarge", + "c5a.large", + "c5a.xlarge", + "c5ad.12xlarge", + "c5ad.16xlarge", + "c5ad.24xlarge", + "c5ad.2xlarge", + "c5ad.4xlarge", + "c5ad.8xlarge", + "c5ad.large", + "c5ad.xlarge", + "c5d.12xlarge", + "c5d.18xlarge", + "c5d.24xlarge", + "c5d.2xlarge", + "c5d.4xlarge", + "c5d.9xlarge", + "c5d.large", + "c5d.metal", + "c5d.xlarge", + "c5n.18xlarge", + "c5n.2xlarge", + "c5n.4xlarge", + "c5n.9xlarge", + "c5n.large", + "c5n.metal", + "c5n.xlarge", + "c6a.12xlarge", + "c6a.16xlarge", + "c6a.24xlarge", + "c6a.2xlarge", + "c6a.32xlarge", + "c6a.48xlarge", + "c6a.4xlarge", + "c6a.8xlarge", + "c6a.large", + "c6a.metal", + "c6a.xlarge", + "c6g.12xlarge", + "c6g.16xlarge", + "c6g.2xlarge", + "c6g.4xlarge", + "c6g.8xlarge", + "c6g.large", + "c6g.medium", + "c6g.metal", + "c6g.xlarge", + "c6gd.12xlarge", + "c6gd.16xlarge", + "c6gd.2xlarge", + "c6gd.4xlarge", + "c6gd.8xlarge", + "c6gd.large", + "c6gd.medium", + "c6gd.metal", + "c6gd.xlarge", + "c6gn.12xlarge", + "c6gn.16xlarge", + "c6gn.2xlarge", + "c6gn.4xlarge", + "c6gn.8xlarge", + "c6gn.large", + "c6gn.medium", + "c6gn.xlarge", + "c6i.12xlarge", + "c6i.16xlarge", + "c6i.24xlarge", + "c6i.2xlarge", + "c6i.32xlarge", + "c6i.4xlarge", + "c6i.8xlarge", + "c6i.large", + "c6i.metal", + "c6i.xlarge", + "c6id.12xlarge", + "c6id.16xlarge", + "c6id.24xlarge", + "c6id.2xlarge", + "c6id.32xlarge", + "c6id.4xlarge", + "c6id.8xlarge", + "c6id.large", + "c6id.metal", + "c6id.xlarge", + "c6in.12xlarge", + "c6in.16xlarge", + "c6in.24xlarge", + "c6in.2xlarge", + "c6in.32xlarge", + "c6in.4xlarge", + "c6in.8xlarge", + "c6in.large", + "c6in.metal", + "c6in.xlarge", + "c7a.12xlarge", + "c7a.16xlarge", + "c7a.24xlarge", + "c7a.2xlarge", + "c7a.32xlarge", + "c7a.48xlarge", + "c7a.4xlarge", + "c7a.8xlarge", + "c7a.large", + "c7a.medium", + "c7a.metal-48xl", + "c7a.xlarge", + "c7g.12xlarge", + "c7g.16xlarge", + "c7g.2xlarge", + "c7g.4xlarge", + "c7g.8xlarge", + "c7g.large", + "c7g.medium", + "c7g.metal", + "c7g.xlarge", + "c7gd.12xlarge", + "c7gd.16xlarge", + "c7gd.2xlarge", + "c7gd.4xlarge", + "c7gd.8xlarge", + "c7gd.large", + "c7gd.medium", + "c7gd.xlarge", + "c7gn.12xlarge", + "c7gn.16xlarge", + "c7gn.2xlarge", + "c7gn.4xlarge", + "c7gn.8xlarge", + "c7gn.large", + "c7gn.medium", + "c7gn.xlarge", + "c7i.12xlarge", + "c7i.16xlarge", + "c7i.24xlarge", + "c7i.2xlarge", + "c7i.48xlarge", + "c7i.4xlarge", + "c7i.8xlarge", + "c7i.large", + "c7i.metal-24xl", + "c7i.metal-48xl", + "c7i.xlarge", + "cr1.8xlarge", + "d2.2xlarge", + "d2.4xlarge", + "d2.8xlarge", + "d2.xlarge", + "d3.2xlarge", + "d3.4xlarge", + "d3.8xlarge", + "d3.xlarge", + "d3en.12xlarge", + "d3en.2xlarge", + "d3en.4xlarge", + "d3en.6xlarge", + "d3en.8xlarge", + "d3en.xlarge", + "dl1.24xlarge", + "f1.16xlarge", + "f1.2xlarge", + "f1.4xlarge", + "g2.2xlarge", + "g2.8xlarge", + "g3.16xlarge", + "g3.4xlarge", + "g3.8xlarge", + "g3s.xlarge", + "g4ad.16xlarge", + "g4ad.2xlarge", + "g4ad.4xlarge", + "g4ad.8xlarge", + "g4ad.xlarge", + "g4dn.12xlarge", + "g4dn.16xlarge", + "g4dn.2xlarge", + "g4dn.4xlarge", + "g4dn.8xlarge", + "g4dn.metal", + "g4dn.xlarge", + "g5.12xlarge", + "g5.16xlarge", + "g5.24xlarge", + "g5.2xlarge", + "g5.48xlarge", + "g5.4xlarge", + "g5.8xlarge", + "g5.xlarge", + "g5g.16xlarge", + "g5g.2xlarge", + "g5g.4xlarge", + "g5g.8xlarge", + "g5g.metal", + "g5g.xlarge", + "h1.16xlarge", + "h1.2xlarge", + "h1.4xlarge", + "h1.8xlarge", + "hpc7g.16xlarge", + "hpc7g.4xlarge", + "hpc7g.8xlarge", + "i2.2xlarge", + "i2.4xlarge", + "i2.8xlarge", + "i2.xlarge", + "i3.16xlarge", + "i3.2xlarge", + "i3.4xlarge", + "i3.8xlarge", + "i3.large", + "i3.metal", + "i3.xlarge", + "i3en.12xlarge", + "i3en.24xlarge", + "i3en.2xlarge", + "i3en.3xlarge", + "i3en.6xlarge", + "i3en.large", + "i3en.metal", + "i3en.xlarge", + "i4g.16xlarge", + "i4g.2xlarge", + "i4g.4xlarge", + "i4g.8xlarge", + "i4g.large", + "i4g.xlarge", + "i4i.12xlarge", + "i4i.16xlarge", + "i4i.24xlarge", + "i4i.2xlarge", + "i4i.32xlarge", + "i4i.4xlarge", + "i4i.8xlarge", + "i4i.large", + "i4i.metal", + "i4i.xlarge", + "im4gn.16xlarge", + "im4gn.2xlarge", + "im4gn.4xlarge", + "im4gn.8xlarge", + "im4gn.large", + "im4gn.xlarge", + "inf1.24xlarge", + "inf1.2xlarge", + "inf1.6xlarge", + "inf1.xlarge", + "inf2.24xlarge", + "inf2.48xlarge", + "inf2.8xlarge", + "inf2.xlarge", + "is4gen.2xlarge", + "is4gen.4xlarge", + "is4gen.8xlarge", + "is4gen.large", + "is4gen.medium", + "is4gen.xlarge", + "m1.large", + "m1.medium", + "m1.xlarge", + "m2.2xlarge", + "m2.4xlarge", + "m2.xlarge", + "m3.2xlarge", + "m3.large", + "m3.medium", + "m3.xlarge", + "m4.10xlarge", + "m4.16xlarge", + "m4.2xlarge", + "m4.4xlarge", + "m4.large", + "m4.xlarge", + "m5.12xlarge", + "m5.16xlarge", + "m5.24xlarge", + "m5.2xlarge", + "m5.4xlarge", + "m5.8xlarge", + "m5.large", + "m5.metal", + "m5.xlarge", + "m5a.12xlarge", + "m5a.16xlarge", + "m5a.24xlarge", + "m5a.2xlarge", + "m5a.4xlarge", + "m5a.8xlarge", + "m5a.large", + "m5a.xlarge", + "m5ad.12xlarge", + "m5ad.16xlarge", + "m5ad.24xlarge", + "m5ad.2xlarge", + "m5ad.4xlarge", + "m5ad.8xlarge", + "m5ad.large", + "m5ad.xlarge", + "m5d.12xlarge", + "m5d.16xlarge", + "m5d.24xlarge", + "m5d.2xlarge", + "m5d.4xlarge", + "m5d.8xlarge", + "m5d.large", + "m5d.metal", + "m5d.xlarge", + "m5dn.12xlarge", + "m5dn.16xlarge", + "m5dn.24xlarge", + "m5dn.2xlarge", + "m5dn.4xlarge", + "m5dn.8xlarge", + "m5dn.large", + "m5dn.metal", + "m5dn.xlarge", + "m5n.12xlarge", + "m5n.16xlarge", + "m5n.24xlarge", + "m5n.2xlarge", + "m5n.4xlarge", + "m5n.8xlarge", + "m5n.large", + "m5n.metal", + "m5n.xlarge", + "m5zn.12xlarge", + "m5zn.2xlarge", + "m5zn.3xlarge", + "m5zn.6xlarge", + "m5zn.large", + "m5zn.metal", + "m5zn.xlarge", + "m6a.12xlarge", + "m6a.16xlarge", + "m6a.24xlarge", + "m6a.2xlarge", + "m6a.32xlarge", + "m6a.48xlarge", + "m6a.4xlarge", + "m6a.8xlarge", + "m6a.large", + "m6a.metal", + "m6a.xlarge", + "m6g.12xlarge", + "m6g.16xlarge", + "m6g.2xlarge", + "m6g.4xlarge", + "m6g.8xlarge", + "m6g.large", + "m6g.medium", + "m6g.metal", + "m6g.xlarge", + "m6gd.12xlarge", + "m6gd.16xlarge", + "m6gd.2xlarge", + "m6gd.4xlarge", + "m6gd.8xlarge", + "m6gd.large", + "m6gd.medium", + "m6gd.metal", + "m6gd.xlarge", + "m6i.12xlarge", + "m6i.16xlarge", + "m6i.24xlarge", + "m6i.2xlarge", + "m6i.32xlarge", + "m6i.4xlarge", + "m6i.8xlarge", + "m6i.large", + "m6i.metal", + "m6i.xlarge", + "m6id.12xlarge", + "m6id.16xlarge", + "m6id.24xlarge", + "m6id.2xlarge", + "m6id.32xlarge", + "m6id.4xlarge", + "m6id.8xlarge", + "m6id.large", + "m6id.metal", + "m6id.xlarge", + "m6idn.12xlarge", + "m6idn.16xlarge", + "m6idn.24xlarge", + "m6idn.2xlarge", + "m6idn.32xlarge", + "m6idn.4xlarge", + "m6idn.8xlarge", + "m6idn.large", + "m6idn.metal", + "m6idn.xlarge", + "m6in.12xlarge", + "m6in.16xlarge", + "m6in.24xlarge", + "m6in.2xlarge", + "m6in.32xlarge", + "m6in.4xlarge", + "m6in.8xlarge", + "m6in.large", + "m6in.metal", + "m6in.xlarge", + "m7a.12xlarge", + "m7a.16xlarge", + "m7a.24xlarge", + "m7a.2xlarge", + "m7a.32xlarge", + "m7a.48xlarge", + "m7a.4xlarge", + "m7a.8xlarge", + "m7a.large", + "m7a.medium", + "m7a.metal-48xl", + "m7a.xlarge", + "m7g.12xlarge", + "m7g.16xlarge", + "m7g.2xlarge", + "m7g.4xlarge", + "m7g.8xlarge", + "m7g.large", + "m7g.medium", + "m7g.metal", + "m7g.xlarge", + "m7gd.12xlarge", + "m7gd.16xlarge", + "m7gd.2xlarge", + "m7gd.4xlarge", + "m7gd.8xlarge", + "m7gd.large", + "m7gd.medium", + "m7gd.xlarge", + "m7i-flex.2xlarge", + "m7i-flex.4xlarge", + "m7i-flex.8xlarge", + "m7i-flex.large", + "m7i-flex.xlarge", + "m7i.12xlarge", + "m7i.16xlarge", + "m7i.24xlarge", + "m7i.2xlarge", + "m7i.48xlarge", + "m7i.4xlarge", + "m7i.8xlarge", + "m7i.large", + "m7i.metal-24xl", + "m7i.metal-48xl", + "m7i.xlarge", + "p2.16xlarge", + "p2.8xlarge", + "p2.xlarge", + "p3.16xlarge", + "p3.2xlarge", + "p3.8xlarge", + "p3dn.24xlarge", + "p4d.24xlarge", + "p4de.24xlarge", + "p5.48xlarge", + "r3.2xlarge", + "r3.4xlarge", + "r3.8xlarge", + "r3.large", + "r3.xlarge", + "r4.16xlarge", + "r4.2xlarge", + "r4.4xlarge", + "r4.8xlarge", + "r4.large", + "r4.xlarge", + "r5.12xlarge", + "r5.16xlarge", + "r5.24xlarge", + "r5.2xlarge", + "r5.4xlarge", + "r5.8xlarge", + "r5.large", + "r5.metal", + "r5.xlarge", + "r5a.12xlarge", + "r5a.16xlarge", + "r5a.24xlarge", + "r5a.2xlarge", + "r5a.4xlarge", + "r5a.8xlarge", + "r5a.large", + "r5a.xlarge", + "r5ad.12xlarge", + "r5ad.16xlarge", + "r5ad.24xlarge", + "r5ad.2xlarge", + "r5ad.4xlarge", + "r5ad.8xlarge", + "r5ad.large", + "r5ad.xlarge", + "r5b.12xlarge", + "r5b.16xlarge", + "r5b.24xlarge", + "r5b.2xlarge", + "r5b.4xlarge", + "r5b.8xlarge", + "r5b.large", + "r5b.metal", + "r5b.xlarge", + "r5d.12xlarge", + "r5d.16xlarge", + "r5d.24xlarge", + "r5d.2xlarge", + "r5d.4xlarge", + "r5d.8xlarge", + "r5d.large", + "r5d.metal", + "r5d.xlarge", + "r5dn.12xlarge", + "r5dn.16xlarge", + "r5dn.24xlarge", + "r5dn.2xlarge", + "r5dn.4xlarge", + "r5dn.8xlarge", + "r5dn.large", + "r5dn.metal", + "r5dn.xlarge", + "r5n.12xlarge", + "r5n.16xlarge", + "r5n.24xlarge", + "r5n.2xlarge", + "r5n.4xlarge", + "r5n.8xlarge", + "r5n.large", + "r5n.metal", + "r5n.xlarge", + "r6a.12xlarge", + "r6a.16xlarge", + "r6a.24xlarge", + "r6a.2xlarge", + "r6a.32xlarge", + "r6a.48xlarge", + "r6a.4xlarge", + "r6a.8xlarge", + "r6a.large", + "r6a.metal", + "r6a.xlarge", + "r6g.12xlarge", + "r6g.16xlarge", + "r6g.2xlarge", + "r6g.4xlarge", + "r6g.8xlarge", + "r6g.large", + "r6g.medium", + "r6g.metal", + "r6g.xlarge", + "r6gd.12xlarge", + "r6gd.16xlarge", + "r6gd.2xlarge", + "r6gd.4xlarge", + "r6gd.8xlarge", + "r6gd.large", + "r6gd.medium", + "r6gd.metal", + "r6gd.xlarge", + "r6i.12xlarge", + "r6i.16xlarge", + "r6i.24xlarge", + "r6i.2xlarge", + "r6i.32xlarge", + "r6i.4xlarge", + "r6i.8xlarge", + "r6i.large", + "r6i.metal", + "r6i.xlarge", + "r6id.12xlarge", + "r6id.16xlarge", + "r6id.24xlarge", + "r6id.2xlarge", + "r6id.32xlarge", + "r6id.4xlarge", + "r6id.8xlarge", + "r6id.large", + "r6id.metal", + "r6id.xlarge", + "r6idn.12xlarge", + "r6idn.16xlarge", + "r6idn.24xlarge", + "r6idn.2xlarge", + "r6idn.32xlarge", + "r6idn.4xlarge", + "r6idn.8xlarge", + "r6idn.large", + "r6idn.metal", + "r6idn.xlarge", + "r6in.12xlarge", + "r6in.16xlarge", + "r6in.24xlarge", + "r6in.2xlarge", + "r6in.32xlarge", + "r6in.4xlarge", + "r6in.8xlarge", + "r6in.large", + "r6in.metal", + "r6in.xlarge", + "r7a.12xlarge", + "r7a.16xlarge", + "r7a.24xlarge", + "r7a.2xlarge", + "r7a.32xlarge", + "r7a.48xlarge", + "r7a.4xlarge", + "r7a.8xlarge", + "r7a.large", + "r7a.medium", + "r7a.metal-48xl", + "r7a.xlarge", + "r7g.12xlarge", + "r7g.16xlarge", + "r7g.2xlarge", + "r7g.4xlarge", + "r7g.8xlarge", + "r7g.large", + "r7g.medium", + "r7g.metal", + "r7g.xlarge", + "r7gd.12xlarge", + "r7gd.16xlarge", + "r7gd.2xlarge", + "r7gd.4xlarge", + "r7gd.8xlarge", + "r7gd.large", + "r7gd.medium", + "r7gd.xlarge", + "r7i.12xlarge", + "r7i.16xlarge", + "r7i.24xlarge", + "r7i.2xlarge", + "r7i.48xlarge", + "r7i.4xlarge", + "r7i.8xlarge", + "r7i.large", + "r7i.metal-24xl", + "r7i.metal-48xl", + "r7i.xlarge", + "r7iz.12xlarge", + "r7iz.16xlarge", + "r7iz.2xlarge", + "r7iz.32xlarge", + "r7iz.4xlarge", + "r7iz.8xlarge", + "r7iz.large", + "r7iz.metal-16xl", + "r7iz.metal-32xl", + "r7iz.xlarge", + "t1.micro", + "t2.2xlarge", + "t2.large", + "t2.medium", + "t2.micro", + "t2.nano", + "t2.small", + "t2.xlarge", + "t3.2xlarge", + "t3.large", + "t3.medium", + "t3.micro", + "t3.nano", + "t3.small", + "t3.xlarge", + "t3a.2xlarge", + "t3a.large", + "t3a.medium", + "t3a.micro", + "t3a.nano", + "t3a.small", + "t3a.xlarge", + "t4g.2xlarge", + "t4g.large", + "t4g.medium", + "t4g.micro", + "t4g.nano", + "t4g.small", + "t4g.xlarge", + "trn1.2xlarge", + "trn1.32xlarge", + "trn1n.32xlarge", + "u-12tb1.112xlarge", + "u-18tb1.112xlarge", + "u-24tb1.112xlarge", + "u-3tb1.56xlarge", + "u-6tb1.112xlarge", + "u-6tb1.56xlarge", + "u-9tb1.112xlarge", + "vt1.24xlarge", + "vt1.3xlarge", + "vt1.6xlarge", + "x1.16xlarge", + "x1.32xlarge", + "x1e.16xlarge", + "x1e.2xlarge", + "x1e.32xlarge", + "x1e.4xlarge", + "x1e.8xlarge", + "x1e.xlarge", + "x2gd.12xlarge", + "x2gd.16xlarge", + "x2gd.2xlarge", + "x2gd.4xlarge", + "x2gd.8xlarge", + "x2gd.large", + "x2gd.medium", + "x2gd.metal", + "x2gd.xlarge", + "x2idn.16xlarge", + "x2idn.24xlarge", + "x2idn.32xlarge", + "x2idn.metal", + "x2iedn.16xlarge", + "x2iedn.24xlarge", + "x2iedn.2xlarge", + "x2iedn.32xlarge", + "x2iedn.4xlarge", + "x2iedn.8xlarge", + "x2iedn.metal", + "x2iedn.xlarge", + "x2iezn.12xlarge", + "x2iezn.2xlarge", + "x2iezn.4xlarge", + "x2iezn.6xlarge", + "x2iezn.8xlarge", + "x2iezn.metal", + "z1d.12xlarge", + "z1d.2xlarge", + "z1d.3xlarge", + "z1d.6xlarge", + "z1d.large", + "z1d.metal", + "z1d.xlarge", + ], + "us-east-2": [ + "a1.2xlarge", + "a1.4xlarge", + "a1.large", + "a1.medium", + "a1.metal", + "a1.xlarge", + "c4.2xlarge", + "c4.4xlarge", + "c4.8xlarge", + "c4.large", + "c4.xlarge", + "c5.12xlarge", + "c5.18xlarge", + "c5.24xlarge", + "c5.2xlarge", + "c5.4xlarge", + "c5.9xlarge", + "c5.large", + "c5.metal", + "c5.xlarge", + "c5a.12xlarge", + "c5a.16xlarge", + "c5a.24xlarge", + "c5a.2xlarge", + "c5a.4xlarge", + "c5a.8xlarge", + "c5a.large", + "c5a.xlarge", + "c5ad.12xlarge", + "c5ad.16xlarge", + "c5ad.24xlarge", + "c5ad.2xlarge", + "c5ad.4xlarge", + "c5ad.8xlarge", + "c5ad.large", + "c5ad.xlarge", + "c5d.12xlarge", + "c5d.18xlarge", + "c5d.24xlarge", + "c5d.2xlarge", + "c5d.4xlarge", + "c5d.9xlarge", + "c5d.large", + "c5d.metal", + "c5d.xlarge", + "c5n.18xlarge", + "c5n.2xlarge", + "c5n.4xlarge", + "c5n.9xlarge", + "c5n.large", + "c5n.metal", + "c5n.xlarge", + "c6a.12xlarge", + "c6a.16xlarge", + "c6a.24xlarge", + "c6a.2xlarge", + "c6a.32xlarge", + "c6a.48xlarge", + "c6a.4xlarge", + "c6a.8xlarge", + "c6a.large", + "c6a.metal", + "c6a.xlarge", + "c6g.12xlarge", + "c6g.16xlarge", + "c6g.2xlarge", + "c6g.4xlarge", + "c6g.8xlarge", + "c6g.large", + "c6g.medium", + "c6g.metal", + "c6g.xlarge", + "c6gd.12xlarge", + "c6gd.16xlarge", + "c6gd.2xlarge", + "c6gd.4xlarge", + "c6gd.8xlarge", + "c6gd.large", + "c6gd.medium", + "c6gd.metal", + "c6gd.xlarge", + "c6gn.12xlarge", + "c6gn.16xlarge", + "c6gn.2xlarge", + "c6gn.4xlarge", + "c6gn.8xlarge", + "c6gn.large", + "c6gn.medium", + "c6gn.xlarge", + "c6i.12xlarge", + "c6i.16xlarge", + "c6i.24xlarge", + "c6i.2xlarge", + "c6i.32xlarge", + "c6i.4xlarge", + "c6i.8xlarge", + "c6i.large", + "c6i.metal", + "c6i.xlarge", + "c6id.12xlarge", + "c6id.16xlarge", + "c6id.24xlarge", + "c6id.2xlarge", + "c6id.32xlarge", + "c6id.4xlarge", + "c6id.8xlarge", + "c6id.large", + "c6id.metal", + "c6id.xlarge", + "c6in.12xlarge", + "c6in.16xlarge", + "c6in.24xlarge", + "c6in.2xlarge", + "c6in.32xlarge", + "c6in.4xlarge", + "c6in.8xlarge", + "c6in.large", + "c6in.metal", + "c6in.xlarge", + "c7a.12xlarge", + "c7a.16xlarge", + "c7a.24xlarge", + "c7a.2xlarge", + "c7a.32xlarge", + "c7a.48xlarge", + "c7a.4xlarge", + "c7a.8xlarge", + "c7a.large", + "c7a.medium", + "c7a.metal-48xl", + "c7a.xlarge", + "c7g.12xlarge", + "c7g.16xlarge", + "c7g.2xlarge", + "c7g.4xlarge", + "c7g.8xlarge", + "c7g.large", + "c7g.medium", + "c7g.metal", + "c7g.xlarge", + "c7gd.12xlarge", + "c7gd.16xlarge", + "c7gd.2xlarge", + "c7gd.4xlarge", + "c7gd.8xlarge", + "c7gd.large", + "c7gd.medium", + "c7gd.xlarge", + "c7gn.12xlarge", + "c7gn.16xlarge", + "c7gn.2xlarge", + "c7gn.4xlarge", + "c7gn.8xlarge", + "c7gn.large", + "c7gn.medium", + "c7gn.xlarge", + "c7i.12xlarge", + "c7i.16xlarge", + "c7i.24xlarge", + "c7i.2xlarge", + "c7i.48xlarge", + "c7i.4xlarge", + "c7i.8xlarge", + "c7i.large", + "c7i.metal-24xl", + "c7i.metal-48xl", + "c7i.xlarge", + "d2.2xlarge", + "d2.4xlarge", + "d2.8xlarge", + "d2.xlarge", + "d3.2xlarge", + "d3.4xlarge", + "d3.8xlarge", + "d3.xlarge", + "g3.16xlarge", + "g3.4xlarge", + "g3.8xlarge", + "g3s.xlarge", + "g4ad.16xlarge", + "g4ad.2xlarge", + "g4ad.4xlarge", + "g4ad.8xlarge", + "g4ad.xlarge", + "g4dn.12xlarge", + "g4dn.16xlarge", + "g4dn.2xlarge", + "g4dn.4xlarge", + "g4dn.8xlarge", + "g4dn.metal", + "g4dn.xlarge", + "g5.12xlarge", + "g5.16xlarge", + "g5.24xlarge", + "g5.2xlarge", + "g5.48xlarge", + "g5.4xlarge", + "g5.8xlarge", + "g5.xlarge", + "h1.16xlarge", + "h1.2xlarge", + "h1.4xlarge", + "h1.8xlarge", + "hpc6a.48xlarge", + "hpc6id.32xlarge", + "hpc7a.12xlarge", + "hpc7a.24xlarge", + "hpc7a.48xlarge", + "hpc7a.96xlarge", + "i2.2xlarge", + "i2.4xlarge", + "i2.8xlarge", + "i2.xlarge", + "i3.16xlarge", + "i3.2xlarge", + "i3.4xlarge", + "i3.8xlarge", + "i3.large", + "i3.metal", + "i3.xlarge", + "i3en.12xlarge", + "i3en.24xlarge", + "i3en.2xlarge", + "i3en.3xlarge", + "i3en.6xlarge", + "i3en.large", + "i3en.metal", + "i3en.xlarge", + "i4g.16xlarge", + "i4g.2xlarge", + "i4g.4xlarge", + "i4g.8xlarge", + "i4g.large", + "i4g.xlarge", + "i4i.12xlarge", + "i4i.16xlarge", + "i4i.24xlarge", + "i4i.2xlarge", + "i4i.32xlarge", + "i4i.4xlarge", + "i4i.8xlarge", + "i4i.large", + "i4i.metal", + "i4i.xlarge", + "im4gn.16xlarge", + "im4gn.2xlarge", + "im4gn.4xlarge", + "im4gn.8xlarge", + "im4gn.large", + "im4gn.xlarge", + "inf1.24xlarge", + "inf1.2xlarge", + "inf1.6xlarge", + "inf1.xlarge", + "inf2.24xlarge", + "inf2.48xlarge", + "inf2.8xlarge", + "inf2.xlarge", + "is4gen.2xlarge", + "is4gen.4xlarge", + "is4gen.8xlarge", + "is4gen.large", + "is4gen.medium", + "is4gen.xlarge", + "m4.10xlarge", + "m4.16xlarge", + "m4.2xlarge", + "m4.4xlarge", + "m4.large", + "m4.xlarge", + "m5.12xlarge", + "m5.16xlarge", + "m5.24xlarge", + "m5.2xlarge", + "m5.4xlarge", + "m5.8xlarge", + "m5.large", + "m5.metal", + "m5.xlarge", + "m5a.12xlarge", + "m5a.16xlarge", + "m5a.24xlarge", + "m5a.2xlarge", + "m5a.4xlarge", + "m5a.8xlarge", + "m5a.large", + "m5a.xlarge", + "m5ad.12xlarge", + "m5ad.16xlarge", + "m5ad.24xlarge", + "m5ad.2xlarge", + "m5ad.4xlarge", + "m5ad.8xlarge", + "m5ad.large", + "m5ad.xlarge", + "m5d.12xlarge", + "m5d.16xlarge", + "m5d.24xlarge", + "m5d.2xlarge", + "m5d.4xlarge", + "m5d.8xlarge", + "m5d.large", + "m5d.metal", + "m5d.xlarge", + "m5dn.12xlarge", + "m5dn.16xlarge", + "m5dn.24xlarge", + "m5dn.2xlarge", + "m5dn.4xlarge", + "m5dn.8xlarge", + "m5dn.large", + "m5dn.metal", + "m5dn.xlarge", + "m5n.12xlarge", + "m5n.16xlarge", + "m5n.24xlarge", + "m5n.2xlarge", + "m5n.4xlarge", + "m5n.8xlarge", + "m5n.large", + "m5n.metal", + "m5n.xlarge", + "m5zn.12xlarge", + "m5zn.2xlarge", + "m5zn.3xlarge", + "m5zn.6xlarge", + "m5zn.large", + "m5zn.metal", + "m5zn.xlarge", + "m6a.12xlarge", + "m6a.16xlarge", + "m6a.24xlarge", + "m6a.2xlarge", + "m6a.32xlarge", + "m6a.48xlarge", + "m6a.4xlarge", + "m6a.8xlarge", + "m6a.large", + "m6a.metal", + "m6a.xlarge", + "m6g.12xlarge", + "m6g.16xlarge", + "m6g.2xlarge", + "m6g.4xlarge", + "m6g.8xlarge", + "m6g.large", + "m6g.medium", + "m6g.metal", + "m6g.xlarge", + "m6gd.12xlarge", + "m6gd.16xlarge", + "m6gd.2xlarge", + "m6gd.4xlarge", + "m6gd.8xlarge", + "m6gd.large", + "m6gd.medium", + "m6gd.metal", + "m6gd.xlarge", + "m6i.12xlarge", + "m6i.16xlarge", + "m6i.24xlarge", + "m6i.2xlarge", + "m6i.32xlarge", + "m6i.4xlarge", + "m6i.8xlarge", + "m6i.large", + "m6i.metal", + "m6i.xlarge", + "m6id.12xlarge", + "m6id.16xlarge", + "m6id.24xlarge", + "m6id.2xlarge", + "m6id.32xlarge", + "m6id.4xlarge", + "m6id.8xlarge", + "m6id.large", + "m6id.metal", + "m6id.xlarge", + "m6idn.12xlarge", + "m6idn.16xlarge", + "m6idn.24xlarge", + "m6idn.2xlarge", + "m6idn.32xlarge", + "m6idn.4xlarge", + "m6idn.8xlarge", + "m6idn.large", + "m6idn.metal", + "m6idn.xlarge", + "m6in.12xlarge", + "m6in.16xlarge", + "m6in.24xlarge", + "m6in.2xlarge", + "m6in.32xlarge", + "m6in.4xlarge", + "m6in.8xlarge", + "m6in.large", + "m6in.metal", + "m6in.xlarge", + "m7a.12xlarge", + "m7a.16xlarge", + "m7a.24xlarge", + "m7a.2xlarge", + "m7a.32xlarge", + "m7a.48xlarge", + "m7a.4xlarge", + "m7a.8xlarge", + "m7a.large", + "m7a.medium", + "m7a.metal-48xl", + "m7a.xlarge", + "m7g.12xlarge", + "m7g.16xlarge", + "m7g.2xlarge", + "m7g.4xlarge", + "m7g.8xlarge", + "m7g.large", + "m7g.medium", + "m7g.metal", + "m7g.xlarge", + "m7gd.12xlarge", + "m7gd.16xlarge", + "m7gd.2xlarge", + "m7gd.4xlarge", + "m7gd.8xlarge", + "m7gd.large", + "m7gd.medium", + "m7gd.xlarge", + "m7i-flex.2xlarge", + "m7i-flex.4xlarge", + "m7i-flex.8xlarge", + "m7i-flex.large", + "m7i-flex.xlarge", + "m7i.12xlarge", + "m7i.16xlarge", + "m7i.24xlarge", + "m7i.2xlarge", + "m7i.48xlarge", + "m7i.4xlarge", + "m7i.8xlarge", + "m7i.large", + "m7i.metal-24xl", + "m7i.metal-48xl", + "m7i.xlarge", + "p2.16xlarge", + "p2.8xlarge", + "p2.xlarge", + "p3.16xlarge", + "p3.2xlarge", + "p3.8xlarge", + "p4d.24xlarge", + "p5.48xlarge", + "r3.2xlarge", + "r3.4xlarge", + "r3.8xlarge", + "r3.large", + "r3.xlarge", + "r4.16xlarge", + "r4.2xlarge", + "r4.4xlarge", + "r4.8xlarge", + "r4.large", + "r4.xlarge", + "r5.12xlarge", + "r5.16xlarge", + "r5.24xlarge", + "r5.2xlarge", + "r5.4xlarge", + "r5.8xlarge", + "r5.large", + "r5.metal", + "r5.xlarge", + "r5a.12xlarge", + "r5a.16xlarge", + "r5a.24xlarge", + "r5a.2xlarge", + "r5a.4xlarge", + "r5a.8xlarge", + "r5a.large", + "r5a.xlarge", + "r5ad.12xlarge", + "r5ad.16xlarge", + "r5ad.24xlarge", + "r5ad.2xlarge", + "r5ad.4xlarge", + "r5ad.8xlarge", + "r5ad.large", + "r5ad.xlarge", + "r5b.12xlarge", + "r5b.16xlarge", + "r5b.24xlarge", + "r5b.2xlarge", + "r5b.4xlarge", + "r5b.8xlarge", + "r5b.large", + "r5b.metal", + "r5b.xlarge", + "r5d.12xlarge", + "r5d.16xlarge", + "r5d.24xlarge", + "r5d.2xlarge", + "r5d.4xlarge", + "r5d.8xlarge", + "r5d.large", + "r5d.metal", + "r5d.xlarge", + "r5dn.12xlarge", + "r5dn.16xlarge", + "r5dn.24xlarge", + "r5dn.2xlarge", + "r5dn.4xlarge", + "r5dn.8xlarge", + "r5dn.large", + "r5dn.metal", + "r5dn.xlarge", + "r5n.12xlarge", + "r5n.16xlarge", + "r5n.24xlarge", + "r5n.2xlarge", + "r5n.4xlarge", + "r5n.8xlarge", + "r5n.large", + "r5n.metal", + "r5n.xlarge", + "r6a.12xlarge", + "r6a.16xlarge", + "r6a.24xlarge", + "r6a.2xlarge", + "r6a.32xlarge", + "r6a.48xlarge", + "r6a.4xlarge", + "r6a.8xlarge", + "r6a.large", + "r6a.metal", + "r6a.xlarge", + "r6g.12xlarge", + "r6g.16xlarge", + "r6g.2xlarge", + "r6g.4xlarge", + "r6g.8xlarge", + "r6g.large", + "r6g.medium", + "r6g.metal", + "r6g.xlarge", + "r6gd.12xlarge", + "r6gd.16xlarge", + "r6gd.2xlarge", + "r6gd.4xlarge", + "r6gd.8xlarge", + "r6gd.large", + "r6gd.medium", + "r6gd.metal", + "r6gd.xlarge", + "r6i.12xlarge", + "r6i.16xlarge", + "r6i.24xlarge", + "r6i.2xlarge", + "r6i.32xlarge", + "r6i.4xlarge", + "r6i.8xlarge", + "r6i.large", + "r6i.metal", + "r6i.xlarge", + "r6id.12xlarge", + "r6id.16xlarge", + "r6id.24xlarge", + "r6id.2xlarge", + "r6id.32xlarge", + "r6id.4xlarge", + "r6id.8xlarge", + "r6id.large", + "r6id.metal", + "r6id.xlarge", + "r6idn.12xlarge", + "r6idn.16xlarge", + "r6idn.24xlarge", + "r6idn.2xlarge", + "r6idn.32xlarge", + "r6idn.4xlarge", + "r6idn.8xlarge", + "r6idn.large", + "r6idn.metal", + "r6idn.xlarge", + "r6in.12xlarge", + "r6in.16xlarge", + "r6in.24xlarge", + "r6in.2xlarge", + "r6in.32xlarge", + "r6in.4xlarge", + "r6in.8xlarge", + "r6in.large", + "r6in.metal", + "r6in.xlarge", + "r7a.12xlarge", + "r7a.16xlarge", + "r7a.24xlarge", + "r7a.2xlarge", + "r7a.32xlarge", + "r7a.48xlarge", + "r7a.4xlarge", + "r7a.8xlarge", + "r7a.large", + "r7a.medium", + "r7a.metal-48xl", + "r7a.xlarge", + "r7g.12xlarge", + "r7g.16xlarge", + "r7g.2xlarge", + "r7g.4xlarge", + "r7g.8xlarge", + "r7g.large", + "r7g.medium", + "r7g.metal", + "r7g.xlarge", + "r7gd.12xlarge", + "r7gd.16xlarge", + "r7gd.2xlarge", + "r7gd.4xlarge", + "r7gd.8xlarge", + "r7gd.large", + "r7gd.medium", + "r7gd.xlarge", + "r7i.12xlarge", + "r7i.16xlarge", + "r7i.24xlarge", + "r7i.2xlarge", + "r7i.48xlarge", + "r7i.4xlarge", + "r7i.8xlarge", + "r7i.large", + "r7i.metal-24xl", + "r7i.metal-48xl", + "r7i.xlarge", + "r7iz.12xlarge", + "r7iz.16xlarge", + "r7iz.2xlarge", + "r7iz.32xlarge", + "r7iz.4xlarge", + "r7iz.8xlarge", + "r7iz.large", + "r7iz.metal-16xl", + "r7iz.metal-32xl", + "r7iz.xlarge", + "t2.2xlarge", + "t2.large", + "t2.medium", + "t2.micro", + "t2.nano", + "t2.small", + "t2.xlarge", + "t3.2xlarge", + "t3.large", + "t3.medium", + "t3.micro", + "t3.nano", + "t3.small", + "t3.xlarge", + "t3a.2xlarge", + "t3a.large", + "t3a.medium", + "t3a.micro", + "t3a.nano", + "t3a.small", + "t3a.xlarge", + "t4g.2xlarge", + "t4g.large", + "t4g.medium", + "t4g.micro", + "t4g.nano", + "t4g.small", + "t4g.xlarge", + "trn1.2xlarge", + "trn1.32xlarge", + "trn1n.32xlarge", + "u-12tb1.112xlarge", + "u-3tb1.56xlarge", + "u-6tb1.112xlarge", + "u-6tb1.56xlarge", + "u-9tb1.112xlarge", + "x1.16xlarge", + "x1.32xlarge", + "x1e.16xlarge", + "x1e.2xlarge", + "x1e.32xlarge", + "x1e.4xlarge", + "x1e.8xlarge", + "x1e.xlarge", + "x2gd.12xlarge", + "x2gd.16xlarge", + "x2gd.2xlarge", + "x2gd.4xlarge", + "x2gd.8xlarge", + "x2gd.large", + "x2gd.medium", + "x2gd.metal", + "x2gd.xlarge", + "x2idn.16xlarge", + "x2idn.24xlarge", + "x2idn.32xlarge", + "x2idn.metal", + "x2iedn.16xlarge", + "x2iedn.24xlarge", + "x2iedn.2xlarge", + "x2iedn.32xlarge", + "x2iedn.4xlarge", + "x2iedn.8xlarge", + "x2iedn.metal", + "x2iedn.xlarge", + "z1d.12xlarge", + "z1d.2xlarge", + "z1d.3xlarge", + "z1d.6xlarge", + "z1d.large", + "z1d.metal", + "z1d.xlarge", + ], + "us-gov-west-1": [ + "c1.medium", + "c1.xlarge", + "c3.2xlarge", + "c3.4xlarge", + "c3.8xlarge", + "c3.large", + "c3.xlarge", + "c4.2xlarge", + "c4.4xlarge", + "c4.8xlarge", + "c4.large", + "c4.xlarge", + "c5.12xlarge", + "c5.18xlarge", + "c5.24xlarge", + "c5.2xlarge", + "c5.4xlarge", + "c5.9xlarge", + "c5.large", + "c5.metal", + "c5.xlarge", + "c5a.12xlarge", + "c5a.16xlarge", + "c5a.24xlarge", + "c5a.2xlarge", + "c5a.4xlarge", + "c5a.8xlarge", + "c5a.large", + "c5a.xlarge", + "c5d.12xlarge", + "c5d.18xlarge", + "c5d.24xlarge", + "c5d.2xlarge", + "c5d.4xlarge", + "c5d.9xlarge", + "c5d.large", + "c5d.metal", + "c5d.xlarge", + "c5n.18xlarge", + "c5n.2xlarge", + "c5n.4xlarge", + "c5n.9xlarge", + "c5n.large", + "c5n.metal", + "c5n.xlarge", + "c6g.12xlarge", + "c6g.16xlarge", + "c6g.2xlarge", + "c6g.4xlarge", + "c6g.8xlarge", + "c6g.large", + "c6g.medium", + "c6g.metal", + "c6g.xlarge", + "c6gd.12xlarge", + "c6gd.16xlarge", + "c6gd.2xlarge", + "c6gd.4xlarge", + "c6gd.8xlarge", + "c6gd.large", + "c6gd.medium", + "c6gd.metal", + "c6gd.xlarge", + "c6gn.12xlarge", + "c6gn.16xlarge", + "c6gn.2xlarge", + "c6gn.4xlarge", + "c6gn.8xlarge", + "c6gn.large", + "c6gn.medium", + "c6gn.xlarge", + "c6i.12xlarge", + "c6i.16xlarge", + "c6i.24xlarge", + "c6i.2xlarge", + "c6i.32xlarge", + "c6i.4xlarge", + "c6i.8xlarge", + "c6i.large", + "c6i.metal", + "c6i.xlarge", + "c6id.12xlarge", + "c6id.16xlarge", + "c6id.24xlarge", + "c6id.2xlarge", + "c6id.32xlarge", + "c6id.4xlarge", + "c6id.8xlarge", + "c6id.large", + "c6id.metal", + "c6id.xlarge", + "c6in.12xlarge", + "c6in.16xlarge", + "c6in.24xlarge", + "c6in.2xlarge", + "c6in.32xlarge", + "c6in.4xlarge", + "c6in.8xlarge", + "c6in.large", + "c6in.metal", + "c6in.xlarge", + "cc2.8xlarge", + "d2.2xlarge", + "d2.4xlarge", + "d2.8xlarge", + "d2.xlarge", + "d3.2xlarge", + "d3.4xlarge", + "d3.8xlarge", + "d3.xlarge", + "f1.16xlarge", + "f1.2xlarge", + "f1.4xlarge", + "g3.16xlarge", + "g3.4xlarge", + "g3.8xlarge", + "g3s.xlarge", + "g4dn.12xlarge", + "g4dn.16xlarge", + "g4dn.2xlarge", + "g4dn.4xlarge", + "g4dn.8xlarge", + "g4dn.metal", + "g4dn.xlarge", + "hpc6a.48xlarge", + "hpc6id.32xlarge", + "hpc7a.12xlarge", + "hpc7a.24xlarge", + "hpc7a.48xlarge", + "hpc7a.96xlarge", + "hpc7g.16xlarge", + "hpc7g.4xlarge", + "hpc7g.8xlarge", + "hs1.8xlarge", + "i2.2xlarge", + "i2.4xlarge", + "i2.8xlarge", + "i2.xlarge", + "i3.16xlarge", + "i3.2xlarge", + "i3.4xlarge", + "i3.8xlarge", + "i3.large", + "i3.metal", + "i3.xlarge", + "i3en.12xlarge", + "i3en.24xlarge", + "i3en.2xlarge", + "i3en.3xlarge", + "i3en.6xlarge", + "i3en.large", + "i3en.metal", + "i3en.xlarge", + "i3p.16xlarge", + "i4i.12xlarge", + "i4i.16xlarge", + "i4i.24xlarge", + "i4i.2xlarge", + "i4i.32xlarge", + "i4i.4xlarge", + "i4i.8xlarge", + "i4i.large", + "i4i.metal", + "i4i.xlarge", + "inf1.24xlarge", + "inf1.2xlarge", + "inf1.6xlarge", + "inf1.xlarge", + "m1.large", + "m1.medium", + "m1.xlarge", + "m2.2xlarge", + "m2.4xlarge", + "m2.xlarge", + "m3.2xlarge", + "m3.large", + "m3.medium", + "m3.xlarge", + "m4.10xlarge", + "m4.16xlarge", + "m4.2xlarge", + "m4.4xlarge", + "m4.large", + "m4.xlarge", + "m5.12xlarge", + "m5.16xlarge", + "m5.24xlarge", + "m5.2xlarge", + "m5.4xlarge", + "m5.8xlarge", + "m5.large", + "m5.metal", + "m5.xlarge", + "m5a.12xlarge", + "m5a.16xlarge", + "m5a.24xlarge", + "m5a.2xlarge", + "m5a.4xlarge", + "m5a.8xlarge", + "m5a.large", + "m5a.xlarge", + "m5ad.12xlarge", + "m5ad.16xlarge", + "m5ad.24xlarge", + "m5ad.2xlarge", + "m5ad.4xlarge", + "m5ad.8xlarge", + "m5ad.large", + "m5ad.xlarge", + "m5d.12xlarge", + "m5d.16xlarge", + "m5d.24xlarge", + "m5d.2xlarge", + "m5d.4xlarge", + "m5d.8xlarge", + "m5d.large", + "m5d.metal", + "m5d.xlarge", + "m5dn.12xlarge", + "m5dn.16xlarge", + "m5dn.24xlarge", + "m5dn.2xlarge", + "m5dn.4xlarge", + "m5dn.8xlarge", + "m5dn.large", + "m5dn.metal", + "m5dn.xlarge", + "m5n.12xlarge", + "m5n.16xlarge", + "m5n.24xlarge", + "m5n.2xlarge", + "m5n.4xlarge", + "m5n.8xlarge", + "m5n.large", + "m5n.metal", + "m5n.xlarge", + "m6g.12xlarge", + "m6g.16xlarge", + "m6g.2xlarge", + "m6g.4xlarge", + "m6g.8xlarge", + "m6g.large", + "m6g.medium", + "m6g.metal", + "m6g.xlarge", + "m6i.12xlarge", + "m6i.16xlarge", + "m6i.24xlarge", + "m6i.2xlarge", + "m6i.32xlarge", + "m6i.4xlarge", + "m6i.8xlarge", + "m6i.large", + "m6i.metal", + "m6i.xlarge", + "m6id.12xlarge", + "m6id.16xlarge", + "m6id.24xlarge", + "m6id.2xlarge", + "m6id.32xlarge", + "m6id.4xlarge", + "m6id.8xlarge", + "m6id.large", + "m6id.metal", + "m6id.xlarge", + "m6idn.12xlarge", + "m6idn.16xlarge", + "m6idn.24xlarge", + "m6idn.2xlarge", + "m6idn.32xlarge", + "m6idn.4xlarge", + "m6idn.8xlarge", + "m6idn.large", + "m6idn.metal", + "m6idn.xlarge", + "m6in.12xlarge", + "m6in.16xlarge", + "m6in.24xlarge", + "m6in.2xlarge", + "m6in.32xlarge", + "m6in.4xlarge", + "m6in.8xlarge", + "m6in.large", + "m6in.metal", + "m6in.xlarge", + "p2.16xlarge", + "p2.8xlarge", + "p2.xlarge", + "p3.16xlarge", + "p3.2xlarge", + "p3.8xlarge", + "p3dn.24xlarge", + "p4d.24xlarge", + "r3.2xlarge", + "r3.4xlarge", + "r3.8xlarge", + "r3.large", + "r3.xlarge", + "r4.16xlarge", + "r4.2xlarge", + "r4.4xlarge", + "r4.8xlarge", + "r4.large", + "r4.xlarge", + "r5.12xlarge", + "r5.16xlarge", + "r5.24xlarge", + "r5.2xlarge", + "r5.4xlarge", + "r5.8xlarge", + "r5.large", + "r5.metal", + "r5.xlarge", + "r5a.12xlarge", + "r5a.16xlarge", + "r5a.24xlarge", + "r5a.2xlarge", + "r5a.4xlarge", + "r5a.8xlarge", + "r5a.large", + "r5a.xlarge", + "r5ad.12xlarge", + "r5ad.16xlarge", + "r5ad.24xlarge", + "r5ad.2xlarge", + "r5ad.4xlarge", + "r5ad.8xlarge", + "r5ad.large", + "r5ad.xlarge", + "r5d.12xlarge", + "r5d.16xlarge", + "r5d.24xlarge", + "r5d.2xlarge", + "r5d.4xlarge", + "r5d.8xlarge", + "r5d.large", + "r5d.metal", + "r5d.xlarge", + "r5dn.12xlarge", + "r5dn.16xlarge", + "r5dn.24xlarge", + "r5dn.2xlarge", + "r5dn.4xlarge", + "r5dn.8xlarge", + "r5dn.large", + "r5dn.metal", + "r5dn.xlarge", + "r5n.12xlarge", + "r5n.16xlarge", + "r5n.24xlarge", + "r5n.2xlarge", + "r5n.4xlarge", + "r5n.8xlarge", + "r5n.large", + "r5n.metal", + "r5n.xlarge", + "r6g.12xlarge", + "r6g.16xlarge", + "r6g.2xlarge", + "r6g.4xlarge", + "r6g.8xlarge", + "r6g.large", + "r6g.medium", + "r6g.metal", + "r6g.xlarge", + "r6gd.12xlarge", + "r6gd.16xlarge", + "r6gd.2xlarge", + "r6gd.4xlarge", + "r6gd.8xlarge", + "r6gd.large", + "r6gd.medium", + "r6gd.metal", + "r6gd.xlarge", + "r6i.12xlarge", + "r6i.16xlarge", + "r6i.24xlarge", + "r6i.2xlarge", + "r6i.32xlarge", + "r6i.4xlarge", + "r6i.8xlarge", + "r6i.large", + "r6i.metal", + "r6i.xlarge", + "r6id.12xlarge", + "r6id.16xlarge", + "r6id.24xlarge", + "r6id.2xlarge", + "r6id.32xlarge", + "r6id.4xlarge", + "r6id.8xlarge", + "r6id.large", + "r6id.metal", + "r6id.xlarge", + "r6idn.12xlarge", + "r6idn.16xlarge", + "r6idn.24xlarge", + "r6idn.2xlarge", + "r6idn.32xlarge", + "r6idn.4xlarge", + "r6idn.8xlarge", + "r6idn.large", + "r6idn.metal", + "r6idn.xlarge", + "r6in.12xlarge", + "r6in.16xlarge", + "r6in.24xlarge", + "r6in.2xlarge", + "r6in.32xlarge", + "r6in.4xlarge", + "r6in.8xlarge", + "r6in.large", + "r6in.metal", + "r6in.xlarge", + "t1.micro", + "t2.2xlarge", + "t2.large", + "t2.medium", + "t2.micro", + "t2.nano", + "t2.small", + "t2.xlarge", + "t3.2xlarge", + "t3.large", + "t3.medium", + "t3.micro", + "t3.nano", + "t3.small", + "t3.xlarge", + "t3a.2xlarge", + "t3a.large", + "t3a.medium", + "t3a.micro", + "t3a.nano", + "t3a.small", + "t3a.xlarge", + "t4g.2xlarge", + "t4g.large", + "t4g.medium", + "t4g.micro", + "t4g.nano", + "t4g.small", + "t4g.xlarge", + "u-12tb1.112xlarge", + "u-24tb1.112xlarge", + "u-3tb1.56xlarge", + "u-6tb1.112xlarge", + "u-6tb1.56xlarge", + "u-9tb1.112xlarge", + "x1.16xlarge", + "x1.32xlarge", + "x1e.16xlarge", + "x1e.2xlarge", + "x1e.32xlarge", + "x1e.4xlarge", + "x1e.8xlarge", + "x1e.xlarge", + "x2idn.16xlarge", + "x2idn.24xlarge", + "x2idn.32xlarge", + "x2idn.metal", + "x2iedn.16xlarge", + "x2iedn.24xlarge", + "x2iedn.2xlarge", + "x2iedn.32xlarge", + "x2iedn.4xlarge", + "x2iedn.8xlarge", + "x2iedn.metal", + "x2iedn.xlarge", + ], + "ca-central-1": [ + "c4.2xlarge", + "c4.4xlarge", + "c4.8xlarge", + "c4.large", + "c4.xlarge", + "c5.12xlarge", + "c5.18xlarge", + "c5.24xlarge", + "c5.2xlarge", + "c5.4xlarge", + "c5.9xlarge", + "c5.large", + "c5.metal", + "c5.xlarge", + "c5a.12xlarge", + "c5a.16xlarge", + "c5a.24xlarge", + "c5a.2xlarge", + "c5a.4xlarge", + "c5a.8xlarge", + "c5a.large", + "c5a.xlarge", + "c5d.12xlarge", + "c5d.18xlarge", + "c5d.24xlarge", + "c5d.2xlarge", + "c5d.4xlarge", + "c5d.9xlarge", + "c5d.large", + "c5d.metal", + "c5d.xlarge", + "c5n.18xlarge", + "c5n.2xlarge", + "c5n.4xlarge", + "c5n.9xlarge", + "c5n.large", + "c5n.metal", + "c5n.xlarge", + "c6a.12xlarge", + "c6a.16xlarge", + "c6a.24xlarge", + "c6a.2xlarge", + "c6a.32xlarge", + "c6a.48xlarge", + "c6a.4xlarge", + "c6a.8xlarge", + "c6a.large", + "c6a.metal", + "c6a.xlarge", + "c6g.12xlarge", + "c6g.16xlarge", + "c6g.2xlarge", + "c6g.4xlarge", + "c6g.8xlarge", + "c6g.large", + "c6g.medium", + "c6g.metal", + "c6g.xlarge", + "c6gd.12xlarge", + "c6gd.16xlarge", + "c6gd.2xlarge", + "c6gd.4xlarge", + "c6gd.8xlarge", + "c6gd.large", + "c6gd.medium", + "c6gd.metal", + "c6gd.xlarge", + "c6gn.12xlarge", + "c6gn.16xlarge", + "c6gn.2xlarge", + "c6gn.4xlarge", + "c6gn.8xlarge", + "c6gn.large", + "c6gn.medium", + "c6gn.metal", + "c6gn.xlarge", + "c6i.12xlarge", + "c6i.16xlarge", + "c6i.24xlarge", + "c6i.2xlarge", + "c6i.32xlarge", + "c6i.4xlarge", + "c6i.8xlarge", + "c6i.large", + "c6i.metal", + "c6i.xlarge", + "c6in.12xlarge", + "c6in.16xlarge", + "c6in.24xlarge", + "c6in.2xlarge", + "c6in.32xlarge", + "c6in.4xlarge", + "c6in.8xlarge", + "c6in.large", + "c6in.metal", + "c6in.xlarge", + "c7g.12xlarge", + "c7g.16xlarge", + "c7g.2xlarge", + "c7g.4xlarge", + "c7g.8xlarge", + "c7g.large", + "c7g.medium", + "c7g.metal", + "c7g.xlarge", + "c7i.12xlarge", + "c7i.16xlarge", + "c7i.24xlarge", + "c7i.2xlarge", + "c7i.48xlarge", + "c7i.4xlarge", + "c7i.8xlarge", + "c7i.large", + "c7i.metal-24xl", + "c7i.metal-48xl", + "c7i.xlarge", + "d2.2xlarge", + "d2.4xlarge", + "d2.8xlarge", + "d2.xlarge", + "d3.2xlarge", + "d3.4xlarge", + "d3.8xlarge", + "d3.xlarge", + "g3.16xlarge", + "g3.4xlarge", + "g3.8xlarge", + "g4ad.16xlarge", + "g4ad.2xlarge", + "g4ad.4xlarge", + "g4ad.8xlarge", + "g4ad.xlarge", + "g4dn.12xlarge", + "g4dn.16xlarge", + "g4dn.2xlarge", + "g4dn.4xlarge", + "g4dn.8xlarge", + "g4dn.metal", + "g4dn.xlarge", + "g5.12xlarge", + "g5.16xlarge", + "g5.24xlarge", + "g5.2xlarge", + "g5.48xlarge", + "g5.4xlarge", + "g5.8xlarge", + "g5.xlarge", + "i3.16xlarge", + "i3.2xlarge", + "i3.4xlarge", + "i3.8xlarge", + "i3.large", + "i3.metal", + "i3.xlarge", + "i3en.12xlarge", + "i3en.24xlarge", + "i3en.2xlarge", + "i3en.3xlarge", + "i3en.6xlarge", + "i3en.large", + "i3en.metal", + "i3en.xlarge", + "i4g.16xlarge", + "i4g.2xlarge", + "i4g.4xlarge", + "i4g.8xlarge", + "i4g.large", + "i4g.xlarge", + "i4i.12xlarge", + "i4i.16xlarge", + "i4i.24xlarge", + "i4i.2xlarge", + "i4i.32xlarge", + "i4i.4xlarge", + "i4i.8xlarge", + "i4i.large", + "i4i.metal", + "i4i.xlarge", + "im4gn.16xlarge", + "im4gn.2xlarge", + "im4gn.4xlarge", + "im4gn.8xlarge", + "im4gn.large", + "im4gn.xlarge", + "inf1.24xlarge", + "inf1.2xlarge", + "inf1.6xlarge", + "inf1.xlarge", + "is4gen.2xlarge", + "is4gen.4xlarge", + "is4gen.8xlarge", + "is4gen.large", + "is4gen.medium", + "is4gen.xlarge", + "m4.10xlarge", + "m4.16xlarge", + "m4.2xlarge", + "m4.4xlarge", + "m4.large", + "m4.xlarge", + "m5.12xlarge", + "m5.16xlarge", + "m5.24xlarge", + "m5.2xlarge", + "m5.4xlarge", + "m5.8xlarge", + "m5.large", + "m5.metal", + "m5.xlarge", + "m5a.12xlarge", + "m5a.16xlarge", + "m5a.24xlarge", + "m5a.2xlarge", + "m5a.4xlarge", + "m5a.8xlarge", + "m5a.large", + "m5a.xlarge", + "m5ad.12xlarge", + "m5ad.16xlarge", + "m5ad.24xlarge", + "m5ad.2xlarge", + "m5ad.4xlarge", + "m5ad.8xlarge", + "m5ad.large", + "m5ad.xlarge", + "m5d.12xlarge", + "m5d.16xlarge", + "m5d.24xlarge", + "m5d.2xlarge", + "m5d.4xlarge", + "m5d.8xlarge", + "m5d.large", + "m5d.metal", + "m5d.xlarge", + "m6a.12xlarge", + "m6a.16xlarge", + "m6a.24xlarge", + "m6a.2xlarge", + "m6a.32xlarge", + "m6a.48xlarge", + "m6a.4xlarge", + "m6a.8xlarge", + "m6a.large", + "m6a.metal", + "m6a.xlarge", + "m6g.12xlarge", + "m6g.16xlarge", + "m6g.2xlarge", + "m6g.4xlarge", + "m6g.8xlarge", + "m6g.large", + "m6g.medium", + "m6g.metal", + "m6g.xlarge", + "m6gd.12xlarge", + "m6gd.16xlarge", + "m6gd.2xlarge", + "m6gd.4xlarge", + "m6gd.8xlarge", + "m6gd.large", + "m6gd.medium", + "m6gd.metal", + "m6gd.xlarge", + "m6i.12xlarge", + "m6i.16xlarge", + "m6i.24xlarge", + "m6i.2xlarge", + "m6i.32xlarge", + "m6i.4xlarge", + "m6i.8xlarge", + "m6i.large", + "m6i.metal", + "m6i.xlarge", + "m6id.12xlarge", + "m6id.16xlarge", + "m6id.24xlarge", + "m6id.2xlarge", + "m6id.32xlarge", + "m6id.4xlarge", + "m6id.8xlarge", + "m6id.large", + "m6id.metal", + "m6id.xlarge", + "m7g.12xlarge", + "m7g.16xlarge", + "m7g.2xlarge", + "m7g.4xlarge", + "m7g.8xlarge", + "m7g.large", + "m7g.medium", + "m7g.metal", + "m7g.xlarge", + "m7i-flex.2xlarge", + "m7i-flex.4xlarge", + "m7i-flex.8xlarge", + "m7i-flex.large", + "m7i-flex.xlarge", + "m7i.12xlarge", + "m7i.16xlarge", + "m7i.24xlarge", + "m7i.2xlarge", + "m7i.48xlarge", + "m7i.4xlarge", + "m7i.8xlarge", + "m7i.large", + "m7i.metal-24xl", + "m7i.metal-48xl", + "m7i.xlarge", + "p3.16xlarge", + "p3.2xlarge", + "p3.8xlarge", + "r4.16xlarge", + "r4.2xlarge", + "r4.4xlarge", + "r4.8xlarge", + "r4.large", + "r4.xlarge", + "r5.12xlarge", + "r5.16xlarge", + "r5.24xlarge", + "r5.2xlarge", + "r5.4xlarge", + "r5.8xlarge", + "r5.large", + "r5.metal", + "r5.xlarge", + "r5a.12xlarge", + "r5a.16xlarge", + "r5a.24xlarge", + "r5a.2xlarge", + "r5a.4xlarge", + "r5a.8xlarge", + "r5a.large", + "r5a.xlarge", + "r5ad.12xlarge", + "r5ad.16xlarge", + "r5ad.24xlarge", + "r5ad.2xlarge", + "r5ad.4xlarge", + "r5ad.8xlarge", + "r5ad.large", + "r5ad.xlarge", + "r5b.12xlarge", + "r5b.16xlarge", + "r5b.24xlarge", + "r5b.2xlarge", + "r5b.4xlarge", + "r5b.8xlarge", + "r5b.large", + "r5b.metal", + "r5b.xlarge", + "r5d.12xlarge", + "r5d.16xlarge", + "r5d.24xlarge", + "r5d.2xlarge", + "r5d.4xlarge", + "r5d.8xlarge", + "r5d.large", + "r5d.metal", + "r5d.xlarge", + "r5n.12xlarge", + "r5n.16xlarge", + "r5n.24xlarge", + "r5n.2xlarge", + "r5n.4xlarge", + "r5n.8xlarge", + "r5n.large", + "r5n.metal", + "r5n.xlarge", + "r6g.12xlarge", + "r6g.16xlarge", + "r6g.2xlarge", + "r6g.4xlarge", + "r6g.8xlarge", + "r6g.large", + "r6g.medium", + "r6g.metal", + "r6g.xlarge", + "r6gd.12xlarge", + "r6gd.16xlarge", + "r6gd.2xlarge", + "r6gd.4xlarge", + "r6gd.8xlarge", + "r6gd.large", + "r6gd.medium", + "r6gd.metal", + "r6gd.xlarge", + "r6i.12xlarge", + "r6i.16xlarge", + "r6i.24xlarge", + "r6i.2xlarge", + "r6i.32xlarge", + "r6i.4xlarge", + "r6i.8xlarge", + "r6i.large", + "r6i.metal", + "r6i.xlarge", + "r7g.12xlarge", + "r7g.16xlarge", + "r7g.2xlarge", + "r7g.4xlarge", + "r7g.8xlarge", + "r7g.large", + "r7g.medium", + "r7g.metal", + "r7g.xlarge", + "r7i.12xlarge", + "r7i.16xlarge", + "r7i.24xlarge", + "r7i.2xlarge", + "r7i.48xlarge", + "r7i.4xlarge", + "r7i.8xlarge", + "r7i.large", + "r7i.metal-24xl", + "r7i.metal-48xl", + "r7i.xlarge", + "t2.2xlarge", + "t2.large", + "t2.medium", + "t2.micro", + "t2.nano", + "t2.small", + "t2.xlarge", + "t3.2xlarge", + "t3.large", + "t3.medium", + "t3.micro", + "t3.nano", + "t3.small", + "t3.xlarge", + "t3a.2xlarge", + "t3a.large", + "t3a.medium", + "t3a.micro", + "t3a.nano", + "t3a.small", + "t3a.xlarge", + "t4g.2xlarge", + "t4g.large", + "t4g.medium", + "t4g.micro", + "t4g.nano", + "t4g.small", + "t4g.xlarge", + "u-3tb1.56xlarge", + "u-6tb1.112xlarge", + "u-6tb1.56xlarge", + "x1.16xlarge", + "x1.32xlarge", + "x1e.16xlarge", + "x1e.2xlarge", + "x1e.32xlarge", + "x1e.4xlarge", + "x1e.8xlarge", + "x1e.xlarge", + "x2idn.16xlarge", + "x2idn.24xlarge", + "x2idn.32xlarge", + "x2idn.metal", + "x2iedn.16xlarge", + "x2iedn.24xlarge", + "x2iedn.2xlarge", + "x2iedn.32xlarge", + "x2iedn.4xlarge", + "x2iedn.8xlarge", + "x2iedn.metal", + "x2iedn.xlarge", + ], + "ap-northeast-1": [ + "a1.2xlarge", + "a1.4xlarge", + "a1.large", + "a1.medium", + "a1.metal", + "a1.xlarge", + "c1.medium", + "c1.xlarge", + "c3.2xlarge", + "c3.4xlarge", + "c3.8xlarge", + "c3.large", + "c3.xlarge", + "c4.2xlarge", + "c4.4xlarge", + "c4.8xlarge", + "c4.large", + "c4.xlarge", + "c5.12xlarge", + "c5.18xlarge", + "c5.24xlarge", + "c5.2xlarge", + "c5.4xlarge", + "c5.9xlarge", + "c5.large", + "c5.metal", + "c5.xlarge", + "c5a.12xlarge", + "c5a.16xlarge", + "c5a.24xlarge", + "c5a.2xlarge", + "c5a.4xlarge", + "c5a.8xlarge", + "c5a.large", + "c5a.xlarge", + "c5d.12xlarge", + "c5d.18xlarge", + "c5d.24xlarge", + "c5d.2xlarge", + "c5d.4xlarge", + "c5d.9xlarge", + "c5d.large", + "c5d.metal", + "c5d.xlarge", + "c5n.18xlarge", + "c5n.2xlarge", + "c5n.4xlarge", + "c5n.9xlarge", + "c5n.large", + "c5n.metal", + "c5n.xlarge", + "c6a.12xlarge", + "c6a.16xlarge", + "c6a.24xlarge", + "c6a.2xlarge", + "c6a.32xlarge", + "c6a.48xlarge", + "c6a.4xlarge", + "c6a.8xlarge", + "c6a.large", + "c6a.metal", + "c6a.xlarge", + "c6g.12xlarge", + "c6g.16xlarge", + "c6g.2xlarge", + "c6g.4xlarge", + "c6g.8xlarge", + "c6g.large", + "c6g.medium", + "c6g.metal", + "c6g.xlarge", + "c6gd.12xlarge", + "c6gd.16xlarge", + "c6gd.2xlarge", + "c6gd.4xlarge", + "c6gd.8xlarge", + "c6gd.large", + "c6gd.medium", + "c6gd.metal", + "c6gd.xlarge", + "c6gn.12xlarge", + "c6gn.16xlarge", + "c6gn.2xlarge", + "c6gn.4xlarge", + "c6gn.8xlarge", + "c6gn.large", + "c6gn.medium", + "c6gn.xlarge", + "c6i.12xlarge", + "c6i.16xlarge", + "c6i.24xlarge", + "c6i.2xlarge", + "c6i.32xlarge", + "c6i.4xlarge", + "c6i.8xlarge", + "c6i.large", + "c6i.metal", + "c6i.xlarge", + "c6id.12xlarge", + "c6id.16xlarge", + "c6id.24xlarge", + "c6id.2xlarge", + "c6id.32xlarge", + "c6id.4xlarge", + "c6id.8xlarge", + "c6id.large", + "c6id.metal", + "c6id.xlarge", + "c6in.12xlarge", + "c6in.16xlarge", + "c6in.24xlarge", + "c6in.2xlarge", + "c6in.32xlarge", + "c6in.4xlarge", + "c6in.8xlarge", + "c6in.large", + "c6in.metal", + "c6in.xlarge", + "c7g.12xlarge", + "c7g.16xlarge", + "c7g.2xlarge", + "c7g.4xlarge", + "c7g.8xlarge", + "c7g.large", + "c7g.medium", + "c7g.metal", + "c7g.xlarge", + "c7gd.12xlarge", + "c7gd.16xlarge", + "c7gd.2xlarge", + "c7gd.4xlarge", + "c7gd.8xlarge", + "c7gd.large", + "c7gd.medium", + "c7gd.xlarge", + "c7gn.12xlarge", + "c7gn.16xlarge", + "c7gn.2xlarge", + "c7gn.4xlarge", + "c7gn.8xlarge", + "c7gn.large", + "c7gn.medium", + "c7gn.xlarge", + "c7i.12xlarge", + "c7i.16xlarge", + "c7i.24xlarge", + "c7i.2xlarge", + "c7i.48xlarge", + "c7i.4xlarge", + "c7i.8xlarge", + "c7i.large", + "c7i.metal-24xl", + "c7i.metal-48xl", + "c7i.xlarge", + "cr1.8xlarge", + "d2.2xlarge", + "d2.4xlarge", + "d2.8xlarge", + "d2.xlarge", + "d3.2xlarge", + "d3.4xlarge", + "d3.8xlarge", + "d3.xlarge", + "d3en.12xlarge", + "d3en.2xlarge", + "d3en.4xlarge", + "d3en.6xlarge", + "d3en.8xlarge", + "d3en.xlarge", + "g2.2xlarge", + "g2.8xlarge", + "g3.16xlarge", + "g3.4xlarge", + "g3.8xlarge", + "g3s.xlarge", + "g4ad.16xlarge", + "g4ad.2xlarge", + "g4ad.4xlarge", + "g4ad.8xlarge", + "g4ad.xlarge", + "g4dn.12xlarge", + "g4dn.16xlarge", + "g4dn.2xlarge", + "g4dn.4xlarge", + "g4dn.8xlarge", + "g4dn.metal", + "g4dn.xlarge", + "g5.12xlarge", + "g5.16xlarge", + "g5.24xlarge", + "g5.2xlarge", + "g5.48xlarge", + "g5.4xlarge", + "g5.8xlarge", + "g5.xlarge", + "g5g.16xlarge", + "g5g.2xlarge", + "g5g.4xlarge", + "g5g.8xlarge", + "g5g.metal", + "g5g.xlarge", + "hpc7g.16xlarge", + "hpc7g.4xlarge", + "hpc7g.8xlarge", + "i2.2xlarge", + "i2.4xlarge", + "i2.8xlarge", + "i2.xlarge", + "i3.16xlarge", + "i3.2xlarge", + "i3.4xlarge", + "i3.8xlarge", + "i3.large", + "i3.metal", + "i3.xlarge", + "i3en.12xlarge", + "i3en.24xlarge", + "i3en.2xlarge", + "i3en.3xlarge", + "i3en.6xlarge", + "i3en.large", + "i3en.metal", + "i3en.xlarge", + "i4i.12xlarge", + "i4i.16xlarge", + "i4i.24xlarge", + "i4i.2xlarge", + "i4i.32xlarge", + "i4i.4xlarge", + "i4i.8xlarge", + "i4i.large", + "i4i.metal", + "i4i.xlarge", + "im4gn.16xlarge", + "im4gn.2xlarge", + "im4gn.4xlarge", + "im4gn.8xlarge", + "im4gn.large", + "im4gn.xlarge", + "inf1.24xlarge", + "inf1.2xlarge", + "inf1.6xlarge", + "inf1.xlarge", + "inf2.24xlarge", + "inf2.48xlarge", + "inf2.8xlarge", + "inf2.xlarge", + "is4gen.2xlarge", + "is4gen.4xlarge", + "is4gen.8xlarge", + "is4gen.large", + "is4gen.medium", + "is4gen.xlarge", + "m1.large", + "m1.medium", + "m1.xlarge", + "m2.2xlarge", + "m2.4xlarge", + "m2.xlarge", + "m3.2xlarge", + "m3.large", + "m3.medium", + "m3.xlarge", + "m4.10xlarge", + "m4.16xlarge", + "m4.2xlarge", + "m4.4xlarge", + "m4.large", + "m4.xlarge", + "m5.12xlarge", + "m5.16xlarge", + "m5.24xlarge", + "m5.2xlarge", + "m5.4xlarge", + "m5.8xlarge", + "m5.large", + "m5.metal", + "m5.xlarge", + "m5a.12xlarge", + "m5a.16xlarge", + "m5a.24xlarge", + "m5a.2xlarge", + "m5a.4xlarge", + "m5a.8xlarge", + "m5a.large", + "m5a.xlarge", + "m5ad.12xlarge", + "m5ad.16xlarge", + "m5ad.24xlarge", + "m5ad.2xlarge", + "m5ad.4xlarge", + "m5ad.8xlarge", + "m5ad.large", + "m5ad.xlarge", + "m5d.12xlarge", + "m5d.16xlarge", + "m5d.24xlarge", + "m5d.2xlarge", + "m5d.4xlarge", + "m5d.8xlarge", + "m5d.large", + "m5d.metal", + "m5d.xlarge", + "m5dn.12xlarge", + "m5dn.16xlarge", + "m5dn.24xlarge", + "m5dn.2xlarge", + "m5dn.4xlarge", + "m5dn.8xlarge", + "m5dn.large", + "m5dn.metal", + "m5dn.xlarge", + "m5n.12xlarge", + "m5n.16xlarge", + "m5n.24xlarge", + "m5n.2xlarge", + "m5n.4xlarge", + "m5n.8xlarge", + "m5n.large", + "m5n.metal", + "m5n.xlarge", + "m5zn.12xlarge", + "m5zn.2xlarge", + "m5zn.3xlarge", + "m5zn.6xlarge", + "m5zn.large", + "m5zn.metal", + "m5zn.xlarge", + "m6a.12xlarge", + "m6a.16xlarge", + "m6a.24xlarge", + "m6a.2xlarge", + "m6a.32xlarge", + "m6a.48xlarge", + "m6a.4xlarge", + "m6a.8xlarge", + "m6a.large", + "m6a.metal", + "m6a.xlarge", + "m6g.12xlarge", + "m6g.16xlarge", + "m6g.2xlarge", + "m6g.4xlarge", + "m6g.8xlarge", + "m6g.large", + "m6g.medium", + "m6g.metal", + "m6g.xlarge", + "m6gd.12xlarge", + "m6gd.16xlarge", + "m6gd.2xlarge", + "m6gd.4xlarge", + "m6gd.8xlarge", + "m6gd.large", + "m6gd.medium", + "m6gd.metal", + "m6gd.xlarge", + "m6i.12xlarge", + "m6i.16xlarge", + "m6i.24xlarge", + "m6i.2xlarge", + "m6i.32xlarge", + "m6i.4xlarge", + "m6i.8xlarge", + "m6i.large", + "m6i.metal", + "m6i.xlarge", + "m6id.12xlarge", + "m6id.16xlarge", + "m6id.24xlarge", + "m6id.2xlarge", + "m6id.32xlarge", + "m6id.4xlarge", + "m6id.8xlarge", + "m6id.large", + "m6id.metal", + "m6id.xlarge", + "m6idn.12xlarge", + "m6idn.16xlarge", + "m6idn.24xlarge", + "m6idn.2xlarge", + "m6idn.32xlarge", + "m6idn.4xlarge", + "m6idn.8xlarge", + "m6idn.large", + "m6idn.metal", + "m6idn.xlarge", + "m6in.12xlarge", + "m6in.16xlarge", + "m6in.24xlarge", + "m6in.2xlarge", + "m6in.32xlarge", + "m6in.4xlarge", + "m6in.8xlarge", + "m6in.large", + "m6in.metal", + "m6in.xlarge", + "m7a.12xlarge", + "m7a.16xlarge", + "m7a.24xlarge", + "m7a.2xlarge", + "m7a.32xlarge", + "m7a.48xlarge", + "m7a.4xlarge", + "m7a.8xlarge", + "m7a.large", + "m7a.medium", + "m7a.metal-48xl", + "m7a.xlarge", + "m7g.12xlarge", + "m7g.16xlarge", + "m7g.2xlarge", + "m7g.4xlarge", + "m7g.8xlarge", + "m7g.large", + "m7g.medium", + "m7g.metal", + "m7g.xlarge", + "m7gd.12xlarge", + "m7gd.16xlarge", + "m7gd.2xlarge", + "m7gd.4xlarge", + "m7gd.8xlarge", + "m7gd.large", + "m7gd.medium", + "m7gd.xlarge", + "m7i-flex.2xlarge", + "m7i-flex.4xlarge", + "m7i-flex.8xlarge", + "m7i-flex.large", + "m7i-flex.xlarge", + "m7i.12xlarge", + "m7i.16xlarge", + "m7i.24xlarge", + "m7i.2xlarge", + "m7i.48xlarge", + "m7i.4xlarge", + "m7i.8xlarge", + "m7i.large", + "m7i.metal-24xl", + "m7i.metal-48xl", + "m7i.xlarge", + "p2.16xlarge", + "p2.8xlarge", + "p2.xlarge", + "p3.16xlarge", + "p3.2xlarge", + "p3.8xlarge", + "p3dn.24xlarge", + "p4d.24xlarge", + "r3.2xlarge", + "r3.4xlarge", + "r3.8xlarge", + "r3.large", + "r3.xlarge", + "r4.16xlarge", + "r4.2xlarge", + "r4.4xlarge", + "r4.8xlarge", + "r4.large", + "r4.xlarge", + "r5.12xlarge", + "r5.16xlarge", + "r5.24xlarge", + "r5.2xlarge", + "r5.4xlarge", + "r5.8xlarge", + "r5.large", + "r5.metal", + "r5.xlarge", + "r5a.12xlarge", + "r5a.16xlarge", + "r5a.24xlarge", + "r5a.2xlarge", + "r5a.4xlarge", + "r5a.8xlarge", + "r5a.large", + "r5a.xlarge", + "r5ad.12xlarge", + "r5ad.16xlarge", + "r5ad.24xlarge", + "r5ad.2xlarge", + "r5ad.4xlarge", + "r5ad.8xlarge", + "r5ad.large", + "r5ad.xlarge", + "r5b.12xlarge", + "r5b.16xlarge", + "r5b.24xlarge", + "r5b.2xlarge", + "r5b.4xlarge", + "r5b.8xlarge", + "r5b.large", + "r5b.metal", + "r5b.xlarge", + "r5d.12xlarge", + "r5d.16xlarge", + "r5d.24xlarge", + "r5d.2xlarge", + "r5d.4xlarge", + "r5d.8xlarge", + "r5d.large", + "r5d.metal", + "r5d.xlarge", + "r5dn.12xlarge", + "r5dn.16xlarge", + "r5dn.24xlarge", + "r5dn.2xlarge", + "r5dn.4xlarge", + "r5dn.8xlarge", + "r5dn.large", + "r5dn.metal", + "r5dn.xlarge", + "r5n.12xlarge", + "r5n.16xlarge", + "r5n.24xlarge", + "r5n.2xlarge", + "r5n.4xlarge", + "r5n.8xlarge", + "r5n.large", + "r5n.metal", + "r5n.xlarge", + "r6a.12xlarge", + "r6a.16xlarge", + "r6a.24xlarge", + "r6a.2xlarge", + "r6a.32xlarge", + "r6a.48xlarge", + "r6a.4xlarge", + "r6a.8xlarge", + "r6a.large", + "r6a.metal", + "r6a.xlarge", + "r6g.12xlarge", + "r6g.16xlarge", + "r6g.2xlarge", + "r6g.4xlarge", + "r6g.8xlarge", + "r6g.large", + "r6g.medium", + "r6g.metal", + "r6g.xlarge", + "r6gd.12xlarge", + "r6gd.16xlarge", + "r6gd.2xlarge", + "r6gd.4xlarge", + "r6gd.8xlarge", + "r6gd.large", + "r6gd.medium", + "r6gd.metal", + "r6gd.xlarge", + "r6i.12xlarge", + "r6i.16xlarge", + "r6i.24xlarge", + "r6i.2xlarge", + "r6i.32xlarge", + "r6i.4xlarge", + "r6i.8xlarge", + "r6i.large", + "r6i.metal", + "r6i.xlarge", + "r6id.12xlarge", + "r6id.16xlarge", + "r6id.24xlarge", + "r6id.2xlarge", + "r6id.32xlarge", + "r6id.4xlarge", + "r6id.8xlarge", + "r6id.large", + "r6id.metal", + "r6id.xlarge", + "r6idn.12xlarge", + "r6idn.16xlarge", + "r6idn.24xlarge", + "r6idn.2xlarge", + "r6idn.32xlarge", + "r6idn.4xlarge", + "r6idn.8xlarge", + "r6idn.large", + "r6idn.metal", + "r6idn.xlarge", + "r6in.12xlarge", + "r6in.16xlarge", + "r6in.24xlarge", + "r6in.2xlarge", + "r6in.32xlarge", + "r6in.4xlarge", + "r6in.8xlarge", + "r6in.large", + "r6in.metal", + "r6in.xlarge", + "r7a.12xlarge", + "r7a.16xlarge", + "r7a.24xlarge", + "r7a.2xlarge", + "r7a.32xlarge", + "r7a.48xlarge", + "r7a.4xlarge", + "r7a.8xlarge", + "r7a.large", + "r7a.medium", + "r7a.metal-48xl", + "r7a.xlarge", + "r7g.12xlarge", + "r7g.16xlarge", + "r7g.2xlarge", + "r7g.4xlarge", + "r7g.8xlarge", + "r7g.large", + "r7g.medium", + "r7g.metal", + "r7g.xlarge", + "r7gd.12xlarge", + "r7gd.16xlarge", + "r7gd.2xlarge", + "r7gd.4xlarge", + "r7gd.8xlarge", + "r7gd.large", + "r7gd.medium", + "r7gd.xlarge", + "r7i.12xlarge", + "r7i.16xlarge", + "r7i.24xlarge", + "r7i.2xlarge", + "r7i.48xlarge", + "r7i.4xlarge", + "r7i.8xlarge", + "r7i.large", + "r7i.metal-24xl", + "r7i.metal-48xl", + "r7i.xlarge", + "r7iz.12xlarge", + "r7iz.16xlarge", + "r7iz.2xlarge", + "r7iz.32xlarge", + "r7iz.4xlarge", + "r7iz.8xlarge", + "r7iz.large", + "r7iz.metal-16xl", + "r7iz.metal-32xl", + "r7iz.xlarge", + "t1.micro", + "t2.2xlarge", + "t2.large", + "t2.medium", + "t2.micro", + "t2.nano", + "t2.small", + "t2.xlarge", + "t3.2xlarge", + "t3.large", + "t3.medium", + "t3.micro", + "t3.nano", + "t3.small", + "t3.xlarge", + "t3a.2xlarge", + "t3a.large", + "t3a.medium", + "t3a.micro", + "t3a.nano", + "t3a.small", + "t3a.xlarge", + "t4g.2xlarge", + "t4g.large", + "t4g.medium", + "t4g.micro", + "t4g.nano", + "t4g.small", + "t4g.xlarge", + "u-12tb1.112xlarge", + "u-3tb1.56xlarge", + "u-6tb1.112xlarge", + "u-6tb1.56xlarge", + "u-9tb1.112xlarge", + "vt1.24xlarge", + "vt1.3xlarge", + "vt1.6xlarge", + "x1.16xlarge", + "x1.32xlarge", + "x1e.16xlarge", + "x1e.2xlarge", + "x1e.32xlarge", + "x1e.4xlarge", + "x1e.8xlarge", + "x1e.xlarge", + "x2idn.16xlarge", + "x2idn.24xlarge", + "x2idn.32xlarge", + "x2idn.metal", + "x2iedn.16xlarge", + "x2iedn.24xlarge", + "x2iedn.2xlarge", + "x2iedn.32xlarge", + "x2iedn.4xlarge", + "x2iedn.8xlarge", + "x2iedn.metal", + "x2iedn.xlarge", + "x2iezn.12xlarge", + "x2iezn.2xlarge", + "x2iezn.4xlarge", + "x2iezn.6xlarge", + "x2iezn.8xlarge", + "x2iezn.metal", + "z1d.12xlarge", + "z1d.2xlarge", + "z1d.3xlarge", + "z1d.6xlarge", + "z1d.large", + "z1d.metal", + "z1d.xlarge", + ], + "ap-northeast-2": [ + "c3.2xlarge", + "c3.4xlarge", + "c3.8xlarge", + "c3.large", + "c3.xlarge", + "c4.2xlarge", + "c4.4xlarge", + "c4.8xlarge", + "c4.large", + "c4.xlarge", + "c5.12xlarge", + "c5.18xlarge", + "c5.24xlarge", + "c5.2xlarge", + "c5.4xlarge", + "c5.9xlarge", + "c5.large", + "c5.metal", + "c5.xlarge", + "c5a.12xlarge", + "c5a.16xlarge", + "c5a.24xlarge", + "c5a.2xlarge", + "c5a.4xlarge", + "c5a.8xlarge", + "c5a.large", + "c5a.xlarge", + "c5d.12xlarge", + "c5d.18xlarge", + "c5d.24xlarge", + "c5d.2xlarge", + "c5d.4xlarge", + "c5d.9xlarge", + "c5d.large", + "c5d.metal", + "c5d.xlarge", + "c5n.18xlarge", + "c5n.2xlarge", + "c5n.4xlarge", + "c5n.9xlarge", + "c5n.large", + "c5n.metal", + "c5n.xlarge", + "c6g.12xlarge", + "c6g.16xlarge", + "c6g.2xlarge", + "c6g.4xlarge", + "c6g.8xlarge", + "c6g.large", + "c6g.medium", + "c6g.metal", + "c6g.xlarge", + "c6gd.12xlarge", + "c6gd.16xlarge", + "c6gd.2xlarge", + "c6gd.4xlarge", + "c6gd.8xlarge", + "c6gd.large", + "c6gd.medium", + "c6gd.metal", + "c6gd.xlarge", + "c6gn.12xlarge", + "c6gn.16xlarge", + "c6gn.2xlarge", + "c6gn.4xlarge", + "c6gn.8xlarge", + "c6gn.large", + "c6gn.medium", + "c6gn.xlarge", + "c6i.12xlarge", + "c6i.16xlarge", + "c6i.24xlarge", + "c6i.2xlarge", + "c6i.32xlarge", + "c6i.4xlarge", + "c6i.8xlarge", + "c6i.large", + "c6i.metal", + "c6i.xlarge", + "c6id.12xlarge", + "c6id.16xlarge", + "c6id.24xlarge", + "c6id.2xlarge", + "c6id.32xlarge", + "c6id.4xlarge", + "c6id.8xlarge", + "c6id.large", + "c6id.metal", + "c6id.xlarge", + "c6in.12xlarge", + "c6in.16xlarge", + "c6in.24xlarge", + "c6in.2xlarge", + "c6in.32xlarge", + "c6in.4xlarge", + "c6in.8xlarge", + "c6in.large", + "c6in.metal", + "c6in.xlarge", + "c7g.12xlarge", + "c7g.16xlarge", + "c7g.2xlarge", + "c7g.4xlarge", + "c7g.8xlarge", + "c7g.large", + "c7g.medium", + "c7g.metal", + "c7g.xlarge", + "c7i.12xlarge", + "c7i.16xlarge", + "c7i.24xlarge", + "c7i.2xlarge", + "c7i.48xlarge", + "c7i.4xlarge", + "c7i.8xlarge", + "c7i.large", + "c7i.metal-24xl", + "c7i.metal-48xl", + "c7i.xlarge", + "d2.2xlarge", + "d2.4xlarge", + "d2.8xlarge", + "d2.xlarge", + "g2.2xlarge", + "g2.8xlarge", + "g3.16xlarge", + "g3.4xlarge", + "g3.8xlarge", + "g3s.xlarge", + "g4dn.12xlarge", + "g4dn.16xlarge", + "g4dn.2xlarge", + "g4dn.4xlarge", + "g4dn.8xlarge", + "g4dn.metal", + "g4dn.xlarge", + "g5.12xlarge", + "g5.16xlarge", + "g5.24xlarge", + "g5.2xlarge", + "g5.48xlarge", + "g5.4xlarge", + "g5.8xlarge", + "g5.xlarge", + "g5g.16xlarge", + "g5g.2xlarge", + "g5g.4xlarge", + "g5g.8xlarge", + "g5g.metal", + "g5g.xlarge", + "i2.2xlarge", + "i2.4xlarge", + "i2.8xlarge", + "i2.xlarge", + "i3.16xlarge", + "i3.2xlarge", + "i3.4xlarge", + "i3.8xlarge", + "i3.large", + "i3.metal", + "i3.xlarge", + "i3en.12xlarge", + "i3en.24xlarge", + "i3en.2xlarge", + "i3en.3xlarge", + "i3en.6xlarge", + "i3en.large", + "i3en.metal", + "i3en.xlarge", + "i4i.12xlarge", + "i4i.16xlarge", + "i4i.24xlarge", + "i4i.2xlarge", + "i4i.32xlarge", + "i4i.4xlarge", + "i4i.8xlarge", + "i4i.large", + "i4i.metal", + "i4i.xlarge", + "inf1.24xlarge", + "inf1.2xlarge", + "inf1.6xlarge", + "inf1.xlarge", + "m3.2xlarge", + "m3.large", + "m3.medium", + "m3.xlarge", + "m4.10xlarge", + "m4.16xlarge", + "m4.2xlarge", + "m4.4xlarge", + "m4.large", + "m4.xlarge", + "m5.12xlarge", + "m5.16xlarge", + "m5.24xlarge", + "m5.2xlarge", + "m5.4xlarge", + "m5.8xlarge", + "m5.large", + "m5.metal", + "m5.xlarge", + "m5a.12xlarge", + "m5a.16xlarge", + "m5a.24xlarge", + "m5a.2xlarge", + "m5a.4xlarge", + "m5a.8xlarge", + "m5a.large", + "m5a.xlarge", + "m5ad.12xlarge", + "m5ad.16xlarge", + "m5ad.24xlarge", + "m5ad.2xlarge", + "m5ad.4xlarge", + "m5ad.8xlarge", + "m5ad.large", + "m5ad.xlarge", + "m5d.12xlarge", + "m5d.16xlarge", + "m5d.24xlarge", + "m5d.2xlarge", + "m5d.4xlarge", + "m5d.8xlarge", + "m5d.large", + "m5d.metal", + "m5d.xlarge", + "m5zn.12xlarge", + "m5zn.2xlarge", + "m5zn.3xlarge", + "m5zn.6xlarge", + "m5zn.large", + "m5zn.metal", + "m5zn.xlarge", + "m6g.12xlarge", + "m6g.16xlarge", + "m6g.2xlarge", + "m6g.4xlarge", + "m6g.8xlarge", + "m6g.large", + "m6g.medium", + "m6g.metal", + "m6g.xlarge", + "m6gd.12xlarge", + "m6gd.16xlarge", + "m6gd.2xlarge", + "m6gd.4xlarge", + "m6gd.8xlarge", + "m6gd.large", + "m6gd.medium", + "m6gd.metal", + "m6gd.xlarge", + "m6i.12xlarge", + "m6i.16xlarge", + "m6i.24xlarge", + "m6i.2xlarge", + "m6i.32xlarge", + "m6i.4xlarge", + "m6i.8xlarge", + "m6i.large", + "m6i.metal", + "m6i.xlarge", + "m6id.12xlarge", + "m6id.16xlarge", + "m6id.24xlarge", + "m6id.2xlarge", + "m6id.32xlarge", + "m6id.4xlarge", + "m6id.8xlarge", + "m6id.large", + "m6id.metal", + "m6id.xlarge", + "m7g.12xlarge", + "m7g.16xlarge", + "m7g.2xlarge", + "m7g.4xlarge", + "m7g.8xlarge", + "m7g.large", + "m7g.medium", + "m7g.metal", + "m7g.xlarge", + "m7i-flex.2xlarge", + "m7i-flex.4xlarge", + "m7i-flex.8xlarge", + "m7i-flex.large", + "m7i-flex.xlarge", + "m7i.12xlarge", + "m7i.16xlarge", + "m7i.24xlarge", + "m7i.2xlarge", + "m7i.48xlarge", + "m7i.4xlarge", + "m7i.8xlarge", + "m7i.large", + "m7i.metal-24xl", + "m7i.metal-48xl", + "m7i.xlarge", + "p2.16xlarge", + "p2.8xlarge", + "p2.xlarge", + "p3.16xlarge", + "p3.2xlarge", + "p3.8xlarge", + "p4d.24xlarge", + "r3.2xlarge", + "r3.4xlarge", + "r3.8xlarge", + "r3.large", + "r3.xlarge", + "r4.16xlarge", + "r4.2xlarge", + "r4.4xlarge", + "r4.8xlarge", + "r4.large", + "r4.xlarge", + "r5.12xlarge", + "r5.16xlarge", + "r5.24xlarge", + "r5.2xlarge", + "r5.4xlarge", + "r5.8xlarge", + "r5.large", + "r5.metal", + "r5.xlarge", + "r5a.12xlarge", + "r5a.16xlarge", + "r5a.24xlarge", + "r5a.2xlarge", + "r5a.4xlarge", + "r5a.8xlarge", + "r5a.large", + "r5a.xlarge", + "r5ad.12xlarge", + "r5ad.16xlarge", + "r5ad.24xlarge", + "r5ad.2xlarge", + "r5ad.4xlarge", + "r5ad.8xlarge", + "r5ad.large", + "r5ad.xlarge", + "r5b.12xlarge", + "r5b.16xlarge", + "r5b.24xlarge", + "r5b.2xlarge", + "r5b.4xlarge", + "r5b.8xlarge", + "r5b.large", + "r5b.metal", + "r5b.xlarge", + "r5d.12xlarge", + "r5d.16xlarge", + "r5d.24xlarge", + "r5d.2xlarge", + "r5d.4xlarge", + "r5d.8xlarge", + "r5d.large", + "r5d.metal", + "r5d.xlarge", + "r5dn.12xlarge", + "r5dn.16xlarge", + "r5dn.24xlarge", + "r5dn.2xlarge", + "r5dn.4xlarge", + "r5dn.8xlarge", + "r5dn.large", + "r5dn.metal", + "r5dn.xlarge", + "r5n.12xlarge", + "r5n.16xlarge", + "r5n.24xlarge", + "r5n.2xlarge", + "r5n.4xlarge", + "r5n.8xlarge", + "r5n.large", + "r5n.metal", + "r5n.xlarge", + "r6g.12xlarge", + "r6g.16xlarge", + "r6g.2xlarge", + "r6g.4xlarge", + "r6g.8xlarge", + "r6g.large", + "r6g.medium", + "r6g.metal", + "r6g.xlarge", + "r6gd.12xlarge", + "r6gd.16xlarge", + "r6gd.2xlarge", + "r6gd.4xlarge", + "r6gd.8xlarge", + "r6gd.large", + "r6gd.medium", + "r6gd.metal", + "r6gd.xlarge", + "r6i.12xlarge", + "r6i.16xlarge", + "r6i.24xlarge", + "r6i.2xlarge", + "r6i.32xlarge", + "r6i.4xlarge", + "r6i.8xlarge", + "r6i.large", + "r6i.metal", + "r6i.xlarge", + "r6id.12xlarge", + "r6id.16xlarge", + "r6id.24xlarge", + "r6id.2xlarge", + "r6id.32xlarge", + "r6id.4xlarge", + "r6id.8xlarge", + "r6id.large", + "r6id.metal", + "r6id.xlarge", + "r7g.12xlarge", + "r7g.16xlarge", + "r7g.2xlarge", + "r7g.4xlarge", + "r7g.8xlarge", + "r7g.large", + "r7g.medium", + "r7g.metal", + "r7g.xlarge", + "r7i.12xlarge", + "r7i.16xlarge", + "r7i.24xlarge", + "r7i.2xlarge", + "r7i.48xlarge", + "r7i.4xlarge", + "r7i.8xlarge", + "r7i.large", + "r7i.metal-24xl", + "r7i.metal-48xl", + "r7i.xlarge", + "t2.2xlarge", + "t2.large", + "t2.medium", + "t2.micro", + "t2.nano", + "t2.small", + "t2.xlarge", + "t3.2xlarge", + "t3.large", + "t3.medium", + "t3.micro", + "t3.nano", + "t3.small", + "t3.xlarge", + "t3a.2xlarge", + "t3a.large", + "t3a.medium", + "t3a.micro", + "t3a.nano", + "t3a.small", + "t3a.xlarge", + "t4g.2xlarge", + "t4g.large", + "t4g.medium", + "t4g.micro", + "t4g.nano", + "t4g.small", + "t4g.xlarge", + "u-12tb1.112xlarge", + "u-24tb1.112xlarge", + "u-6tb1.112xlarge", + "u-6tb1.56xlarge", + "u-9tb1.112xlarge", + "x1.16xlarge", + "x1.32xlarge", + "x1e.16xlarge", + "x1e.2xlarge", + "x1e.32xlarge", + "x1e.4xlarge", + "x1e.8xlarge", + "x1e.xlarge", + "x2idn.16xlarge", + "x2idn.24xlarge", + "x2idn.32xlarge", + "x2idn.metal", + "x2iedn.16xlarge", + "x2iedn.24xlarge", + "x2iedn.2xlarge", + "x2iedn.32xlarge", + "x2iedn.4xlarge", + "x2iedn.8xlarge", + "x2iedn.metal", + "x2iedn.xlarge", + "z1d.12xlarge", + "z1d.2xlarge", + "z1d.3xlarge", + "z1d.6xlarge", + "z1d.large", + "z1d.metal", + "z1d.xlarge", + ], + "ap-northeast-3": [ + "c4.2xlarge", + "c4.4xlarge", + "c4.8xlarge", + "c4.large", + "c4.xlarge", + "c5.12xlarge", + "c5.18xlarge", + "c5.24xlarge", + "c5.2xlarge", + "c5.4xlarge", + "c5.9xlarge", + "c5.large", + "c5.metal", + "c5.xlarge", + "c5d.12xlarge", + "c5d.18xlarge", + "c5d.24xlarge", + "c5d.2xlarge", + "c5d.4xlarge", + "c5d.9xlarge", + "c5d.large", + "c5d.metal", + "c5d.xlarge", + "c5n.18xlarge", + "c5n.2xlarge", + "c5n.4xlarge", + "c5n.9xlarge", + "c5n.large", + "c5n.metal", + "c5n.xlarge", + "c6g.12xlarge", + "c6g.16xlarge", + "c6g.2xlarge", + "c6g.4xlarge", + "c6g.8xlarge", + "c6g.large", + "c6g.medium", + "c6g.metal", + "c6g.xlarge", + "c6gd.12xlarge", + "c6gd.16xlarge", + "c6gd.2xlarge", + "c6gd.4xlarge", + "c6gd.8xlarge", + "c6gd.large", + "c6gd.medium", + "c6gd.metal", + "c6gd.xlarge", + "c6gn.12xlarge", + "c6gn.16xlarge", + "c6gn.2xlarge", + "c6gn.4xlarge", + "c6gn.8xlarge", + "c6gn.large", + "c6gn.medium", + "c6gn.xlarge", + "c6i.12xlarge", + "c6i.16xlarge", + "c6i.24xlarge", + "c6i.2xlarge", + "c6i.32xlarge", + "c6i.4xlarge", + "c6i.8xlarge", + "c6i.large", + "c6i.metal", + "c6i.xlarge", + "d2.2xlarge", + "d2.4xlarge", + "d2.8xlarge", + "d2.xlarge", + "g4dn.12xlarge", + "g4dn.16xlarge", + "g4dn.2xlarge", + "g4dn.4xlarge", + "g4dn.8xlarge", + "g4dn.metal", + "g4dn.xlarge", + "i3.16xlarge", + "i3.2xlarge", + "i3.4xlarge", + "i3.8xlarge", + "i3.large", + "i3.metal", + "i3.xlarge", + "i3en.12xlarge", + "i3en.24xlarge", + "i3en.2xlarge", + "i3en.3xlarge", + "i3en.6xlarge", + "i3en.large", + "i3en.metal", + "i3en.xlarge", + "i4i.12xlarge", + "i4i.16xlarge", + "i4i.24xlarge", + "i4i.2xlarge", + "i4i.32xlarge", + "i4i.4xlarge", + "i4i.8xlarge", + "i4i.large", + "i4i.metal", + "i4i.xlarge", + "m4.16xlarge", + "m4.2xlarge", + "m4.4xlarge", + "m4.large", + "m4.xlarge", + "m5.12xlarge", + "m5.16xlarge", + "m5.24xlarge", + "m5.2xlarge", + "m5.4xlarge", + "m5.8xlarge", + "m5.large", + "m5.metal", + "m5.xlarge", + "m5d.12xlarge", + "m5d.16xlarge", + "m5d.24xlarge", + "m5d.2xlarge", + "m5d.4xlarge", + "m5d.8xlarge", + "m5d.large", + "m5d.metal", + "m5d.xlarge", + "m6g.12xlarge", + "m6g.16xlarge", + "m6g.2xlarge", + "m6g.4xlarge", + "m6g.8xlarge", + "m6g.large", + "m6g.medium", + "m6g.metal", + "m6g.xlarge", + "m6gd.12xlarge", + "m6gd.16xlarge", + "m6gd.2xlarge", + "m6gd.4xlarge", + "m6gd.8xlarge", + "m6gd.large", + "m6gd.medium", + "m6gd.metal", + "m6gd.xlarge", + "m6i.12xlarge", + "m6i.16xlarge", + "m6i.24xlarge", + "m6i.2xlarge", + "m6i.32xlarge", + "m6i.4xlarge", + "m6i.8xlarge", + "m6i.large", + "m6i.metal", + "m6i.xlarge", + "r4.16xlarge", + "r4.2xlarge", + "r4.4xlarge", + "r4.8xlarge", + "r4.large", + "r4.xlarge", + "r5.12xlarge", + "r5.16xlarge", + "r5.24xlarge", + "r5.2xlarge", + "r5.4xlarge", + "r5.8xlarge", + "r5.large", + "r5.metal", + "r5.xlarge", + "r5d.12xlarge", + "r5d.16xlarge", + "r5d.24xlarge", + "r5d.2xlarge", + "r5d.4xlarge", + "r5d.8xlarge", + "r5d.large", + "r5d.metal", + "r5d.xlarge", + "r6g.12xlarge", + "r6g.16xlarge", + "r6g.2xlarge", + "r6g.4xlarge", + "r6g.8xlarge", + "r6g.large", + "r6g.medium", + "r6g.metal", + "r6g.xlarge", + "r6gd.12xlarge", + "r6gd.16xlarge", + "r6gd.2xlarge", + "r6gd.4xlarge", + "r6gd.8xlarge", + "r6gd.large", + "r6gd.medium", + "r6gd.metal", + "r6gd.xlarge", + "r6i.12xlarge", + "r6i.16xlarge", + "r6i.24xlarge", + "r6i.2xlarge", + "r6i.32xlarge", + "r6i.4xlarge", + "r6i.8xlarge", + "r6i.large", + "r6i.metal", + "r6i.xlarge", + "t2.2xlarge", + "t2.large", + "t2.medium", + "t2.micro", + "t2.nano", + "t2.small", + "t2.xlarge", + "t3.2xlarge", + "t3.large", + "t3.medium", + "t3.micro", + "t3.nano", + "t3.small", + "t3.xlarge", + "t4g.2xlarge", + "t4g.large", + "t4g.medium", + "t4g.micro", + "t4g.nano", + "t4g.small", + "t4g.xlarge", + "x1.16xlarge", + "x1.32xlarge", + "x1e.16xlarge", + "x1e.2xlarge", + "x1e.32xlarge", + "x1e.4xlarge", + "x1e.8xlarge", + "x1e.xlarge", + "x2idn.16xlarge", + "x2idn.24xlarge", + "x2idn.32xlarge", + "x2idn.metal", + "x2iedn.16xlarge", + "x2iedn.24xlarge", + "x2iedn.2xlarge", + "x2iedn.32xlarge", + "x2iedn.4xlarge", + "x2iedn.8xlarge", + "x2iedn.metal", + "x2iedn.xlarge", + ], + "ap-southeast-1": [ + "a1.2xlarge", + "a1.4xlarge", + "a1.large", + "a1.medium", + "a1.metal", + "a1.xlarge", + "c1.medium", + "c1.xlarge", + "c3.2xlarge", + "c3.4xlarge", + "c3.8xlarge", + "c3.large", + "c3.xlarge", + "c4.2xlarge", + "c4.4xlarge", + "c4.8xlarge", + "c4.large", + "c4.xlarge", + "c5.12xlarge", + "c5.18xlarge", + "c5.24xlarge", + "c5.2xlarge", + "c5.4xlarge", + "c5.9xlarge", + "c5.large", + "c5.metal", + "c5.xlarge", + "c5a.12xlarge", + "c5a.16xlarge", + "c5a.24xlarge", + "c5a.2xlarge", + "c5a.4xlarge", + "c5a.8xlarge", + "c5a.large", + "c5a.xlarge", + "c5ad.12xlarge", + "c5ad.16xlarge", + "c5ad.24xlarge", + "c5ad.2xlarge", + "c5ad.4xlarge", + "c5ad.8xlarge", + "c5ad.large", + "c5ad.xlarge", + "c5d.12xlarge", + "c5d.18xlarge", + "c5d.24xlarge", + "c5d.2xlarge", + "c5d.4xlarge", + "c5d.9xlarge", + "c5d.large", + "c5d.metal", + "c5d.xlarge", + "c5n.18xlarge", + "c5n.2xlarge", + "c5n.4xlarge", + "c5n.9xlarge", + "c5n.large", + "c5n.metal", + "c5n.xlarge", + "c6a.12xlarge", + "c6a.16xlarge", + "c6a.24xlarge", + "c6a.2xlarge", + "c6a.32xlarge", + "c6a.48xlarge", + "c6a.4xlarge", + "c6a.8xlarge", + "c6a.large", + "c6a.metal", + "c6a.xlarge", + "c6g.12xlarge", + "c6g.16xlarge", + "c6g.2xlarge", + "c6g.4xlarge", + "c6g.8xlarge", + "c6g.large", + "c6g.medium", + "c6g.metal", + "c6g.xlarge", + "c6gd.12xlarge", + "c6gd.16xlarge", + "c6gd.2xlarge", + "c6gd.4xlarge", + "c6gd.8xlarge", + "c6gd.large", + "c6gd.medium", + "c6gd.metal", + "c6gd.xlarge", + "c6gn.12xlarge", + "c6gn.16xlarge", + "c6gn.2xlarge", + "c6gn.4xlarge", + "c6gn.8xlarge", + "c6gn.large", + "c6gn.medium", + "c6gn.xlarge", + "c6i.12xlarge", + "c6i.16xlarge", + "c6i.24xlarge", + "c6i.2xlarge", + "c6i.32xlarge", + "c6i.4xlarge", + "c6i.8xlarge", + "c6i.large", + "c6i.metal", + "c6i.xlarge", + "c6id.12xlarge", + "c6id.16xlarge", + "c6id.24xlarge", + "c6id.2xlarge", + "c6id.32xlarge", + "c6id.4xlarge", + "c6id.8xlarge", + "c6id.large", + "c6id.metal", + "c6id.xlarge", + "c6in.12xlarge", + "c6in.16xlarge", + "c6in.24xlarge", + "c6in.2xlarge", + "c6in.32xlarge", + "c6in.4xlarge", + "c6in.8xlarge", + "c6in.large", + "c6in.metal", + "c6in.xlarge", + "c7g.12xlarge", + "c7g.16xlarge", + "c7g.2xlarge", + "c7g.4xlarge", + "c7g.8xlarge", + "c7g.large", + "c7g.medium", + "c7g.metal", + "c7g.xlarge", + "c7gd.12xlarge", + "c7gd.16xlarge", + "c7gd.2xlarge", + "c7gd.4xlarge", + "c7gd.8xlarge", + "c7gd.large", + "c7gd.medium", + "c7gd.xlarge", + "c7i.12xlarge", + "c7i.16xlarge", + "c7i.24xlarge", + "c7i.2xlarge", + "c7i.48xlarge", + "c7i.4xlarge", + "c7i.8xlarge", + "c7i.large", + "c7i.metal-24xl", + "c7i.metal-48xl", + "c7i.xlarge", + "d2.2xlarge", + "d2.4xlarge", + "d2.8xlarge", + "d2.xlarge", + "d3.2xlarge", + "d3.4xlarge", + "d3.8xlarge", + "d3.xlarge", + "d3en.12xlarge", + "d3en.2xlarge", + "d3en.4xlarge", + "d3en.6xlarge", + "d3en.8xlarge", + "d3en.xlarge", + "g2.2xlarge", + "g2.8xlarge", + "g3.16xlarge", + "g3.4xlarge", + "g3.8xlarge", + "g4dn.12xlarge", + "g4dn.16xlarge", + "g4dn.2xlarge", + "g4dn.4xlarge", + "g4dn.8xlarge", + "g4dn.metal", + "g4dn.xlarge", + "g5g.16xlarge", + "g5g.2xlarge", + "g5g.4xlarge", + "g5g.8xlarge", + "g5g.metal", + "g5g.xlarge", + "hpc6a.48xlarge", + "i2.2xlarge", + "i2.4xlarge", + "i2.8xlarge", + "i2.xlarge", + "i3.16xlarge", + "i3.2xlarge", + "i3.4xlarge", + "i3.8xlarge", + "i3.large", + "i3.metal", + "i3.xlarge", + "i3en.12xlarge", + "i3en.24xlarge", + "i3en.2xlarge", + "i3en.3xlarge", + "i3en.6xlarge", + "i3en.large", + "i3en.metal", + "i3en.xlarge", + "i4g.16xlarge", + "i4g.2xlarge", + "i4g.4xlarge", + "i4g.8xlarge", + "i4g.large", + "i4g.xlarge", + "i4i.12xlarge", + "i4i.16xlarge", + "i4i.24xlarge", + "i4i.2xlarge", + "i4i.32xlarge", + "i4i.4xlarge", + "i4i.8xlarge", + "i4i.large", + "i4i.metal", + "i4i.xlarge", + "im4gn.16xlarge", + "im4gn.2xlarge", + "im4gn.4xlarge", + "im4gn.8xlarge", + "im4gn.large", + "im4gn.xlarge", + "inf1.24xlarge", + "inf1.2xlarge", + "inf1.6xlarge", + "inf1.xlarge", + "inf2.24xlarge", + "inf2.48xlarge", + "inf2.8xlarge", + "inf2.xlarge", + "is4gen.2xlarge", + "is4gen.4xlarge", + "is4gen.8xlarge", + "is4gen.large", + "is4gen.medium", + "is4gen.xlarge", + "m1.large", + "m1.medium", + "m1.xlarge", + "m2.2xlarge", + "m2.4xlarge", + "m2.xlarge", + "m3.2xlarge", + "m3.large", + "m3.medium", + "m3.xlarge", + "m4.10xlarge", + "m4.16xlarge", + "m4.2xlarge", + "m4.4xlarge", + "m4.large", + "m4.xlarge", + "m5.12xlarge", + "m5.16xlarge", + "m5.24xlarge", + "m5.2xlarge", + "m5.4xlarge", + "m5.8xlarge", + "m5.large", + "m5.metal", + "m5.xlarge", + "m5a.12xlarge", + "m5a.16xlarge", + "m5a.24xlarge", + "m5a.2xlarge", + "m5a.4xlarge", + "m5a.8xlarge", + "m5a.large", + "m5a.xlarge", + "m5ad.12xlarge", + "m5ad.16xlarge", + "m5ad.24xlarge", + "m5ad.2xlarge", + "m5ad.4xlarge", + "m5ad.8xlarge", + "m5ad.large", + "m5ad.xlarge", + "m5d.12xlarge", + "m5d.16xlarge", + "m5d.24xlarge", + "m5d.2xlarge", + "m5d.4xlarge", + "m5d.8xlarge", + "m5d.large", + "m5d.metal", + "m5d.xlarge", + "m5dn.12xlarge", + "m5dn.16xlarge", + "m5dn.24xlarge", + "m5dn.2xlarge", + "m5dn.4xlarge", + "m5dn.8xlarge", + "m5dn.large", + "m5dn.metal", + "m5dn.xlarge", + "m5n.12xlarge", + "m5n.16xlarge", + "m5n.24xlarge", + "m5n.2xlarge", + "m5n.4xlarge", + "m5n.8xlarge", + "m5n.large", + "m5n.metal", + "m5n.xlarge", + "m5zn.12xlarge", + "m5zn.2xlarge", + "m5zn.3xlarge", + "m5zn.6xlarge", + "m5zn.large", + "m5zn.metal", + "m5zn.xlarge", + "m6a.12xlarge", + "m6a.16xlarge", + "m6a.24xlarge", + "m6a.2xlarge", + "m6a.32xlarge", + "m6a.48xlarge", + "m6a.4xlarge", + "m6a.8xlarge", + "m6a.large", + "m6a.metal", + "m6a.xlarge", + "m6g.12xlarge", + "m6g.16xlarge", + "m6g.2xlarge", + "m6g.4xlarge", + "m6g.8xlarge", + "m6g.large", + "m6g.medium", + "m6g.metal", + "m6g.xlarge", + "m6gd.12xlarge", + "m6gd.16xlarge", + "m6gd.2xlarge", + "m6gd.4xlarge", + "m6gd.8xlarge", + "m6gd.large", + "m6gd.medium", + "m6gd.metal", + "m6gd.xlarge", + "m6i.12xlarge", + "m6i.16xlarge", + "m6i.24xlarge", + "m6i.2xlarge", + "m6i.32xlarge", + "m6i.4xlarge", + "m6i.8xlarge", + "m6i.large", + "m6i.metal", + "m6i.xlarge", + "m6id.12xlarge", + "m6id.16xlarge", + "m6id.24xlarge", + "m6id.2xlarge", + "m6id.32xlarge", + "m6id.4xlarge", + "m6id.8xlarge", + "m6id.large", + "m6id.metal", + "m6id.xlarge", + "m6idn.12xlarge", + "m6idn.16xlarge", + "m6idn.24xlarge", + "m6idn.2xlarge", + "m6idn.32xlarge", + "m6idn.4xlarge", + "m6idn.8xlarge", + "m6idn.large", + "m6idn.metal", + "m6idn.xlarge", + "m6in.12xlarge", + "m6in.16xlarge", + "m6in.24xlarge", + "m6in.2xlarge", + "m6in.32xlarge", + "m6in.4xlarge", + "m6in.8xlarge", + "m6in.large", + "m6in.metal", + "m6in.xlarge", + "m7g.12xlarge", + "m7g.16xlarge", + "m7g.2xlarge", + "m7g.4xlarge", + "m7g.8xlarge", + "m7g.large", + "m7g.medium", + "m7g.metal", + "m7g.xlarge", + "m7gd.12xlarge", + "m7gd.16xlarge", + "m7gd.2xlarge", + "m7gd.4xlarge", + "m7gd.8xlarge", + "m7gd.large", + "m7gd.medium", + "m7gd.xlarge", + "m7i-flex.2xlarge", + "m7i-flex.4xlarge", + "m7i-flex.8xlarge", + "m7i-flex.large", + "m7i-flex.xlarge", + "m7i.12xlarge", + "m7i.16xlarge", + "m7i.24xlarge", + "m7i.2xlarge", + "m7i.48xlarge", + "m7i.4xlarge", + "m7i.8xlarge", + "m7i.large", + "m7i.metal-24xl", + "m7i.metal-48xl", + "m7i.xlarge", + "p2.16xlarge", + "p2.8xlarge", + "p2.xlarge", + "p3.16xlarge", + "p3.2xlarge", + "p3.8xlarge", + "p4d.24xlarge", + "p4de.24xlarge", + "r3.2xlarge", + "r3.4xlarge", + "r3.8xlarge", + "r3.large", + "r3.xlarge", + "r4.16xlarge", + "r4.2xlarge", + "r4.4xlarge", + "r4.8xlarge", + "r4.large", + "r4.xlarge", + "r5.12xlarge", + "r5.16xlarge", + "r5.24xlarge", + "r5.2xlarge", + "r5.4xlarge", + "r5.8xlarge", + "r5.large", + "r5.metal", + "r5.xlarge", + "r5a.12xlarge", + "r5a.16xlarge", + "r5a.24xlarge", + "r5a.2xlarge", + "r5a.4xlarge", + "r5a.8xlarge", + "r5a.large", + "r5a.xlarge", + "r5ad.12xlarge", + "r5ad.16xlarge", + "r5ad.24xlarge", + "r5ad.2xlarge", + "r5ad.4xlarge", + "r5ad.8xlarge", + "r5ad.large", + "r5ad.xlarge", + "r5b.12xlarge", + "r5b.16xlarge", + "r5b.24xlarge", + "r5b.2xlarge", + "r5b.4xlarge", + "r5b.8xlarge", + "r5b.large", + "r5b.metal", + "r5b.xlarge", + "r5d.12xlarge", + "r5d.16xlarge", + "r5d.24xlarge", + "r5d.2xlarge", + "r5d.4xlarge", + "r5d.8xlarge", + "r5d.large", + "r5d.metal", + "r5d.xlarge", + "r5dn.12xlarge", + "r5dn.16xlarge", + "r5dn.24xlarge", + "r5dn.2xlarge", + "r5dn.4xlarge", + "r5dn.8xlarge", + "r5dn.large", + "r5dn.metal", + "r5dn.xlarge", + "r5n.12xlarge", + "r5n.16xlarge", + "r5n.24xlarge", + "r5n.2xlarge", + "r5n.4xlarge", + "r5n.8xlarge", + "r5n.large", + "r5n.metal", + "r5n.xlarge", + "r6a.12xlarge", + "r6a.16xlarge", + "r6a.24xlarge", + "r6a.2xlarge", + "r6a.32xlarge", + "r6a.48xlarge", + "r6a.4xlarge", + "r6a.8xlarge", + "r6a.large", + "r6a.metal", + "r6a.xlarge", + "r6g.12xlarge", + "r6g.16xlarge", + "r6g.2xlarge", + "r6g.4xlarge", + "r6g.8xlarge", + "r6g.large", + "r6g.medium", + "r6g.metal", + "r6g.xlarge", + "r6gd.12xlarge", + "r6gd.16xlarge", + "r6gd.2xlarge", + "r6gd.4xlarge", + "r6gd.8xlarge", + "r6gd.large", + "r6gd.medium", + "r6gd.metal", + "r6gd.xlarge", + "r6i.12xlarge", + "r6i.16xlarge", + "r6i.24xlarge", + "r6i.2xlarge", + "r6i.32xlarge", + "r6i.4xlarge", + "r6i.8xlarge", + "r6i.large", + "r6i.metal", + "r6i.xlarge", + "r6id.12xlarge", + "r6id.16xlarge", + "r6id.24xlarge", + "r6id.2xlarge", + "r6id.32xlarge", + "r6id.4xlarge", + "r6id.8xlarge", + "r6id.large", + "r6id.metal", + "r6id.xlarge", + "r6idn.12xlarge", + "r6idn.16xlarge", + "r6idn.24xlarge", + "r6idn.2xlarge", + "r6idn.32xlarge", + "r6idn.4xlarge", + "r6idn.8xlarge", + "r6idn.large", + "r6idn.metal", + "r6idn.xlarge", + "r6in.12xlarge", + "r6in.16xlarge", + "r6in.24xlarge", + "r6in.2xlarge", + "r6in.32xlarge", + "r6in.4xlarge", + "r6in.8xlarge", + "r6in.large", + "r6in.metal", + "r6in.xlarge", + "r7g.12xlarge", + "r7g.16xlarge", + "r7g.2xlarge", + "r7g.4xlarge", + "r7g.8xlarge", + "r7g.large", + "r7g.medium", + "r7g.metal", + "r7g.xlarge", + "r7gd.12xlarge", + "r7gd.16xlarge", + "r7gd.2xlarge", + "r7gd.4xlarge", + "r7gd.8xlarge", + "r7gd.large", + "r7gd.medium", + "r7gd.xlarge", + "r7i.12xlarge", + "r7i.16xlarge", + "r7i.24xlarge", + "r7i.2xlarge", + "r7i.48xlarge", + "r7i.4xlarge", + "r7i.8xlarge", + "r7i.large", + "r7i.metal-24xl", + "r7i.metal-48xl", + "r7i.xlarge", + "t1.micro", + "t2.2xlarge", + "t2.large", + "t2.medium", + "t2.micro", + "t2.nano", + "t2.small", + "t2.xlarge", + "t3.2xlarge", + "t3.large", + "t3.medium", + "t3.micro", + "t3.nano", + "t3.small", + "t3.xlarge", + "t3a.2xlarge", + "t3a.large", + "t3a.medium", + "t3a.micro", + "t3a.nano", + "t3a.small", + "t3a.xlarge", + "t4g.2xlarge", + "t4g.large", + "t4g.medium", + "t4g.micro", + "t4g.nano", + "t4g.small", + "t4g.xlarge", + "u-12tb1.112xlarge", + "u-3tb1.56xlarge", + "u-6tb1.112xlarge", + "u-6tb1.56xlarge", + "u-9tb1.112xlarge", + "x1.16xlarge", + "x1.32xlarge", + "x1e.16xlarge", + "x1e.2xlarge", + "x1e.32xlarge", + "x1e.4xlarge", + "x1e.8xlarge", + "x1e.xlarge", + "x2idn.16xlarge", + "x2idn.24xlarge", + "x2idn.32xlarge", + "x2idn.metal", + "x2iedn.16xlarge", + "x2iedn.24xlarge", + "x2iedn.2xlarge", + "x2iedn.32xlarge", + "x2iedn.4xlarge", + "x2iedn.8xlarge", + "x2iedn.metal", + "x2iedn.xlarge", + "z1d.12xlarge", + "z1d.2xlarge", + "z1d.3xlarge", + "z1d.6xlarge", + "z1d.large", + "z1d.metal", + "z1d.xlarge", + ], + "ap-southeast-2": [ + "a1.2xlarge", + "a1.4xlarge", + "a1.large", + "a1.medium", + "a1.metal", + "a1.xlarge", + "c1.medium", + "c1.xlarge", + "c3.2xlarge", + "c3.4xlarge", + "c3.8xlarge", + "c3.large", + "c3.xlarge", + "c4.2xlarge", + "c4.4xlarge", + "c4.8xlarge", + "c4.large", + "c4.xlarge", + "c5.12xlarge", + "c5.18xlarge", + "c5.24xlarge", + "c5.2xlarge", + "c5.4xlarge", + "c5.9xlarge", + "c5.large", + "c5.metal", + "c5.xlarge", + "c5a.12xlarge", + "c5a.16xlarge", + "c5a.24xlarge", + "c5a.2xlarge", + "c5a.4xlarge", + "c5a.8xlarge", + "c5a.large", + "c5a.xlarge", + "c5ad.12xlarge", + "c5ad.16xlarge", + "c5ad.24xlarge", + "c5ad.2xlarge", + "c5ad.4xlarge", + "c5ad.8xlarge", + "c5ad.large", + "c5ad.xlarge", + "c5d.12xlarge", + "c5d.18xlarge", + "c5d.24xlarge", + "c5d.2xlarge", + "c5d.4xlarge", + "c5d.9xlarge", + "c5d.large", + "c5d.metal", + "c5d.xlarge", + "c5n.18xlarge", + "c5n.2xlarge", + "c5n.4xlarge", + "c5n.9xlarge", + "c5n.large", + "c5n.metal", + "c5n.xlarge", + "c6a.12xlarge", + "c6a.16xlarge", + "c6a.24xlarge", + "c6a.2xlarge", + "c6a.32xlarge", + "c6a.48xlarge", + "c6a.4xlarge", + "c6a.8xlarge", + "c6a.large", + "c6a.metal", + "c6a.xlarge", + "c6g.12xlarge", + "c6g.16xlarge", + "c6g.2xlarge", + "c6g.4xlarge", + "c6g.8xlarge", + "c6g.large", + "c6g.medium", + "c6g.metal", + "c6g.xlarge", + "c6gd.12xlarge", + "c6gd.16xlarge", + "c6gd.2xlarge", + "c6gd.4xlarge", + "c6gd.8xlarge", + "c6gd.large", + "c6gd.medium", + "c6gd.metal", + "c6gd.xlarge", + "c6gn.12xlarge", + "c6gn.16xlarge", + "c6gn.2xlarge", + "c6gn.4xlarge", + "c6gn.8xlarge", + "c6gn.large", + "c6gn.medium", + "c6gn.xlarge", + "c6i.12xlarge", + "c6i.16xlarge", + "c6i.24xlarge", + "c6i.2xlarge", + "c6i.32xlarge", + "c6i.4xlarge", + "c6i.8xlarge", + "c6i.large", + "c6i.metal", + "c6i.xlarge", + "c6id.12xlarge", + "c6id.16xlarge", + "c6id.24xlarge", + "c6id.2xlarge", + "c6id.32xlarge", + "c6id.4xlarge", + "c6id.8xlarge", + "c6id.large", + "c6id.metal", + "c6id.xlarge", + "c6in.12xlarge", + "c6in.16xlarge", + "c6in.24xlarge", + "c6in.2xlarge", + "c6in.32xlarge", + "c6in.4xlarge", + "c6in.8xlarge", + "c6in.large", + "c6in.metal", + "c6in.xlarge", + "c7g.12xlarge", + "c7g.16xlarge", + "c7g.2xlarge", + "c7g.4xlarge", + "c7g.8xlarge", + "c7g.large", + "c7g.medium", + "c7g.metal", + "c7g.xlarge", + "c7gd.12xlarge", + "c7gd.16xlarge", + "c7gd.2xlarge", + "c7gd.4xlarge", + "c7gd.8xlarge", + "c7gd.large", + "c7gd.medium", + "c7gd.xlarge", + "c7i.12xlarge", + "c7i.16xlarge", + "c7i.24xlarge", + "c7i.2xlarge", + "c7i.48xlarge", + "c7i.4xlarge", + "c7i.8xlarge", + "c7i.large", + "c7i.metal-24xl", + "c7i.metal-48xl", + "c7i.xlarge", + "d2.2xlarge", + "d2.4xlarge", + "d2.8xlarge", + "d2.xlarge", + "d3.2xlarge", + "d3.4xlarge", + "d3.8xlarge", + "d3.xlarge", + "f1.16xlarge", + "f1.2xlarge", + "f1.4xlarge", + "g2.2xlarge", + "g2.8xlarge", + "g3.16xlarge", + "g3.4xlarge", + "g3.8xlarge", + "g3s.xlarge", + "g4dn.12xlarge", + "g4dn.16xlarge", + "g4dn.2xlarge", + "g4dn.4xlarge", + "g4dn.8xlarge", + "g4dn.metal", + "g4dn.xlarge", + "g5.12xlarge", + "g5.16xlarge", + "g5.24xlarge", + "g5.2xlarge", + "g5.48xlarge", + "g5.4xlarge", + "g5.8xlarge", + "g5.xlarge", + "hpc6a.48xlarge", + "i2.2xlarge", + "i2.4xlarge", + "i2.8xlarge", + "i2.xlarge", + "i3.16xlarge", + "i3.2xlarge", + "i3.4xlarge", + "i3.8xlarge", + "i3.large", + "i3.metal", + "i3.xlarge", + "i3en.12xlarge", + "i3en.24xlarge", + "i3en.2xlarge", + "i3en.3xlarge", + "i3en.6xlarge", + "i3en.large", + "i3en.metal", + "i3en.xlarge", + "i4i.12xlarge", + "i4i.16xlarge", + "i4i.24xlarge", + "i4i.2xlarge", + "i4i.32xlarge", + "i4i.4xlarge", + "i4i.8xlarge", + "i4i.large", + "i4i.metal", + "i4i.xlarge", + "im4gn.16xlarge", + "im4gn.2xlarge", + "im4gn.4xlarge", + "im4gn.8xlarge", + "im4gn.large", + "im4gn.xlarge", + "inf1.24xlarge", + "inf1.2xlarge", + "inf1.6xlarge", + "inf1.xlarge", + "is4gen.2xlarge", + "is4gen.4xlarge", + "is4gen.8xlarge", + "is4gen.large", + "is4gen.medium", + "is4gen.xlarge", + "m1.large", + "m1.medium", + "m1.xlarge", + "m2.2xlarge", + "m2.4xlarge", + "m2.xlarge", + "m3.2xlarge", + "m3.large", + "m3.medium", + "m3.xlarge", + "m4.10xlarge", + "m4.16xlarge", + "m4.2xlarge", + "m4.4xlarge", + "m4.large", + "m4.xlarge", + "m5.12xlarge", + "m5.16xlarge", + "m5.24xlarge", + "m5.2xlarge", + "m5.4xlarge", + "m5.8xlarge", + "m5.large", + "m5.metal", + "m5.xlarge", + "m5a.12xlarge", + "m5a.16xlarge", + "m5a.24xlarge", + "m5a.2xlarge", + "m5a.4xlarge", + "m5a.8xlarge", + "m5a.large", + "m5a.xlarge", + "m5ad.12xlarge", + "m5ad.16xlarge", + "m5ad.24xlarge", + "m5ad.2xlarge", + "m5ad.4xlarge", + "m5ad.8xlarge", + "m5ad.large", + "m5ad.xlarge", + "m5d.12xlarge", + "m5d.16xlarge", + "m5d.24xlarge", + "m5d.2xlarge", + "m5d.4xlarge", + "m5d.8xlarge", + "m5d.large", + "m5d.metal", + "m5d.xlarge", + "m5zn.12xlarge", + "m5zn.2xlarge", + "m5zn.3xlarge", + "m5zn.6xlarge", + "m5zn.large", + "m5zn.metal", + "m5zn.xlarge", + "m6a.12xlarge", + "m6a.16xlarge", + "m6a.24xlarge", + "m6a.2xlarge", + "m6a.32xlarge", + "m6a.48xlarge", + "m6a.4xlarge", + "m6a.8xlarge", + "m6a.large", + "m6a.metal", + "m6a.xlarge", + "m6g.12xlarge", + "m6g.16xlarge", + "m6g.2xlarge", + "m6g.4xlarge", + "m6g.8xlarge", + "m6g.large", + "m6g.medium", + "m6g.metal", + "m6g.xlarge", + "m6gd.12xlarge", + "m6gd.16xlarge", + "m6gd.2xlarge", + "m6gd.4xlarge", + "m6gd.8xlarge", + "m6gd.large", + "m6gd.medium", + "m6gd.metal", + "m6gd.xlarge", + "m6i.12xlarge", + "m6i.16xlarge", + "m6i.24xlarge", + "m6i.2xlarge", + "m6i.32xlarge", + "m6i.4xlarge", + "m6i.8xlarge", + "m6i.large", + "m6i.metal", + "m6i.xlarge", + "m6id.12xlarge", + "m6id.16xlarge", + "m6id.24xlarge", + "m6id.2xlarge", + "m6id.32xlarge", + "m6id.4xlarge", + "m6id.8xlarge", + "m6id.large", + "m6id.metal", + "m6id.xlarge", + "m6idn.12xlarge", + "m6idn.16xlarge", + "m6idn.24xlarge", + "m6idn.2xlarge", + "m6idn.32xlarge", + "m6idn.4xlarge", + "m6idn.8xlarge", + "m6idn.large", + "m6idn.metal", + "m6idn.xlarge", + "m6in.12xlarge", + "m6in.16xlarge", + "m6in.24xlarge", + "m6in.2xlarge", + "m6in.32xlarge", + "m6in.4xlarge", + "m6in.8xlarge", + "m6in.large", + "m6in.metal", + "m6in.xlarge", + "m7g.12xlarge", + "m7g.16xlarge", + "m7g.2xlarge", + "m7g.4xlarge", + "m7g.8xlarge", + "m7g.large", + "m7g.medium", + "m7g.metal", + "m7g.xlarge", + "m7gd.12xlarge", + "m7gd.16xlarge", + "m7gd.2xlarge", + "m7gd.4xlarge", + "m7gd.8xlarge", + "m7gd.large", + "m7gd.medium", + "m7gd.xlarge", + "m7i-flex.2xlarge", + "m7i-flex.4xlarge", + "m7i-flex.8xlarge", + "m7i-flex.large", + "m7i-flex.xlarge", + "m7i.12xlarge", + "m7i.16xlarge", + "m7i.24xlarge", + "m7i.2xlarge", + "m7i.48xlarge", + "m7i.4xlarge", + "m7i.8xlarge", + "m7i.large", + "m7i.metal-24xl", + "m7i.metal-48xl", + "m7i.xlarge", + "p2.16xlarge", + "p2.8xlarge", + "p2.xlarge", + "p3.16xlarge", + "p3.2xlarge", + "p3.8xlarge", + "r3.2xlarge", + "r3.4xlarge", + "r3.8xlarge", + "r3.large", + "r3.xlarge", + "r4.16xlarge", + "r4.2xlarge", + "r4.4xlarge", + "r4.8xlarge", + "r4.large", + "r4.xlarge", + "r5.12xlarge", + "r5.16xlarge", + "r5.24xlarge", + "r5.2xlarge", + "r5.4xlarge", + "r5.8xlarge", + "r5.large", + "r5.metal", + "r5.xlarge", + "r5a.12xlarge", + "r5a.16xlarge", + "r5a.24xlarge", + "r5a.2xlarge", + "r5a.4xlarge", + "r5a.8xlarge", + "r5a.large", + "r5a.xlarge", + "r5ad.12xlarge", + "r5ad.16xlarge", + "r5ad.24xlarge", + "r5ad.2xlarge", + "r5ad.4xlarge", + "r5ad.8xlarge", + "r5ad.large", + "r5ad.xlarge", + "r5b.12xlarge", + "r5b.16xlarge", + "r5b.24xlarge", + "r5b.2xlarge", + "r5b.4xlarge", + "r5b.8xlarge", + "r5b.large", + "r5b.metal", + "r5b.xlarge", + "r5d.12xlarge", + "r5d.16xlarge", + "r5d.24xlarge", + "r5d.2xlarge", + "r5d.4xlarge", + "r5d.8xlarge", + "r5d.large", + "r5d.metal", + "r5d.xlarge", + "r5dn.12xlarge", + "r5dn.16xlarge", + "r5dn.24xlarge", + "r5dn.2xlarge", + "r5dn.4xlarge", + "r5dn.8xlarge", + "r5dn.large", + "r5dn.metal", + "r5dn.xlarge", + "r5n.12xlarge", + "r5n.16xlarge", + "r5n.24xlarge", + "r5n.2xlarge", + "r5n.4xlarge", + "r5n.8xlarge", + "r5n.large", + "r5n.metal", + "r5n.xlarge", + "r6a.12xlarge", + "r6a.16xlarge", + "r6a.24xlarge", + "r6a.2xlarge", + "r6a.32xlarge", + "r6a.48xlarge", + "r6a.4xlarge", + "r6a.8xlarge", + "r6a.large", + "r6a.metal", + "r6a.xlarge", + "r6g.12xlarge", + "r6g.16xlarge", + "r6g.2xlarge", + "r6g.4xlarge", + "r6g.8xlarge", + "r6g.large", + "r6g.medium", + "r6g.metal", + "r6g.xlarge", + "r6gd.12xlarge", + "r6gd.16xlarge", + "r6gd.2xlarge", + "r6gd.4xlarge", + "r6gd.8xlarge", + "r6gd.large", + "r6gd.medium", + "r6gd.metal", + "r6gd.xlarge", + "r6i.12xlarge", + "r6i.16xlarge", + "r6i.24xlarge", + "r6i.2xlarge", + "r6i.32xlarge", + "r6i.4xlarge", + "r6i.8xlarge", + "r6i.large", + "r6i.metal", + "r6i.xlarge", + "r6id.12xlarge", + "r6id.16xlarge", + "r6id.24xlarge", + "r6id.2xlarge", + "r6id.32xlarge", + "r6id.4xlarge", + "r6id.8xlarge", + "r6id.large", + "r6id.metal", + "r6id.xlarge", + "r7g.12xlarge", + "r7g.16xlarge", + "r7g.2xlarge", + "r7g.4xlarge", + "r7g.8xlarge", + "r7g.large", + "r7g.medium", + "r7g.metal", + "r7g.xlarge", + "r7gd.12xlarge", + "r7gd.16xlarge", + "r7gd.2xlarge", + "r7gd.4xlarge", + "r7gd.8xlarge", + "r7gd.large", + "r7gd.medium", + "r7gd.xlarge", + "r7i.12xlarge", + "r7i.16xlarge", + "r7i.24xlarge", + "r7i.2xlarge", + "r7i.48xlarge", + "r7i.4xlarge", + "r7i.8xlarge", + "r7i.large", + "r7i.metal-24xl", + "r7i.metal-48xl", + "r7i.xlarge", + "t1.micro", + "t2.2xlarge", + "t2.large", + "t2.medium", + "t2.micro", + "t2.nano", + "t2.small", + "t2.xlarge", + "t3.2xlarge", + "t3.large", + "t3.medium", + "t3.micro", + "t3.nano", + "t3.small", + "t3.xlarge", + "t3a.2xlarge", + "t3a.large", + "t3a.medium", + "t3a.micro", + "t3a.nano", + "t3a.small", + "t3a.xlarge", + "t4g.2xlarge", + "t4g.large", + "t4g.medium", + "t4g.micro", + "t4g.nano", + "t4g.small", + "t4g.xlarge", + "u-12tb1.112xlarge", + "u-3tb1.56xlarge", + "u-6tb1.112xlarge", + "u-6tb1.56xlarge", + "x1.16xlarge", + "x1.32xlarge", + "x1e.16xlarge", + "x1e.2xlarge", + "x1e.32xlarge", + "x1e.4xlarge", + "x1e.8xlarge", + "x1e.xlarge", + "x2idn.16xlarge", + "x2idn.24xlarge", + "x2idn.32xlarge", + "x2idn.metal", + "x2iedn.16xlarge", + "x2iedn.24xlarge", + "x2iedn.2xlarge", + "x2iedn.32xlarge", + "x2iedn.4xlarge", + "x2iedn.8xlarge", + "x2iedn.metal", + "x2iedn.xlarge", + "z1d.12xlarge", + "z1d.2xlarge", + "z1d.3xlarge", + "z1d.6xlarge", + "z1d.large", + "z1d.metal", + "z1d.xlarge", + ], + "ap-south-1": [ + "a1.2xlarge", + "a1.4xlarge", + "a1.large", + "a1.medium", + "a1.metal", + "a1.xlarge", + "c4.2xlarge", + "c4.4xlarge", + "c4.8xlarge", + "c4.large", + "c4.xlarge", + "c5.12xlarge", + "c5.18xlarge", + "c5.24xlarge", + "c5.2xlarge", + "c5.4xlarge", + "c5.9xlarge", + "c5.large", + "c5.metal", + "c5.xlarge", + "c5a.12xlarge", + "c5a.16xlarge", + "c5a.24xlarge", + "c5a.2xlarge", + "c5a.4xlarge", + "c5a.8xlarge", + "c5a.large", + "c5a.xlarge", + "c5d.12xlarge", + "c5d.18xlarge", + "c5d.24xlarge", + "c5d.2xlarge", + "c5d.4xlarge", + "c5d.9xlarge", + "c5d.large", + "c5d.metal", + "c5d.xlarge", + "c5n.18xlarge", + "c5n.2xlarge", + "c5n.4xlarge", + "c5n.9xlarge", + "c5n.large", + "c5n.metal", + "c5n.xlarge", + "c6a.12xlarge", + "c6a.16xlarge", + "c6a.24xlarge", + "c6a.2xlarge", + "c6a.32xlarge", + "c6a.48xlarge", + "c6a.4xlarge", + "c6a.8xlarge", + "c6a.large", + "c6a.metal", + "c6a.xlarge", + "c6g.12xlarge", + "c6g.16xlarge", + "c6g.2xlarge", + "c6g.4xlarge", + "c6g.8xlarge", + "c6g.large", + "c6g.medium", + "c6g.metal", + "c6g.xlarge", + "c6gd.12xlarge", + "c6gd.16xlarge", + "c6gd.2xlarge", + "c6gd.4xlarge", + "c6gd.8xlarge", + "c6gd.large", + "c6gd.medium", + "c6gd.metal", + "c6gd.xlarge", + "c6gn.12xlarge", + "c6gn.16xlarge", + "c6gn.2xlarge", + "c6gn.4xlarge", + "c6gn.8xlarge", + "c6gn.large", + "c6gn.medium", + "c6gn.xlarge", + "c6i.12xlarge", + "c6i.16xlarge", + "c6i.24xlarge", + "c6i.2xlarge", + "c6i.32xlarge", + "c6i.4xlarge", + "c6i.8xlarge", + "c6i.large", + "c6i.metal", + "c6i.xlarge", + "c6in.12xlarge", + "c6in.16xlarge", + "c6in.24xlarge", + "c6in.2xlarge", + "c6in.32xlarge", + "c6in.4xlarge", + "c6in.8xlarge", + "c6in.large", + "c6in.metal", + "c6in.xlarge", + "c7g.12xlarge", + "c7g.16xlarge", + "c7g.2xlarge", + "c7g.4xlarge", + "c7g.8xlarge", + "c7g.large", + "c7g.medium", + "c7g.metal", + "c7g.xlarge", + "c7gd.12xlarge", + "c7gd.16xlarge", + "c7gd.2xlarge", + "c7gd.4xlarge", + "c7gd.8xlarge", + "c7gd.large", + "c7gd.medium", + "c7gd.xlarge", + "c7i.12xlarge", + "c7i.16xlarge", + "c7i.24xlarge", + "c7i.2xlarge", + "c7i.48xlarge", + "c7i.4xlarge", + "c7i.8xlarge", + "c7i.large", + "c7i.metal-24xl", + "c7i.metal-48xl", + "c7i.xlarge", + "d2.2xlarge", + "d2.4xlarge", + "d2.8xlarge", + "d2.xlarge", + "d3.2xlarge", + "d3.4xlarge", + "d3.8xlarge", + "d3.xlarge", + "g4dn.12xlarge", + "g4dn.16xlarge", + "g4dn.2xlarge", + "g4dn.4xlarge", + "g4dn.8xlarge", + "g4dn.metal", + "g4dn.xlarge", + "g5.12xlarge", + "g5.16xlarge", + "g5.24xlarge", + "g5.2xlarge", + "g5.48xlarge", + "g5.4xlarge", + "g5.8xlarge", + "g5.xlarge", + "i2.2xlarge", + "i2.4xlarge", + "i2.8xlarge", + "i2.xlarge", + "i3.16xlarge", + "i3.2xlarge", + "i3.4xlarge", + "i3.8xlarge", + "i3.large", + "i3.metal", + "i3.xlarge", + "i3en.12xlarge", + "i3en.24xlarge", + "i3en.2xlarge", + "i3en.3xlarge", + "i3en.6xlarge", + "i3en.large", + "i3en.metal", + "i3en.xlarge", + "i4i.12xlarge", + "i4i.16xlarge", + "i4i.24xlarge", + "i4i.2xlarge", + "i4i.32xlarge", + "i4i.4xlarge", + "i4i.8xlarge", + "i4i.large", + "i4i.metal", + "i4i.xlarge", + "inf1.24xlarge", + "inf1.2xlarge", + "inf1.6xlarge", + "inf1.xlarge", + "inf2.24xlarge", + "inf2.48xlarge", + "inf2.8xlarge", + "inf2.xlarge", + "is4gen.2xlarge", + "is4gen.4xlarge", + "is4gen.8xlarge", + "is4gen.large", + "is4gen.medium", + "is4gen.xlarge", + "m4.10xlarge", + "m4.16xlarge", + "m4.2xlarge", + "m4.4xlarge", + "m4.large", + "m4.xlarge", + "m5.12xlarge", + "m5.16xlarge", + "m5.24xlarge", + "m5.2xlarge", + "m5.4xlarge", + "m5.8xlarge", + "m5.large", + "m5.metal", + "m5.xlarge", + "m5a.12xlarge", + "m5a.16xlarge", + "m5a.24xlarge", + "m5a.2xlarge", + "m5a.4xlarge", + "m5a.8xlarge", + "m5a.large", + "m5a.xlarge", + "m5ad.12xlarge", + "m5ad.16xlarge", + "m5ad.24xlarge", + "m5ad.2xlarge", + "m5ad.4xlarge", + "m5ad.8xlarge", + "m5ad.large", + "m5ad.xlarge", + "m5d.12xlarge", + "m5d.16xlarge", + "m5d.24xlarge", + "m5d.2xlarge", + "m5d.4xlarge", + "m5d.8xlarge", + "m5d.large", + "m5d.metal", + "m5d.xlarge", + "m6a.12xlarge", + "m6a.16xlarge", + "m6a.24xlarge", + "m6a.2xlarge", + "m6a.32xlarge", + "m6a.48xlarge", + "m6a.4xlarge", + "m6a.8xlarge", + "m6a.large", + "m6a.metal", + "m6a.xlarge", + "m6g.12xlarge", + "m6g.16xlarge", + "m6g.2xlarge", + "m6g.4xlarge", + "m6g.8xlarge", + "m6g.large", + "m6g.medium", + "m6g.metal", + "m6g.xlarge", + "m6gd.12xlarge", + "m6gd.16xlarge", + "m6gd.2xlarge", + "m6gd.4xlarge", + "m6gd.8xlarge", + "m6gd.large", + "m6gd.medium", + "m6gd.metal", + "m6gd.xlarge", + "m6i.12xlarge", + "m6i.16xlarge", + "m6i.24xlarge", + "m6i.2xlarge", + "m6i.32xlarge", + "m6i.4xlarge", + "m6i.8xlarge", + "m6i.large", + "m6i.metal", + "m6i.xlarge", + "m6id.12xlarge", + "m6id.16xlarge", + "m6id.24xlarge", + "m6id.2xlarge", + "m6id.32xlarge", + "m6id.4xlarge", + "m6id.8xlarge", + "m6id.large", + "m6id.metal", + "m6id.xlarge", + "m7g.12xlarge", + "m7g.16xlarge", + "m7g.2xlarge", + "m7g.4xlarge", + "m7g.8xlarge", + "m7g.large", + "m7g.medium", + "m7g.metal", + "m7g.xlarge", + "m7gd.12xlarge", + "m7gd.16xlarge", + "m7gd.2xlarge", + "m7gd.4xlarge", + "m7gd.8xlarge", + "m7gd.large", + "m7gd.medium", + "m7gd.xlarge", + "m7i-flex.2xlarge", + "m7i-flex.4xlarge", + "m7i-flex.8xlarge", + "m7i-flex.large", + "m7i-flex.xlarge", + "m7i.12xlarge", + "m7i.16xlarge", + "m7i.24xlarge", + "m7i.2xlarge", + "m7i.48xlarge", + "m7i.4xlarge", + "m7i.8xlarge", + "m7i.large", + "m7i.metal-24xl", + "m7i.metal-48xl", + "m7i.xlarge", + "p2.16xlarge", + "p2.8xlarge", + "p2.xlarge", + "r3.2xlarge", + "r3.4xlarge", + "r3.8xlarge", + "r3.large", + "r3.xlarge", + "r4.16xlarge", + "r4.2xlarge", + "r4.4xlarge", + "r4.8xlarge", + "r4.large", + "r4.xlarge", + "r5.12xlarge", + "r5.16xlarge", + "r5.24xlarge", + "r5.2xlarge", + "r5.4xlarge", + "r5.8xlarge", + "r5.large", + "r5.metal", + "r5.xlarge", + "r5a.12xlarge", + "r5a.16xlarge", + "r5a.24xlarge", + "r5a.2xlarge", + "r5a.4xlarge", + "r5a.8xlarge", + "r5a.large", + "r5a.xlarge", + "r5ad.12xlarge", + "r5ad.16xlarge", + "r5ad.24xlarge", + "r5ad.2xlarge", + "r5ad.4xlarge", + "r5ad.8xlarge", + "r5ad.large", + "r5ad.xlarge", + "r5d.12xlarge", + "r5d.16xlarge", + "r5d.24xlarge", + "r5d.2xlarge", + "r5d.4xlarge", + "r5d.8xlarge", + "r5d.large", + "r5d.metal", + "r5d.xlarge", + "r5n.12xlarge", + "r5n.16xlarge", + "r5n.24xlarge", + "r5n.2xlarge", + "r5n.4xlarge", + "r5n.8xlarge", + "r5n.large", + "r5n.metal", + "r5n.xlarge", + "r6a.12xlarge", + "r6a.16xlarge", + "r6a.24xlarge", + "r6a.2xlarge", + "r6a.32xlarge", + "r6a.48xlarge", + "r6a.4xlarge", + "r6a.8xlarge", + "r6a.large", + "r6a.metal", + "r6a.xlarge", + "r6g.12xlarge", + "r6g.16xlarge", + "r6g.2xlarge", + "r6g.4xlarge", + "r6g.8xlarge", + "r6g.large", + "r6g.medium", + "r6g.metal", + "r6g.xlarge", + "r6gd.12xlarge", + "r6gd.16xlarge", + "r6gd.2xlarge", + "r6gd.4xlarge", + "r6gd.8xlarge", + "r6gd.large", + "r6gd.medium", + "r6gd.metal", + "r6gd.xlarge", + "r6i.12xlarge", + "r6i.16xlarge", + "r6i.24xlarge", + "r6i.2xlarge", + "r6i.32xlarge", + "r6i.4xlarge", + "r6i.8xlarge", + "r6i.large", + "r6i.metal", + "r6i.xlarge", + "r6id.12xlarge", + "r6id.16xlarge", + "r6id.24xlarge", + "r6id.2xlarge", + "r6id.32xlarge", + "r6id.4xlarge", + "r6id.8xlarge", + "r6id.large", + "r6id.metal", + "r6id.xlarge", + "r7g.12xlarge", + "r7g.16xlarge", + "r7g.2xlarge", + "r7g.4xlarge", + "r7g.8xlarge", + "r7g.large", + "r7g.medium", + "r7g.metal", + "r7g.xlarge", + "r7gd.12xlarge", + "r7gd.16xlarge", + "r7gd.2xlarge", + "r7gd.4xlarge", + "r7gd.8xlarge", + "r7gd.large", + "r7gd.medium", + "r7gd.xlarge", + "r7i.12xlarge", + "r7i.16xlarge", + "r7i.24xlarge", + "r7i.2xlarge", + "r7i.48xlarge", + "r7i.4xlarge", + "r7i.8xlarge", + "r7i.large", + "r7i.metal-24xl", + "r7i.metal-48xl", + "r7i.xlarge", + "t2.2xlarge", + "t2.large", + "t2.medium", + "t2.micro", + "t2.nano", + "t2.small", + "t2.xlarge", + "t3.2xlarge", + "t3.large", + "t3.medium", + "t3.micro", + "t3.nano", + "t3.small", + "t3.xlarge", + "t3a.2xlarge", + "t3a.large", + "t3a.medium", + "t3a.micro", + "t3a.nano", + "t3a.small", + "t3a.xlarge", + "t4g.2xlarge", + "t4g.large", + "t4g.medium", + "t4g.micro", + "t4g.nano", + "t4g.small", + "t4g.xlarge", + "u-12tb1.112xlarge", + "u-6tb1.112xlarge", + "u-6tb1.56xlarge", + "x1.16xlarge", + "x1.32xlarge", + "x1e.16xlarge", + "x1e.2xlarge", + "x1e.32xlarge", + "x1e.4xlarge", + "x1e.8xlarge", + "x1e.xlarge", + "x2idn.16xlarge", + "x2idn.24xlarge", + "x2idn.32xlarge", + "x2idn.metal", + "x2iedn.16xlarge", + "x2iedn.24xlarge", + "x2iedn.2xlarge", + "x2iedn.32xlarge", + "x2iedn.4xlarge", + "x2iedn.8xlarge", + "x2iedn.metal", + "x2iedn.xlarge", + "z1d.12xlarge", + "z1d.2xlarge", + "z1d.3xlarge", + "z1d.6xlarge", + "z1d.large", + "z1d.metal", + "z1d.xlarge", + ], + "eu-west-1": [ + "a1.2xlarge", + "a1.4xlarge", + "a1.large", + "a1.medium", + "a1.metal", + "a1.xlarge", + "c1.medium", + "c1.xlarge", + "c3.2xlarge", + "c3.4xlarge", + "c3.8xlarge", + "c3.large", + "c3.xlarge", + "c4.2xlarge", + "c4.4xlarge", + "c4.8xlarge", + "c4.large", + "c4.xlarge", + "c5.12xlarge", + "c5.18xlarge", + "c5.24xlarge", + "c5.2xlarge", + "c5.4xlarge", + "c5.9xlarge", + "c5.large", + "c5.metal", + "c5.xlarge", + "c5a.12xlarge", + "c5a.16xlarge", + "c5a.24xlarge", + "c5a.2xlarge", + "c5a.4xlarge", + "c5a.8xlarge", + "c5a.large", + "c5a.xlarge", + "c5ad.12xlarge", + "c5ad.16xlarge", + "c5ad.24xlarge", + "c5ad.2xlarge", + "c5ad.4xlarge", + "c5ad.8xlarge", + "c5ad.large", + "c5ad.xlarge", + "c5d.12xlarge", + "c5d.18xlarge", + "c5d.24xlarge", + "c5d.2xlarge", + "c5d.4xlarge", + "c5d.9xlarge", + "c5d.large", + "c5d.metal", + "c5d.xlarge", + "c5n.18xlarge", + "c5n.2xlarge", + "c5n.4xlarge", + "c5n.9xlarge", + "c5n.large", + "c5n.metal", + "c5n.xlarge", + "c6a.12xlarge", + "c6a.16xlarge", + "c6a.24xlarge", + "c6a.2xlarge", + "c6a.32xlarge", + "c6a.48xlarge", + "c6a.4xlarge", + "c6a.8xlarge", + "c6a.large", + "c6a.metal", + "c6a.xlarge", + "c6g.12xlarge", + "c6g.16xlarge", + "c6g.2xlarge", + "c6g.4xlarge", + "c6g.8xlarge", + "c6g.large", + "c6g.medium", + "c6g.metal", + "c6g.xlarge", + "c6gd.12xlarge", + "c6gd.16xlarge", + "c6gd.2xlarge", + "c6gd.4xlarge", + "c6gd.8xlarge", + "c6gd.large", + "c6gd.medium", + "c6gd.metal", + "c6gd.xlarge", + "c6gn.12xlarge", + "c6gn.16xlarge", + "c6gn.2xlarge", + "c6gn.4xlarge", + "c6gn.8xlarge", + "c6gn.large", + "c6gn.medium", + "c6gn.xlarge", + "c6i.12xlarge", + "c6i.16xlarge", + "c6i.24xlarge", + "c6i.2xlarge", + "c6i.32xlarge", + "c6i.4xlarge", + "c6i.8xlarge", + "c6i.large", + "c6i.metal", + "c6i.xlarge", + "c6id.12xlarge", + "c6id.16xlarge", + "c6id.24xlarge", + "c6id.2xlarge", + "c6id.32xlarge", + "c6id.4xlarge", + "c6id.8xlarge", + "c6id.large", + "c6id.metal", + "c6id.xlarge", + "c6in.12xlarge", + "c6in.16xlarge", + "c6in.24xlarge", + "c6in.2xlarge", + "c6in.32xlarge", + "c6in.4xlarge", + "c6in.8xlarge", + "c6in.large", + "c6in.metal", + "c6in.xlarge", + "c7a.12xlarge", + "c7a.16xlarge", + "c7a.24xlarge", + "c7a.2xlarge", + "c7a.32xlarge", + "c7a.48xlarge", + "c7a.4xlarge", + "c7a.8xlarge", + "c7a.large", + "c7a.medium", + "c7a.metal-48xl", + "c7a.xlarge", + "c7g.12xlarge", + "c7g.16xlarge", + "c7g.2xlarge", + "c7g.4xlarge", + "c7g.8xlarge", + "c7g.large", + "c7g.medium", + "c7g.metal", + "c7g.xlarge", + "c7gd.12xlarge", + "c7gd.16xlarge", + "c7gd.2xlarge", + "c7gd.4xlarge", + "c7gd.8xlarge", + "c7gd.large", + "c7gd.medium", + "c7gd.xlarge", + "c7gn.12xlarge", + "c7gn.16xlarge", + "c7gn.2xlarge", + "c7gn.4xlarge", + "c7gn.8xlarge", + "c7gn.large", + "c7gn.medium", + "c7gn.xlarge", + "c7i.12xlarge", + "c7i.16xlarge", + "c7i.24xlarge", + "c7i.2xlarge", + "c7i.48xlarge", + "c7i.4xlarge", + "c7i.8xlarge", + "c7i.large", + "c7i.metal-24xl", + "c7i.metal-48xl", + "c7i.xlarge", + "cr1.8xlarge", + "d2.2xlarge", + "d2.4xlarge", + "d2.8xlarge", + "d2.xlarge", + "d3.2xlarge", + "d3.4xlarge", + "d3.8xlarge", + "d3.xlarge", + "d3en.12xlarge", + "d3en.2xlarge", + "d3en.4xlarge", + "d3en.6xlarge", + "d3en.8xlarge", + "d3en.xlarge", + "f1.16xlarge", + "f1.2xlarge", + "f1.4xlarge", + "g2.2xlarge", + "g2.8xlarge", + "g3.16xlarge", + "g3.4xlarge", + "g3.8xlarge", + "g3s.xlarge", + "g4ad.16xlarge", + "g4ad.2xlarge", + "g4ad.4xlarge", + "g4ad.8xlarge", + "g4ad.xlarge", + "g4dn.12xlarge", + "g4dn.16xlarge", + "g4dn.2xlarge", + "g4dn.4xlarge", + "g4dn.8xlarge", + "g4dn.metal", + "g4dn.xlarge", + "g5.12xlarge", + "g5.16xlarge", + "g5.24xlarge", + "g5.2xlarge", + "g5.48xlarge", + "g5.4xlarge", + "g5.8xlarge", + "g5.xlarge", + "h1.16xlarge", + "h1.2xlarge", + "h1.4xlarge", + "h1.8xlarge", + "hpc7a.12xlarge", + "hpc7a.24xlarge", + "hpc7a.48xlarge", + "hpc7a.96xlarge", + "hpc7g.16xlarge", + "hpc7g.4xlarge", + "hpc7g.8xlarge", + "i2.2xlarge", + "i2.4xlarge", + "i2.8xlarge", + "i2.xlarge", + "i3.16xlarge", + "i3.2xlarge", + "i3.4xlarge", + "i3.8xlarge", + "i3.large", + "i3.metal", + "i3.xlarge", + "i3en.12xlarge", + "i3en.24xlarge", + "i3en.2xlarge", + "i3en.3xlarge", + "i3en.6xlarge", + "i3en.large", + "i3en.metal", + "i3en.xlarge", + "i4g.16xlarge", + "i4g.2xlarge", + "i4g.4xlarge", + "i4g.8xlarge", + "i4g.large", + "i4g.xlarge", + "i4i.12xlarge", + "i4i.16xlarge", + "i4i.24xlarge", + "i4i.2xlarge", + "i4i.32xlarge", + "i4i.4xlarge", + "i4i.8xlarge", + "i4i.large", + "i4i.metal", + "i4i.xlarge", + "im4gn.16xlarge", + "im4gn.2xlarge", + "im4gn.4xlarge", + "im4gn.8xlarge", + "im4gn.large", + "im4gn.xlarge", + "inf1.24xlarge", + "inf1.2xlarge", + "inf1.6xlarge", + "inf1.xlarge", + "inf2.24xlarge", + "inf2.48xlarge", + "inf2.8xlarge", + "inf2.xlarge", + "is4gen.2xlarge", + "is4gen.4xlarge", + "is4gen.8xlarge", + "is4gen.large", + "is4gen.medium", + "is4gen.xlarge", + "m1.large", + "m1.medium", + "m1.xlarge", + "m2.2xlarge", + "m2.4xlarge", + "m2.xlarge", + "m3.2xlarge", + "m3.large", + "m3.medium", + "m3.xlarge", + "m4.10xlarge", + "m4.16xlarge", + "m4.2xlarge", + "m4.4xlarge", + "m4.large", + "m4.xlarge", + "m5.12xlarge", + "m5.16xlarge", + "m5.24xlarge", + "m5.2xlarge", + "m5.4xlarge", + "m5.8xlarge", + "m5.large", + "m5.metal", + "m5.xlarge", + "m5a.12xlarge", + "m5a.16xlarge", + "m5a.24xlarge", + "m5a.2xlarge", + "m5a.4xlarge", + "m5a.8xlarge", + "m5a.large", + "m5a.xlarge", + "m5ad.12xlarge", + "m5ad.16xlarge", + "m5ad.24xlarge", + "m5ad.2xlarge", + "m5ad.4xlarge", + "m5ad.8xlarge", + "m5ad.large", + "m5ad.xlarge", + "m5d.12xlarge", + "m5d.16xlarge", + "m5d.24xlarge", + "m5d.2xlarge", + "m5d.4xlarge", + "m5d.8xlarge", + "m5d.large", + "m5d.metal", + "m5d.xlarge", + "m5dn.12xlarge", + "m5dn.16xlarge", + "m5dn.24xlarge", + "m5dn.2xlarge", + "m5dn.4xlarge", + "m5dn.8xlarge", + "m5dn.large", + "m5dn.metal", + "m5dn.xlarge", + "m5n.12xlarge", + "m5n.16xlarge", + "m5n.24xlarge", + "m5n.2xlarge", + "m5n.4xlarge", + "m5n.8xlarge", + "m5n.large", + "m5n.xlarge", + "m5zn.12xlarge", + "m5zn.2xlarge", + "m5zn.3xlarge", + "m5zn.6xlarge", + "m5zn.large", + "m5zn.metal", + "m5zn.xlarge", + "m6a.12xlarge", + "m6a.16xlarge", + "m6a.24xlarge", + "m6a.2xlarge", + "m6a.32xlarge", + "m6a.48xlarge", + "m6a.4xlarge", + "m6a.8xlarge", + "m6a.large", + "m6a.metal", + "m6a.xlarge", + "m6g.12xlarge", + "m6g.16xlarge", + "m6g.2xlarge", + "m6g.4xlarge", + "m6g.8xlarge", + "m6g.large", + "m6g.medium", + "m6g.metal", + "m6g.xlarge", + "m6gd.12xlarge", + "m6gd.16xlarge", + "m6gd.2xlarge", + "m6gd.4xlarge", + "m6gd.8xlarge", + "m6gd.large", + "m6gd.medium", + "m6gd.metal", + "m6gd.xlarge", + "m6i.12xlarge", + "m6i.16xlarge", + "m6i.24xlarge", + "m6i.2xlarge", + "m6i.32xlarge", + "m6i.4xlarge", + "m6i.8xlarge", + "m6i.large", + "m6i.metal", + "m6i.xlarge", + "m6id.12xlarge", + "m6id.16xlarge", + "m6id.24xlarge", + "m6id.2xlarge", + "m6id.32xlarge", + "m6id.4xlarge", + "m6id.8xlarge", + "m6id.large", + "m6id.metal", + "m6id.xlarge", + "m6idn.12xlarge", + "m6idn.16xlarge", + "m6idn.24xlarge", + "m6idn.2xlarge", + "m6idn.32xlarge", + "m6idn.4xlarge", + "m6idn.8xlarge", + "m6idn.large", + "m6idn.metal", + "m6idn.xlarge", + "m6in.12xlarge", + "m6in.16xlarge", + "m6in.24xlarge", + "m6in.2xlarge", + "m6in.32xlarge", + "m6in.4xlarge", + "m6in.8xlarge", + "m6in.large", + "m6in.metal", + "m6in.xlarge", + "m7a.12xlarge", + "m7a.16xlarge", + "m7a.24xlarge", + "m7a.2xlarge", + "m7a.32xlarge", + "m7a.48xlarge", + "m7a.4xlarge", + "m7a.8xlarge", + "m7a.large", + "m7a.medium", + "m7a.metal-48xl", + "m7a.xlarge", + "m7g.12xlarge", + "m7g.16xlarge", + "m7g.2xlarge", + "m7g.4xlarge", + "m7g.8xlarge", + "m7g.large", + "m7g.medium", + "m7g.metal", + "m7g.xlarge", + "m7gd.12xlarge", + "m7gd.16xlarge", + "m7gd.2xlarge", + "m7gd.4xlarge", + "m7gd.8xlarge", + "m7gd.large", + "m7gd.medium", + "m7gd.xlarge", + "m7i-flex.2xlarge", + "m7i-flex.4xlarge", + "m7i-flex.8xlarge", + "m7i-flex.large", + "m7i-flex.xlarge", + "m7i.12xlarge", + "m7i.16xlarge", + "m7i.24xlarge", + "m7i.2xlarge", + "m7i.48xlarge", + "m7i.4xlarge", + "m7i.8xlarge", + "m7i.large", + "m7i.metal-24xl", + "m7i.metal-48xl", + "m7i.xlarge", + "p2.16xlarge", + "p2.8xlarge", + "p2.xlarge", + "p3.16xlarge", + "p3.2xlarge", + "p3.8xlarge", + "p3dn.24xlarge", + "p4d.24xlarge", + "r3.2xlarge", + "r3.4xlarge", + "r3.8xlarge", + "r3.large", + "r3.xlarge", + "r4.16xlarge", + "r4.2xlarge", + "r4.4xlarge", + "r4.8xlarge", + "r4.large", + "r4.xlarge", + "r5.12xlarge", + "r5.16xlarge", + "r5.24xlarge", + "r5.2xlarge", + "r5.4xlarge", + "r5.8xlarge", + "r5.large", + "r5.metal", + "r5.xlarge", + "r5a.12xlarge", + "r5a.16xlarge", + "r5a.24xlarge", + "r5a.2xlarge", + "r5a.4xlarge", + "r5a.8xlarge", + "r5a.large", + "r5a.xlarge", + "r5ad.12xlarge", + "r5ad.16xlarge", + "r5ad.24xlarge", + "r5ad.2xlarge", + "r5ad.4xlarge", + "r5ad.8xlarge", + "r5ad.large", + "r5ad.xlarge", + "r5b.12xlarge", + "r5b.16xlarge", + "r5b.24xlarge", + "r5b.2xlarge", + "r5b.4xlarge", + "r5b.8xlarge", + "r5b.large", + "r5b.metal", + "r5b.xlarge", + "r5d.12xlarge", + "r5d.16xlarge", + "r5d.24xlarge", + "r5d.2xlarge", + "r5d.4xlarge", + "r5d.8xlarge", + "r5d.large", + "r5d.metal", + "r5d.xlarge", + "r5dn.12xlarge", + "r5dn.16xlarge", + "r5dn.24xlarge", + "r5dn.2xlarge", + "r5dn.4xlarge", + "r5dn.8xlarge", + "r5dn.large", + "r5dn.metal", + "r5dn.xlarge", + "r5n.12xlarge", + "r5n.16xlarge", + "r5n.24xlarge", + "r5n.2xlarge", + "r5n.4xlarge", + "r5n.8xlarge", + "r5n.large", + "r5n.metal", + "r5n.xlarge", + "r6a.12xlarge", + "r6a.16xlarge", + "r6a.24xlarge", + "r6a.2xlarge", + "r6a.32xlarge", + "r6a.48xlarge", + "r6a.4xlarge", + "r6a.8xlarge", + "r6a.large", + "r6a.metal", + "r6a.xlarge", + "r6g.12xlarge", + "r6g.16xlarge", + "r6g.2xlarge", + "r6g.4xlarge", + "r6g.8xlarge", + "r6g.large", + "r6g.medium", + "r6g.metal", + "r6g.xlarge", + "r6gd.12xlarge", + "r6gd.16xlarge", + "r6gd.2xlarge", + "r6gd.4xlarge", + "r6gd.8xlarge", + "r6gd.large", + "r6gd.medium", + "r6gd.metal", + "r6gd.xlarge", + "r6i.12xlarge", + "r6i.16xlarge", + "r6i.24xlarge", + "r6i.2xlarge", + "r6i.32xlarge", + "r6i.4xlarge", + "r6i.8xlarge", + "r6i.large", + "r6i.metal", + "r6i.xlarge", + "r6id.12xlarge", + "r6id.16xlarge", + "r6id.24xlarge", + "r6id.2xlarge", + "r6id.32xlarge", + "r6id.4xlarge", + "r6id.8xlarge", + "r6id.large", + "r6id.metal", + "r6id.xlarge", + "r6idn.12xlarge", + "r6idn.16xlarge", + "r6idn.24xlarge", + "r6idn.2xlarge", + "r6idn.32xlarge", + "r6idn.4xlarge", + "r6idn.8xlarge", + "r6idn.large", + "r6idn.metal", + "r6idn.xlarge", + "r6in.12xlarge", + "r6in.16xlarge", + "r6in.24xlarge", + "r6in.2xlarge", + "r6in.32xlarge", + "r6in.4xlarge", + "r6in.8xlarge", + "r6in.large", + "r6in.metal", + "r6in.xlarge", + "r7a.12xlarge", + "r7a.16xlarge", + "r7a.24xlarge", + "r7a.2xlarge", + "r7a.32xlarge", + "r7a.48xlarge", + "r7a.4xlarge", + "r7a.8xlarge", + "r7a.large", + "r7a.medium", + "r7a.metal-48xl", + "r7a.xlarge", + "r7g.12xlarge", + "r7g.16xlarge", + "r7g.2xlarge", + "r7g.4xlarge", + "r7g.8xlarge", + "r7g.large", + "r7g.medium", + "r7g.metal", + "r7g.xlarge", + "r7gd.12xlarge", + "r7gd.16xlarge", + "r7gd.2xlarge", + "r7gd.4xlarge", + "r7gd.8xlarge", + "r7gd.large", + "r7gd.medium", + "r7gd.xlarge", + "r7i.12xlarge", + "r7i.16xlarge", + "r7i.24xlarge", + "r7i.2xlarge", + "r7i.48xlarge", + "r7i.4xlarge", + "r7i.8xlarge", + "r7i.large", + "r7i.metal-24xl", + "r7i.metal-48xl", + "r7i.xlarge", + "r7iz.12xlarge", + "r7iz.16xlarge", + "r7iz.2xlarge", + "r7iz.32xlarge", + "r7iz.4xlarge", + "r7iz.8xlarge", + "r7iz.large", + "r7iz.metal-16xl", + "r7iz.metal-32xl", + "r7iz.xlarge", + "t1.micro", + "t2.2xlarge", + "t2.large", + "t2.medium", + "t2.micro", + "t2.nano", + "t2.small", + "t2.xlarge", + "t3.2xlarge", + "t3.large", + "t3.medium", + "t3.micro", + "t3.nano", + "t3.small", + "t3.xlarge", + "t3a.2xlarge", + "t3a.large", + "t3a.medium", + "t3a.micro", + "t3a.nano", + "t3a.small", + "t3a.xlarge", + "t4g.2xlarge", + "t4g.large", + "t4g.medium", + "t4g.micro", + "t4g.nano", + "t4g.small", + "t4g.xlarge", + "u-12tb1.112xlarge", + "u-18tb1.112xlarge", + "u-3tb1.56xlarge", + "u-6tb1.112xlarge", + "u-6tb1.56xlarge", + "u-9tb1.112xlarge", + "vt1.24xlarge", + "vt1.3xlarge", + "vt1.6xlarge", + "x1.16xlarge", + "x1.32xlarge", + "x1e.16xlarge", + "x1e.2xlarge", + "x1e.32xlarge", + "x1e.4xlarge", + "x1e.8xlarge", + "x1e.xlarge", + "x2gd.12xlarge", + "x2gd.16xlarge", + "x2gd.2xlarge", + "x2gd.4xlarge", + "x2gd.8xlarge", + "x2gd.large", + "x2gd.medium", + "x2gd.metal", + "x2gd.xlarge", + "x2idn.16xlarge", + "x2idn.24xlarge", + "x2idn.32xlarge", + "x2idn.metal", + "x2iedn.16xlarge", + "x2iedn.24xlarge", + "x2iedn.2xlarge", + "x2iedn.32xlarge", + "x2iedn.4xlarge", + "x2iedn.8xlarge", + "x2iedn.metal", + "x2iedn.xlarge", + "x2iezn.12xlarge", + "x2iezn.2xlarge", + "x2iezn.4xlarge", + "x2iezn.6xlarge", + "x2iezn.8xlarge", + "x2iezn.metal", + "z1d.12xlarge", + "z1d.2xlarge", + "z1d.3xlarge", + "z1d.6xlarge", + "z1d.large", + "z1d.metal", + "z1d.xlarge", + ], + "eu-west-2": [ + "c4.2xlarge", + "c4.4xlarge", + "c4.8xlarge", + "c4.large", + "c4.xlarge", + "c5.12xlarge", + "c5.18xlarge", + "c5.24xlarge", + "c5.2xlarge", + "c5.4xlarge", + "c5.9xlarge", + "c5.large", + "c5.metal", + "c5.xlarge", + "c5a.12xlarge", + "c5a.16xlarge", + "c5a.24xlarge", + "c5a.2xlarge", + "c5a.4xlarge", + "c5a.8xlarge", + "c5a.large", + "c5a.xlarge", + "c5d.12xlarge", + "c5d.18xlarge", + "c5d.24xlarge", + "c5d.2xlarge", + "c5d.4xlarge", + "c5d.9xlarge", + "c5d.large", + "c5d.metal", + "c5d.xlarge", + "c5n.18xlarge", + "c5n.2xlarge", + "c5n.4xlarge", + "c5n.9xlarge", + "c5n.large", + "c5n.metal", + "c5n.xlarge", + "c6a.12xlarge", + "c6a.16xlarge", + "c6a.24xlarge", + "c6a.2xlarge", + "c6a.32xlarge", + "c6a.48xlarge", + "c6a.4xlarge", + "c6a.8xlarge", + "c6a.large", + "c6a.metal", + "c6a.xlarge", + "c6g.12xlarge", + "c6g.16xlarge", + "c6g.2xlarge", + "c6g.4xlarge", + "c6g.8xlarge", + "c6g.large", + "c6g.medium", + "c6g.metal", + "c6g.xlarge", + "c6gd.12xlarge", + "c6gd.16xlarge", + "c6gd.2xlarge", + "c6gd.4xlarge", + "c6gd.8xlarge", + "c6gd.large", + "c6gd.medium", + "c6gd.metal", + "c6gd.xlarge", + "c6gn.12xlarge", + "c6gn.16xlarge", + "c6gn.2xlarge", + "c6gn.4xlarge", + "c6gn.8xlarge", + "c6gn.large", + "c6gn.medium", + "c6gn.xlarge", + "c6i.12xlarge", + "c6i.16xlarge", + "c6i.24xlarge", + "c6i.2xlarge", + "c6i.32xlarge", + "c6i.4xlarge", + "c6i.8xlarge", + "c6i.large", + "c6i.metal", + "c6i.xlarge", + "c6id.12xlarge", + "c6id.16xlarge", + "c6id.24xlarge", + "c6id.2xlarge", + "c6id.32xlarge", + "c6id.4xlarge", + "c6id.8xlarge", + "c6id.large", + "c6id.metal", + "c6id.xlarge", + "c6in.12xlarge", + "c6in.16xlarge", + "c6in.24xlarge", + "c6in.2xlarge", + "c6in.32xlarge", + "c6in.4xlarge", + "c6in.8xlarge", + "c6in.large", + "c6in.metal", + "c6in.xlarge", + "c7g.12xlarge", + "c7g.16xlarge", + "c7g.2xlarge", + "c7g.4xlarge", + "c7g.8xlarge", + "c7g.large", + "c7g.medium", + "c7g.metal", + "c7g.xlarge", + "c7i.12xlarge", + "c7i.16xlarge", + "c7i.24xlarge", + "c7i.2xlarge", + "c7i.48xlarge", + "c7i.4xlarge", + "c7i.8xlarge", + "c7i.large", + "c7i.metal-24xl", + "c7i.metal-48xl", + "c7i.xlarge", + "d2.2xlarge", + "d2.4xlarge", + "d2.8xlarge", + "d2.xlarge", + "d3.2xlarge", + "d3.4xlarge", + "d3.8xlarge", + "d3.xlarge", + "f1.2xlarge", + "f1.4xlarge", + "g3.16xlarge", + "g3.4xlarge", + "g3.8xlarge", + "g3s.xlarge", + "g4ad.16xlarge", + "g4ad.2xlarge", + "g4ad.4xlarge", + "g4ad.8xlarge", + "g4ad.xlarge", + "g4dn.12xlarge", + "g4dn.16xlarge", + "g4dn.2xlarge", + "g4dn.4xlarge", + "g4dn.8xlarge", + "g4dn.metal", + "g4dn.xlarge", + "g5.12xlarge", + "g5.16xlarge", + "g5.24xlarge", + "g5.2xlarge", + "g5.48xlarge", + "g5.4xlarge", + "g5.8xlarge", + "g5.xlarge", + "i3.16xlarge", + "i3.2xlarge", + "i3.4xlarge", + "i3.8xlarge", + "i3.large", + "i3.metal", + "i3.xlarge", + "i3en.12xlarge", + "i3en.24xlarge", + "i3en.2xlarge", + "i3en.3xlarge", + "i3en.6xlarge", + "i3en.large", + "i3en.metal", + "i3en.xlarge", + "i4i.12xlarge", + "i4i.16xlarge", + "i4i.24xlarge", + "i4i.2xlarge", + "i4i.32xlarge", + "i4i.4xlarge", + "i4i.8xlarge", + "i4i.large", + "i4i.metal", + "i4i.xlarge", + "im4gn.16xlarge", + "im4gn.2xlarge", + "im4gn.4xlarge", + "im4gn.8xlarge", + "im4gn.large", + "im4gn.xlarge", + "inf1.24xlarge", + "inf1.2xlarge", + "inf1.6xlarge", + "inf1.xlarge", + "is4gen.2xlarge", + "is4gen.4xlarge", + "is4gen.8xlarge", + "is4gen.large", + "is4gen.medium", + "is4gen.xlarge", + "m4.10xlarge", + "m4.16xlarge", + "m4.2xlarge", + "m4.4xlarge", + "m4.large", + "m4.xlarge", + "m5.12xlarge", + "m5.16xlarge", + "m5.24xlarge", + "m5.2xlarge", + "m5.4xlarge", + "m5.8xlarge", + "m5.large", + "m5.metal", + "m5.xlarge", + "m5a.12xlarge", + "m5a.16xlarge", + "m5a.24xlarge", + "m5a.2xlarge", + "m5a.4xlarge", + "m5a.8xlarge", + "m5a.large", + "m5a.xlarge", + "m5ad.12xlarge", + "m5ad.16xlarge", + "m5ad.24xlarge", + "m5ad.2xlarge", + "m5ad.4xlarge", + "m5ad.8xlarge", + "m5ad.large", + "m5ad.xlarge", + "m5d.12xlarge", + "m5d.16xlarge", + "m5d.24xlarge", + "m5d.2xlarge", + "m5d.4xlarge", + "m5d.8xlarge", + "m5d.large", + "m5d.metal", + "m5d.xlarge", + "m6a.12xlarge", + "m6a.16xlarge", + "m6a.24xlarge", + "m6a.2xlarge", + "m6a.32xlarge", + "m6a.48xlarge", + "m6a.4xlarge", + "m6a.8xlarge", + "m6a.large", + "m6a.metal", + "m6a.xlarge", + "m6g.12xlarge", + "m6g.16xlarge", + "m6g.2xlarge", + "m6g.4xlarge", + "m6g.8xlarge", + "m6g.large", + "m6g.medium", + "m6g.metal", + "m6g.xlarge", + "m6gd.12xlarge", + "m6gd.16xlarge", + "m6gd.2xlarge", + "m6gd.4xlarge", + "m6gd.8xlarge", + "m6gd.large", + "m6gd.medium", + "m6gd.metal", + "m6gd.xlarge", + "m6i.12xlarge", + "m6i.16xlarge", + "m6i.24xlarge", + "m6i.2xlarge", + "m6i.32xlarge", + "m6i.4xlarge", + "m6i.8xlarge", + "m6i.large", + "m6i.metal", + "m6i.xlarge", + "m7g.12xlarge", + "m7g.16xlarge", + "m7g.2xlarge", + "m7g.4xlarge", + "m7g.8xlarge", + "m7g.large", + "m7g.medium", + "m7g.metal", + "m7g.xlarge", + "m7i-flex.2xlarge", + "m7i-flex.4xlarge", + "m7i-flex.8xlarge", + "m7i-flex.large", + "m7i-flex.xlarge", + "m7i.12xlarge", + "m7i.16xlarge", + "m7i.24xlarge", + "m7i.2xlarge", + "m7i.48xlarge", + "m7i.4xlarge", + "m7i.8xlarge", + "m7i.large", + "m7i.metal-24xl", + "m7i.metal-48xl", + "m7i.xlarge", + "p3.16xlarge", + "p3.2xlarge", + "p3.8xlarge", + "r4.16xlarge", + "r4.2xlarge", + "r4.4xlarge", + "r4.8xlarge", + "r4.large", + "r4.xlarge", + "r5.12xlarge", + "r5.16xlarge", + "r5.24xlarge", + "r5.2xlarge", + "r5.4xlarge", + "r5.8xlarge", + "r5.large", + "r5.metal", + "r5.xlarge", + "r5a.12xlarge", + "r5a.16xlarge", + "r5a.24xlarge", + "r5a.2xlarge", + "r5a.4xlarge", + "r5a.8xlarge", + "r5a.large", + "r5a.xlarge", + "r5ad.12xlarge", + "r5ad.16xlarge", + "r5ad.24xlarge", + "r5ad.2xlarge", + "r5ad.4xlarge", + "r5ad.8xlarge", + "r5ad.large", + "r5ad.xlarge", + "r5b.12xlarge", + "r5b.16xlarge", + "r5b.24xlarge", + "r5b.2xlarge", + "r5b.4xlarge", + "r5b.8xlarge", + "r5b.large", + "r5b.metal", + "r5b.xlarge", + "r5d.12xlarge", + "r5d.16xlarge", + "r5d.24xlarge", + "r5d.2xlarge", + "r5d.4xlarge", + "r5d.8xlarge", + "r5d.large", + "r5d.metal", + "r5d.xlarge", + "r5n.12xlarge", + "r5n.16xlarge", + "r5n.24xlarge", + "r5n.2xlarge", + "r5n.4xlarge", + "r5n.8xlarge", + "r5n.large", + "r5n.metal", + "r5n.xlarge", + "r6g.12xlarge", + "r6g.16xlarge", + "r6g.2xlarge", + "r6g.4xlarge", + "r6g.8xlarge", + "r6g.large", + "r6g.medium", + "r6g.metal", + "r6g.xlarge", + "r6gd.12xlarge", + "r6gd.16xlarge", + "r6gd.2xlarge", + "r6gd.4xlarge", + "r6gd.8xlarge", + "r6gd.large", + "r6gd.medium", + "r6gd.metal", + "r6gd.xlarge", + "r6i.12xlarge", + "r6i.16xlarge", + "r6i.24xlarge", + "r6i.2xlarge", + "r6i.32xlarge", + "r6i.4xlarge", + "r6i.8xlarge", + "r6i.large", + "r6i.metal", + "r6i.xlarge", + "r6id.12xlarge", + "r6id.16xlarge", + "r6id.24xlarge", + "r6id.2xlarge", + "r6id.32xlarge", + "r6id.4xlarge", + "r6id.8xlarge", + "r6id.large", + "r6id.metal", + "r6id.xlarge", + "r7g.12xlarge", + "r7g.16xlarge", + "r7g.2xlarge", + "r7g.4xlarge", + "r7g.8xlarge", + "r7g.large", + "r7g.medium", + "r7g.metal", + "r7g.xlarge", + "r7i.12xlarge", + "r7i.16xlarge", + "r7i.24xlarge", + "r7i.2xlarge", + "r7i.48xlarge", + "r7i.4xlarge", + "r7i.8xlarge", + "r7i.large", + "r7i.metal-24xl", + "r7i.metal-48xl", + "r7i.xlarge", + "t2.2xlarge", + "t2.large", + "t2.medium", + "t2.micro", + "t2.nano", + "t2.small", + "t2.xlarge", + "t3.2xlarge", + "t3.large", + "t3.medium", + "t3.micro", + "t3.nano", + "t3.small", + "t3.xlarge", + "t3a.2xlarge", + "t3a.large", + "t3a.medium", + "t3a.micro", + "t3a.nano", + "t3a.small", + "t3a.xlarge", + "t4g.2xlarge", + "t4g.large", + "t4g.medium", + "t4g.micro", + "t4g.nano", + "t4g.small", + "t4g.xlarge", + "u-6tb1.112xlarge", + "u-6tb1.56xlarge", + "u-9tb1.112xlarge", + "x1.16xlarge", + "x1.32xlarge", + "x2idn.16xlarge", + "x2idn.24xlarge", + "x2idn.32xlarge", + "x2idn.metal", + "x2iedn.16xlarge", + "x2iedn.24xlarge", + "x2iedn.2xlarge", + "x2iedn.32xlarge", + "x2iedn.4xlarge", + "x2iedn.8xlarge", + "x2iedn.metal", + "x2iedn.xlarge", + "z1d.12xlarge", + "z1d.2xlarge", + "z1d.3xlarge", + "z1d.6xlarge", + "z1d.large", + "z1d.metal", + "z1d.xlarge", + ], + "eu-west-3": [ + "c5.12xlarge", + "c5.18xlarge", + "c5.24xlarge", + "c5.2xlarge", + "c5.4xlarge", + "c5.9xlarge", + "c5.large", + "c5.metal", + "c5.xlarge", + "c5a.12xlarge", + "c5a.16xlarge", + "c5a.24xlarge", + "c5a.2xlarge", + "c5a.4xlarge", + "c5a.8xlarge", + "c5a.large", + "c5a.xlarge", + "c5d.18xlarge", + "c5d.2xlarge", + "c5d.4xlarge", + "c5d.9xlarge", + "c5d.large", + "c5d.xlarge", + "c5n.18xlarge", + "c5n.2xlarge", + "c5n.4xlarge", + "c5n.9xlarge", + "c5n.large", + "c5n.metal", + "c5n.xlarge", + "c6g.12xlarge", + "c6g.16xlarge", + "c6g.2xlarge", + "c6g.4xlarge", + "c6g.8xlarge", + "c6g.large", + "c6g.medium", + "c6g.metal", + "c6g.xlarge", + "c6gd.12xlarge", + "c6gd.16xlarge", + "c6gd.2xlarge", + "c6gd.4xlarge", + "c6gd.8xlarge", + "c6gd.large", + "c6gd.medium", + "c6gd.metal", + "c6gd.xlarge", + "c6gn.12xlarge", + "c6gn.16xlarge", + "c6gn.2xlarge", + "c6gn.4xlarge", + "c6gn.8xlarge", + "c6gn.large", + "c6gn.medium", + "c6gn.metal", + "c6gn.xlarge", + "c6i.12xlarge", + "c6i.16xlarge", + "c6i.24xlarge", + "c6i.2xlarge", + "c6i.32xlarge", + "c6i.4xlarge", + "c6i.8xlarge", + "c6i.large", + "c6i.metal", + "c6i.xlarge", + "c6in.12xlarge", + "c6in.16xlarge", + "c6in.24xlarge", + "c6in.2xlarge", + "c6in.32xlarge", + "c6in.4xlarge", + "c6in.8xlarge", + "c6in.large", + "c6in.metal", + "c6in.xlarge", + "d2.2xlarge", + "d2.4xlarge", + "d2.8xlarge", + "d2.xlarge", + "g4dn.12xlarge", + "g4dn.16xlarge", + "g4dn.2xlarge", + "g4dn.4xlarge", + "g4dn.8xlarge", + "g4dn.metal", + "g4dn.xlarge", + "i3.16xlarge", + "i3.2xlarge", + "i3.4xlarge", + "i3.8xlarge", + "i3.large", + "i3.metal", + "i3.xlarge", + "i3en.12xlarge", + "i3en.24xlarge", + "i3en.2xlarge", + "i3en.3xlarge", + "i3en.6xlarge", + "i3en.large", + "i3en.metal", + "i3en.xlarge", + "i4i.12xlarge", + "i4i.16xlarge", + "i4i.24xlarge", + "i4i.2xlarge", + "i4i.32xlarge", + "i4i.4xlarge", + "i4i.8xlarge", + "i4i.large", + "i4i.metal", + "i4i.xlarge", + "im4gn.16xlarge", + "im4gn.2xlarge", + "im4gn.4xlarge", + "im4gn.8xlarge", + "im4gn.large", + "im4gn.xlarge", + "inf1.24xlarge", + "inf1.2xlarge", + "inf1.6xlarge", + "inf1.xlarge", + "is4gen.2xlarge", + "is4gen.4xlarge", + "is4gen.8xlarge", + "is4gen.large", + "is4gen.medium", + "is4gen.xlarge", + "m5.12xlarge", + "m5.16xlarge", + "m5.24xlarge", + "m5.2xlarge", + "m5.4xlarge", + "m5.8xlarge", + "m5.large", + "m5.metal", + "m5.xlarge", + "m5a.12xlarge", + "m5a.16xlarge", + "m5a.24xlarge", + "m5a.2xlarge", + "m5a.4xlarge", + "m5a.8xlarge", + "m5a.large", + "m5a.xlarge", + "m5ad.12xlarge", + "m5ad.16xlarge", + "m5ad.24xlarge", + "m5ad.2xlarge", + "m5ad.4xlarge", + "m5ad.8xlarge", + "m5ad.large", + "m5ad.xlarge", + "m5d.12xlarge", + "m5d.16xlarge", + "m5d.24xlarge", + "m5d.2xlarge", + "m5d.4xlarge", + "m5d.8xlarge", + "m5d.large", + "m5d.metal", + "m5d.xlarge", + "m6g.12xlarge", + "m6g.16xlarge", + "m6g.2xlarge", + "m6g.4xlarge", + "m6g.8xlarge", + "m6g.large", + "m6g.medium", + "m6g.metal", + "m6g.xlarge", + "m6gd.12xlarge", + "m6gd.16xlarge", + "m6gd.2xlarge", + "m6gd.4xlarge", + "m6gd.8xlarge", + "m6gd.large", + "m6gd.medium", + "m6gd.metal", + "m6gd.xlarge", + "m6i.12xlarge", + "m6i.16xlarge", + "m6i.24xlarge", + "m6i.2xlarge", + "m6i.32xlarge", + "m6i.4xlarge", + "m6i.8xlarge", + "m6i.large", + "m6i.metal", + "m6i.xlarge", + "m7g.12xlarge", + "m7g.16xlarge", + "m7g.2xlarge", + "m7g.4xlarge", + "m7g.8xlarge", + "m7g.large", + "m7g.medium", + "m7g.metal", + "m7g.xlarge", + "m7i-flex.2xlarge", + "m7i-flex.4xlarge", + "m7i-flex.8xlarge", + "m7i-flex.large", + "m7i-flex.xlarge", + "m7i.12xlarge", + "m7i.16xlarge", + "m7i.24xlarge", + "m7i.2xlarge", + "m7i.48xlarge", + "m7i.4xlarge", + "m7i.8xlarge", + "m7i.large", + "m7i.metal-24xl", + "m7i.metal-48xl", + "m7i.xlarge", + "r4.16xlarge", + "r4.2xlarge", + "r4.4xlarge", + "r4.8xlarge", + "r4.large", + "r4.xlarge", + "r5.12xlarge", + "r5.16xlarge", + "r5.24xlarge", + "r5.2xlarge", + "r5.4xlarge", + "r5.8xlarge", + "r5.large", + "r5.metal", + "r5.xlarge", + "r5a.12xlarge", + "r5a.16xlarge", + "r5a.24xlarge", + "r5a.2xlarge", + "r5a.4xlarge", + "r5a.8xlarge", + "r5a.large", + "r5a.xlarge", + "r5ad.12xlarge", + "r5ad.16xlarge", + "r5ad.24xlarge", + "r5ad.2xlarge", + "r5ad.4xlarge", + "r5ad.8xlarge", + "r5ad.large", + "r5ad.xlarge", + "r5d.12xlarge", + "r5d.16xlarge", + "r5d.24xlarge", + "r5d.2xlarge", + "r5d.4xlarge", + "r5d.8xlarge", + "r5d.large", + "r5d.metal", + "r5d.xlarge", + "r5dn.12xlarge", + "r5dn.16xlarge", + "r5dn.24xlarge", + "r5dn.2xlarge", + "r5dn.4xlarge", + "r5dn.8xlarge", + "r5dn.large", + "r5dn.metal", + "r5dn.xlarge", + "r5n.12xlarge", + "r5n.16xlarge", + "r5n.24xlarge", + "r5n.2xlarge", + "r5n.4xlarge", + "r5n.8xlarge", + "r5n.large", + "r5n.metal", + "r5n.xlarge", + "r6g.12xlarge", + "r6g.16xlarge", + "r6g.2xlarge", + "r6g.4xlarge", + "r6g.8xlarge", + "r6g.large", + "r6g.medium", + "r6g.metal", + "r6g.xlarge", + "r6gd.12xlarge", + "r6gd.16xlarge", + "r6gd.2xlarge", + "r6gd.4xlarge", + "r6gd.8xlarge", + "r6gd.large", + "r6gd.medium", + "r6gd.metal", + "r6gd.xlarge", + "r6i.12xlarge", + "r6i.16xlarge", + "r6i.24xlarge", + "r6i.2xlarge", + "r6i.32xlarge", + "r6i.4xlarge", + "r6i.8xlarge", + "r6i.large", + "r6i.metal", + "r6i.xlarge", + "t2.2xlarge", + "t2.large", + "t2.medium", + "t2.micro", + "t2.nano", + "t2.small", + "t2.xlarge", + "t3.2xlarge", + "t3.large", + "t3.medium", + "t3.micro", + "t3.nano", + "t3.small", + "t3.xlarge", + "t3a.2xlarge", + "t3a.large", + "t3a.medium", + "t3a.micro", + "t3a.nano", + "t3a.small", + "t3a.xlarge", + "t4g.2xlarge", + "t4g.large", + "t4g.medium", + "t4g.micro", + "t4g.nano", + "t4g.small", + "t4g.xlarge", + "u-6tb1.112xlarge", + "u-6tb1.56xlarge", + "x1.16xlarge", + "x1.32xlarge", + "x2idn.16xlarge", + "x2idn.24xlarge", + "x2idn.32xlarge", + "x2idn.metal", + "x2iedn.16xlarge", + "x2iedn.24xlarge", + "x2iedn.2xlarge", + "x2iedn.32xlarge", + "x2iedn.4xlarge", + "x2iedn.8xlarge", + "x2iedn.metal", + "x2iedn.xlarge", + ], + "eu-central-1": [ + "a1.2xlarge", + "a1.4xlarge", + "a1.large", + "a1.medium", + "a1.metal", + "a1.xlarge", + "c3.2xlarge", + "c3.4xlarge", + "c3.8xlarge", + "c3.large", + "c3.xlarge", + "c4.2xlarge", + "c4.4xlarge", + "c4.8xlarge", + "c4.large", + "c4.xlarge", + "c5.12xlarge", + "c5.18xlarge", + "c5.24xlarge", + "c5.2xlarge", + "c5.4xlarge", + "c5.9xlarge", + "c5.large", + "c5.metal", + "c5.xlarge", + "c5a.12xlarge", + "c5a.16xlarge", + "c5a.24xlarge", + "c5a.2xlarge", + "c5a.4xlarge", + "c5a.8xlarge", + "c5a.large", + "c5a.xlarge", + "c5ad.12xlarge", + "c5ad.16xlarge", + "c5ad.24xlarge", + "c5ad.2xlarge", + "c5ad.4xlarge", + "c5ad.8xlarge", + "c5ad.large", + "c5ad.xlarge", + "c5d.12xlarge", + "c5d.18xlarge", + "c5d.24xlarge", + "c5d.2xlarge", + "c5d.4xlarge", + "c5d.9xlarge", + "c5d.large", + "c5d.metal", + "c5d.xlarge", + "c5n.18xlarge", + "c5n.2xlarge", + "c5n.4xlarge", + "c5n.9xlarge", + "c5n.large", + "c5n.metal", + "c5n.xlarge", + "c6a.12xlarge", + "c6a.16xlarge", + "c6a.24xlarge", + "c6a.2xlarge", + "c6a.32xlarge", + "c6a.48xlarge", + "c6a.4xlarge", + "c6a.8xlarge", + "c6a.large", + "c6a.metal", + "c6a.xlarge", + "c6g.12xlarge", + "c6g.16xlarge", + "c6g.2xlarge", + "c6g.4xlarge", + "c6g.8xlarge", + "c6g.large", + "c6g.medium", + "c6g.metal", + "c6g.xlarge", + "c6gd.12xlarge", + "c6gd.16xlarge", + "c6gd.2xlarge", + "c6gd.4xlarge", + "c6gd.8xlarge", + "c6gd.large", + "c6gd.medium", + "c6gd.metal", + "c6gd.xlarge", + "c6gn.12xlarge", + "c6gn.16xlarge", + "c6gn.2xlarge", + "c6gn.4xlarge", + "c6gn.8xlarge", + "c6gn.large", + "c6gn.medium", + "c6gn.xlarge", + "c6i.12xlarge", + "c6i.16xlarge", + "c6i.24xlarge", + "c6i.2xlarge", + "c6i.32xlarge", + "c6i.4xlarge", + "c6i.8xlarge", + "c6i.large", + "c6i.metal", + "c6i.xlarge", + "c6id.12xlarge", + "c6id.16xlarge", + "c6id.24xlarge", + "c6id.2xlarge", + "c6id.32xlarge", + "c6id.4xlarge", + "c6id.8xlarge", + "c6id.large", + "c6id.metal", + "c6id.xlarge", + "c6in.12xlarge", + "c6in.16xlarge", + "c6in.24xlarge", + "c6in.2xlarge", + "c6in.32xlarge", + "c6in.4xlarge", + "c6in.8xlarge", + "c6in.large", + "c6in.metal", + "c6in.xlarge", + "c7g.12xlarge", + "c7g.16xlarge", + "c7g.2xlarge", + "c7g.4xlarge", + "c7g.8xlarge", + "c7g.large", + "c7g.medium", + "c7g.metal", + "c7g.xlarge", + "c7gd.12xlarge", + "c7gd.16xlarge", + "c7gd.2xlarge", + "c7gd.4xlarge", + "c7gd.8xlarge", + "c7gd.large", + "c7gd.medium", + "c7gd.xlarge", + "c7i.12xlarge", + "c7i.16xlarge", + "c7i.24xlarge", + "c7i.2xlarge", + "c7i.48xlarge", + "c7i.4xlarge", + "c7i.8xlarge", + "c7i.large", + "c7i.metal-24xl", + "c7i.metal-48xl", + "c7i.xlarge", + "d2.2xlarge", + "d2.4xlarge", + "d2.8xlarge", + "d2.xlarge", + "d3.2xlarge", + "d3.4xlarge", + "d3.8xlarge", + "d3.xlarge", + "d3en.12xlarge", + "d3en.2xlarge", + "d3en.4xlarge", + "d3en.6xlarge", + "d3en.8xlarge", + "d3en.xlarge", + "dl2q.24xlarge", + "f1.2xlarge", + "f1.4xlarge", + "g2.2xlarge", + "g2.8xlarge", + "g3.16xlarge", + "g3.4xlarge", + "g3.8xlarge", + "g3s.xlarge", + "g4ad.16xlarge", + "g4ad.2xlarge", + "g4ad.4xlarge", + "g4ad.8xlarge", + "g4ad.xlarge", + "g4dn.12xlarge", + "g4dn.16xlarge", + "g4dn.2xlarge", + "g4dn.4xlarge", + "g4dn.8xlarge", + "g4dn.metal", + "g4dn.xlarge", + "g5.12xlarge", + "g5.16xlarge", + "g5.24xlarge", + "g5.2xlarge", + "g5.48xlarge", + "g5.4xlarge", + "g5.8xlarge", + "g5.xlarge", + "g5g.16xlarge", + "g5g.2xlarge", + "g5g.4xlarge", + "g5g.8xlarge", + "g5g.metal", + "g5g.xlarge", + "i2.2xlarge", + "i2.4xlarge", + "i2.8xlarge", + "i2.xlarge", + "i3.16xlarge", + "i3.2xlarge", + "i3.4xlarge", + "i3.8xlarge", + "i3.large", + "i3.metal", + "i3.xlarge", + "i3en.12xlarge", + "i3en.24xlarge", + "i3en.2xlarge", + "i3en.3xlarge", + "i3en.6xlarge", + "i3en.large", + "i3en.metal", + "i3en.xlarge", + "i4i.12xlarge", + "i4i.16xlarge", + "i4i.24xlarge", + "i4i.2xlarge", + "i4i.32xlarge", + "i4i.4xlarge", + "i4i.8xlarge", + "i4i.large", + "i4i.metal", + "i4i.xlarge", + "im4gn.16xlarge", + "im4gn.2xlarge", + "im4gn.4xlarge", + "im4gn.8xlarge", + "im4gn.large", + "im4gn.xlarge", + "inf1.24xlarge", + "inf1.2xlarge", + "inf1.6xlarge", + "inf1.xlarge", + "inf2.24xlarge", + "inf2.48xlarge", + "inf2.8xlarge", + "inf2.xlarge", + "is4gen.2xlarge", + "is4gen.4xlarge", + "is4gen.8xlarge", + "is4gen.large", + "is4gen.medium", + "is4gen.xlarge", + "m3.2xlarge", + "m3.large", + "m3.medium", + "m3.xlarge", + "m4.10xlarge", + "m4.16xlarge", + "m4.2xlarge", + "m4.4xlarge", + "m4.large", + "m4.xlarge", + "m5.12xlarge", + "m5.16xlarge", + "m5.24xlarge", + "m5.2xlarge", + "m5.4xlarge", + "m5.8xlarge", + "m5.large", + "m5.metal", + "m5.xlarge", + "m5a.12xlarge", + "m5a.16xlarge", + "m5a.24xlarge", + "m5a.2xlarge", + "m5a.4xlarge", + "m5a.8xlarge", + "m5a.large", + "m5a.xlarge", + "m5ad.12xlarge", + "m5ad.16xlarge", + "m5ad.24xlarge", + "m5ad.2xlarge", + "m5ad.4xlarge", + "m5ad.8xlarge", + "m5ad.large", + "m5ad.xlarge", + "m5d.12xlarge", + "m5d.16xlarge", + "m5d.24xlarge", + "m5d.2xlarge", + "m5d.4xlarge", + "m5d.8xlarge", + "m5d.large", + "m5d.metal", + "m5d.xlarge", + "m5dn.12xlarge", + "m5dn.16xlarge", + "m5dn.24xlarge", + "m5dn.2xlarge", + "m5dn.4xlarge", + "m5dn.8xlarge", + "m5dn.large", + "m5dn.metal", + "m5dn.xlarge", + "m5n.12xlarge", + "m5n.16xlarge", + "m5n.24xlarge", + "m5n.2xlarge", + "m5n.4xlarge", + "m5n.8xlarge", + "m5n.large", + "m5n.metal", + "m5n.xlarge", + "m5zn.12xlarge", + "m5zn.2xlarge", + "m5zn.3xlarge", + "m5zn.6xlarge", + "m5zn.large", + "m5zn.metal", + "m5zn.xlarge", + "m6a.12xlarge", + "m6a.16xlarge", + "m6a.24xlarge", + "m6a.2xlarge", + "m6a.32xlarge", + "m6a.48xlarge", + "m6a.4xlarge", + "m6a.8xlarge", + "m6a.large", + "m6a.metal", + "m6a.xlarge", + "m6g.12xlarge", + "m6g.16xlarge", + "m6g.2xlarge", + "m6g.4xlarge", + "m6g.8xlarge", + "m6g.large", + "m6g.medium", + "m6g.metal", + "m6g.xlarge", + "m6gd.12xlarge", + "m6gd.16xlarge", + "m6gd.2xlarge", + "m6gd.4xlarge", + "m6gd.8xlarge", + "m6gd.large", + "m6gd.medium", + "m6gd.metal", + "m6gd.xlarge", + "m6i.12xlarge", + "m6i.16xlarge", + "m6i.24xlarge", + "m6i.2xlarge", + "m6i.32xlarge", + "m6i.4xlarge", + "m6i.8xlarge", + "m6i.large", + "m6i.metal", + "m6i.xlarge", + "m6id.12xlarge", + "m6id.16xlarge", + "m6id.24xlarge", + "m6id.2xlarge", + "m6id.32xlarge", + "m6id.4xlarge", + "m6id.8xlarge", + "m6id.large", + "m6id.metal", + "m6id.xlarge", + "m6idn.12xlarge", + "m6idn.16xlarge", + "m6idn.24xlarge", + "m6idn.2xlarge", + "m6idn.32xlarge", + "m6idn.4xlarge", + "m6idn.8xlarge", + "m6idn.large", + "m6idn.metal", + "m6idn.xlarge", + "m6in.12xlarge", + "m6in.16xlarge", + "m6in.24xlarge", + "m6in.2xlarge", + "m6in.32xlarge", + "m6in.4xlarge", + "m6in.8xlarge", + "m6in.large", + "m6in.metal", + "m6in.xlarge", + "m7g.12xlarge", + "m7g.16xlarge", + "m7g.2xlarge", + "m7g.4xlarge", + "m7g.8xlarge", + "m7g.large", + "m7g.medium", + "m7g.metal", + "m7g.xlarge", + "m7gd.12xlarge", + "m7gd.16xlarge", + "m7gd.2xlarge", + "m7gd.4xlarge", + "m7gd.8xlarge", + "m7gd.large", + "m7gd.medium", + "m7gd.xlarge", + "m7i-flex.2xlarge", + "m7i-flex.4xlarge", + "m7i-flex.8xlarge", + "m7i-flex.large", + "m7i-flex.xlarge", + "m7i.12xlarge", + "m7i.16xlarge", + "m7i.24xlarge", + "m7i.2xlarge", + "m7i.48xlarge", + "m7i.4xlarge", + "m7i.8xlarge", + "m7i.large", + "m7i.metal-24xl", + "m7i.metal-48xl", + "m7i.xlarge", + "p2.16xlarge", + "p2.8xlarge", + "p2.xlarge", + "p3.16xlarge", + "p3.2xlarge", + "p3.8xlarge", + "p4d.24xlarge", + "r3.2xlarge", + "r3.4xlarge", + "r3.8xlarge", + "r3.large", + "r3.xlarge", + "r4.16xlarge", + "r4.2xlarge", + "r4.4xlarge", + "r4.8xlarge", + "r4.large", + "r4.xlarge", + "r5.12xlarge", + "r5.16xlarge", + "r5.24xlarge", + "r5.2xlarge", + "r5.4xlarge", + "r5.8xlarge", + "r5.large", + "r5.metal", + "r5.xlarge", + "r5a.12xlarge", + "r5a.16xlarge", + "r5a.24xlarge", + "r5a.2xlarge", + "r5a.4xlarge", + "r5a.8xlarge", + "r5a.large", + "r5a.xlarge", + "r5ad.12xlarge", + "r5ad.16xlarge", + "r5ad.24xlarge", + "r5ad.2xlarge", + "r5ad.4xlarge", + "r5ad.8xlarge", + "r5ad.large", + "r5ad.xlarge", + "r5b.12xlarge", + "r5b.16xlarge", + "r5b.24xlarge", + "r5b.2xlarge", + "r5b.4xlarge", + "r5b.8xlarge", + "r5b.large", + "r5b.metal", + "r5b.xlarge", + "r5d.12xlarge", + "r5d.16xlarge", + "r5d.24xlarge", + "r5d.2xlarge", + "r5d.4xlarge", + "r5d.8xlarge", + "r5d.large", + "r5d.metal", + "r5d.xlarge", + "r5dn.12xlarge", + "r5dn.16xlarge", + "r5dn.24xlarge", + "r5dn.2xlarge", + "r5dn.4xlarge", + "r5dn.8xlarge", + "r5dn.large", + "r5dn.metal", + "r5dn.xlarge", + "r5n.12xlarge", + "r5n.16xlarge", + "r5n.24xlarge", + "r5n.2xlarge", + "r5n.4xlarge", + "r5n.8xlarge", + "r5n.large", + "r5n.metal", + "r5n.xlarge", + "r6a.12xlarge", + "r6a.16xlarge", + "r6a.24xlarge", + "r6a.2xlarge", + "r6a.32xlarge", + "r6a.48xlarge", + "r6a.4xlarge", + "r6a.8xlarge", + "r6a.large", + "r6a.metal", + "r6a.xlarge", + "r6g.12xlarge", + "r6g.16xlarge", + "r6g.2xlarge", + "r6g.4xlarge", + "r6g.8xlarge", + "r6g.large", + "r6g.medium", + "r6g.metal", + "r6g.xlarge", + "r6gd.12xlarge", + "r6gd.16xlarge", + "r6gd.2xlarge", + "r6gd.4xlarge", + "r6gd.8xlarge", + "r6gd.large", + "r6gd.medium", + "r6gd.metal", + "r6gd.xlarge", + "r6i.12xlarge", + "r6i.16xlarge", + "r6i.24xlarge", + "r6i.2xlarge", + "r6i.32xlarge", + "r6i.4xlarge", + "r6i.8xlarge", + "r6i.large", + "r6i.metal", + "r6i.xlarge", + "r6id.12xlarge", + "r6id.16xlarge", + "r6id.24xlarge", + "r6id.2xlarge", + "r6id.32xlarge", + "r6id.4xlarge", + "r6id.8xlarge", + "r6id.large", + "r6id.metal", + "r6id.xlarge", + "r6idn.12xlarge", + "r6idn.16xlarge", + "r6idn.24xlarge", + "r6idn.2xlarge", + "r6idn.32xlarge", + "r6idn.4xlarge", + "r6idn.8xlarge", + "r6idn.large", + "r6idn.metal", + "r6idn.xlarge", + "r6in.12xlarge", + "r6in.16xlarge", + "r6in.24xlarge", + "r6in.2xlarge", + "r6in.32xlarge", + "r6in.4xlarge", + "r6in.8xlarge", + "r6in.large", + "r6in.metal", + "r6in.xlarge", + "r7g.12xlarge", + "r7g.16xlarge", + "r7g.2xlarge", + "r7g.4xlarge", + "r7g.8xlarge", + "r7g.large", + "r7g.medium", + "r7g.metal", + "r7g.xlarge", + "r7gd.12xlarge", + "r7gd.16xlarge", + "r7gd.2xlarge", + "r7gd.4xlarge", + "r7gd.8xlarge", + "r7gd.large", + "r7gd.medium", + "r7gd.xlarge", + "r7i.12xlarge", + "r7i.16xlarge", + "r7i.24xlarge", + "r7i.2xlarge", + "r7i.48xlarge", + "r7i.4xlarge", + "r7i.8xlarge", + "r7i.large", + "r7i.metal-24xl", + "r7i.metal-48xl", + "r7i.xlarge", + "r7iz.12xlarge", + "r7iz.16xlarge", + "r7iz.2xlarge", + "r7iz.32xlarge", + "r7iz.4xlarge", + "r7iz.8xlarge", + "r7iz.large", + "r7iz.metal-16xl", + "r7iz.metal-32xl", + "r7iz.xlarge", + "t2.2xlarge", + "t2.large", + "t2.medium", + "t2.micro", + "t2.nano", + "t2.small", + "t2.xlarge", + "t3.2xlarge", + "t3.large", + "t3.medium", + "t3.micro", + "t3.nano", + "t3.small", + "t3.xlarge", + "t3a.2xlarge", + "t3a.large", + "t3a.medium", + "t3a.micro", + "t3a.nano", + "t3a.small", + "t3a.xlarge", + "t4g.2xlarge", + "t4g.large", + "t4g.medium", + "t4g.micro", + "t4g.nano", + "t4g.small", + "t4g.xlarge", + "u-12tb1.112xlarge", + "u-3tb1.56xlarge", + "u-6tb1.112xlarge", + "u-6tb1.56xlarge", + "u-9tb1.112xlarge", + "x1.16xlarge", + "x1.32xlarge", + "x1e.16xlarge", + "x1e.2xlarge", + "x1e.32xlarge", + "x1e.4xlarge", + "x1e.8xlarge", + "x1e.xlarge", + "x2idn.16xlarge", + "x2idn.24xlarge", + "x2idn.32xlarge", + "x2idn.metal", + "x2iedn.16xlarge", + "x2iedn.24xlarge", + "x2iedn.2xlarge", + "x2iedn.32xlarge", + "x2iedn.4xlarge", + "x2iedn.8xlarge", + "x2iedn.metal", + "x2iedn.xlarge", + "z1d.12xlarge", + "z1d.2xlarge", + "z1d.3xlarge", + "z1d.6xlarge", + "z1d.large", + "z1d.metal", + "z1d.xlarge", + ], + "sa-east-1": [ + "c1.medium", + "c1.xlarge", + "c3.2xlarge", + "c3.4xlarge", + "c3.8xlarge", + "c3.large", + "c3.xlarge", + "c4.2xlarge", + "c4.4xlarge", + "c4.8xlarge", + "c4.large", + "c4.xlarge", + "c5.12xlarge", + "c5.18xlarge", + "c5.24xlarge", + "c5.2xlarge", + "c5.4xlarge", + "c5.9xlarge", + "c5.large", + "c5.metal", + "c5.xlarge", + "c5a.12xlarge", + "c5a.16xlarge", + "c5a.24xlarge", + "c5a.2xlarge", + "c5a.4xlarge", + "c5a.8xlarge", + "c5a.large", + "c5a.xlarge", + "c5ad.12xlarge", + "c5ad.16xlarge", + "c5ad.24xlarge", + "c5ad.2xlarge", + "c5ad.4xlarge", + "c5ad.8xlarge", + "c5ad.large", + "c5ad.xlarge", + "c5d.12xlarge", + "c5d.18xlarge", + "c5d.24xlarge", + "c5d.2xlarge", + "c5d.4xlarge", + "c5d.9xlarge", + "c5d.large", + "c5d.metal", + "c5d.xlarge", + "c5n.18xlarge", + "c5n.2xlarge", + "c5n.4xlarge", + "c5n.9xlarge", + "c5n.large", + "c5n.metal", + "c5n.xlarge", + "c6a.12xlarge", + "c6a.16xlarge", + "c6a.24xlarge", + "c6a.2xlarge", + "c6a.32xlarge", + "c6a.48xlarge", + "c6a.4xlarge", + "c6a.8xlarge", + "c6a.large", + "c6a.metal", + "c6a.xlarge", + "c6g.12xlarge", + "c6g.16xlarge", + "c6g.2xlarge", + "c6g.4xlarge", + "c6g.8xlarge", + "c6g.large", + "c6g.medium", + "c6g.metal", + "c6g.xlarge", + "c6gd.12xlarge", + "c6gd.16xlarge", + "c6gd.2xlarge", + "c6gd.4xlarge", + "c6gd.8xlarge", + "c6gd.large", + "c6gd.medium", + "c6gd.metal", + "c6gd.xlarge", + "c6gn.12xlarge", + "c6gn.16xlarge", + "c6gn.2xlarge", + "c6gn.4xlarge", + "c6gn.8xlarge", + "c6gn.large", + "c6gn.medium", + "c6gn.xlarge", + "c6i.12xlarge", + "c6i.16xlarge", + "c6i.24xlarge", + "c6i.2xlarge", + "c6i.32xlarge", + "c6i.4xlarge", + "c6i.8xlarge", + "c6i.large", + "c6i.metal", + "c6i.xlarge", + "c6in.12xlarge", + "c6in.16xlarge", + "c6in.24xlarge", + "c6in.2xlarge", + "c6in.32xlarge", + "c6in.4xlarge", + "c6in.8xlarge", + "c6in.large", + "c6in.metal", + "c6in.xlarge", + "c7i.12xlarge", + "c7i.16xlarge", + "c7i.24xlarge", + "c7i.2xlarge", + "c7i.48xlarge", + "c7i.4xlarge", + "c7i.8xlarge", + "c7i.large", + "c7i.metal-24xl", + "c7i.metal-48xl", + "c7i.xlarge", + "g4dn.12xlarge", + "g4dn.16xlarge", + "g4dn.2xlarge", + "g4dn.4xlarge", + "g4dn.8xlarge", + "g4dn.metal", + "g4dn.xlarge", + "g5.12xlarge", + "g5.16xlarge", + "g5.24xlarge", + "g5.2xlarge", + "g5.48xlarge", + "g5.4xlarge", + "g5.8xlarge", + "g5.xlarge", + "i3.16xlarge", + "i3.2xlarge", + "i3.4xlarge", + "i3.8xlarge", + "i3.large", + "i3.metal", + "i3.xlarge", + "i3en.12xlarge", + "i3en.24xlarge", + "i3en.2xlarge", + "i3en.3xlarge", + "i3en.6xlarge", + "i3en.large", + "i3en.metal", + "i3en.xlarge", + "i4i.12xlarge", + "i4i.16xlarge", + "i4i.24xlarge", + "i4i.2xlarge", + "i4i.32xlarge", + "i4i.4xlarge", + "i4i.8xlarge", + "i4i.large", + "i4i.metal", + "i4i.xlarge", + "inf1.24xlarge", + "inf1.2xlarge", + "inf1.6xlarge", + "inf1.xlarge", + "m1.large", + "m1.medium", + "m1.xlarge", + "m2.2xlarge", + "m2.4xlarge", + "m2.xlarge", + "m3.2xlarge", + "m3.large", + "m3.medium", + "m3.xlarge", + "m4.10xlarge", + "m4.16xlarge", + "m4.2xlarge", + "m4.4xlarge", + "m4.large", + "m4.xlarge", + "m5.12xlarge", + "m5.16xlarge", + "m5.24xlarge", + "m5.2xlarge", + "m5.4xlarge", + "m5.8xlarge", + "m5.large", + "m5.metal", + "m5.xlarge", + "m5a.12xlarge", + "m5a.16xlarge", + "m5a.24xlarge", + "m5a.2xlarge", + "m5a.4xlarge", + "m5a.8xlarge", + "m5a.large", + "m5a.xlarge", + "m5ad.12xlarge", + "m5ad.16xlarge", + "m5ad.24xlarge", + "m5ad.2xlarge", + "m5ad.4xlarge", + "m5ad.8xlarge", + "m5ad.large", + "m5ad.xlarge", + "m5d.12xlarge", + "m5d.16xlarge", + "m5d.24xlarge", + "m5d.2xlarge", + "m5d.4xlarge", + "m5d.8xlarge", + "m5d.large", + "m5d.metal", + "m5d.xlarge", + "m5zn.12xlarge", + "m5zn.2xlarge", + "m5zn.3xlarge", + "m5zn.6xlarge", + "m5zn.large", + "m5zn.metal", + "m5zn.xlarge", + "m6a.12xlarge", + "m6a.16xlarge", + "m6a.24xlarge", + "m6a.2xlarge", + "m6a.32xlarge", + "m6a.48xlarge", + "m6a.4xlarge", + "m6a.8xlarge", + "m6a.large", + "m6a.metal", + "m6a.xlarge", + "m6g.12xlarge", + "m6g.16xlarge", + "m6g.2xlarge", + "m6g.4xlarge", + "m6g.8xlarge", + "m6g.large", + "m6g.medium", + "m6g.metal", + "m6g.xlarge", + "m6gd.12xlarge", + "m6gd.16xlarge", + "m6gd.2xlarge", + "m6gd.4xlarge", + "m6gd.8xlarge", + "m6gd.large", + "m6gd.medium", + "m6gd.metal", + "m6gd.xlarge", + "m6i.12xlarge", + "m6i.16xlarge", + "m6i.24xlarge", + "m6i.2xlarge", + "m6i.32xlarge", + "m6i.4xlarge", + "m6i.8xlarge", + "m6i.large", + "m6i.metal", + "m6i.xlarge", + "m6id.12xlarge", + "m6id.16xlarge", + "m6id.24xlarge", + "m6id.2xlarge", + "m6id.32xlarge", + "m6id.4xlarge", + "m6id.8xlarge", + "m6id.large", + "m6id.metal", + "m6id.xlarge", + "m7g.12xlarge", + "m7g.16xlarge", + "m7g.2xlarge", + "m7g.4xlarge", + "m7g.8xlarge", + "m7g.large", + "m7g.medium", + "m7g.metal", + "m7g.xlarge", + "m7i-flex.2xlarge", + "m7i-flex.4xlarge", + "m7i-flex.8xlarge", + "m7i-flex.large", + "m7i-flex.xlarge", + "m7i.12xlarge", + "m7i.16xlarge", + "m7i.24xlarge", + "m7i.2xlarge", + "m7i.48xlarge", + "m7i.4xlarge", + "m7i.8xlarge", + "m7i.large", + "m7i.metal-24xl", + "m7i.metal-48xl", + "m7i.xlarge", + "r3.2xlarge", + "r3.4xlarge", + "r3.8xlarge", + "r3.large", + "r3.xlarge", + "r4.16xlarge", + "r4.2xlarge", + "r4.4xlarge", + "r4.8xlarge", + "r4.large", + "r4.xlarge", + "r5.12xlarge", + "r5.16xlarge", + "r5.24xlarge", + "r5.2xlarge", + "r5.4xlarge", + "r5.8xlarge", + "r5.large", + "r5.metal", + "r5.xlarge", + "r5a.12xlarge", + "r5a.16xlarge", + "r5a.24xlarge", + "r5a.2xlarge", + "r5a.4xlarge", + "r5a.8xlarge", + "r5a.large", + "r5a.xlarge", + "r5ad.12xlarge", + "r5ad.16xlarge", + "r5ad.24xlarge", + "r5ad.2xlarge", + "r5ad.4xlarge", + "r5ad.8xlarge", + "r5ad.large", + "r5ad.xlarge", + "r5b.12xlarge", + "r5b.16xlarge", + "r5b.24xlarge", + "r5b.2xlarge", + "r5b.4xlarge", + "r5b.8xlarge", + "r5b.large", + "r5b.metal", + "r5b.xlarge", + "r5d.12xlarge", + "r5d.16xlarge", + "r5d.24xlarge", + "r5d.2xlarge", + "r5d.4xlarge", + "r5d.8xlarge", + "r5d.large", + "r5d.metal", + "r5d.xlarge", + "r5n.12xlarge", + "r5n.16xlarge", + "r5n.24xlarge", + "r5n.2xlarge", + "r5n.4xlarge", + "r5n.8xlarge", + "r5n.large", + "r5n.metal", + "r5n.xlarge", + "r6g.12xlarge", + "r6g.16xlarge", + "r6g.2xlarge", + "r6g.4xlarge", + "r6g.8xlarge", + "r6g.large", + "r6g.medium", + "r6g.metal", + "r6g.xlarge", + "r6gd.12xlarge", + "r6gd.16xlarge", + "r6gd.2xlarge", + "r6gd.4xlarge", + "r6gd.8xlarge", + "r6gd.large", + "r6gd.medium", + "r6gd.metal", + "r6gd.xlarge", + "r6i.12xlarge", + "r6i.16xlarge", + "r6i.24xlarge", + "r6i.2xlarge", + "r6i.32xlarge", + "r6i.4xlarge", + "r6i.8xlarge", + "r6i.large", + "r6i.metal", + "r6i.xlarge", + "r7i.12xlarge", + "r7i.16xlarge", + "r7i.24xlarge", + "r7i.2xlarge", + "r7i.48xlarge", + "r7i.4xlarge", + "r7i.8xlarge", + "r7i.large", + "r7i.metal-24xl", + "r7i.metal-48xl", + "r7i.xlarge", + "t1.micro", + "t2.2xlarge", + "t2.large", + "t2.medium", + "t2.micro", + "t2.nano", + "t2.small", + "t2.xlarge", + "t3.2xlarge", + "t3.large", + "t3.medium", + "t3.micro", + "t3.nano", + "t3.small", + "t3.xlarge", + "t3a.2xlarge", + "t3a.large", + "t3a.medium", + "t3a.micro", + "t3a.nano", + "t3a.small", + "t3a.xlarge", + "t4g.2xlarge", + "t4g.large", + "t4g.medium", + "t4g.micro", + "t4g.nano", + "t4g.small", + "t4g.xlarge", + "u-12tb1.112xlarge", + "u-3tb1.56xlarge", + "u-6tb1.112xlarge", + "u-6tb1.56xlarge", + "x1.16xlarge", + "x1.32xlarge", + "x1e.16xlarge", + "x1e.2xlarge", + "x1e.32xlarge", + "x1e.4xlarge", + "x1e.8xlarge", + "x1e.xlarge", + "x2idn.16xlarge", + "x2idn.24xlarge", + "x2idn.32xlarge", + "x2idn.metal", + "x2iedn.16xlarge", + "x2iedn.24xlarge", + "x2iedn.2xlarge", + "x2iedn.32xlarge", + "x2iedn.4xlarge", + "x2iedn.8xlarge", + "x2iedn.metal", + "x2iedn.xlarge", + ], +} -ec2InstancesByRegion = {region: [E2Instances[i] for i in instances] for region, instances in regionDict.items()} +ec2InstancesByRegion = { + region: [E2Instances[i] for i in instances] + for region, instances in regionDict.items() +} diff --git a/src/toil/lib/humanize.py b/src/toil/lib/humanize.py index 3c896ab17c..1e0c39bd10 100644 --- a/src/toil/lib/humanize.py +++ b/src/toil/lib/humanize.py @@ -25,7 +25,9 @@ def bytes2human(n: SupportsInt) -> str: """ Convert n bytes into a human readable string. """ - logger.warning('Deprecated toil method. Please use "toil.lib.conversions.bytes2human()" instead."') + logger.warning( + 'Deprecated toil method. Please use "toil.lib.conversions.bytes2human()" instead."' + ) return b2h(n) @@ -36,5 +38,7 @@ def human2bytes(s: str) -> int: When unable to recognize the format ValueError is raised. """ - logger.warning('Deprecated toil method. Please use "toil.lib.conversions.human2bytes()" instead."') + logger.warning( + 'Deprecated toil method. Please use "toil.lib.conversions.human2bytes()" instead."' + ) return h2b(s) diff --git a/src/toil/lib/io.py b/src/toil/lib/io.py index 1cb98f6e3e..fbf54668db 100644 --- a/src/toil/lib/io.py +++ b/src/toil/lib/io.py @@ -4,13 +4,19 @@ import stat import tempfile import uuid +from collections.abc import Iterator from contextlib import contextmanager from io import BytesIO -from typing import IO, Any, Callable, Iterator, Optional, Union +from typing import IO, Any, Callable, Optional, Union logger = logging.getLogger(__name__) -def mkdtemp(suffix: Optional[str] = None, prefix: Optional[str] = None, dir: Optional[str] = None) -> str: + +def mkdtemp( + suffix: Optional[str] = None, + prefix: Optional[str] = None, + dir: Optional[str] = None, +) -> str: """ Make a temporary directory like tempfile.mkdtemp, but with relaxed permissions. @@ -27,10 +33,13 @@ def mkdtemp(suffix: Optional[str] = None, prefix: Optional[str] = None, dir: Opt # Make the directory result = tempfile.mkdtemp(suffix=suffix, prefix=prefix, dir=dir) # Grant all the permissions: full control for user, and execute for group and other - os.chmod(result, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) + os.chmod( + result, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH + ) # Return the path created return result + def robust_rmtree(path: Union[str, bytes]) -> None: """ Robustly tries to delete paths. @@ -45,7 +54,7 @@ def robust_rmtree(path: Union[str, bytes]) -> None: if not isinstance(path, bytes): # Internally we must work in bytes, in case we find an undecodeable # filename. - path = path.encode('utf-8') + path = path.encode("utf-8") if not os.path.exists(path): # Nothing to do! @@ -107,7 +116,7 @@ def atomic_tmp_file(final_path: str) -> str: as finalPath. It the final path is in /dev (/dev/null, /dev/stdout), it is returned unchanged and atomic_tmp_install will do nothing.""" final_dir = os.path.dirname(os.path.normpath(final_path)) # can be empty - if final_dir == '/dev': + if final_dir == "/dev": return final_path final_basename = os.path.basename(final_path) final_ext = os.path.splitext(final_path)[1] @@ -117,9 +126,10 @@ def atomic_tmp_file(final_path: str) -> str: def atomic_install(tmp_path, final_path) -> None: """atomic install of tmp_path as final_path""" - if os.path.dirname(os.path.normpath(final_path)) != '/dev': + if os.path.dirname(os.path.normpath(final_path)) != "/dev": os.rename(tmp_path, final_path) + @contextmanager def AtomicFileCreate(final_path: str, keep: bool = False) -> Iterator[str]: """Context manager to create a temporary file. Entering returns path to @@ -140,7 +150,9 @@ def AtomicFileCreate(final_path: str, keep: bool = False) -> Iterator[str]: raise -def atomic_copy(src_path: str, dest_path: str, executable: Optional[bool] = None) -> None: +def atomic_copy( + src_path: str, dest_path: str, executable: Optional[bool] = None +) -> None: """Copy a file using posix atomic creations semantics.""" if executable is None: executable = os.stat(src_path).st_mode & stat.S_IXUSR != 0 @@ -150,10 +162,12 @@ def atomic_copy(src_path: str, dest_path: str, executable: Optional[bool] = None os.chmod(dest_path_tmp, os.stat(dest_path_tmp).st_mode | stat.S_IXUSR) -def atomic_copyobj(src_fh: BytesIO, dest_path: str, length: int = 16384, executable: bool = False) -> None: +def atomic_copyobj( + src_fh: BytesIO, dest_path: str, length: int = 16384, executable: bool = False +) -> None: """Copy an open file using posix atomic creations semantics.""" with AtomicFileCreate(dest_path) as dest_path_tmp: - with open(dest_path_tmp, 'wb') as dest_path_fh: + with open(dest_path_tmp, "wb") as dest_path_fh: shutil.copyfileobj(src_fh, dest_path_fh, length=length) if executable: os.chmod(dest_path_tmp, os.stat(dest_path_tmp).st_mode | stat.S_IXUSR) @@ -179,9 +193,11 @@ def make_public_dir(in_directory: str, suggested_name: Optional[str] = None) -> return generated_dir_path except FileExistsError: pass - for i in range(4, 32 + 1): # make random uuids and truncate to lengths starting at 4 and working up to max 32 + for i in range( + 4, 32 + 1 + ): # make random uuids and truncate to lengths starting at 4 and working up to max 32 for _ in range(10): # make 10 attempts for each length - truncated_uuid: str = str(uuid.uuid4()).replace('-', '')[:i] + truncated_uuid: str = str(uuid.uuid4()).replace("-", "")[:i] generated_dir_path: str = os.path.join(in_directory, truncated_uuid) try: os.mkdir(generated_dir_path) @@ -194,6 +210,7 @@ def make_public_dir(in_directory: str, suggested_name: Optional[str] = None) -> os.chmod(this_should_never_happen, 0o777) return this_should_never_happen + def try_path(path: str, min_size: int = 100 * 1024 * 1024) -> Optional[str]: """ Try to use the given path. Return it if it exists or can be made, diff --git a/src/toil/lib/iterables.py b/src/toil/lib/iterables.py index 8dbbe12972..2ed1fca146 100644 --- a/src/toil/lib/iterables.py +++ b/src/toil/lib/iterables.py @@ -12,8 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from collections.abc import Iterable, Iterator + # 5.14.2018: copied into Toil from https://github.com/BD2KGenomics/bd2k-python-lib -from typing import Any, Iterable, Iterator, TypeVar +from typing import Any, TypeVar IT = TypeVar("IT") @@ -102,7 +104,7 @@ def expand(x): try: i = x.__iter__() except AttributeError: - i = x, + i = (x,) else: i = x return i diff --git a/src/toil/lib/memoize.py b/src/toil/lib/memoize.py index e7e1ec21a4..5841d77f0d 100644 --- a/src/toil/lib/memoize.py +++ b/src/toil/lib/memoize.py @@ -17,7 +17,7 @@ import re from functools import lru_cache, wraps from threading import Lock -from typing import Any, Callable, Dict, Tuple, TypeVar +from typing import Any, Callable, TypeVar memoize = lru_cache(maxsize=None) """ @@ -31,13 +31,14 @@ MAT = TypeVar("MAT") MRT = TypeVar("MRT") + def sync_memoize(f: Callable[[MAT], MRT]) -> Callable[[MAT], MRT]: """ Like memoize, but guarantees that decorated function is only called once, even when multiple threads are calling the decorating function with multiple parameters. """ # TODO: Think about an f that is recursive - memory: Dict[Tuple[Any, ...], Any] = {} + memory: dict[tuple[Any, ...], Any] = {} lock = Lock() @wraps(f) @@ -53,13 +54,14 @@ def new_f(*args: Any) -> Any: r = f(*args) memory[args] = r return r + return new_f def parse_iso_utc(s: str) -> datetime.datetime: """ Parses an ISO time with a hard-coded Z for zulu-time (UTC) at the end. Other timezones are - not supported. Returns a timezone-naive datetime object. + not supported. Returns a timezone-naive datetime object. :param s: The ISO-formatted time @@ -74,20 +76,22 @@ def parse_iso_utc(s: str) -> datetime.datetime: ... ValueError: Not a valid ISO datetime in UTC: 2016-04-27T00:28:04X """ - rfc3339_datetime = re.compile(r'^(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})(?:\.(\d+))?(Z|[+-]\d{2}:\d{2})$') + rfc3339_datetime = re.compile( + r"^(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})(?:\.(\d+))?(Z|[+-]\d{2}:\d{2})$" + ) m = rfc3339_datetime.match(s) if not m: - raise ValueError(f'Not a valid ISO datetime in UTC: {s}') + raise ValueError(f"Not a valid ISO datetime in UTC: {s}") else: - fmt = '%Y-%m-%dT%H:%M:%S' + ('.%f' if m.group(7) else '') + 'Z' + fmt = "%Y-%m-%dT%H:%M:%S" + (".%f" if m.group(7) else "") + "Z" return datetime.datetime.strptime(s, fmt) def strict_bool(s: str) -> bool: """Variant of bool() that only accepts two possible string values.""" - if s == 'True': + if s == "True": return True - elif s == 'False': + elif s == "False": return False else: raise ValueError(s) diff --git a/src/toil/lib/misc.py b/src/toil/lib/misc.py index 486395364f..41d76a3b07 100644 --- a/src/toil/lib/misc.py +++ b/src/toil/lib/misc.py @@ -7,9 +7,9 @@ import subprocess import sys import time -import typing +from collections.abc import Iterator from contextlib import closing -from typing import Iterator, List, Optional +from typing import Optional logger = logging.getLogger(__name__) @@ -21,19 +21,20 @@ def get_public_ip() -> str: try: # Try to get the internet-facing IP by attempting a connection # to a non-existent server and reading what IP was used. - ip = '127.0.0.1' + ip = "127.0.0.1" with closing(socket.socket(socket.AF_INET, socket.SOCK_DGRAM)) as sock: # 203.0.113.0/24 is reserved as TEST-NET-3 by RFC 5737, so # there is guaranteed to be no one listening on the other # end (and we won't accidentally DOS anyone). - sock.connect(('203.0.113.1', 1)) + sock.connect(("203.0.113.1", 1)) ip = sock.getsockname()[0] return ip except: # Something went terribly wrong. Just give loopback rather # than killing everything, because this is often called just # to provide a default argument - return '127.0.0.1' + return "127.0.0.1" + def get_user_name() -> str: """ @@ -46,20 +47,23 @@ def get_user_name() -> str: except KeyError: # This is expected if the user isn't in /etc/passwd, such as in a # Docker container when running as a weird UID. Make something up. - return 'UnknownUser' + str(os.getuid()) + return "UnknownUser" + str(os.getuid()) except Exception as e: # We can't get the UID, or something weird has gone wrong. - logger.error('Unexpected error getting user name: %s', e) - return 'UnknownUser' + logger.error("Unexpected error getting user name: %s", e) + return "UnknownUser" + def utc_now() -> datetime.datetime: """Return a datetime in the UTC timezone corresponding to right now.""" return datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc) + def unix_now_ms() -> float: """Return the current time in milliseconds since the Unix epoch.""" return time.time() * 1000 + def slow_down(seconds: float) -> float: """ Toil jobs that have completed are not allowed to have taken 0 seconds, but @@ -118,12 +122,23 @@ def __str__(self) -> str: if (self.returncode < 0) or (self.stderr is None): return str(super()) else: - err = self.stderr if isinstance(self.stderr, str) else self.stderr.decode("ascii", errors="replace") + err = ( + self.stderr + if isinstance(self.stderr, str) + else self.stderr.decode("ascii", errors="replace") + ) return "Command '%s' exit status %d: %s" % (self.cmd, self.returncode, err) -def call_command(cmd: List[str], *args: str, input: Optional[str] = None, timeout: Optional[float] = None, - useCLocale: bool = True, env: Optional[typing.Dict[str, str]] = None, quiet: Optional[bool] = False) -> str: +def call_command( + cmd: list[str], + *args: str, + input: Optional[str] = None, + timeout: Optional[float] = None, + useCLocale: bool = True, + env: Optional[dict[str, str]] = None, + quiet: Optional[bool] = False +) -> str: """ Simplified calling of external commands. @@ -154,14 +169,30 @@ def call_command(cmd: List[str], *args: str, input: Optional[str] = None, timeou logger.debug("run command: {}".format(" ".join(cmd))) start_time = datetime.datetime.now() - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, - encoding='utf-8', errors="replace", env=env) + proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + encoding="utf-8", + errors="replace", + env=env, + ) stdout, stderr = proc.communicate(input=input, timeout=timeout) end_time = datetime.datetime.now() runtime = (end_time - start_time).total_seconds() sys.stderr.write(stderr) if proc.returncode != 0: - logger.debug("command failed in {}s: {}: {}".format(runtime, " ".join(cmd), stderr.rstrip())) - raise CalledProcessErrorStderr(proc.returncode, cmd, output=stdout, stderr=stderr) - logger.debug("command succeeded in {}s: {}{}".format(runtime, " ".join(cmd), (': ' + stdout.rstrip()) if not quiet else '')) + logger.debug( + "command failed in {}s: {}: {}".format( + runtime, " ".join(cmd), stderr.rstrip() + ) + ) + raise CalledProcessErrorStderr( + proc.returncode, cmd, output=stdout, stderr=stderr + ) + logger.debug( + "command succeeded in {}s: {}{}".format( + runtime, " ".join(cmd), (": " + stdout.rstrip()) if not quiet else "" + ) + ) return stdout diff --git a/src/toil/lib/objects.py b/src/toil/lib/objects.py index 5db6bcef06..0ebfc121b6 100644 --- a/src/toil/lib/objects.py +++ b/src/toil/lib/objects.py @@ -126,10 +126,10 @@ def __get__(self, instance, owner): if instance is None: return self.inner_class else: - return self._bind( instance ) + return self._bind(instance) @sync_memoize - def _bind( self, _outer): + def _bind(self, _outer): class BoundInner(self.inner_class): outer = _outer diff --git a/src/toil/lib/resources.py b/src/toil/lib/resources.py index e4c3ef4380..35107dca90 100644 --- a/src/toil/lib/resources.py +++ b/src/toil/lib/resources.py @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. import fnmatch -import os import math -import sys +import os import resource -from typing import List, Tuple +import sys + class ResourceMonitor: """ @@ -52,14 +52,20 @@ def record_extra_cpu(cls, seconds: float) -> None: cls._extra_cpu_seconds += seconds @classmethod - def get_total_cpu_time_and_memory_usage(cls) -> Tuple[float, int]: + def get_total_cpu_time_and_memory_usage(cls) -> tuple[float, int]: """ Gives the total cpu time of itself and all its children, and the maximum RSS memory usage of itself and its single largest child (in kibibytes). """ me = resource.getrusage(resource.RUSAGE_SELF) children = resource.getrusage(resource.RUSAGE_CHILDREN) - total_cpu_time = me.ru_utime + me.ru_stime + children.ru_utime + children.ru_stime + cls._extra_cpu_seconds + total_cpu_time = ( + me.ru_utime + + me.ru_stime + + children.ru_utime + + children.ru_stime + + cls._extra_cpu_seconds + ) total_memory_usage = me.ru_maxrss + children.ru_maxrss if sys.platform == "darwin": # On Linux, getrusage works in "kilobytes" (really kibibytes), but on @@ -74,10 +80,16 @@ def get_total_cpu_time(cls) -> float: """Gives the total cpu time, including the children.""" me = resource.getrusage(resource.RUSAGE_SELF) childs = resource.getrusage(resource.RUSAGE_CHILDREN) - return me.ru_utime + me.ru_stime + childs.ru_utime + childs.ru_stime + cls._extra_cpu_seconds + return ( + me.ru_utime + + me.ru_stime + + childs.ru_utime + + childs.ru_stime + + cls._extra_cpu_seconds + ) -def glob(glob_pattern: str, directoryname: str) -> List[str]: +def glob(glob_pattern: str, directoryname: str) -> list[str]: """ Walks through a directory and its subdirectories looking for files matching the glob_pattern and returns a list=[]. diff --git a/src/toil/lib/retry.py b/src/toil/lib/retry.py index f645724d52..5c15e8b8d6 100644 --- a/src/toil/lib/retry.py +++ b/src/toil/lib/retry.py @@ -131,35 +131,30 @@ def boto_bucket(bucket_name): import time import traceback import urllib.error +from collections.abc import Generator, Iterable, Sequence from contextlib import contextmanager -from typing import (Any, - Callable, - ContextManager, - Generator, - Iterable, - List, - Optional, - Sequence, - Tuple, - Type, - Union, TypeVar) +from typing import Any, Callable, ContextManager, Optional, TypeVar, Union import requests.exceptions import urllib3.exceptions -SUPPORTED_HTTP_ERRORS = [http.client.HTTPException, - urllib.error.HTTPError, - urllib3.exceptions.HTTPError, - requests.exceptions.HTTPError] +SUPPORTED_HTTP_ERRORS = [ + http.client.HTTPException, + urllib.error.HTTPError, + urllib3.exceptions.HTTPError, + requests.exceptions.HTTPError, +] try: import kubernetes.client.rest + SUPPORTED_HTTP_ERRORS.append(kubernetes.client.rest.ApiException) except ModuleNotFoundError: kubernetes = None try: import botocore.exceptions + SUPPORTED_HTTP_ERRORS.append(botocore.exceptions.ClientError) except ModuleNotFoundError: botocore = None @@ -175,12 +170,14 @@ class ErrorCondition: whether to retry. """ - def __init__(self, - error: Optional[Any] = None, - error_codes: List[int] = None, - boto_error_codes: List[str] = None, - error_message_must_include: str = None, - retry_on_this_condition: bool = True): + def __init__( + self, + error: Optional[Any] = None, + error_codes: list[int] = None, + boto_error_codes: list[str] = None, + error_message_must_include: str = None, + retry_on_this_condition: bool = True, + ): """ Initialize this ErrorCondition. @@ -227,12 +224,14 @@ def __init__(self, # There is a better way to type hint this with python 3.10 # https://stackoverflow.com/a/68290080 RT = TypeVar("RT") + + def retry( - intervals: Optional[List] = None, + intervals: Optional[list] = None, infinite_retries: bool = False, - errors: Optional[Sequence[Union[ErrorCondition, Type[Exception]]]] = None, - log_message: Optional[Tuple[Callable, str]] = None, - prepare: Optional[List[Callable]] = None, + errors: Optional[Sequence[Union[ErrorCondition, type[Exception]]]] = None, + log_message: Optional[tuple[Callable, str]] = None, + prepare: Optional[list[Callable]] = None, ) -> Callable[[Callable[..., RT]], Callable[..., RT]]: """ Retry a function if it fails with any Exception defined in "errors". @@ -266,7 +265,9 @@ def retry( errors = errors if errors else [Exception] error_conditions = {error for error in errors if isinstance(error, ErrorCondition)} - retriable_errors = {error for error in errors if not isinstance(error, ErrorCondition)} + retriable_errors = { + error for error in errors if not isinstance(error, ErrorCondition) + } if log_message: post_message_function = log_message[0] @@ -275,7 +276,10 @@ def retry( # if a generic error exists (with no restrictions), # delete more specific error_condition instances of it for error_condition in error_conditions: - if error_condition.retry_on_this_condition and error_condition.error in retriable_errors: + if ( + error_condition.retry_on_this_condition + and error_condition.error in retriable_errors + ): error_conditions.remove(error_condition) # if a more specific error exists that isn't in the general set, @@ -306,13 +310,17 @@ def call(*args, **kwargs) -> RT: raise interval = intervals_remaining.pop(0) - logger.warning(f"Error in {func}: {e}. Retrying after {interval} s...") + logger.warning( + f"Error in {func}: {e}. Retrying after {interval} s..." + ) time.sleep(interval) if prepare is not None: for prep_function in prepare: # Reset state for next attempt prep_function(*args, **kwargs) + return call + return decorate @@ -323,17 +331,18 @@ def return_status_code(e): if botocore: if isinstance(e, botocore.exceptions.ClientError): - return e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') + return e.response.get("ResponseMetadata", {}).get("HTTPStatusCode") if isinstance(e, requests.exceptions.HTTPError): return e.response.status_code - elif isinstance(e, http.client.HTTPException) or \ - isinstance(e, urllib3.exceptions.HTTPError): + elif isinstance(e, http.client.HTTPException) or isinstance( + e, urllib3.exceptions.HTTPError + ): return e.status elif isinstance(e, urllib.error.HTTPError): return e.code else: - raise ValueError(f'Unsupported error type; cannot grok status code: {e}.') + raise ValueError(f"Unsupported error type; cannot grok status code: {e}.") def get_error_code(e: Exception) -> str: @@ -342,21 +351,21 @@ def get_error_code(e: Exception) -> str: Returns empty string for other errors. """ - if hasattr(e, 'error_code') and isinstance(e.error_code, str): + if hasattr(e, "error_code") and isinstance(e.error_code, str): # A Boto 2 error return e.error_code - if hasattr(e, 'code') and isinstance(e.code, str): + if hasattr(e, "code") and isinstance(e.code, str): # A (different?) Boto 2 error return e.code - elif hasattr(e, 'response') and hasattr(e.response, 'get'): + elif hasattr(e, "response") and hasattr(e.response, "get"): # A Boto 3 error - code = e.response.get('Error', {}).get('Code') + code = e.response.get("Error", {}).get("Code") if isinstance(code, str): return code else: - return '' + return "" else: - return '' + return "" def get_error_message(e: Exception) -> str: @@ -366,18 +375,18 @@ def get_error_message(e: Exception) -> str: Note that error message conditions also check more than this; this function does not fall back to the traceback for incompatible types. """ - if hasattr(e, 'error_message') and isinstance(e.error_message, str): + if hasattr(e, "error_message") and isinstance(e.error_message, str): # A Boto 2 error return e.error_message - elif hasattr(e, 'response') and hasattr(e.response, 'get'): + elif hasattr(e, "response") and hasattr(e.response, "get"): # A Boto 3 error - message = e.response.get('Error', {}).get('Message') + message = e.response.get("Error", {}).get("Message") if isinstance(message, str): return message else: - return '' + return "" else: - return '' + return "" def get_error_status(e: Exception) -> int: @@ -391,22 +400,23 @@ def get_error_status(e: Exception) -> int: Returns 0 from other errors. """ + def numify(x): """Make sure a value is an integer.""" return int(str(x).strip()) - if hasattr(e, 'status'): + if hasattr(e, "status"): # A Boto 2 error, kubernetes.client.rest.ApiException, # http.client.HTTPException, or urllib3.exceptions.HTTPError return numify(e.status) - elif hasattr(e, 'response'): - if hasattr(e.response, 'status_code'): + elif hasattr(e, "response"): + if hasattr(e.response, "status_code"): # A requests.exceptions.HTTPError return numify(e.response.status_code) - elif hasattr(e.response, 'get'): + elif hasattr(e.response, "get"): # A Boto 3 error - return numify(e.response.get('ResponseMetadata', {}).get('HTTPStatusCode')) - elif hasattr(e, 'code'): + return numify(e.response.get("ResponseMetadata", {}).get("HTTPStatusCode")) + elif hasattr(e, "code"): # A urllib.error.HTTPError return numify(e.code) else: @@ -419,16 +429,17 @@ def get_error_body(e: Exception) -> str: Returns the code and message if the error does not have a body. """ - if hasattr(e, 'body'): + if hasattr(e, "body"): # A Boto 2 error if isinstance(e.body, bytes): # Decode the body first - return e.body.decode('utf-8') + return e.body.decode("utf-8") elif isinstance(e.body, str): return e.body # Anything else - return f'{get_error_code(e)}: {get_error_message(e)}' + return f"{get_error_code(e)}: {get_error_message(e)}" + def meets_error_message_condition(e: Exception, error_message: Optional[str]): if error_message: @@ -440,7 +451,9 @@ def meets_error_message_condition(e: Exception, error_message: Optional[str]): if isinstance(e, botocore.exceptions.ClientError): return error_message in str(e) - if isinstance(e, http.client.HTTPException) or isinstance(e, urllib3.exceptions.HTTPError): + if isinstance(e, http.client.HTTPException) or isinstance( + e, urllib3.exceptions.HTTPError + ): return error_message in e.reason elif isinstance(e, sqlite3.OperationalError): return error_message in str(e) @@ -448,7 +461,7 @@ def meets_error_message_condition(e: Exception, error_message: Optional[str]): return error_message in e.msg elif isinstance(e, requests.exceptions.HTTPError): return error_message in e.raw - elif hasattr(e, 'msg'): + elif hasattr(e, "msg"): return error_message in e.msg else: return error_message in traceback.format_exc() @@ -456,7 +469,7 @@ def meets_error_message_condition(e: Exception, error_message: Optional[str]): return True -def meets_error_code_condition(e: Exception, error_codes: Optional[List[int]]): +def meets_error_code_condition(e: Exception, error_codes: Optional[list[int]]): """These are expected to be normal HTTP error codes, like 404 or 500.""" if error_codes: status_code = get_error_status(e) @@ -465,7 +478,9 @@ def meets_error_code_condition(e: Exception, error_codes: Optional[List[int]]): return True -def meets_boto_error_code_condition(e: Exception, boto_error_codes: Optional[List[str]]): +def meets_boto_error_code_condition( + e: Exception, boto_error_codes: Optional[list[str]] +): """These are expected to be AWS's custom error aliases, like 'BucketNotFound' or 'AccessDenied'.""" if boto_error_codes: status_code = get_error_code(e) @@ -478,21 +493,37 @@ def error_meets_conditions(e, error_conditions): condition_met = False for error in error_conditions: if isinstance(e, error.error): - if error.error_codes or error.boto_error_codes or error.error_message_must_include: - error_message_condition_met = meets_error_message_condition(e, error.error_message_must_include) - error_code_condition_met = meets_error_code_condition(e, error.error_codes) - boto_error_code_condition_met = meets_boto_error_code_condition(e, error.boto_error_codes) - if error_message_condition_met and error_code_condition_met and boto_error_code_condition_met: + if ( + error.error_codes + or error.boto_error_codes + or error.error_message_must_include + ): + error_message_condition_met = meets_error_message_condition( + e, error.error_message_must_include + ) + error_code_condition_met = meets_error_code_condition( + e, error.error_codes + ) + boto_error_code_condition_met = meets_boto_error_code_condition( + e, error.boto_error_codes + ) + if ( + error_message_condition_met + and error_code_condition_met + and boto_error_code_condition_met + ): if not error.retry_on_this_condition: return False condition_met = True return condition_met + DEFAULT_DELAYS = (0, 1, 1, 4, 16, 64) DEFAULT_TIMEOUT = 300 E = TypeVar("E", bound=Exception) # so mypy understands passed through types + # TODO: Replace the use of this with retry() # The aws provisioner and jobstore need a large refactoring to be boto3 compliant, so this is # still used there to avoid the duplication of future work @@ -575,38 +606,45 @@ def old_retry( if timeout is None: timeout = DEFAULT_TIMEOUT if timeout > 0: - go = [ None ] + go = [None] @contextmanager - def repeated_attempt( delay ): + def repeated_attempt(delay): try: yield except Exception as e: - if time.time( ) + delay < expiration: - if predicate( e ): - logger.info('Got %s, trying again in %is.', e, delay) - time.sleep( delay ) + if time.time() + delay < expiration: + if predicate(e): + logger.info("Got %s, trying again in %is.", e, delay) + time.sleep(delay) else: - logger.error('Got a %s: %s which is not retriable according to %s', type(e), e, predicate) + logger.error( + "Got a %s: %s which is not retriable according to %s", + type(e), + e, + predicate, + ) raise else: - logger.error('Got %s and no time is left to retry', e) + logger.error("Got %s and no time is left to retry", e) raise else: - go.pop( ) + go.pop() - delays = iter( delays ) - expiration = time.time( ) + timeout - delay = next( delays ) + delays = iter(delays) + expiration = time.time() + timeout + delay = next(delays) while go: - yield repeated_attempt( delay ) - delay = next( delays, delay ) + yield repeated_attempt(delay) + delay = next(delays, delay) else: + @contextmanager - def single_attempt( ): + def single_attempt(): yield - yield single_attempt( ) + yield single_attempt() + # Decorator to retry tests that fail. Needs to be called with # prepare=[tearDown, setUp] if the test class has tear down and set up that diff --git a/src/toil/lib/threading.py b/src/toil/lib/threading.py index 02da76d585..27a74d7a17 100644 --- a/src/toil/lib/threading.py +++ b/src/toil/lib/threading.py @@ -25,21 +25,24 @@ import subprocess import sys import tempfile -import time import threading +import time import traceback +from collections.abc import Iterator from contextlib import contextmanager -from typing import Dict, Iterator, Optional, Union, cast +from typing import Optional, Union, cast import psutil from toil.lib.exceptions import raise_ from toil.lib.io import robust_rmtree -from toil.lib.memoize import memoize logger = logging.getLogger(__name__) -def ensure_filesystem_lockable(path: str, timeout: float = 30, hint: Optional[str] = None) -> None: + +def ensure_filesystem_lockable( + path: str, timeout: float = 30, hint: Optional[str] = None +) -> None: """ Make sure that the filesystem used at the given path is one where locks are safe to use. @@ -50,7 +53,7 @@ def ensure_filesystem_lockable(path: str, timeout: float = 30, hint: Optional[st is known to trigger bugs in the filesystem implementation. Also raises an exception if the given path does not exist, or if attempting to determine the filesystem type takes more than the timeout in seconds. - + If the filesystem type cannot be determined, does nothing. :param hint: Extra text to include in an error, if raised, telling the user @@ -63,17 +66,28 @@ def ensure_filesystem_lockable(path: str, timeout: float = 30, hint: Optional[st if platform.system() == "Linux": # We know how to find the filesystem here. - + try: # Start a child process to stat the path. See . # We really should call statfs but no bindings for it are in PyPI. - completed = subprocess.run(["stat", "-f", "-c", "%T", path], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout) + completed = subprocess.run( + ["stat", "-f", "-c", "%T", path], + check=True, + capture_output=True, + timeout=timeout, + ) except subprocess.TimeoutExpired as e: # The subprocess itself is Too Slow - raise RuntimeError(f"Polling filesystem type at {path} took more than {timeout} seconds; is your filesystem working?") from e + raise RuntimeError( + f"Polling filesystem type at {path} took more than {timeout} seconds; is your filesystem working?" + ) from e except subprocess.CalledProcessError as e: # Stat didn't work. Maybe we don't have the right version of stat installed? - logger.warning("Could not determine filesystem type at %s because of: %s", path, e.stderr.decode("utf-8", errors="replace").strip()) + logger.warning( + "Could not determine filesystem type at %s because of: %s", + path, + e.stderr.decode("utf-8", errors="replace").strip(), + ) # If we don't know the filesystem type, keep going anyway. return @@ -81,19 +95,26 @@ def ensure_filesystem_lockable(path: str, timeout: float = 30, hint: Optional[st if filesystem_type == "ceph": # Ceph is known to deadlock the MDS and break the parent directory when locking. - message = [f"Refusing to use {path} because file locks are known to break {filesystem_type} filesystems."] + message = [ + f"Refusing to use {path} because file locks are known to break {filesystem_type} filesystems." + ] if hint: # Hint the user how to fix this. message.append(hint) - raise RuntimeError(' '.join(message)) + raise RuntimeError(" ".join(message)) else: # Other filesystem types are fine (even though NFS is sometimes # flaky with regard to locks actually locking anything). - logger.debug("Detected that %s has lockable filesystem type: %s", path, filesystem_type) + logger.debug( + "Detected that %s has lockable filesystem type: %s", + path, + filesystem_type, + ) # Other platforms (Mac) probably aren't mounting Ceph and also don't # usually use the same stat binary implementation. + def safe_lock(fd: int, block: bool = True, shared: bool = False) -> None: """ Get an fcntl lock, while retrying on IO errors. @@ -110,7 +131,9 @@ def safe_lock(fd: int, block: bool = True, shared: bool = False) -> None: while True: try: # Wait until we can exclusively lock it. - lock_mode = (fcntl.LOCK_SH if shared else fcntl.LOCK_EX) | (fcntl.LOCK_NB if not block else 0) + lock_mode = (fcntl.LOCK_SH if shared else fcntl.LOCK_EX) | ( + fcntl.LOCK_NB if not block else 0 + ) fcntl.flock(fd, lock_mode) return except OSError as e: @@ -123,17 +146,23 @@ def safe_lock(fd: int, block: bool = True, shared: bool = False) -> None: # TODO: Should we eventually give up if the disk really is # broken? If so we should use the retry system. if error_tries < MAX_ERROR_TRIES: - logger.error("IO error talking to lock file. Retrying after %s seconds.", error_backoff) + logger.error( + "IO error talking to lock file. Retrying after %s seconds.", + error_backoff, + ) time.sleep(error_backoff) error_backoff = min(60, error_backoff * 2) error_tries += 1 continue else: - logger.critical("Too many IO errors talking to lock file. If using Ceph, check for MDS deadlocks. See .") + logger.critical( + "Too many IO errors talking to lock file. If using Ceph, check for MDS deadlocks. See ." + ) raise else: raise + def safe_unlock_and_close(fd: int) -> None: """ Release an fcntl lock and close the file descriptor, while handling fcntl IO errors. @@ -148,6 +177,7 @@ def safe_unlock_and_close(fd: int) -> None: # locked by us. os.close(fd) + class ExceptionalThread(threading.Thread): """ A thread whose join() method re-raises exceptions raised during run(). While join() is @@ -177,6 +207,7 @@ class ExceptionalThread(threading.Thread): AssertionError """ + exc_info = None def run(self) -> None: @@ -215,7 +246,7 @@ def cpu_count() -> int: :rtype: int """ - cached = getattr(cpu_count, 'result', None) + cached = getattr(cpu_count, "result", None) if cached is not None: # We already got a CPU count. return cast(int, cached) @@ -223,13 +254,15 @@ def cpu_count() -> int: # Get the fallback answer of all the CPUs on the machine psutil_cpu_count = cast(Optional[int], psutil.cpu_count(logical=True)) if psutil_cpu_count is None: - logger.debug('Could not retrieve the logical CPU count.') + logger.debug("Could not retrieve the logical CPU count.") - total_machine_size: Union[float, int] = psutil_cpu_count if psutil_cpu_count is not None else float('inf') - logger.debug('Total machine size: %s core(s)', total_machine_size) + total_machine_size: Union[float, int] = ( + psutil_cpu_count if psutil_cpu_count is not None else float("inf") + ) + logger.debug("Total machine size: %s core(s)", total_machine_size) # cgroups may limit the size - cgroup_size: Union[float, int] = float('inf') + cgroup_size: Union[float, int] = float("inf") try: # See if we can fetch these and use them @@ -237,13 +270,13 @@ def cpu_count() -> int: period: Optional[int] = None # CGroups v1 keeps quota and period separate - CGROUP1_QUOTA_FILE = '/sys/fs/cgroup/cpu/cpu.cfs_quota_us' - CGROUP1_PERIOD_FILE = '/sys/fs/cgroup/cpu/cpu.cfs_period_us' + CGROUP1_QUOTA_FILE = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us" + CGROUP1_PERIOD_FILE = "/sys/fs/cgroup/cpu/cpu.cfs_period_us" # CGroups v2 keeps both in one file, space-separated, quota first - CGROUP2_COMBINED_FILE = '/sys/fs/cgroup/cpu.max' + CGROUP2_COMBINED_FILE = "/sys/fs/cgroup/cpu.max" if os.path.exists(CGROUP1_QUOTA_FILE) and os.path.exists(CGROUP1_PERIOD_FILE): - logger.debug('CPU quota and period available from cgroups v1') + logger.debug("CPU quota and period available from cgroups v1") with open(CGROUP1_QUOTA_FILE) as stream: # Read the quota quota = int(stream.read()) @@ -252,56 +285,58 @@ def cpu_count() -> int: # Read the period in which we are allowed to burn the quota period = int(stream.read()) elif os.path.exists(CGROUP2_COMBINED_FILE): - logger.debug('CPU quota and period available from cgroups v2') + logger.debug("CPU quota and period available from cgroups v2") with open(CGROUP2_COMBINED_FILE) as stream: # Read the quota and the period together - quota, period = (int(part) for part in stream.read().split(' ')) + quota, period = (int(part) for part in stream.read().split(" ")) else: - logger.debug('CPU quota/period not available from cgroups v1 or cgroups v2') + logger.debug("CPU quota/period not available from cgroups v1 or cgroups v2") if quota is not None and period is not None: # We got a quota and a period. - logger.debug('CPU quota: %d period: %d', quota, period) + logger.debug("CPU quota: %d period: %d", quota, period) if quota == -1: # But the quota can be -1 for unset. # Assume we can use the whole machine. - cgroup_size = float('inf') + cgroup_size = float("inf") else: # The thread count is how many multiples of a wall clock period we # can burn in that period. - cgroup_size = int(math.ceil(float(quota)/float(period))) + cgroup_size = int(math.ceil(float(quota) / float(period))) - logger.debug('Control group size in cores: %s', cgroup_size) + logger.debug("Control group size in cores: %s", cgroup_size) except: # We can't actually read these cgroup fields. Maybe we are a mac or something. - logger.debug('Could not inspect cgroup: %s', traceback.format_exc()) + logger.debug("Could not inspect cgroup: %s", traceback.format_exc()) # CPU affinity may limit the size - affinity_size: Union[float, int] = float('inf') - if hasattr(os, 'sched_getaffinity'): + affinity_size: Union[float, int] = float("inf") + if hasattr(os, "sched_getaffinity"): try: - logger.debug('CPU affinity available') + logger.debug("CPU affinity available") affinity_size = len(os.sched_getaffinity(0)) - logger.debug('CPU affinity is restricted to %d cores', affinity_size) + logger.debug("CPU affinity is restricted to %d cores", affinity_size) except: - # We can't actually read this even though it exists. - logger.debug('Could not inspect scheduling affinity: %s', traceback.format_exc()) + # We can't actually read this even though it exists. + logger.debug( + "Could not inspect scheduling affinity: %s", traceback.format_exc() + ) else: - logger.debug('CPU affinity not available') + logger.debug("CPU affinity not available") - limit: Union[float, int] = float('inf') + limit: Union[float, int] = float("inf") # Apply all the limits to take the smallest limit = min(limit, total_machine_size) limit = min(limit, cgroup_size) limit = min(limit, affinity_size) - if limit < 1 or limit == float('inf'): + if limit < 1 or limit == float("inf"): # Fall back to 1 if we can't get a size limit = 1 result = int(limit) - logger.debug('cpu_count: %s', result) + logger.debug("cpu_count: %s", result) # Make sure to remember it for the next call - setattr(cpu_count, 'result', result) + setattr(cpu_count, "result", result) return result @@ -323,7 +358,8 @@ def cpu_count() -> int: current_process_name_lock = threading.Lock() # And a global dict from work directory to name in that work directory. # We also have a file descriptor per work directory but it is just leaked. -current_process_name_for: Dict[str, str] = {} +current_process_name_for: dict[str, str] = {} + def collect_process_name_garbage() -> None: """ @@ -347,6 +383,7 @@ def collect_process_name_garbage() -> None: for base_dir in missing: del current_process_name_for[base_dir] + def destroy_all_process_names() -> None: """ Delete all our process name files because our process is going away. @@ -361,9 +398,11 @@ def destroy_all_process_names() -> None: for base_dir, name in current_process_name_for.items(): robust_rmtree(os.path.join(base_dir, name)) + # Run the cleanup at exit atexit.register(destroy_all_process_names) + def get_process_name(base_dir: str) -> str: """ Return the name of the current process. Like a PID but visible between @@ -396,7 +435,9 @@ def get_process_name(base_dir: str) -> str: except OSError as e: if e.errno in (errno.EACCES, errno.EAGAIN): # Someone else locked it even though they should not have. - raise RuntimeError(f"Could not lock process name file {nameFileName}") from e + raise RuntimeError( + f"Could not lock process name file {nameFileName}" + ) from e else: # Something else is wrong raise @@ -437,7 +478,6 @@ def process_name_exists(base_dir: str, name: str) -> bool: # If the file is gone, the process can't exist. return False - nameFD = None try: try: @@ -474,6 +514,7 @@ def process_name_exists(base_dir: str, name: str) -> bool: except: pass + # Similar to the process naming system above, we define a global mutex system # for critical sections, based just around file locks. @contextmanager @@ -496,14 +537,13 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]: # TODO: We don't know what CLI option controls where to put this mutex, so # we aren't very helpful if the location is bad. ensure_filesystem_lockable( - base_dir, - hint=f"Specify a different place to put the {mutex} mutex." + base_dir, hint=f"Specify a different place to put the {mutex} mutex." ) # Define a filename - lock_filename = os.path.join(base_dir, 'toil-mutex-' + mutex) + lock_filename = os.path.join(base_dir, "toil-mutex-" + mutex) - logger.debug('PID %d acquiring mutex %s', os.getpid(), lock_filename) + logger.debug("PID %d acquiring mutex %s", os.getpid(), lock_filename) # We can't just create/open and lock a file, because when we clean up # there's a race where someone can open the file before we unlink it and @@ -545,7 +585,11 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]: except FileNotFoundError: path_stats = None - if path_stats is None or fd_stats.st_dev != path_stats.st_dev or fd_stats.st_ino != path_stats.st_ino: + if ( + path_stats is None + or fd_stats.st_dev != path_stats.st_dev + or fd_stats.st_ino != path_stats.st_ino + ): # The file we have a lock on is not the file linked to the name (if # any). This usually happens, because before someone releases a # lock, they delete the file. Go back and contend again. TODO: This @@ -560,12 +604,12 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]: try: # When we have it, do the thing we are protecting. - logger.debug('PID %d now holds mutex %s', os.getpid(), lock_filename) + logger.debug("PID %d now holds mutex %s", os.getpid(), lock_filename) yield finally: # Delete it while we still own it, so we can't delete it from out from # under someone else who thinks they are holding it. - logger.debug('PID %d releasing mutex %s', os.getpid(), lock_filename) + logger.debug("PID %d releasing mutex %s", os.getpid(), lock_filename) # We have had observations in the wild of the lock file not exisiting # when we go to unlink it, causing a crash on mutex release. See @@ -583,16 +627,30 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]: # Check to make sure it still looks locked before we unlink. if path_stats is None: - logger.error('PID %d had mutex %s disappear while locked! Mutex system is not working!', os.getpid(), lock_filename) - elif fd_stats.st_dev != path_stats.st_dev or fd_stats.st_ino != path_stats.st_ino: - logger.error('PID %d had mutex %s get replaced while locked! Mutex system is not working!', os.getpid(), lock_filename) + logger.error( + "PID %d had mutex %s disappear while locked! Mutex system is not working!", + os.getpid(), + lock_filename, + ) + elif ( + fd_stats.st_dev != path_stats.st_dev or fd_stats.st_ino != path_stats.st_ino + ): + logger.error( + "PID %d had mutex %s get replaced while locked! Mutex system is not working!", + os.getpid(), + lock_filename, + ) if path_stats is not None: try: # Unlink the file os.unlink(lock_filename) except FileNotFoundError: - logger.error('PID %d had mutex %s disappear between stat and unlink while unlocking! Mutex system is not working!', os.getpid(), lock_filename) + logger.error( + "PID %d had mutex %s disappear between stat and unlink while unlocking! Mutex system is not working!", + os.getpid(), + lock_filename, + ) # Note that we are unlinking it and then unlocking it; a lot of people # might have opened it before we unlinked it and will wake up when they @@ -634,13 +692,13 @@ def __init__(self, base_dir: str, name: str) -> None: # We need a mutex name to allow only one process to be entering or # leaving at a time. - self.mutex = name + '-arena-lock' + self.mutex = name + "-arena-lock" # We need a way to track who is actually in, and who was in but died. # So everybody gets a locked file (again). # TODO: deduplicate with the similar logic for process names, and also # deferred functions. - self.lockfileDir = os.path.join(base_dir, name + '-arena-members') + self.lockfileDir = os.path.join(base_dir, name + "-arena-members") # When we enter the arena, we fill this in with the FD of the locked # file that represents our presence. @@ -656,7 +714,7 @@ def enter(self) -> None: You may not enter the arena again before leaving it. """ - logger.debug('Joining arena %s', self.lockfileDir) + logger.debug("Joining arena %s", self.lockfileDir) # Make sure we're not in it already. if self.lockfileName is not None or self.lockfileFD is not None: @@ -671,19 +729,23 @@ def enter(self) -> None: except FileExistsError: pass except Exception as e: - raise RuntimeError("Could not make lock file directory " + self.lockfileDir) from e + raise RuntimeError( + "Could not make lock file directory " + self.lockfileDir + ) from e # Make ourselves a file in it and lock it to prove we are alive. try: - self.lockfileFD, self.lockfileName = tempfile.mkstemp(dir=self.lockfileDir) # type: ignore + self.lockfileFD, self.lockfileName = tempfile.mkstemp(dir=self.lockfileDir) # type: ignore except Exception as e: - raise RuntimeError("Could not make lock file in " + self.lockfileDir) from e + raise RuntimeError( + "Could not make lock file in " + self.lockfileDir + ) from e # Nobody can see it yet, so lock it right away - safe_lock(self.lockfileFD) # type: ignore + safe_lock(self.lockfileFD) # type: ignore # Now we're properly in, so release the global mutex - logger.debug('Now in arena %s', self.lockfileDir) + logger.debug("Now in arena %s", self.lockfileDir) def leave(self) -> Iterator[bool]: """ @@ -703,7 +765,7 @@ def leave(self) -> Iterator[bool]: if self.lockfileName is None or self.lockfileFD is None: raise RuntimeError("This process is not in the arena.") - logger.debug('Leaving arena %s', self.lockfileDir) + logger.debug("Leaving arena %s", self.lockfileDir) with global_mutex(self.base_dir, self.mutex): # Now nobody else should also be trying to join or leave. @@ -748,17 +810,22 @@ def leave(self) -> Iterator[bool]: else: # Nothing alive was found. Nobody will come in while we hold # the global mutex, so we are the Last Process Standing. - logger.debug('We are the Last Process Standing in arena %s', self.lockfileDir) + logger.debug( + "We are the Last Process Standing in arena %s", self.lockfileDir + ) yield True try: # Delete the arena directory so as to leave nothing behind. os.rmdir(self.lockfileDir) except: - logger.warning('Could not clean up arena %s completely: %s', - self.lockfileDir, traceback.format_exc()) + logger.warning( + "Could not clean up arena %s completely: %s", + self.lockfileDir, + traceback.format_exc(), + ) # Now we're done, whether we were the last one or not, and can # release the mutex. - logger.debug('Now out of arena %s', self.lockfileDir) + logger.debug("Now out of arena %s", self.lockfileDir) diff --git a/src/toil/lib/throttle.py b/src/toil/lib/throttle.py index 0ca0b51eb6..6389052e6f 100644 --- a/src/toil/lib/throttle.py +++ b/src/toil/lib/throttle.py @@ -47,23 +47,23 @@ def throttle(self, wait: bool = True) -> bool: configured minimum interval has passed since the last time this method returned True in the current thread) or False otherwise. """ - now = time.time( ) + now = time.time() last_invocation = self.per_thread.last_invocation if last_invocation is not None: interval = now - last_invocation if interval < self.min_interval: if wait: remainder = self.min_interval - interval - time.sleep( remainder ) + time.sleep(remainder) else: return False self.per_thread.last_invocation = now return True - def __call__( self, function ): - def wrapper( *args, **kwargs ): - self.throttle( ) - return function( *args, **kwargs ) + def __call__(self, function): + def wrapper(*args, **kwargs): + self.throttle() + return function(*args, **kwargs) return wrapper @@ -146,18 +146,19 @@ class throttle: def __init__(self, min_interval: Union[int, float]) -> None: self.min_interval = min_interval - def __enter__( self ): - self.start = time.time( ) + def __enter__(self): + self.start = time.time() - def __exit__( self, exc_type, exc_val, exc_tb ): + def __exit__(self, exc_type, exc_val, exc_tb): if exc_type is None: - duration = time.time( ) - self.start + duration = time.time() - self.start remainder = self.min_interval - duration if remainder > 0: - time.sleep( remainder ) + time.sleep(remainder) - def __call__( self, function ): - def wrapper( *args, **kwargs ): + def __call__(self, function): + def wrapper(*args, **kwargs): with self: - return function( *args, **kwargs ) + return function(*args, **kwargs) + return wrapper diff --git a/src/toil/options/common.py b/src/toil/options/common.py index 845655d4bb..b32bfba1c0 100644 --- a/src/toil/options/common.py +++ b/src/toil/options/common.py @@ -1,17 +1,16 @@ +import logging import os -from argparse import ArgumentParser, Action, _AppendAction -from typing import Any, Optional, Union, Type, Callable, List, Dict, TYPE_CHECKING +from argparse import Action, ArgumentParser, _AppendAction +from typing import TYPE_CHECKING, Any, Callable, Optional, Union from configargparse import SUPPRESS -import logging - from ruamel.yaml import YAML -from toil.lib.conversions import bytes2human, human2bytes, strtobool, opt_strtobool - from toil.batchSystems.options import add_all_batchsystem_options +from toil.lib.conversions import bytes2human, human2bytes, opt_strtobool, strtobool from toil.provisioners import parse_node_types from toil.statsAndLogging import add_logging_options + if TYPE_CHECKING: from toil.job import AcceleratorRequirement @@ -23,7 +22,8 @@ # sys.max_size on 64 bit systems is 9223372036854775807, so that 32-bit systems # use the same number -def parse_set_env(l: List[str]) -> Dict[str, Optional[str]]: + +def parse_set_env(l: list[str]) -> dict[str, Optional[str]]: """ Parse a list of strings of the form "NAME=VALUE" or just "NAME" into a dictionary. @@ -56,20 +56,20 @@ def parse_set_env(l: List[str]) -> Dict[str, Optional[str]]: v: Optional[str] = None for i in l: try: - k, v = i.split('=', 1) + k, v = i.split("=", 1) except ValueError: k, v = i, None if not k: - raise ValueError('Empty name') + raise ValueError("Empty name") d[k] = v return d -def parse_str_list(s: str) -> List[str]: +def parse_str_list(s: str) -> list[str]: return [str(x) for x in s.split(",")] -def parse_int_list(s: str) -> List[int]: +def parse_int_list(s: str) -> list[int]: return [int(x) for x in s.split(",")] @@ -91,7 +91,7 @@ def fC(minValue: float, maxValue: Optional[float] = None) -> Callable[[float], b return lambda x: minValue <= x < maxValue -def parse_accelerator_list(specs: Optional[str]) -> List['AcceleratorRequirement']: +def parse_accelerator_list(specs: Optional[str]) -> list["AcceleratorRequirement"]: """ Parse a string description of one or more accelerator requirements. """ @@ -102,20 +102,22 @@ def parse_accelerator_list(specs: Optional[str]) -> List['AcceleratorRequirement # Otherwise parse each requirement. from toil.job import parse_accelerator - return [parse_accelerator(r) for r in specs.split(',')] + return [parse_accelerator(r) for r in specs.split(",")] def parseBool(val: str) -> bool: - if val.lower() in ['true', 't', 'yes', 'y', 'on', '1']: + if val.lower() in ["true", "t", "yes", "y", "on", "1"]: return True - elif val.lower() in ['false', 'f', 'no', 'n', 'off', '0']: + elif val.lower() in ["false", "f", "no", "n", "off", "0"]: return False else: - raise RuntimeError("Could not interpret \"%s\" as a boolean value" % val) + raise RuntimeError('Could not interpret "%s" as a boolean value' % val) # This is kept in the outer scope as multiple batchsystem files use this -def make_open_interval_action(min: Union[int, float], max: Optional[Union[int, float]] = None) -> Type[Action]: +def make_open_interval_action( + min: Union[int, float], max: Optional[Union[int, float]] = None +) -> type[Action]: """ Returns an argparse action class to check if the input is within the given half-open interval. ex: @@ -128,7 +130,9 @@ def make_open_interval_action(min: Union[int, float], max: Optional[Union[int, f """ class IntOrFloatOpenAction(Action): - def __call__(self, parser: Any, namespace: Any, values: Any, option_string: Any = None) -> None: + def __call__( + self, parser: Any, namespace: Any, values: Any, option_string: Any = None + ) -> None: if isinstance(min, int): if max is not None: # for mypy assert isinstance(max, int) @@ -146,7 +150,9 @@ def __call__(self, parser: Any, namespace: Any, values: Any, option_string: Any f"{option_string} ({values}) must be at least {min} and strictly less than {max})" ) except AssertionError: - raise RuntimeError(f"The {option_string} option has an invalid value: {values}") + raise RuntimeError( + f"The {option_string} option has an invalid value: {values}" + ) setattr(namespace, self.dest, values) return IntOrFloatOpenAction @@ -164,8 +170,9 @@ def parse_jobstore(jobstore_uri: str) -> str: :return: URI of the jobstore """ from toil.common import Toil + name, rest = Toil.parseLocator(jobstore_uri) - if name == 'file': + if name == "file": # We need to resolve relative paths early, on the leader, because the worker process # may have a different working directory than the leader, e.g. under Mesos. return Toil.buildLocator(name, os.path.abspath(rest)) @@ -173,22 +180,26 @@ def parse_jobstore(jobstore_uri: str) -> str: return jobstore_uri -JOBSTORE_HELP = ("The location of the job store for the workflow. " - "A job store holds persistent information about the jobs, stats, and files in a " - "workflow. If the workflow is run with a distributed batch system, the job " - "store must be accessible by all worker nodes. Depending on the desired " - "job store implementation, the location should be formatted according to " - "one of the following schemes:\n\n" - "file: where points to a directory on the file system\n\n" - "aws:: where is the name of an AWS region like " - "us-west-2 and will be prepended to the names of any top-level " - "AWS resources in use by job store, e.g. S3 buckets.\n\n " - "google:: TODO: explain\n\n" - "For backwards compatibility, you may also specify ./foo (equivalent to " - "file:./foo or just file:foo) or /bar (equivalent to file:/bar).") - - -def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False, cwl: bool = False) -> None: +JOBSTORE_HELP = ( + "The location of the job store for the workflow. " + "A job store holds persistent information about the jobs, stats, and files in a " + "workflow. If the workflow is run with a distributed batch system, the job " + "store must be accessible by all worker nodes. Depending on the desired " + "job store implementation, the location should be formatted according to " + "one of the following schemes:\n\n" + "file: where points to a directory on the file system\n\n" + "aws:: where is the name of an AWS region like " + "us-west-2 and will be prepended to the names of any top-level " + "AWS resources in use by job store, e.g. S3 buckets.\n\n " + "google:: TODO: explain\n\n" + "For backwards compatibility, you may also specify ./foo (equivalent to " + "file:./foo or just file:foo) or /bar (equivalent to file:/bar)." +) + + +def add_base_toil_options( + parser: ArgumentParser, jobstore_as_flag: bool = False, cwl: bool = False +) -> None: """ Add base Toil command line options to the parser. :param parser: Argument parser to add options to @@ -203,8 +214,14 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False # If using argparse instead of configargparse, this should just not parse when calling parse_args() # default config value is set to none as defaults should already be populated at config init - config.add_argument('--config', dest='config', is_config_file_arg=True, default=None, metavar="PATH", - help="Get options from a config file.") + config.add_argument( + "--config", + dest="config", + is_config_file_arg=True, + default=None, + metavar="PATH", + help="Get options from a config file.", + ) add_logging_options(parser) parser.register("type", "bool", parseBool) # Custom type for arg=True/False. @@ -213,30 +230,42 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False core_options = parser.add_argument_group( title="Toil core options", description="Options to specify the location of the Toil workflow and " - "turn on stats collation about the performance of jobs." + "turn on stats collation about the performance of jobs.", ) if jobstore_as_flag: - core_options.add_argument('--jobstore', '--jobStore', dest='jobStore', type=parse_jobstore, default=None, - help=JOBSTORE_HELP) + core_options.add_argument( + "--jobstore", + "--jobStore", + dest="jobStore", + type=parse_jobstore, + default=None, + help=JOBSTORE_HELP, + ) else: - core_options.add_argument('jobStore', type=parse_jobstore, help=JOBSTORE_HELP) + core_options.add_argument("jobStore", type=parse_jobstore, help=JOBSTORE_HELP) class WorkDirAction(Action): """ Argparse action class to check that the provided --workDir exists """ - def __call__(self, parser: Any, namespace: Any, values: Any, option_string: Any = None) -> None: + def __call__( + self, parser: Any, namespace: Any, values: Any, option_string: Any = None + ) -> None: workDir = values if workDir is not None: workDir = os.path.abspath(workDir) if not os.path.exists(workDir): - raise RuntimeError(f"The path provided to --workDir ({workDir}) does not exist.") + raise RuntimeError( + f"The path provided to --workDir ({workDir}) does not exist." + ) if len(workDir) > 80: - logger.warning(f'Length of workDir path "{workDir}" is {len(workDir)} characters. ' - f'Consider setting a shorter path with --workPath or setting TMPDIR to something ' - f'like "/tmp" to avoid overly long paths.') + logger.warning( + f'Length of workDir path "{workDir}" is {len(workDir)} characters. ' + f"Consider setting a shorter path with --workPath or setting TMPDIR to something " + f'like "/tmp" to avoid overly long paths.' + ) setattr(namespace, self.dest, workDir) class CoordinationDirAction(Action): @@ -244,16 +273,21 @@ class CoordinationDirAction(Action): Argparse action class to check that the provided --coordinationDir exists """ - def __call__(self, parser: Any, namespace: Any, values: Any, option_string: Any = None) -> None: + def __call__( + self, parser: Any, namespace: Any, values: Any, option_string: Any = None + ) -> None: coordination_dir = values if coordination_dir is not None: coordination_dir = os.path.abspath(coordination_dir) if not os.path.exists(coordination_dir): raise RuntimeError( - f"The path provided to --coordinationDir ({coordination_dir}) does not exist.") + f"The path provided to --coordinationDir ({coordination_dir}) does not exist." + ) setattr(namespace, self.dest, coordination_dir) - def make_closed_interval_action(min: Union[int, float], max: Optional[Union[int, float]] = None) -> Type[Action]: + def make_closed_interval_action( + min: Union[int, float], max: Optional[Union[int, float]] = None + ) -> type[Action]: """ Returns an argparse action class to check if the input is within the given half-open interval. ex: @@ -265,7 +299,13 @@ def make_closed_interval_action(min: Union[int, float], max: Optional[Union[int, """ class ClosedIntOrFloatAction(Action): - def __call__(self, parser: Any, namespace: Any, values: Any, option_string: Any = None) -> None: + def __call__( + self, + parser: Any, + namespace: Any, + values: Any, + option_string: Any = None, + ) -> None: def is_within(x: Union[int, float]) -> bool: if max is None: return min <= x @@ -275,137 +315,221 @@ def is_within(x: Union[int, float]) -> bool: try: if not is_within(values): raise parser.error( - f"{option_string} ({values}) must be within the range: [{min}, {'infinity' if max is None else max}]") + f"{option_string} ({values}) must be within the range: [{min}, {'infinity' if max is None else max}]" + ) except AssertionError: - raise RuntimeError(f"The {option_string} option has an invalid value: {values}") + raise RuntimeError( + f"The {option_string} option has an invalid value: {values}" + ) setattr(namespace, self.dest, values) return ClosedIntOrFloatAction - core_options.add_argument("--workDir", dest="workDir", default=None, env_var="TOIL_WORKDIR", action=WorkDirAction, - metavar="PATH", - help="Absolute path to directory where temporary files generated during the Toil " - "run should be placed. Standard output and error from batch system jobs " - "(unless --noStdOutErr is set) will be placed in this directory. A cache directory " - "may be placed in this directory. Temp files and folders will be placed in a " - "directory toil- within workDir. The workflowID is generated by " - "Toil and will be reported in the workflow logs. Default is determined by the " - "variables (TMPDIR, TEMP, TMP) via mkdtemp. This directory needs to exist on " - "all machines running jobs; if capturing standard output and error from batch " - "system jobs is desired, it will generally need to be on a shared file system. " - "When sharing a cache between containers on a host, this directory must be " - "shared between the containers.") - core_options.add_argument("--coordinationDir", dest="coordination_dir", default=None, - env_var="TOIL_COORDINATION_DIR", action=CoordinationDirAction, metavar="PATH", - help="Absolute path to directory where Toil will keep state and lock files. " - "When sharing a cache between containers on a host, this directory must be " - "shared between the containers.") - core_options.add_argument("--noStdOutErr", dest="noStdOutErr", default=False, action="store_true", - help="Do not capture standard output and error from batch system jobs.") - core_options.add_argument("--stats", dest="stats", default=False, action="store_true", - help="Records statistics about the toil workflow to be used by 'toil stats'.") - clean_choices = ['always', 'onError', 'never', 'onSuccess'] - core_options.add_argument("--clean", dest="clean", choices=clean_choices, default="onSuccess", - help=f"Determines the deletion of the jobStore upon completion of the program. " - f"Choices: {clean_choices}. The --stats option requires information from the " - f"jobStore upon completion so the jobStore will never be deleted with that flag. " - f"If you wish to be able to restart the run, choose \'never\' or \'onSuccess\'. " - f"Default is \'never\' if stats is enabled, and \'onSuccess\' otherwise.") - core_options.add_argument("--cleanWorkDir", dest="cleanWorkDir", choices=clean_choices, default='always', - help=f"Determines deletion of temporary worker directory upon completion of a job. " - f"Choices: {clean_choices}. Default = always. WARNING: This option should be " - f"changed for debugging only. Running a full pipeline with this option could " - f"fill your disk with excessive intermediate data.") - core_options.add_argument("--clusterStats", dest="clusterStats", nargs='?', action='store', default=None, - metavar="OPT_PATH", const=os.getcwd(), - help="If enabled, writes out JSON resource usage statistics to a file. " - "The default location for this file is the current working directory, but an " - "absolute path can also be passed to specify where this file should be written. " - "This options only applies when using scalable batch systems.") + core_options.add_argument( + "--workDir", + dest="workDir", + default=None, + env_var="TOIL_WORKDIR", + action=WorkDirAction, + metavar="PATH", + help="Absolute path to directory where temporary files generated during the Toil " + "run should be placed. Standard output and error from batch system jobs " + "(unless --noStdOutErr is set) will be placed in this directory. A cache directory " + "may be placed in this directory. Temp files and folders will be placed in a " + "directory toil- within workDir. The workflowID is generated by " + "Toil and will be reported in the workflow logs. Default is determined by the " + "variables (TMPDIR, TEMP, TMP) via mkdtemp. This directory needs to exist on " + "all machines running jobs; if capturing standard output and error from batch " + "system jobs is desired, it will generally need to be on a shared file system. " + "When sharing a cache between containers on a host, this directory must be " + "shared between the containers.", + ) + core_options.add_argument( + "--coordinationDir", + dest="coordination_dir", + default=None, + env_var="TOIL_COORDINATION_DIR", + action=CoordinationDirAction, + metavar="PATH", + help="Absolute path to directory where Toil will keep state and lock files. " + "When sharing a cache between containers on a host, this directory must be " + "shared between the containers.", + ) + core_options.add_argument( + "--noStdOutErr", + dest="noStdOutErr", + default=False, + action="store_true", + help="Do not capture standard output and error from batch system jobs.", + ) + core_options.add_argument( + "--stats", + dest="stats", + default=False, + action="store_true", + help="Records statistics about the toil workflow to be used by 'toil stats'.", + ) + clean_choices = ["always", "onError", "never", "onSuccess"] + core_options.add_argument( + "--clean", + dest="clean", + choices=clean_choices, + default="onSuccess", + help=f"Determines the deletion of the jobStore upon completion of the program. " + f"Choices: {clean_choices}. The --stats option requires information from the " + f"jobStore upon completion so the jobStore will never be deleted with that flag. " + f"If you wish to be able to restart the run, choose 'never' or 'onSuccess'. " + f"Default is 'never' if stats is enabled, and 'onSuccess' otherwise.", + ) + core_options.add_argument( + "--cleanWorkDir", + dest="cleanWorkDir", + choices=clean_choices, + default="always", + help=f"Determines deletion of temporary worker directory upon completion of a job. " + f"Choices: {clean_choices}. Default = always. WARNING: This option should be " + f"changed for debugging only. Running a full pipeline with this option could " + f"fill your disk with excessive intermediate data.", + ) + core_options.add_argument( + "--clusterStats", + dest="clusterStats", + nargs="?", + action="store", + default=None, + metavar="OPT_PATH", + const=os.getcwd(), + help="If enabled, writes out JSON resource usage statistics to a file. " + "The default location for this file is the current working directory, but an " + "absolute path can also be passed to specify where this file should be written. " + "This options only applies when using scalable batch systems.", + ) # Restarting the workflow options restart_options = parser.add_argument_group( title="Toil options for restarting an existing workflow", - description="Allows the restart of an existing workflow" + description="Allows the restart of an existing workflow", + ) + restart_options.add_argument( + "--restart", + dest="restart", + default=False, + action="store_true", + help="If --restart is specified then will attempt to restart existing workflow " + "at the location pointed to by the --jobStore option. Will raise an exception " + "if the workflow does not exist", ) - restart_options.add_argument("--restart", dest="restart", default=False, action="store_true", - help="If --restart is specified then will attempt to restart existing workflow " - "at the location pointed to by the --jobStore option. Will raise an exception " - "if the workflow does not exist") # Batch system options batchsystem_options = parser.add_argument_group( title="Toil options for specifying the batch system", - description="Allows the specification of the batch system." + description="Allows the specification of the batch system.", ) add_all_batchsystem_options(batchsystem_options) # File store options file_store_options = parser.add_argument_group( title="Toil options for configuring storage", - description="Allows configuring Toil's data storage." + description="Allows configuring Toil's data storage.", ) link_imports = file_store_options.add_mutually_exclusive_group() - link_imports_help = ("When using a filesystem based job store, CWL input files are by default symlinked in. " - "Setting this option to True instead copies the files into the job store, which may protect " - "them from being modified externally. When set to False, as long as caching is enabled, " - "Toil will protect the file automatically by changing the permissions to read-only. " - "default=%(default)s") - link_imports.add_argument("--symlinkImports", dest="symlinkImports", type=strtobool, default=True, - metavar="BOOL", help=link_imports_help) + link_imports_help = ( + "When using a filesystem based job store, CWL input files are by default symlinked in. " + "Setting this option to True instead copies the files into the job store, which may protect " + "them from being modified externally. When set to False, as long as caching is enabled, " + "Toil will protect the file automatically by changing the permissions to read-only. " + "default=%(default)s" + ) + link_imports.add_argument( + "--symlinkImports", + dest="symlinkImports", + type=strtobool, + default=True, + metavar="BOOL", + help=link_imports_help, + ) move_exports = file_store_options.add_mutually_exclusive_group() - move_exports_help = ('When using a filesystem based job store, output files are by default moved to the ' - 'output directory, and a symlink to the moved exported file is created at the initial ' - 'location. Setting this option to True instead copies the files into the output directory. ' - 'Applies to filesystem-based job stores only. ' - 'default=%(default)s') - move_exports.add_argument("--moveOutputs", dest="moveOutputs", type=strtobool, default=False, metavar="BOOL", - help=move_exports_help) + move_exports_help = ( + "When using a filesystem based job store, output files are by default moved to the " + "output directory, and a symlink to the moved exported file is created at the initial " + "location. Setting this option to True instead copies the files into the output directory. " + "Applies to filesystem-based job stores only. " + "default=%(default)s" + ) + move_exports.add_argument( + "--moveOutputs", + dest="moveOutputs", + type=strtobool, + default=False, + metavar="BOOL", + help=move_exports_help, + ) caching = file_store_options.add_mutually_exclusive_group() caching_help = "Enable or disable caching for your workflow, specifying this overrides default from job store" - caching.add_argument('--caching', dest='caching', type=opt_strtobool, default=None, metavar="BOOL", - help=caching_help) + caching.add_argument( + "--caching", + dest="caching", + type=opt_strtobool, + default=None, + metavar="BOOL", + help=caching_help, + ) # default is None according to PR 4299, seems to be generated at runtime - file_store_options.add_argument("--symlinkJobStoreReads", dest="symlink_job_store_reads", type=strtobool, default=True, - metavar="BOOL", - help="Allow reads and container mounts from a JobStore's shared filesystem directly " - "via symlink. default=%(default)s") + file_store_options.add_argument( + "--symlinkJobStoreReads", + dest="symlink_job_store_reads", + type=strtobool, + default=True, + metavar="BOOL", + help="Allow reads and container mounts from a JobStore's shared filesystem directly " + "via symlink. default=%(default)s", + ) # Auto scaling options autoscaling_options = parser.add_argument_group( title="Toil options for autoscaling the cluster of worker nodes", description="Allows the specification of the minimum and maximum number of nodes in an autoscaled cluster, " - "as well as parameters to control the level of provisioning." + "as well as parameters to control the level of provisioning.", ) - provisioner_choices = ['aws', 'gce', None] + provisioner_choices = ["aws", "gce", None] # TODO: Better consolidate this provisioner arg and the one in provisioners/__init__.py? - autoscaling_options.add_argument('--provisioner', '-p', dest="provisioner", choices=provisioner_choices, - default=None, - help=f"The provisioner for cluster auto-scaling. This is the main Toil " - f"'--provisioner' option, and defaults to None for running on single " - f"machine and non-auto-scaling batch systems. The currently supported " - f"choices are {provisioner_choices}. The default is %(default)s.") - autoscaling_options.add_argument('--nodeTypes', default=[], dest="nodeTypes", type=parse_node_types, - action="extend", - help="Specifies a list of comma-separated node types, each of which is " - "composed of slash-separated instance types, and an optional spot " - "bid set off by a colon, making the node type preemptible. Instance " - "types may appear in multiple node types, and the same node type " - "may appear as both preemptible and non-preemptible.\n" - "Valid argument specifying two node types:\n" - "\tc5.4xlarge/c5a.4xlarge:0.42,t2.large\n" - "Node types:\n" - "\tc5.4xlarge/c5a.4xlarge:0.42 and t2.large\n" - "Instance types:\n" - "\tc5.4xlarge, c5a.4xlarge, and t2.large\n" - "Semantics:\n" - "\tBid $0.42/hour for either c5.4xlarge or c5a.4xlarge instances,\n" - "\ttreated interchangeably, while they are available at that price,\n" - "\tand buy t2.large instances at full price.\n" - "default=%(default)s") + autoscaling_options.add_argument( + "--provisioner", + "-p", + dest="provisioner", + choices=provisioner_choices, + default=None, + help=f"The provisioner for cluster auto-scaling. This is the main Toil " + f"'--provisioner' option, and defaults to None for running on single " + f"machine and non-auto-scaling batch systems. The currently supported " + f"choices are {provisioner_choices}. The default is %(default)s.", + ) + autoscaling_options.add_argument( + "--nodeTypes", + default=[], + dest="nodeTypes", + type=parse_node_types, + action="extend", + help="Specifies a list of comma-separated node types, each of which is " + "composed of slash-separated instance types, and an optional spot " + "bid set off by a colon, making the node type preemptible. Instance " + "types may appear in multiple node types, and the same node type " + "may appear as both preemptible and non-preemptible.\n" + "Valid argument specifying two node types:\n" + "\tc5.4xlarge/c5a.4xlarge:0.42,t2.large\n" + "Node types:\n" + "\tc5.4xlarge/c5a.4xlarge:0.42 and t2.large\n" + "Instance types:\n" + "\tc5.4xlarge, c5a.4xlarge, and t2.large\n" + "Semantics:\n" + "\tBid $0.42/hour for either c5.4xlarge or c5a.4xlarge instances,\n" + "\ttreated interchangeably, while they are available at that price,\n" + "\tand buy t2.large instances at full price.\n" + "default=%(default)s", + ) class NodeExtendAction(_AppendAction): """ @@ -418,252 +542,495 @@ def __init__(self, option_strings: Any, dest: Any, **kwargs: Any): super().__init__(option_strings, dest, **kwargs) self.is_default = True - def __call__(self, parser: Any, namespace: Any, values: Any, option_string: Any = None) -> None: + def __call__( + self, parser: Any, namespace: Any, values: Any, option_string: Any = None + ) -> None: if self.is_default: setattr(namespace, self.dest, values) self.is_default = False else: super().__call__(parser, namespace, values, option_string) - autoscaling_options.add_argument('--maxNodes', default=[10], dest="maxNodes", type=parse_int_list, - action=NodeExtendAction, metavar="INT[,INT...]", - help=f"Maximum number of nodes of each type in the cluster, if using autoscaling, " - f"provided as a comma-separated list. The first value is used as a default " - f"if the list length is less than the number of nodeTypes. " - f"default=%(default)s") - autoscaling_options.add_argument('--minNodes', default=[0], dest="minNodes", type=parse_int_list, - action=NodeExtendAction, metavar="INT[,INT...]", - help="Mininum number of nodes of each type in the cluster, if using " - "auto-scaling. This should be provided as a comma-separated list of the " - "same length as the list of node types. default=%(default)s") - autoscaling_options.add_argument("--targetTime", dest="targetTime", default=defaultTargetTime, type=int, - action=make_closed_interval_action(0), metavar="INT", - help=f"Sets how rapidly you aim to complete jobs in seconds. Shorter times mean " - f"more aggressive parallelization. The autoscaler attempts to scale up/down " - f"so that it expects all queued jobs will complete within targetTime " - f"seconds. default=%(default)s") - autoscaling_options.add_argument("--betaInertia", dest="betaInertia", default=0.1, type=float, - action=make_closed_interval_action(0.0, 0.9), metavar="FLOAT", - help=f"A smoothing parameter to prevent unnecessary oscillations in the number " - f"of provisioned nodes. This controls an exponentially weighted moving " - f"average of the estimated number of nodes. A value of 0.0 disables any " - f"smoothing, and a value of 0.9 will smooth so much that few changes will " - f"ever be made. Must be between 0.0 and 0.9. default=%(default)s") - autoscaling_options.add_argument("--scaleInterval", dest="scaleInterval", default=60, type=int, metavar="INT", - help=f"The interval (seconds) between assessing if the scale of " - f"the cluster needs to change. default=%(default)s") - autoscaling_options.add_argument("--preemptibleCompensation", "--preemptableCompensation", - dest="preemptibleCompensation", default=0.0, type=float, - action=make_closed_interval_action(0.0, 1.0), metavar="FLOAT", - help=f"The preference of the autoscaler to replace preemptible nodes with " - f"non-preemptible nodes, when preemptible nodes cannot be started for some " - f"reason. This value must be between 0.0 and 1.0, inclusive. " - f"A value of 0.0 disables such " - f"compensation, a value of 0.5 compensates two missing preemptible nodes " - f"with a non-preemptible one. A value of 1.0 replaces every missing " - f"pre-emptable node with a non-preemptible one. default=%(default)s") - autoscaling_options.add_argument("--nodeStorage", dest="nodeStorage", default=50, type=int, metavar="INT", - help="Specify the size of the root volume of worker nodes when they are launched " - "in gigabytes. You may want to set this if your jobs require a lot of disk " - f"space. (default=%(default)s).") - autoscaling_options.add_argument('--nodeStorageOverrides', dest="nodeStorageOverrides", default=[], - type=parse_str_list, action="extend", - metavar="NODETYPE:NODESTORAGE[,NODETYPE:NODESTORAGE...]", - help="Comma-separated list of nodeType:nodeStorage that are used to override " - "the default value from --nodeStorage for the specified nodeType(s). " - "This is useful for heterogeneous jobs where some tasks require much more " - "disk than others.") - - autoscaling_options.add_argument("--metrics", dest="metrics", default=False, type=strtobool, metavar="BOOL", - help="Enable the prometheus/grafana dashboard for monitoring CPU/RAM usage, " - "queue size, and issued jobs.") - autoscaling_options.add_argument("--assumeZeroOverhead", dest="assume_zero_overhead", default=False, - type=strtobool, metavar="BOOL", - help="Ignore scheduler and OS overhead and assume jobs can use every last byte " - "of memory and disk on a node when autoscaling.") + autoscaling_options.add_argument( + "--maxNodes", + default=[10], + dest="maxNodes", + type=parse_int_list, + action=NodeExtendAction, + metavar="INT[,INT...]", + help=f"Maximum number of nodes of each type in the cluster, if using autoscaling, " + f"provided as a comma-separated list. The first value is used as a default " + f"if the list length is less than the number of nodeTypes. " + f"default=%(default)s", + ) + autoscaling_options.add_argument( + "--minNodes", + default=[0], + dest="minNodes", + type=parse_int_list, + action=NodeExtendAction, + metavar="INT[,INT...]", + help="Mininum number of nodes of each type in the cluster, if using " + "auto-scaling. This should be provided as a comma-separated list of the " + "same length as the list of node types. default=%(default)s", + ) + autoscaling_options.add_argument( + "--targetTime", + dest="targetTime", + default=defaultTargetTime, + type=int, + action=make_closed_interval_action(0), + metavar="INT", + help=f"Sets how rapidly you aim to complete jobs in seconds. Shorter times mean " + f"more aggressive parallelization. The autoscaler attempts to scale up/down " + f"so that it expects all queued jobs will complete within targetTime " + f"seconds. default=%(default)s", + ) + autoscaling_options.add_argument( + "--betaInertia", + dest="betaInertia", + default=0.1, + type=float, + action=make_closed_interval_action(0.0, 0.9), + metavar="FLOAT", + help=f"A smoothing parameter to prevent unnecessary oscillations in the number " + f"of provisioned nodes. This controls an exponentially weighted moving " + f"average of the estimated number of nodes. A value of 0.0 disables any " + f"smoothing, and a value of 0.9 will smooth so much that few changes will " + f"ever be made. Must be between 0.0 and 0.9. default=%(default)s", + ) + autoscaling_options.add_argument( + "--scaleInterval", + dest="scaleInterval", + default=60, + type=int, + metavar="INT", + help=f"The interval (seconds) between assessing if the scale of " + f"the cluster needs to change. default=%(default)s", + ) + autoscaling_options.add_argument( + "--preemptibleCompensation", + "--preemptableCompensation", + dest="preemptibleCompensation", + default=0.0, + type=float, + action=make_closed_interval_action(0.0, 1.0), + metavar="FLOAT", + help=f"The preference of the autoscaler to replace preemptible nodes with " + f"non-preemptible nodes, when preemptible nodes cannot be started for some " + f"reason. This value must be between 0.0 and 1.0, inclusive. " + f"A value of 0.0 disables such " + f"compensation, a value of 0.5 compensates two missing preemptible nodes " + f"with a non-preemptible one. A value of 1.0 replaces every missing " + f"pre-emptable node with a non-preemptible one. default=%(default)s", + ) + autoscaling_options.add_argument( + "--nodeStorage", + dest="nodeStorage", + default=50, + type=int, + metavar="INT", + help="Specify the size of the root volume of worker nodes when they are launched " + "in gigabytes. You may want to set this if your jobs require a lot of disk " + f"space. (default=%(default)s).", + ) + autoscaling_options.add_argument( + "--nodeStorageOverrides", + dest="nodeStorageOverrides", + default=[], + type=parse_str_list, + action="extend", + metavar="NODETYPE:NODESTORAGE[,NODETYPE:NODESTORAGE...]", + help="Comma-separated list of nodeType:nodeStorage that are used to override " + "the default value from --nodeStorage for the specified nodeType(s). " + "This is useful for heterogeneous jobs where some tasks require much more " + "disk than others.", + ) + + autoscaling_options.add_argument( + "--metrics", + dest="metrics", + default=False, + type=strtobool, + metavar="BOOL", + help="Enable the prometheus/grafana dashboard for monitoring CPU/RAM usage, " + "queue size, and issued jobs.", + ) + autoscaling_options.add_argument( + "--assumeZeroOverhead", + dest="assume_zero_overhead", + default=False, + type=strtobool, + metavar="BOOL", + help="Ignore scheduler and OS overhead and assume jobs can use every last byte " + "of memory and disk on a node when autoscaling.", + ) # Parameters to limit service jobs / detect service deadlocks service_options = parser.add_argument_group( title="Toil options for limiting the number of service jobs and detecting service deadlocks", - description=SUPPRESS if cwl else "Allows the specification of the maximum number of service jobs in a cluster. " - "By keeping this limited we can avoid nodes occupied with services causing " - "deadlocks." - ) - service_options.add_argument("--maxServiceJobs", dest="maxServiceJobs", default=SYS_MAX_SIZE, type=int, - metavar="INT", - help=SUPPRESS if cwl else f"The maximum number of service jobs that can be run " - f"concurrently, excluding service jobs running on " - f"preemptible nodes. default=%(default)s") - service_options.add_argument("--maxPreemptibleServiceJobs", dest="maxPreemptibleServiceJobs", - default=SYS_MAX_SIZE, - type=int, metavar="INT", - help=SUPPRESS if cwl else "The maximum number of service jobs that can run " - "concurrently on preemptible nodes. default=%(default)s") - service_options.add_argument("--deadlockWait", dest="deadlockWait", default=60, type=int, metavar="INT", - help=SUPPRESS if cwl else f"Time, in seconds, to tolerate the workflow running only " - f"the same service jobs, with no jobs to use them, " - f"before declaring the workflow to be deadlocked and " - f"stopping. default=%(default)s") - service_options.add_argument("--deadlockCheckInterval", dest="deadlockCheckInterval", default=30, type=int, - metavar="INT", - help=SUPPRESS if cwl else "Time, in seconds, to wait between checks to see if the " - "workflow is stuck running only service jobs, with no jobs " - "to use them. Should be shorter than --deadlockWait. May " - "need to be increased if the batch system cannot enumerate " - "running jobs quickly enough, or if polling for running " - "jobs is placing an unacceptable load on a shared cluster." - f"default=%(default)s") + description=( + SUPPRESS + if cwl + else "Allows the specification of the maximum number of service jobs in a cluster. " + "By keeping this limited we can avoid nodes occupied with services causing " + "deadlocks." + ), + ) + service_options.add_argument( + "--maxServiceJobs", + dest="maxServiceJobs", + default=SYS_MAX_SIZE, + type=int, + metavar="INT", + help=( + SUPPRESS + if cwl + else f"The maximum number of service jobs that can be run " + f"concurrently, excluding service jobs running on " + f"preemptible nodes. default=%(default)s" + ), + ) + service_options.add_argument( + "--maxPreemptibleServiceJobs", + dest="maxPreemptibleServiceJobs", + default=SYS_MAX_SIZE, + type=int, + metavar="INT", + help=( + SUPPRESS + if cwl + else "The maximum number of service jobs that can run " + "concurrently on preemptible nodes. default=%(default)s" + ), + ) + service_options.add_argument( + "--deadlockWait", + dest="deadlockWait", + default=60, + type=int, + metavar="INT", + help=( + SUPPRESS + if cwl + else f"Time, in seconds, to tolerate the workflow running only " + f"the same service jobs, with no jobs to use them, " + f"before declaring the workflow to be deadlocked and " + f"stopping. default=%(default)s" + ), + ) + service_options.add_argument( + "--deadlockCheckInterval", + dest="deadlockCheckInterval", + default=30, + type=int, + metavar="INT", + help=( + SUPPRESS + if cwl + else "Time, in seconds, to wait between checks to see if the " + "workflow is stuck running only service jobs, with no jobs " + "to use them. Should be shorter than --deadlockWait. May " + "need to be increased if the batch system cannot enumerate " + "running jobs quickly enough, or if polling for running " + "jobs is placing an unacceptable load on a shared cluster." + f"default=%(default)s" + ), + ) # Resource requirements resource_options = parser.add_argument_group( title="Toil options for cores/memory requirements", description="The options to specify default cores/memory requirements (if not specified by the jobs " - "themselves), and to limit the total amount of memory/cores requested from the batch system." - ) - resource_help_msg = ('The {} amount of {} to request for a job. ' - 'Only applicable to jobs that do not specify an explicit value for this requirement. ' - '{}. ' - 'Default is {}.') - cpu_note = 'Fractions of a core (for example 0.1) are supported on some batch systems [mesos, single_machine]' - disk_mem_note = 'Standard suffixes like K, Ki, M, Mi, G or Gi are supported' + "themselves), and to limit the total amount of memory/cores requested from the batch system.", + ) + resource_help_msg = ( + "The {} amount of {} to request for a job. " + "Only applicable to jobs that do not specify an explicit value for this requirement. " + "{}. " + "Default is {}." + ) + cpu_note = "Fractions of a core (for example 0.1) are supported on some batch systems [mesos, single_machine]" + disk_mem_note = "Standard suffixes like K, Ki, M, Mi, G or Gi are supported" accelerators_note = ( - 'Each accelerator specification can have a type (gpu [default], nvidia, amd, cuda, rocm, opencl, ' - 'or a specific model like nvidia-tesla-k80), and a count [default: 1]. If both a type and a count ' - 'are used, they must be separated by a colon. If multiple types of accelerators are ' - 'used, the specifications are separated by commas') + "Each accelerator specification can have a type (gpu [default], nvidia, amd, cuda, rocm, opencl, " + "or a specific model like nvidia-tesla-k80), and a count [default: 1]. If both a type and a count " + "are used, they must be separated by a colon. If multiple types of accelerators are " + "used, the specifications are separated by commas" + ) h2b = lambda x: human2bytes(str(x)) - resource_options.add_argument('--defaultMemory', dest='defaultMemory', default="2.0 Gi", type=h2b, - action=make_open_interval_action(1), - help=resource_help_msg.format('default', 'memory', disk_mem_note, - bytes2human(2147483648))) - resource_options.add_argument('--defaultCores', dest='defaultCores', default=1, metavar='FLOAT', type=float, - action=make_open_interval_action(1.0), - help=resource_help_msg.format('default', 'cpu', cpu_note, str(1))) - resource_options.add_argument('--defaultDisk', dest='defaultDisk', default="2.0 Gi", metavar='INT', type=h2b, - action=make_open_interval_action(1), - help=resource_help_msg.format('default', 'disk', disk_mem_note, - bytes2human(2147483648))) - resource_options.add_argument('--defaultAccelerators', dest='defaultAccelerators', default=[], - metavar='ACCELERATOR[,ACCELERATOR...]', type=parse_accelerator_list, action="extend", - help=resource_help_msg.format('default', 'accelerators', accelerators_note, [])) - resource_options.add_argument('--defaultPreemptible', '--defaultPreemptable', dest='defaultPreemptible', - metavar='BOOL', - type=strtobool, nargs='?', const=True, default=False, - help='Make all jobs able to run on preemptible (spot) nodes by default.') - resource_options.add_argument('--maxCores', dest='maxCores', default=SYS_MAX_SIZE, metavar='INT', type=int, - action=make_open_interval_action(1), - help=resource_help_msg.format('max', 'cpu', cpu_note, str(SYS_MAX_SIZE))) - resource_options.add_argument('--maxMemory', dest='maxMemory', default=SYS_MAX_SIZE, metavar='INT', type=h2b, - action=make_open_interval_action(1), - help=resource_help_msg.format('max', 'memory', disk_mem_note, - bytes2human(SYS_MAX_SIZE))) - resource_options.add_argument('--maxDisk', dest='maxDisk', default=SYS_MAX_SIZE, metavar='INT', type=h2b, - action=make_open_interval_action(1), - help=resource_help_msg.format('max', 'disk', disk_mem_note, - bytes2human(SYS_MAX_SIZE))) + resource_options.add_argument( + "--defaultMemory", + dest="defaultMemory", + default="2.0 Gi", + type=h2b, + action=make_open_interval_action(1), + help=resource_help_msg.format( + "default", "memory", disk_mem_note, bytes2human(2147483648) + ), + ) + resource_options.add_argument( + "--defaultCores", + dest="defaultCores", + default=1, + metavar="FLOAT", + type=float, + action=make_open_interval_action(1.0), + help=resource_help_msg.format("default", "cpu", cpu_note, str(1)), + ) + resource_options.add_argument( + "--defaultDisk", + dest="defaultDisk", + default="2.0 Gi", + metavar="INT", + type=h2b, + action=make_open_interval_action(1), + help=resource_help_msg.format( + "default", "disk", disk_mem_note, bytes2human(2147483648) + ), + ) + resource_options.add_argument( + "--defaultAccelerators", + dest="defaultAccelerators", + default=[], + metavar="ACCELERATOR[,ACCELERATOR...]", + type=parse_accelerator_list, + action="extend", + help=resource_help_msg.format("default", "accelerators", accelerators_note, []), + ) + resource_options.add_argument( + "--defaultPreemptible", + "--defaultPreemptable", + dest="defaultPreemptible", + metavar="BOOL", + type=strtobool, + nargs="?", + const=True, + default=False, + help="Make all jobs able to run on preemptible (spot) nodes by default.", + ) + resource_options.add_argument( + "--maxCores", + dest="maxCores", + default=SYS_MAX_SIZE, + metavar="INT", + type=int, + action=make_open_interval_action(1), + help=resource_help_msg.format("max", "cpu", cpu_note, str(SYS_MAX_SIZE)), + ) + resource_options.add_argument( + "--maxMemory", + dest="maxMemory", + default=SYS_MAX_SIZE, + metavar="INT", + type=h2b, + action=make_open_interval_action(1), + help=resource_help_msg.format( + "max", "memory", disk_mem_note, bytes2human(SYS_MAX_SIZE) + ), + ) + resource_options.add_argument( + "--maxDisk", + dest="maxDisk", + default=SYS_MAX_SIZE, + metavar="INT", + type=h2b, + action=make_open_interval_action(1), + help=resource_help_msg.format( + "max", "disk", disk_mem_note, bytes2human(SYS_MAX_SIZE) + ), + ) # Retrying/rescuing jobs job_options = parser.add_argument_group( title="Toil options for rescuing/killing/restarting jobs", - description="The options for jobs that either run too long/fail or get lost (some batch systems have issues!)." - ) - job_options.add_argument("--retryCount", dest="retryCount", default=1, type=int, - action=make_open_interval_action(0), metavar="INT", - help=f"Number of times to retry a failing job before giving up and " - f"labeling job failed. default={1}") - job_options.add_argument("--enableUnlimitedPreemptibleRetries", "--enableUnlimitedPreemptableRetries", - dest="enableUnlimitedPreemptibleRetries", - type=strtobool, default=False, metavar="BOOL", - help="If set, preemptible failures (or any failure due to an instance getting " - "unexpectedly terminated) will not count towards job failures and --retryCount.") - job_options.add_argument("--doubleMem", dest="doubleMem", type=strtobool, default=False, metavar="BOOL", - help="If set, batch jobs which die to reaching memory limit on batch schedulers " - "will have their memory doubled and they will be retried. The remaining " - "retry count will be reduced by 1. Currently supported by LSF.") - job_options.add_argument("--maxJobDuration", dest="maxJobDuration", default=SYS_MAX_SIZE, type=int, - action=make_open_interval_action(1), metavar="INT", - help=f"Maximum runtime of a job (in seconds) before we kill it (this is a lower bound, " - f"and the actual time before killing the job may be longer). " - f"default=%(default)s") - job_options.add_argument("--rescueJobsFrequency", dest="rescueJobsFrequency", default=60, type=int, - action=make_open_interval_action(1), metavar="INT", - help=f"Period of time to wait (in seconds) between checking for missing/overlong jobs, " - f"that is jobs which get lost by the batch system. Expert parameter. " - f"default=%(default)s") - job_options.add_argument("--jobStoreTimeout", dest="job_store_timeout", default=30, type=float, - action=make_open_interval_action(0), metavar="FLOAT", - help=f"Maximum time (in seconds) to wait for a job's update to the job store " - f"before declaring it failed. default=%(default)s") - + description="The options for jobs that either run too long/fail or get lost (some batch systems have issues!).", + ) + job_options.add_argument( + "--retryCount", + dest="retryCount", + default=1, + type=int, + action=make_open_interval_action(0), + metavar="INT", + help=f"Number of times to retry a failing job before giving up and " + f"labeling job failed. default={1}", + ) + job_options.add_argument( + "--enableUnlimitedPreemptibleRetries", + "--enableUnlimitedPreemptableRetries", + dest="enableUnlimitedPreemptibleRetries", + type=strtobool, + default=False, + metavar="BOOL", + help="If set, preemptible failures (or any failure due to an instance getting " + "unexpectedly terminated) will not count towards job failures and --retryCount.", + ) + job_options.add_argument( + "--doubleMem", + dest="doubleMem", + type=strtobool, + default=False, + metavar="BOOL", + help="If set, batch jobs which die to reaching memory limit on batch schedulers " + "will have their memory doubled and they will be retried. The remaining " + "retry count will be reduced by 1. Currently supported by LSF.", + ) + job_options.add_argument( + "--maxJobDuration", + dest="maxJobDuration", + default=SYS_MAX_SIZE, + type=int, + action=make_open_interval_action(1), + metavar="INT", + help=f"Maximum runtime of a job (in seconds) before we kill it (this is a lower bound, " + f"and the actual time before killing the job may be longer). " + f"default=%(default)s", + ) + job_options.add_argument( + "--rescueJobsFrequency", + dest="rescueJobsFrequency", + default=60, + type=int, + action=make_open_interval_action(1), + metavar="INT", + help=f"Period of time to wait (in seconds) between checking for missing/overlong jobs, " + f"that is jobs which get lost by the batch system. Expert parameter. " + f"default=%(default)s", + ) + job_options.add_argument( + "--jobStoreTimeout", + dest="job_store_timeout", + default=30, + type=float, + action=make_open_interval_action(0), + metavar="FLOAT", + help=f"Maximum time (in seconds) to wait for a job's update to the job store " + f"before declaring it failed. default=%(default)s", + ) # Log management options log_options = parser.add_argument_group( title="Toil log management options", - description="Options for how Toil should manage its logs." - ) - log_options.add_argument("--maxLogFileSize", dest="maxLogFileSize", default=100 * 1024 * 1024, type=h2b, - action=make_open_interval_action(1), - help=f"The maximum size of a job log file to keep (in bytes), log files larger than " - f"this will be truncated to the last X bytes. Setting this option to zero will " - f"prevent any truncation. Setting this option to a negative value will truncate " - f"from the beginning. Default={bytes2human(100 * 1024 * 1024)}") - log_options.add_argument("--writeLogs", dest="writeLogs", nargs='?', action='store', default=None, - const=os.getcwd(), metavar="OPT_PATH", - help="Write worker logs received by the leader into their own files at the specified " - "path. Any non-empty standard output and error from failed batch system jobs will " - "also be written into files at this path. The current working directory will be " - "used if a path is not specified explicitly. Note: By default only the logs of " - "failed jobs are returned to leader. Set log level to 'debug' or enable " - "'--writeLogsFromAllJobs' to get logs back from successful jobs, and adjust " - "'maxLogFileSize' to control the truncation limit for worker logs.") - log_options.add_argument("--writeLogsGzip", dest="writeLogsGzip", nargs='?', action='store', default=None, - const=os.getcwd(), metavar="OPT_PATH", - help="Identical to --writeLogs except the logs files are gzipped on the leader.") - log_options.add_argument("--writeLogsFromAllJobs", dest="writeLogsFromAllJobs", type=strtobool, - default=False, metavar="BOOL", - help="Whether to write logs from all jobs (including the successful ones) without " - "necessarily setting the log level to 'debug'. Ensure that either --writeLogs " - "or --writeLogsGzip is set if enabling this option.") - log_options.add_argument("--writeMessages", dest="write_messages", default=None, - type=lambda x: None if x is None else os.path.abspath(x), metavar="PATH", - help="File to send messages from the leader's message bus to.") - log_options.add_argument("--realTimeLogging", dest="realTimeLogging", type=strtobool, default=False, metavar="BOOL", - help="Enable real-time logging from workers to leader") + description="Options for how Toil should manage its logs.", + ) + log_options.add_argument( + "--maxLogFileSize", + dest="maxLogFileSize", + default=100 * 1024 * 1024, + type=h2b, + action=make_open_interval_action(1), + help=f"The maximum size of a job log file to keep (in bytes), log files larger than " + f"this will be truncated to the last X bytes. Setting this option to zero will " + f"prevent any truncation. Setting this option to a negative value will truncate " + f"from the beginning. Default={bytes2human(100 * 1024 * 1024)}", + ) + log_options.add_argument( + "--writeLogs", + dest="writeLogs", + nargs="?", + action="store", + default=None, + const=os.getcwd(), + metavar="OPT_PATH", + help="Write worker logs received by the leader into their own files at the specified " + "path. Any non-empty standard output and error from failed batch system jobs will " + "also be written into files at this path. The current working directory will be " + "used if a path is not specified explicitly. Note: By default only the logs of " + "failed jobs are returned to leader. Set log level to 'debug' or enable " + "'--writeLogsFromAllJobs' to get logs back from successful jobs, and adjust " + "'maxLogFileSize' to control the truncation limit for worker logs.", + ) + log_options.add_argument( + "--writeLogsGzip", + dest="writeLogsGzip", + nargs="?", + action="store", + default=None, + const=os.getcwd(), + metavar="OPT_PATH", + help="Identical to --writeLogs except the logs files are gzipped on the leader.", + ) + log_options.add_argument( + "--writeLogsFromAllJobs", + dest="writeLogsFromAllJobs", + type=strtobool, + default=False, + metavar="BOOL", + help="Whether to write logs from all jobs (including the successful ones) without " + "necessarily setting the log level to 'debug'. Ensure that either --writeLogs " + "or --writeLogsGzip is set if enabling this option.", + ) + log_options.add_argument( + "--writeMessages", + dest="write_messages", + default=None, + type=lambda x: None if x is None else os.path.abspath(x), + metavar="PATH", + help="File to send messages from the leader's message bus to.", + ) + log_options.add_argument( + "--realTimeLogging", + dest="realTimeLogging", + type=strtobool, + default=False, + metavar="BOOL", + help="Enable real-time logging from workers to leader", + ) # Misc options misc_options = parser.add_argument_group( - title="Toil miscellaneous options", - description="Everything else." - ) - misc_options.add_argument('--disableChaining', dest='disableChaining', type=strtobool, default=False, - metavar="BOOL", - help="Disables chaining of jobs (chaining uses one job's resource allocation " - "for its successor job if possible).") - misc_options.add_argument("--disableJobStoreChecksumVerification", dest="disableJobStoreChecksumVerification", - default=False, type=strtobool, metavar="BOOL", - help="Disables checksum verification for files transferred to/from the job store. " - "Checksum verification is a safety check to ensure the data is not corrupted " - "during transfer. Currently only supported for non-streaming AWS files.") + title="Toil miscellaneous options", description="Everything else." + ) + misc_options.add_argument( + "--disableChaining", + dest="disableChaining", + type=strtobool, + default=False, + metavar="BOOL", + help="Disables chaining of jobs (chaining uses one job's resource allocation " + "for its successor job if possible).", + ) + misc_options.add_argument( + "--disableJobStoreChecksumVerification", + dest="disableJobStoreChecksumVerification", + default=False, + type=strtobool, + metavar="BOOL", + help="Disables checksum verification for files transferred to/from the job store. " + "Checksum verification is a safety check to ensure the data is not corrupted " + "during transfer. Currently only supported for non-streaming AWS files.", + ) class SSEKeyAction(Action): - def __call__(self, parser: Any, namespace: Any, values: Any, option_string: Any = None) -> None: + def __call__( + self, parser: Any, namespace: Any, values: Any, option_string: Any = None + ) -> None: if values is not None: sse_key = values if sse_key is None: return with open(sse_key) as f: - assert len(f.readline().rstrip()) == 32, 'SSE key appears to be invalid.' + assert ( + len(f.readline().rstrip()) == 32 + ), "SSE key appears to be invalid." setattr(namespace, self.dest, values) - misc_options.add_argument("--sseKey", dest="sseKey", default=None, action=SSEKeyAction, metavar="PATH", - help="Path to file containing 32 character key to be used for server-side encryption on " - "awsJobStore or googleJobStore. SSE will not be used if this flag is not passed.") + misc_options.add_argument( + "--sseKey", + dest="sseKey", + default=None, + action=SSEKeyAction, + metavar="PATH", + help="Path to file containing 32 character key to be used for server-side encryption on " + "awsJobStore or googleJobStore. SSE will not be used if this flag is not passed.", + ) # yaml.safe_load is being deprecated, this is the suggested workaround def yaml_safe_load(stream: Any) -> Any: - yaml = YAML(typ='safe', pure=True) + yaml = YAML(typ="safe", pure=True) d = yaml.load(stream) if isinstance(d, dict): # this means the argument was a dictionary and is valid yaml (for configargparse) @@ -677,69 +1044,129 @@ class ExtendActionDict(Action): Argparse action class to implement the action="extend" functionality on dictionaries """ - def __call__(self, parser: Any, namespace: Any, values: Any, option_string: Any = None) -> None: + def __call__( + self, parser: Any, namespace: Any, values: Any, option_string: Any = None + ) -> None: items = getattr(namespace, self.dest, None) - assert items is not None # for mypy. This should never be None, esp. if called in setEnv + assert ( + items is not None + ) # for mypy. This should never be None, esp. if called in setEnv # note: this will overwrite existing entries items.update(values) - misc_options.add_argument("--setEnv", '-e', metavar='NAME=VALUE or NAME', dest="environment", - default={}, type=yaml_safe_load, action=ExtendActionDict, - help="Set an environment variable early on in the worker. If VALUE is null, it will " - "be looked up in the current environment. Independently of this option, the worker " - "will try to emulate the leader's environment before running a job, except for " - "some variables known to vary across systems. Using this option, a variable can " - "be injected into the worker process itself before it is started.") - misc_options.add_argument("--servicePollingInterval", dest="servicePollingInterval", default=60.0, type=float, - action=make_open_interval_action(0.0), metavar="FLOAT", - help=f"Interval of time service jobs wait between polling for the existence of the " - f"keep-alive flag. Default: {60.0}") - misc_options.add_argument('--forceDockerAppliance', dest='forceDockerAppliance', type=strtobool, default=False, - metavar="BOOL", - help='Disables sanity checking the existence of the docker image specified by ' - 'TOIL_APPLIANCE_SELF, which Toil uses to provision mesos for autoscaling.') - misc_options.add_argument('--statusWait', dest='statusWait', type=int, default=3600, metavar="INT", - help="Seconds to wait between reports of running jobs.") - misc_options.add_argument('--disableProgress', dest='disableProgress', action="store_true", default=False, - help="Disables the progress bar shown when standard error is a terminal.") + misc_options.add_argument( + "--setEnv", + "-e", + metavar="NAME=VALUE or NAME", + dest="environment", + default={}, + type=yaml_safe_load, + action=ExtendActionDict, + help="Set an environment variable early on in the worker. If VALUE is null, it will " + "be looked up in the current environment. Independently of this option, the worker " + "will try to emulate the leader's environment before running a job, except for " + "some variables known to vary across systems. Using this option, a variable can " + "be injected into the worker process itself before it is started.", + ) + misc_options.add_argument( + "--servicePollingInterval", + dest="servicePollingInterval", + default=60.0, + type=float, + action=make_open_interval_action(0.0), + metavar="FLOAT", + help=f"Interval of time service jobs wait between polling for the existence of the " + f"keep-alive flag. Default: {60.0}", + ) + misc_options.add_argument( + "--forceDockerAppliance", + dest="forceDockerAppliance", + type=strtobool, + default=False, + metavar="BOOL", + help="Disables sanity checking the existence of the docker image specified by " + "TOIL_APPLIANCE_SELF, which Toil uses to provision mesos for autoscaling.", + ) + misc_options.add_argument( + "--statusWait", + dest="statusWait", + type=int, + default=3600, + metavar="INT", + help="Seconds to wait between reports of running jobs.", + ) + misc_options.add_argument( + "--disableProgress", + dest="disableProgress", + action="store_true", + default=False, + help="Disables the progress bar shown when standard error is a terminal.", + ) # Debug options debug_options = parser.add_argument_group( title="Toil debug options", - description="Debug options for finding problems or helping with testing." - ) - debug_options.add_argument("--debugWorker", dest="debugWorker", default=False, action="store_true", - help="Experimental no forking mode for local debugging. Specifically, workers " - "are not forked and stderr/stdout are not redirected to the log.") - debug_options.add_argument("--disableWorkerOutputCapture", dest="disableWorkerOutputCapture", default=False, - action="store_true", - help="Let worker output go to worker's standard out/error instead of per-job logs.") - debug_options.add_argument("--badWorker", dest="badWorker", default=0.0, type=float, - action=make_closed_interval_action(0.0, 1.0), metavar="FLOAT", - help=f"For testing purposes randomly kill --badWorker proportion of jobs using " - f"SIGKILL. default={0.0}") - debug_options.add_argument("--badWorkerFailInterval", dest="badWorkerFailInterval", default=0.01, type=float, - action=make_open_interval_action(0.0), metavar="FLOAT", # might be cyclical? - help=f"When killing the job pick uniformly within the interval from 0.0 to " - f"--badWorkerFailInterval seconds after the worker starts. " - f"default={0.01}") + description="Debug options for finding problems or helping with testing.", + ) + debug_options.add_argument( + "--debugWorker", + dest="debugWorker", + default=False, + action="store_true", + help="Experimental no forking mode for local debugging. Specifically, workers " + "are not forked and stderr/stdout are not redirected to the log.", + ) + debug_options.add_argument( + "--disableWorkerOutputCapture", + dest="disableWorkerOutputCapture", + default=False, + action="store_true", + help="Let worker output go to worker's standard out/error instead of per-job logs.", + ) + debug_options.add_argument( + "--badWorker", + dest="badWorker", + default=0.0, + type=float, + action=make_closed_interval_action(0.0, 1.0), + metavar="FLOAT", + help=f"For testing purposes randomly kill --badWorker proportion of jobs using " + f"SIGKILL. default={0.0}", + ) + debug_options.add_argument( + "--badWorkerFailInterval", + dest="badWorkerFailInterval", + default=0.01, + type=float, + action=make_open_interval_action(0.0), + metavar="FLOAT", # might be cyclical? + help=f"When killing the job pick uniformly within the interval from 0.0 to " + f"--badWorkerFailInterval seconds after the worker starts. " + f"default={0.01}", + ) # All deprecated options: # These are deprecated in favor of a simpler option # ex: noLinkImports and linkImports can be simplified into a single link_imports argument - link_imports.add_argument("--noLinkImports", dest="linkImports", action="store_false", - help=SUPPRESS) - link_imports.add_argument("--linkImports", dest="linkImports", action="store_true", - help=SUPPRESS) + link_imports.add_argument( + "--noLinkImports", dest="linkImports", action="store_false", help=SUPPRESS + ) + link_imports.add_argument( + "--linkImports", dest="linkImports", action="store_true", help=SUPPRESS + ) link_imports.set_defaults(linkImports=None) - move_exports.add_argument("--moveExports", dest="moveExports", action="store_true", - help=SUPPRESS) - move_exports.add_argument("--noMoveExports", dest="moveExports", action="store_false", - help=SUPPRESS) + move_exports.add_argument( + "--moveExports", dest="moveExports", action="store_true", help=SUPPRESS + ) + move_exports.add_argument( + "--noMoveExports", dest="moveExports", action="store_false", help=SUPPRESS + ) link_imports.set_defaults(moveExports=None) # dest is set to enableCaching to not conflict with the current --caching destination - caching.add_argument('--disableCaching', dest='enableCaching', action='store_false', help=SUPPRESS) + caching.add_argument( + "--disableCaching", dest="enableCaching", action="store_false", help=SUPPRESS + ) caching.set_defaults(enableCaching=None) diff --git a/src/toil/options/cwl.py b/src/toil/options/cwl.py index 1e5ee480b0..6304e75745 100644 --- a/src/toil/options/cwl.py +++ b/src/toil/options/cwl.py @@ -22,41 +22,59 @@ def add_cwl_options(parser: ArgumentParser, suppress: bool = True) -> None: parser.add_argument( "--enable-dev", action="store_true", - help=suppress_help or suppress_help or "Enable loading and running development versions of CWL", + help=suppress_help + or suppress_help + or "Enable loading and running development versions of CWL", ) parser.add_argument( "--enable-ext", action="store_true", - help=suppress_help or "Enable loading and running 'cwltool:' extensions to the CWL standards.", + help=suppress_help + or "Enable loading and running 'cwltool:' extensions to the CWL standards.", default=False, ) - parser.add_argument("--quiet", dest="quiet", action="store_true", default=False, help=suppress_help) - parser.add_argument("--basedir", type=str, help=suppress_help) # TODO: Might be hard-coded? + parser.add_argument( + "--quiet", dest="quiet", action="store_true", default=False, help=suppress_help + ) + parser.add_argument( + "--basedir", type=str, help=suppress_help + ) # TODO: Might be hard-coded? parser.add_argument("--outdir", type=str, default=None, help=suppress_help) - parser.add_argument("--version", action="version", version=baseVersion, - help=suppress_help or "show program's version number and exit") + parser.add_argument( + "--version", + action="version", + version=baseVersion, + help=suppress_help or "show program's version number and exit", + ) parser.add_argument( "--log-dir", type=str, default="", - help=suppress_help or "Log your tools stdout/stderr to this location outside of container", + help=suppress_help + or "Log your tools stdout/stderr to this location outside of container", ) # this is as a result of suppressed help statements not working well with mutually_exclusive_groups, which will # cause an assertion error # https://github.com/python/cpython/issues/62090 - dockergroup = parser.add_mutually_exclusive_group() if not suppress_help else parser.add_argument_group() + dockergroup = ( + parser.add_mutually_exclusive_group() + if not suppress_help + else parser.add_argument_group() + ) dockergroup.add_argument( "--user-space-docker-cmd", - help=suppress_help or "(Linux/OS X only) Specify a user space docker command (like " - "udocker or dx-docker) that will be used to call 'pull' and 'run'", + help=suppress_help + or "(Linux/OS X only) Specify a user space docker command (like " + "udocker or dx-docker) that will be used to call 'pull' and 'run'", ) dockergroup.add_argument( "--singularity", action="store_true", default=False, - help=suppress_help or "Use Singularity runtime for running containers. " - "Requires Singularity v2.6.1+ and Linux with kernel version v3.18+ or " - "with overlayfs support backported.", + help=suppress_help + or "Use Singularity runtime for running containers. " + "Requires Singularity v2.6.1+ and Linux with kernel version v3.18+ or " + "with overlayfs support backported.", ) dockergroup.add_argument( "--podman", @@ -67,20 +85,23 @@ def add_cwl_options(parser: ArgumentParser, suppress: bool = True) -> None: dockergroup.add_argument( "--no-container", action="store_true", - help=suppress_help or "Do not execute jobs in a " - "Docker container, even when `DockerRequirement` " - "is specified under `hints`.", + help=suppress_help + or "Do not execute jobs in a " + "Docker container, even when `DockerRequirement` " + "is specified under `hints`.", ) dockergroup.add_argument( "--leave-container", action="store_false", default=True, - help=suppress_help or "Do not delete Docker container used by jobs after they exit", + help=suppress_help + or "Do not delete Docker container used by jobs after they exit", dest="rm_container", ) parser.add_argument( "--custom-net", - help=suppress_help or "Specify docker network name to pass to docker run command", + help=suppress_help + or "Specify docker network name to pass to docker run command", ) cidgroup = parser.add_argument_group( "Options for recording the Docker container identifier into a file." @@ -97,7 +118,8 @@ def add_cwl_options(parser: ArgumentParser, suppress: bool = True) -> None: cidgroup.add_argument( "--cidfile-dir", type=str, - help=suppress_help or "Store the Docker container ID into a file in the specified directory.", + help=suppress_help + or "Store the Docker container ID into a file in the specified directory.", default=None, dest="cidfile_dir", ) @@ -105,9 +127,10 @@ def add_cwl_options(parser: ArgumentParser, suppress: bool = True) -> None: cidgroup.add_argument( "--cidfile-prefix", type=str, - help=suppress_help or "Specify a prefix to the container ID filename. " - "Final file name will be followed by a timestamp. " - "The default is no prefix.", + help=suppress_help + or "Specify a prefix to the container ID filename. " + "Final file name will be followed by a timestamp. " + "The default is no prefix.", default=None, dest="cidfile_prefix", ) @@ -116,15 +139,16 @@ def add_cwl_options(parser: ArgumentParser, suppress: bool = True) -> None: "--no-prepull", action="store_true", default=False, - help=suppress_help or "Do not prepull the container prior to running the workflow", + help=suppress_help + or "Do not prepull the container prior to running the workflow", ) parser.add_argument( "--preserve-environment", type=str, nargs="+", - help=suppress_help or "Preserve specified environment variables when running" - " CommandLineTools", + help=suppress_help + or "Preserve specified environment variables when running" " CommandLineTools", metavar=("VAR1 VAR2"), default=("PATH",), dest="preserve_environment", @@ -132,14 +156,29 @@ def add_cwl_options(parser: ArgumentParser, suppress: bool = True) -> None: parser.add_argument( "--preserve-entire-environment", action="store_true", - help=suppress_help or "Preserve all environment variable when running CommandLineTools.", + help=suppress_help + or "Preserve all environment variable when running CommandLineTools.", default=False, dest="preserve_entire_environment", ) - parser.add_argument("--beta-dependency-resolvers-configuration", default=None, help=suppress_help) - parser.add_argument("--beta-dependencies-directory", default=None, help=suppress_help) - parser.add_argument("--beta-use-biocontainers", default=None, action="store_true", help=suppress_help) - parser.add_argument("--beta-conda-dependencies", default=None, action="store_true", help=suppress_help) + parser.add_argument( + "--beta-dependency-resolvers-configuration", default=None, help=suppress_help + ) + parser.add_argument( + "--beta-dependencies-directory", default=None, help=suppress_help + ) + parser.add_argument( + "--beta-use-biocontainers", + default=None, + action="store_true", + help=suppress_help, + ) + parser.add_argument( + "--beta-conda-dependencies", + default=None, + action="store_true", + help=suppress_help, + ) parser.add_argument( "--tmpdir-prefix", type=str, @@ -174,32 +213,35 @@ def add_cwl_options(parser: ArgumentParser, suppress: bool = True) -> None: parser.add_argument( "--strict-memory-limit", action="store_true", - help=suppress_help or "When running with " - "software containers and the Docker engine, pass either the " - "calculated memory allocation from ResourceRequirements or the " - "default of 1 gigabyte to Docker's --memory option.", + help=suppress_help + or "When running with " + "software containers and the Docker engine, pass either the " + "calculated memory allocation from ResourceRequirements or the " + "default of 1 gigabyte to Docker's --memory option.", ) parser.add_argument( "--strict-cpu-limit", action="store_true", - help=suppress_help or "When running with " - "software containers and the Docker engine, pass either the " - "calculated cpu allocation from ResourceRequirements or the " - "default of 1 core to Docker's --cpu option. " - "Requires docker version >= v1.13.", + help=suppress_help + or "When running with " + "software containers and the Docker engine, pass either the " + "calculated cpu allocation from ResourceRequirements or the " + "default of 1 core to Docker's --cpu option. " + "Requires docker version >= v1.13.", ) parser.add_argument( "--relax-path-checks", action="store_true", default=False, - help=suppress_help or "Relax requirements on path names to permit " - "spaces and hash characters.", + help=suppress_help + or "Relax requirements on path names to permit " "spaces and hash characters.", dest="relax_path_checks", ) parser.add_argument( "--default-container", - help=suppress_help or "Specify a default docker container that will be " - "used if the workflow fails to specify one.", + help=suppress_help + or "Specify a default docker container that will be " + "used if the workflow fails to specify one.", ) parser.add_argument( "--disable-validate", @@ -216,7 +258,11 @@ def add_cwl_options(parser: ArgumentParser, suppress: bool = True) -> None: help=suppress_help or SUPPRESS, ) # same workaround as dockergroup - checkgroup = parser.add_mutually_exclusive_group() if not suppress_help else parser.add_argument_group() + checkgroup = ( + parser.add_mutually_exclusive_group() + if not suppress_help + else parser.add_argument_group() + ) checkgroup.add_argument( "--compute-checksum", action="store_true", @@ -227,14 +273,16 @@ def add_cwl_options(parser: ArgumentParser, suppress: bool = True) -> None: checkgroup.add_argument( "--no-compute-checksum", action="store_false", - help=suppress_help or "Do not compute checksum of contents while collecting outputs", + help=suppress_help + or "Do not compute checksum of contents while collecting outputs", dest="compute_checksum", ) parser.add_argument( "--eval-timeout", - help=suppress_help or "Time to wait for a Javascript expression to evaluate before giving " - "an error, default 20s.", + help=suppress_help + or "Time to wait for a Javascript expression to evaluate before giving " + "an error, default 20s.", type=float, default=20, ) @@ -249,10 +297,11 @@ def add_cwl_options(parser: ArgumentParser, suppress: bool = True) -> None: "--mpi-config-file", type=str, default=None, - help=suppress_help or "Platform specific configuration for MPI (parallel " - "launcher, its flag etc). See the cwltool README " - "section 'Running MPI-based tools' for details of the format: " - "https://github.com/common-workflow-language/cwltool#running-mpi-based-tools-that-need-to-be-launched", + help=suppress_help + or "Platform specific configuration for MPI (parallel " + "launcher, its flag etc). See the cwltool README " + "section 'Running MPI-based tools' for details of the format: " + "https://github.com/common-workflow-language/cwltool#running-mpi-based-tools-that-need-to-be-launched", ) provgroup = parser.add_argument_group( @@ -260,9 +309,10 @@ def add_cwl_options(parser: ArgumentParser, suppress: bool = True) -> None: ) provgroup.add_argument( "--provenance", - help=suppress_help or "Save provenance to specified folder as a " - "Research Object that captures and aggregates " - "workflow execution and data products.", + help=suppress_help + or "Save provenance to specified folder as a " + "Research Object that captures and aggregates " + "workflow execution and data products.", type=str, ) @@ -296,20 +346,22 @@ def add_cwl_options(parser: ArgumentParser, suppress: bool = True) -> None: ) provgroup.add_argument( "--orcid", - help=suppress_help or "Record user ORCID identifier as part of " - "provenance, e.g. https://orcid.org/0000-0002-1825-0097 " - "or 0000-0002-1825-0097. Alternatively the environment variable " - "ORCID may be set.", + help=suppress_help + or "Record user ORCID identifier as part of " + "provenance, e.g. https://orcid.org/0000-0002-1825-0097 " + "or 0000-0002-1825-0097. Alternatively the environment variable " + "ORCID may be set.", dest="orcid", default=os.environ.get("ORCID", ""), type=str, ) provgroup.add_argument( "--full-name", - help=suppress_help or "Record full name of user as part of provenance, " - "e.g. Josiah Carberry. You may need to use shell quotes to preserve " - "spaces. Alternatively the environment variable CWL_FULL_NAME may " - "be set.", + help=suppress_help + or "Record full name of user as part of provenance, " + "e.g. Josiah Carberry. You may need to use shell quotes to preserve " + "spaces. Alternatively the environment variable CWL_FULL_NAME may " + "be set.", dest="cwl_full_name", default=os.environ.get("CWL_FULL_NAME", ""), type=str, @@ -320,15 +372,17 @@ def add_cwl_options(parser: ArgumentParser, suppress: bool = True) -> None: "--bypass-file-store", action="store_true", default=False, - help=suppress_help or "Do not use Toil's file store and assume all " - "paths are accessible in place from all nodes.", + help=suppress_help + or "Do not use Toil's file store and assume all " + "paths are accessible in place from all nodes.", dest="bypass_file_store", ) parser.add_argument( "--reference-inputs", action="store_true", default=False, - help=suppress_help or "Do not copy remote inputs into Toil's file " + help=suppress_help + or "Do not copy remote inputs into Toil's file " "store and assume they are accessible in place from " "all nodes.", dest="reference_inputs", @@ -337,10 +391,15 @@ def add_cwl_options(parser: ArgumentParser, suppress: bool = True) -> None: "--disable-streaming", action="store_true", default=False, - help=suppress_help or "Disable file streaming for files that have 'streamable' flag True.", + help=suppress_help + or "Disable file streaming for files that have 'streamable' flag True.", dest="disable_streaming", ) - ram_group = parser.add_mutually_exclusive_group() if not suppress_help else parser.add_argument_group() + ram_group = ( + parser.add_mutually_exclusive_group() + if not suppress_help + else parser.add_argument_group() + ) ram_group.add_argument( "--cwl-default-ram", action="store_true", @@ -351,7 +410,8 @@ def add_cwl_options(parser: ArgumentParser, suppress: bool = True) -> None: ram_group.add_argument( "--no-cwl-default-ram", action="store_false", - help=suppress_help or "Do not apply CWL specification default ramMin, so that Toil --defaultMemory applies.", + help=suppress_help + or "Do not apply CWL specification default ramMin, so that Toil --defaultMemory applies.", dest="cwl_default_ram", ) parser.add_argument( diff --git a/src/toil/options/runner.py b/src/toil/options/runner.py index bb82cdab02..88550360d4 100644 --- a/src/toil/options/runner.py +++ b/src/toil/options/runner.py @@ -1,8 +1,11 @@ from argparse import ArgumentParser + from toil.lib.conversions import human2bytes -def add_runner_options(parser: ArgumentParser, cwl: bool = False, wdl: bool = False) -> None: +def add_runner_options( + parser: ArgumentParser, cwl: bool = False, wdl: bool = False +) -> None: """ Add to the WDL or CWL runners options that are shared or the same between runners :param parser: parser to add arguments to @@ -14,12 +17,22 @@ def add_runner_options(parser: ArgumentParser, cwl: bool = False, wdl: bool = Fa run_imports_on_workers_arguments = ["--runImportsOnWorkers"] if cwl: run_imports_on_workers_arguments.append("--run-imports-on-workers") - parser.add_argument(*run_imports_on_workers_arguments, action="store_true", default=False, dest="run_imports_on_workers", - help="Run the file imports on a worker instead of the leader. This is useful if the leader is not optimized for high network performance. " - "If set to true, the argument --importWorkersDisk must also be set.") + parser.add_argument( + *run_imports_on_workers_arguments, + action="store_true", + default=False, + dest="run_imports_on_workers", + help="Run the file imports on a worker instead of the leader. This is useful if the leader is not optimized for high network performance. " + "If set to true, the argument --importWorkersDisk must also be set." + ) import_workers_disk_arguments = ["--importWorkersDisk"] if cwl: import_workers_disk_arguments.append("--import-workers-disk") - parser.add_argument(*import_workers_disk_arguments, dest="import_workers_disk", type=lambda x: human2bytes(str(x)), default=None, - help="Specify the amount of disk space an import worker will use. If file streaming for input files is not available, " - "this should be set to the size of the largest input file. This must be set in conjunction with the argument runImportsOnWorkers.") + parser.add_argument( + *import_workers_disk_arguments, + dest="import_workers_disk", + type=lambda x: human2bytes(str(x)), + default=None, + help="Specify the amount of disk space an import worker will use. If file streaming for input files is not available, " + "this should be set to the size of the largest input file. This must be set in conjunction with the argument runImportsOnWorkers." + ) diff --git a/src/toil/options/wdl.py b/src/toil/options/wdl.py index 6c40cf0e07..064a09186d 100644 --- a/src/toil/options/wdl.py +++ b/src/toil/options/wdl.py @@ -17,26 +17,72 @@ def add_wdl_options(parser: ArgumentParser, suppress: bool = True) -> None: # this is to avoid possible duplicate options in custom toil scripts, ex outputFile can be a common argument name # TODO: Why do we even need them at all in other Toil scripts? Do we have to worry about dest= collisions? # TODO: Can the better option name be first? - output_dialect_arguments = ["--wdlOutputDialect"] + (["--outputDialect"] if not suppress else []) - parser.add_argument(*output_dialect_arguments, dest="output_dialect", type=str, default='cromwell', - choices=['cromwell', 'miniwdl'], - help=suppress_help or ("JSON output format dialect. 'cromwell' just returns the workflow's " - "output values as JSON, while 'miniwdl' nests that under an 'outputs' " - "key, and includes a 'dir' key where files are written.")) - output_directory_arguments = ["--wdlOutputDirectory"] + (["--outputDirectory", "-o"] if not suppress else []) - parser.add_argument(*output_directory_arguments, dest="output_directory", type=str, default=None, - help=suppress_help or ( - "Directory or URI prefix to save output files at. By default a new directory is created " - "in the current directory.")) - output_file_arguments = ["--wdlOutputFile"] + (["--outputFile", "-m"] if not suppress else []) - parser.add_argument(*output_file_arguments, dest="output_file", type=str, default=None, - help=suppress_help or "File or URI to save output JSON to.") - reference_inputs_arguments = ["--wdlReferenceInputs"] + (["--referenceInputs"] if not suppress else []) - parser.add_argument(*reference_inputs_arguments, dest="reference_inputs", type=strtobool, default=False, - help=suppress_help or "Pass input files by URL") + output_dialect_arguments = ["--wdlOutputDialect"] + ( + ["--outputDialect"] if not suppress else [] + ) + parser.add_argument( + *output_dialect_arguments, + dest="output_dialect", + type=str, + default="cromwell", + choices=["cromwell", "miniwdl"], + help=suppress_help + or ( + "JSON output format dialect. 'cromwell' just returns the workflow's " + "output values as JSON, while 'miniwdl' nests that under an 'outputs' " + "key, and includes a 'dir' key where files are written." + ) + ) + output_directory_arguments = ["--wdlOutputDirectory"] + ( + ["--outputDirectory", "-o"] if not suppress else [] + ) + parser.add_argument( + *output_directory_arguments, + dest="output_directory", + type=str, + default=None, + help=suppress_help + or ( + "Directory or URI prefix to save output files at. By default a new directory is created " + "in the current directory." + ) + ) + output_file_arguments = ["--wdlOutputFile"] + ( + ["--outputFile", "-m"] if not suppress else [] + ) + parser.add_argument( + *output_file_arguments, + dest="output_file", + type=str, + default=None, + help=suppress_help or "File or URI to save output JSON to." + ) + reference_inputs_arguments = ["--wdlReferenceInputs"] + ( + ["--referenceInputs"] if not suppress else [] + ) + parser.add_argument( + *reference_inputs_arguments, + dest="reference_inputs", + type=strtobool, + default=False, + help=suppress_help or "Pass input files by URL" + ) container_arguments = ["--wdlContainer"] + (["--container"] if not suppress else []) - parser.add_argument(*container_arguments, dest="container", type=str, choices=["singularity", "docker", "auto"], default="auto", - help=suppress_help or "Container engine to use to run WDL tasks") - all_call_outputs_arguments = ["--wdlAllCallOutputs"] + (["--allCallOutputs"] if not suppress else []) - parser.add_argument(*all_call_outputs_arguments, dest="all_call_outputs", type=strtobool, default=None, - help=suppress_help or "Keep and return all call outputs as workflow outputs") + parser.add_argument( + *container_arguments, + dest="container", + type=str, + choices=["singularity", "docker", "auto"], + default="auto", + help=suppress_help or "Container engine to use to run WDL tasks" + ) + all_call_outputs_arguments = ["--wdlAllCallOutputs"] + ( + ["--allCallOutputs"] if not suppress else [] + ) + parser.add_argument( + *all_call_outputs_arguments, + dest="all_call_outputs", + type=strtobool, + default=None, + help=suppress_help or "Keep and return all call outputs as workflow outputs" + ) diff --git a/src/toil/provisioners/__init__.py b/src/toil/provisioners/__init__.py index 9f93174d8c..2f2abf9e39 100644 --- a/src/toil/provisioners/__init__.py +++ b/src/toil/provisioners/__init__.py @@ -14,7 +14,7 @@ import argparse import logging from difflib import get_close_matches -from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Type, Union +from typing import TYPE_CHECKING, Optional, Union if TYPE_CHECKING: from toil.provisioners.aws.awsProvisioner import AWSProvisioner @@ -30,9 +30,9 @@ def cluster_factory( clusterType: str = "mesos", zone: Optional[str] = None, nodeStorage: int = 50, - nodeStorageOverrides: Optional[List[str]] = None, + nodeStorageOverrides: Optional[list[str]] = None, sseKey: Optional[str] = None, - enable_fuse: bool = False + enable_fuse: bool = False, ) -> Union["AWSProvisioner", "GCEProvisioner"]: """ Find and instantiate the appropriate provisioner instance to make clusters in the given cloud. @@ -46,20 +46,36 @@ def cluster_factory( :param zone: The cloud zone :return: A cluster object for the the cloud type. """ - if provisioner == 'aws': + if provisioner == "aws": try: from toil.provisioners.aws.awsProvisioner import AWSProvisioner except ImportError: - logger.error('The aws extra must be installed to use this provisioner') + logger.error("The aws extra must be installed to use this provisioner") raise - return AWSProvisioner(clusterName, clusterType, zone, nodeStorage, nodeStorageOverrides, sseKey, enable_fuse) - elif provisioner == 'gce': + return AWSProvisioner( + clusterName, + clusterType, + zone, + nodeStorage, + nodeStorageOverrides, + sseKey, + enable_fuse, + ) + elif provisioner == "gce": try: from toil.provisioners.gceProvisioner import GCEProvisioner except ImportError: - logger.error('The google extra must be installed to use this provisioner') + logger.error("The google extra must be installed to use this provisioner") raise - return GCEProvisioner(clusterName, clusterType, zone, nodeStorage, nodeStorageOverrides, sseKey, enable_fuse) + return GCEProvisioner( + clusterName, + clusterType, + zone, + nodeStorage, + nodeStorageOverrides, + sseKey, + enable_fuse, + ) else: raise RuntimeError("Invalid provisioner '%s'" % provisioner) @@ -67,22 +83,39 @@ def cluster_factory( def add_provisioner_options(parser: argparse.ArgumentParser) -> None: group = parser.add_argument_group("Provisioner Options.") - provisioner_choices = ['aws', 'gce'] + provisioner_choices = ["aws", "gce"] # TODO: Better consolidate this provisioner arg and the one in common.py? - group.add_argument('--provisioner', '-p', dest="provisioner", choices=provisioner_choices, default='aws', - help=f"The provisioner for cluster auto-scaling. This is the '--provisioner' option set for " - f"Toil utils like launch-cluster and destroy-cluster, which always require a provisioner, " - f"and so this defaults to: %(default)s. Choices: {provisioner_choices}.") - group.add_argument('-z', '--zone', dest='zone', required=False, default=None, - help="The availability zone of the leader. This parameter can also be set via the 'TOIL_X_ZONE' " - "environment variable, where X is AWS or GCE, or by the ec2_region_name parameter " - "in your .boto file, or derived from the instance metadata if using this utility on an " - "existing EC2 instance.") - group.add_argument("clusterName", help="The name that the cluster will be identifiable by. " - "Must be lowercase and may not contain the '_' character.") - - -def parse_node_types(node_type_specs: Optional[str]) -> List[Tuple[Set[str], Optional[float]]]: + group.add_argument( + "--provisioner", + "-p", + dest="provisioner", + choices=provisioner_choices, + default="aws", + help=f"The provisioner for cluster auto-scaling. This is the '--provisioner' option set for " + f"Toil utils like launch-cluster and destroy-cluster, which always require a provisioner, " + f"and so this defaults to: %(default)s. Choices: {provisioner_choices}.", + ) + group.add_argument( + "-z", + "--zone", + dest="zone", + required=False, + default=None, + help="The availability zone of the leader. This parameter can also be set via the 'TOIL_X_ZONE' " + "environment variable, where X is AWS or GCE, or by the ec2_region_name parameter " + "in your .boto file, or derived from the instance metadata if using this utility on an " + "existing EC2 instance.", + ) + group.add_argument( + "clusterName", + help="The name that the cluster will be identifiable by. " + "Must be lowercase and may not contain the '_' character.", + ) + + +def parse_node_types( + node_type_specs: Optional[str], +) -> list[tuple[set[str], Optional[float]]]: """ Parse a specification for zero or more node types. @@ -107,27 +140,33 @@ def parse_node_types(node_type_specs: Optional[str]) -> List[Tuple[Set[str], Opt if node_type_specs: # Some node types were actually specified - for node_type_spec in node_type_specs.split(','): + for node_type_spec in node_type_specs.split(","): try: # Types are comma-separated # Then we have the colon and the bid - parts = node_type_spec.split(':') + parts = node_type_spec.split(":") if len(parts) > 2: # Only one bid allowed - raise ValueError(f'Cound not parse node type "{node_type_spec}": multiple bids') + raise ValueError( + f'Cound not parse node type "{node_type_spec}": multiple bids' + ) # Instance types are slash-separated within an equivalence # class - instance_types = set(parts[0].split('/')) + instance_types = set(parts[0].split("/")) for instance_type in instance_types: - if instance_type == '': + if instance_type == "": # No empty instance types allowed - raise ValueError(f'Cound not parse node type "{node_type_spec}": empty instance type') + raise ValueError( + f'Cound not parse node type "{node_type_spec}": empty instance type' + ) # Build the node type tuple - parsed.append((instance_types, float(parts[1]) if len(parts) > 1 else None)) + parsed.append( + (instance_types, float(parts[1]) if len(parts) > 1 else None) + ) except Exception as e: if isinstance(e, ValueError): raise @@ -137,7 +176,9 @@ def parse_node_types(node_type_specs: Optional[str]) -> List[Tuple[Set[str], Opt return parsed -def check_valid_node_types(provisioner, node_types: List[Tuple[Set[str], Optional[float]]]): +def check_valid_node_types( + provisioner, node_types: list[tuple[set[str], Optional[float]]] +): """ Raises if an invalid nodeType is specified for aws or gce. @@ -148,52 +189,76 @@ def check_valid_node_types(provisioner, node_types: List[Tuple[Set[str], Optiona # check if a valid node type for aws from toil.lib.generatedEC2Lists import E2Instances, regionDict - if provisioner == 'aws': + + if provisioner == "aws": from toil.lib.aws import get_current_aws_region - current_region = get_current_aws_region() or 'us-west-2' + + current_region = get_current_aws_region() or "us-west-2" # check if instance type exists in this region for node_type in node_types: for instance_type_name in node_type[0]: if instance_type_name not in regionDict[current_region]: # They probably misspelled it and can't tell. - close = get_close_matches(instance_type_name, regionDict[current_region], 1) + close = get_close_matches( + instance_type_name, regionDict[current_region], 1 + ) if len(close) > 0: - helpText = ' Did you mean ' + close[0] + '?' + helpText = " Did you mean " + close[0] + "?" else: - helpText = '' - raise RuntimeError(f'Invalid instance type ({instance_type_name}) specified for AWS in ' - f'region: {current_region}.{helpText}') - elif provisioner == 'gce': + helpText = "" + raise RuntimeError( + f"Invalid instance type ({instance_type_name}) specified for AWS in " + f"region: {current_region}.{helpText}" + ) + elif provisioner == "gce": for node_type in node_types: for instance_type_name in node_type[0]: if instance_type_name in E2Instances: - raise RuntimeError(f"It looks like you've specified an AWS nodeType with the {provisioner} " - f"provisioner. Please specify a nodeType for {provisioner}.") + raise RuntimeError( + f"It looks like you've specified an AWS nodeType with the {provisioner} " + f"provisioner. Please specify a nodeType for {provisioner}." + ) else: raise RuntimeError(f"Invalid provisioner: {provisioner}") class NoSuchClusterException(Exception): """Indicates that the specified cluster does not exist.""" + def __init__(self, cluster_name: str) -> None: super().__init__(f"The cluster '{cluster_name}' could not be found") + class NoSuchZoneException(Exception): """Indicates that a valid zone could not be found.""" + def __init__(self) -> None: super().__init__(f"No valid zone could be found!") class ClusterTypeNotSupportedException(Exception): """Indicates that a provisioner does not support a given cluster type.""" + def __init__(self, provisioner_class, cluster_type): - super().__init__(f"The {provisioner_class} provisioner does not support making {cluster_type} clusters") + super().__init__( + f"The {provisioner_class} provisioner does not support making {cluster_type} clusters" + ) + class ClusterCombinationNotSupportedException(Exception): """Indicates that a provisioner does not support making a given type of cluster with a given architecture.""" - def __init__(self, provisioner_class: Type, cluster_type: str, architecture: str, reason: Optional[str] = None): - message = (f"The {provisioner_class} provisioner does not support making {cluster_type} clusters " - f"using nodes with the {architecture} architecture.") + + def __init__( + self, + provisioner_class: type, + cluster_type: str, + architecture: str, + reason: Optional[str] = None, + ): + message = ( + f"The {provisioner_class} provisioner does not support making {cluster_type} clusters " + f"using nodes with the {architecture} architecture." + ) if reason is not None: message += f" This is because: {reason}" super().__init__(message) diff --git a/src/toil/provisioners/abstractProvisioner.py b/src/toil/provisioners/abstractProvisioner.py index 64d54efc21..fff95c986b 100644 --- a/src/toil/provisioners/abstractProvisioner.py +++ b/src/toil/provisioners/abstractProvisioner.py @@ -20,7 +20,7 @@ import textwrap from abc import ABC, abstractmethod from functools import total_ordering -from typing import Any, Dict, List, Optional, Set, Tuple, Union +from typing import Any, Optional, Union from urllib.parse import quote from uuid import uuid4 @@ -55,6 +55,7 @@ class Shape: The memory and disk attributes store the number of bytes required by a job (or provided by a node) in RAM or on disk (SSD or HDD), respectively. """ + def __init__( self, wallTime: Union[int, float], @@ -70,11 +71,13 @@ def __init__( self.preemptible = preemptible def __eq__(self, other: Any) -> bool: - return (self.wallTime == other.wallTime and - self.memory == other.memory and - self.cores == other.cores and - self.disk == other.disk and - self.preemptible == other.preemptible) + return ( + self.wallTime == other.wallTime + and self.memory == other.memory + and self.cores == other.cores + and self.disk == other.disk + and self.preemptible == other.preemptible + ) def greater_than(self, other: Any) -> bool: if self.preemptible < other.preemptible: @@ -104,12 +107,13 @@ def __gt__(self, other: Any) -> bool: return self.greater_than(other) def __repr__(self) -> str: - return "Shape(wallTime=%s, memory=%s, cores=%s, disk=%s, preemptible=%s)" % \ - (self.wallTime, - self.memory, - self.cores, - self.disk, - self.preemptible) + return "Shape(wallTime=%s, memory=%s, cores=%s, disk=%s, preemptible=%s)" % ( + self.wallTime, + self.memory, + self.cores, + self.disk, + self.preemptible, + ) def __str__(self) -> str: return self.__repr__() @@ -117,17 +121,14 @@ def __str__(self) -> str: def __hash__(self) -> int: # Since we replaced __eq__ we need to replace __hash__ as well. return hash( - (self.wallTime, - self.memory, - self.cores, - self.disk, - self.preemptible)) + (self.wallTime, self.memory, self.cores, self.disk, self.preemptible) + ) class AbstractProvisioner(ABC): """Interface for provisioning worker nodes to use in a Toil cluster.""" - LEADER_HOME_DIR = '/root/' # home directory in the Toil appliance on an instance + LEADER_HOME_DIR = "/root/" # home directory in the Toil appliance on an instance cloud: str = None def __init__( @@ -136,8 +137,8 @@ def __init__( clusterType: Optional[str] = "mesos", zone: Optional[str] = None, nodeStorage: int = 50, - nodeStorageOverrides: Optional[List[str]] = None, - enable_fuse: bool = False + nodeStorageOverrides: Optional[list[str]] = None, + enable_fuse: bool = False, ) -> None: """ Initialize provisioner. @@ -161,7 +162,7 @@ def __init__( self._nodeStorage = nodeStorage self._nodeStorageOverrides = {} for override in nodeStorageOverrides or []: - nodeShape, storageOverride = override.split(':') + nodeShape, storageOverride = override.split(":") self._nodeStorageOverrides[nodeShape] = int(storageOverride) self._leaderPrivateIP: Optional[str] = None # This will hold an SSH public key for Mesos clusters, or the @@ -179,7 +180,7 @@ def __init__( self.readClusterSettings() @abstractmethod - def supportedClusterTypes(self) -> Set[str]: + def supportedClusterTypes(self) -> set[str]: """ Get all the cluster types that this provisioner implementation supports. @@ -245,12 +246,14 @@ def _setLeaderWorkerAuthentication(self, leader: Node = None): :param leader: Node to pull credentials from, if not the current machine. """ - if self.clusterType == 'mesos': + if self.clusterType == "mesos": # We're using a Mesos cluster, so set up SSH from leader to workers. self._leaderWorkerAuthentication = self._setSSH(leader=leader) - elif self.clusterType == 'kubernetes': + elif self.clusterType == "kubernetes": # We're using a Kubernetes cluster. - self._leaderWorkerAuthentication = self._getKubernetesJoiningInfo(leader=leader) + self._leaderWorkerAuthentication = self._getKubernetesJoiningInfo( + leader=leader + ) def _clearLeaderWorkerAuthentication(self): """ @@ -277,16 +280,22 @@ def _setSSH(self, leader: Node = None) -> str: # To work locally or remotely we need to do all our setup work as one # big bash -c - command = ['bash', '-c', ('set -e; if [ ! -e /root/.sshSuccess ] ; ' - 'then ssh-keygen -f /root/.ssh/id_rsa -t rsa -N ""; ' - 'touch /root/.sshSuccess; fi; chmod 700 /root/.ssh;')] + command = [ + "bash", + "-c", + ( + "set -e; if [ ! -e /root/.sshSuccess ] ; " + 'then ssh-keygen -f /root/.ssh/id_rsa -t rsa -N ""; ' + "touch /root/.sshSuccess; fi; chmod 700 /root/.ssh;" + ), + ] if leader is None: # Run locally subprocess.check_call(command) # Grab from local file - with open('/root/.ssh/id_rsa.pub') as f: + with open("/root/.ssh/id_rsa.pub") as f: leaderPublicKey = f.read() else: # Run remotely @@ -294,20 +303,20 @@ def _setSSH(self, leader: Node = None) -> str: # Grab from remote file with tempfile.TemporaryDirectory() as tmpdir: - localFile = os.path.join(tmpdir, 'id_rsa.pub') - leader.extractFile('/root/.ssh/id_rsa.pub', localFile, 'toil_leader') + localFile = os.path.join(tmpdir, "id_rsa.pub") + leader.extractFile("/root/.ssh/id_rsa.pub", localFile, "toil_leader") with open(localFile) as f: leaderPublicKey = f.read() # Drop the key type and keep just the key data - leaderPublicKey = leaderPublicKey.split(' ')[1] + leaderPublicKey = leaderPublicKey.split(" ")[1] # confirm it really is an RSA public key - assert leaderPublicKey.startswith('AAAAB3NzaC1yc2E'), leaderPublicKey + assert leaderPublicKey.startswith("AAAAB3NzaC1yc2E"), leaderPublicKey return leaderPublicKey - def _getKubernetesJoiningInfo(self, leader: Node = None) -> Dict[str, str]: + def _getKubernetesJoiningInfo(self, leader: Node = None) -> dict[str, str]: """ Get the Kubernetes joining info created when Kubernetes was set up on this node, which is the leader, or on a different specified Node. @@ -327,22 +336,24 @@ def _getKubernetesJoiningInfo(self, leader: Node = None) -> Dict[str, str]: # This info is always supposed to be set up before the Toil appliance # starts, and mounted in at the same path as on the host. So we just go # read it. - with open('/etc/kubernetes/worker.ini') as f: + with open("/etc/kubernetes/worker.ini") as f: config.read_file(f) else: # Grab from remote file with tempfile.TemporaryDirectory() as tmpdir: - localFile = os.path.join(tmpdir, 'worker.ini') - leader.extractFile('/etc/kubernetes/worker.ini', localFile, 'toil_leader') + localFile = os.path.join(tmpdir, "worker.ini") + leader.extractFile( + "/etc/kubernetes/worker.ini", localFile, "toil_leader" + ) with open(localFile) as f: config.read_file(f) # Grab everything out of the default section where our setup script put # it. - return dict(config['DEFAULT']) + return dict(config["DEFAULT"]) - def setAutoscaledNodeTypes(self, nodeTypes: List[Tuple[Set[str], Optional[float]]]): + def setAutoscaledNodeTypes(self, nodeTypes: list[tuple[set[str], Optional[float]]]): """ Set node types, shapes and spot bids for Toil-managed autoscaling. :param nodeTypes: A list of node types, as parsed with parse_node_types. @@ -375,13 +386,13 @@ def hasAutoscaledNodeTypes(self) -> bool: """ return len(self.getAutoscaledInstanceShapes()) > 0 - def getAutoscaledInstanceShapes(self) -> Dict[Shape, str]: + def getAutoscaledInstanceShapes(self) -> dict[Shape, str]: """ Get all the node shapes and their named instance types that the Toil autoscaler should manage. """ - if hasattr(self, '_shape_to_instance_type'): + if hasattr(self, "_shape_to_instance_type"): # We have had Toil-managed autoscaling set up return dict(self._shape_to_instance_type) else: @@ -418,7 +429,7 @@ def launchCluster(self, *args, **kwargs): @abstractmethod def addNodes( self, - nodeTypes: Set[str], + nodeTypes: set[str], numNodes: int, preemptible: bool, spotBid: Optional[float] = None, @@ -433,7 +444,9 @@ def addNodes( """ raise NotImplementedError - def addManagedNodes(self, nodeTypes: Set[str], minNodes, maxNodes, preemptible, spotBid=None) -> None: + def addManagedNodes( + self, nodeTypes: set[str], minNodes, maxNodes, preemptible, spotBid=None + ) -> None: """ Add a group of managed nodes of the given type, up to the given maximum. The nodes will automatically be launched and terminated depending on cluster load. @@ -448,10 +461,12 @@ def addManagedNodes(self, nodeTypes: Set[str], minNodes, maxNodes, preemptible, """ # Not available by default - raise ManagedNodesNotSupportedException("Managed nodes not supported by this provisioner") + raise ManagedNodesNotSupportedException( + "Managed nodes not supported by this provisioner" + ) @abstractmethod - def terminateNodes(self, nodes: List[Node]) -> None: + def terminateNodes(self, nodes: list[Node]) -> None: """ Terminate the nodes represented by given Node objects @@ -467,7 +482,9 @@ def getLeader(self): raise NotImplementedError @abstractmethod - def getProvisionedWorkers(self, instance_type: Optional[str] = None, preemptible: Optional[bool] = None) -> List[Node]: + def getProvisionedWorkers( + self, instance_type: Optional[str] = None, preemptible: Optional[bool] = None + ) -> list[Node]: """ Gets all nodes, optionally of the given instance type or preemptability, from the provisioner. Includes both static and @@ -514,7 +531,14 @@ def __init__(self): # Holds strings like "ssh-rsa actualKeyData" for keys to authorize (independently of cloud provider's system) self.sshPublicKeys = [] - def addFile(self, path: str, filesystem: str = 'root', mode: Union[str, int] = '0755', contents: str = '', append: bool = False): + def addFile( + self, + path: str, + filesystem: str = "root", + mode: Union[str, int] = "0755", + contents: str = "", + append: bool = False, + ): """ Make a file on the instance with the given filesystem, mode, and contents. @@ -526,16 +550,21 @@ def addFile(self, path: str, filesystem: str = 'root', mode: Union[str, int] = ' mode = int(mode, 8) assert isinstance(mode, int) - contents = 'data:,' + quote(contents.encode('utf-8')) + contents = "data:," + quote(contents.encode("utf-8")) - ignition_file = {'path': path, 'filesystem': filesystem, 'mode': mode, 'contents': {'source': contents}} + ignition_file = { + "path": path, + "filesystem": filesystem, + "mode": mode, + "contents": {"source": contents}, + } if append: ignition_file["append"] = append self.files.append(ignition_file) - def addUnit(self, name: str, enabled: bool = True, contents: str = ''): + def addUnit(self, name: str, enabled: bool = True, contents: str = ""): """ Make a systemd unit on the instance with the given name (including .service), and content. Units will be enabled by default. @@ -546,7 +575,7 @@ def addUnit(self, name: str, enabled: bool = True, contents: str = ''): journalctl -xe """ - self.units.append({'name': name, 'enabled': enabled, 'contents': contents}) + self.units.append({"name": name, "enabled": enabled, "contents": contents}) def addSSHRSAKey(self, keyData: str): """ @@ -563,30 +592,19 @@ def toIgnitionConfig(self) -> str: # Define the base config. We're using Flatcar's v2.2.0 fork # See: https://github.com/kinvolk/ignition/blob/flatcar-master/doc/configuration-v2_2.md config = { - 'ignition': { - 'version': '2.2.0' - }, - 'storage': { - 'files': self.files - }, - 'systemd': { - 'units': self.units - } + "ignition": {"version": "2.2.0"}, + "storage": {"files": self.files}, + "systemd": {"units": self.units}, } if len(self.sshPublicKeys) > 0: # Add SSH keys if needed - config['passwd'] = { - 'users': [ - { - 'name': 'core', - 'sshAuthorizedKeys': self.sshPublicKeys - } - ] + config["passwd"] = { + "users": [{"name": "core", "sshAuthorizedKeys": self.sshPublicKeys}] } # Serialize as JSON - return json.dumps(config, separators=(',', ':')) + return json.dumps(config, separators=(",", ":")) def getBaseInstanceConfiguration(self) -> InstanceConfiguration: """ @@ -596,10 +614,16 @@ def getBaseInstanceConfiguration(self) -> InstanceConfiguration: config = self.InstanceConfiguration() # We set Flatcar's update reboot strategy to off - config.addFile("/etc/coreos/update.conf", mode='0644', contents=textwrap.dedent("""\ + config.addFile( + "/etc/coreos/update.conf", + mode="0644", + contents=textwrap.dedent( + """\ GROUP=stable REBOOT_STRATEGY=off - """)) + """ + ), + ) # Then we have volume mounting. That always happens. self.addVolumesService(config) @@ -621,7 +645,10 @@ def addVolumesService(self, config: InstanceConfiguration): # # TODO: check what kind of instance this is, and what ephemeral volumes # *should* be there, and declaratively RAID and mount them. - config.addFile("/home/core/volumes.sh", contents=textwrap.dedent("""\ + config.addFile( + "/home/core/volumes.sh", + contents=textwrap.dedent( + """\ #!/bin/bash set -x ephemeral_count=0 @@ -684,9 +711,14 @@ def addVolumesService(self, config: InstanceConfiguration): sudo mkdir -p /var/$directory sudo mount --bind /mnt/ephemeral/var/$directory /var/$directory done - """)) + """ + ), + ) # TODO: Make this retry? - config.addUnit("volume-mounting.service", contents=textwrap.dedent("""\ + config.addUnit( + "volume-mounting.service", + contents=textwrap.dedent( + """\ [Unit] Description=mounts ephemeral volumes & bind mounts toil directories Before=docker.service @@ -698,14 +730,19 @@ def addVolumesService(self, config: InstanceConfiguration): [Install] WantedBy=multi-user.target - """)) + """ + ), + ) def addNodeExporterService(self, config: InstanceConfiguration): """ Add the node exporter service for Prometheus to an instance configuration. """ - config.addUnit("node-exporter.service", contents=textwrap.dedent('''\ + config.addUnit( + "node-exporter.service", + contents=textwrap.dedent( + """\ [Unit] Description=node-exporter container After=docker.service @@ -728,12 +765,20 @@ def addNodeExporterService(self, config: InstanceConfiguration): [Install] WantedBy=multi-user.target - ''')) + """ + ), + ) def toil_service_env_options(self) -> str: return "-e TMPDIR=/var/tmp" - def add_toil_service(self, config: InstanceConfiguration, role: str, keyPath: str = None, preemptible: bool = False): + def add_toil_service( + self, + config: InstanceConfiguration, + role: str, + keyPath: str = None, + preemptible: bool = False, + ): """ Add the Toil leader or worker service to an instance configuration. @@ -750,46 +795,59 @@ def add_toil_service(self, config: InstanceConfiguration, role: str, keyPath: st # transferred. The waitForKey.sh script loops on the new VM until it finds the keyPath file, then it starts the # mesos-agent. If there are multiple keys to be transferred, then the last one to be transferred must be # set to keyPath. - MESOS_LOG_DIR = '--log_dir=/var/lib/mesos ' - LEADER_DOCKER_ARGS = '--registry=in_memory --cluster={name}' + MESOS_LOG_DIR = "--log_dir=/var/lib/mesos " + LEADER_DOCKER_ARGS = "--registry=in_memory --cluster={name}" # --no-systemd_enable_support is necessary in Ubuntu 16.04 (otherwise, # Mesos attempts to contact systemd but can't find its run file) - WORKER_DOCKER_ARGS = '--work_dir=/var/lib/mesos --master={ip}:5050 --attributes=preemptible:{preemptible} --no-hostname_lookup --no-systemd_enable_support' - - if self.clusterType == 'mesos': - if role == 'leader': - entryPoint = 'mesos-master' - entryPointArgs = MESOS_LOG_DIR + LEADER_DOCKER_ARGS.format(name=self.clusterName) - elif role == 'worker': - entryPoint = 'mesos-agent' - entryPointArgs = MESOS_LOG_DIR + WORKER_DOCKER_ARGS.format(ip=self._leaderPrivateIP, - preemptible=preemptible) + WORKER_DOCKER_ARGS = "--work_dir=/var/lib/mesos --master={ip}:5050 --attributes=preemptible:{preemptible} --no-hostname_lookup --no-systemd_enable_support" + + if self.clusterType == "mesos": + if role == "leader": + entryPoint = "mesos-master" + entryPointArgs = MESOS_LOG_DIR + LEADER_DOCKER_ARGS.format( + name=self.clusterName + ) + elif role == "worker": + entryPoint = "mesos-agent" + entryPointArgs = MESOS_LOG_DIR + WORKER_DOCKER_ARGS.format( + ip=self._leaderPrivateIP, preemptible=preemptible + ) else: raise RuntimeError("Unknown role %s" % role) - elif self.clusterType == 'kubernetes': - if role == 'leader': + elif self.clusterType == "kubernetes": + if role == "leader": # We need *an* entry point or the leader container will finish # and go away, and thus not be available to take user logins. - entryPoint = 'sleep' - entryPointArgs = 'infinity' + entryPoint = "sleep" + entryPointArgs = "infinity" else: - raise RuntimeError('Toil service not needed for %s nodes in a %s cluster', - role, self.clusterType) + raise RuntimeError( + "Toil service not needed for %s nodes in a %s cluster", + role, + self.clusterType, + ) else: - raise RuntimeError('Toil service not needed in a %s cluster', self.clusterType) + raise RuntimeError( + "Toil service not needed in a %s cluster", self.clusterType + ) if keyPath: - entryPointArgs = keyPath + ' ' + entryPointArgs + entryPointArgs = keyPath + " " + entryPointArgs entryPoint = "waitForKey.sh" customDockerInitCommand = customDockerInitCmd() if customDockerInitCommand: - entryPointArgs = " ".join(["'" + customDockerInitCommand + "'", entryPoint, entryPointArgs]) + entryPointArgs = " ".join( + ["'" + customDockerInitCommand + "'", entryPoint, entryPointArgs] + ) entryPoint = "customDockerInit.sh" # Set up the service. Make sure to make it default to using the # actually-big temp directory of /var/tmp (see # https://systemd.io/TEMPORARY_DIRECTORIES/). - config.addUnit(f"toil-{role}.service", contents=textwrap.dedent(f'''\ + config.addUnit( + f"toil-{role}.service", + contents=textwrap.dedent( + f"""\ [Unit] Description=toil-{role} container After=docker.service @@ -828,9 +886,11 @@ def add_toil_service(self, config: InstanceConfiguration, role: str, keyPath: st [Install] WantedBy=multi-user.target - ''')) + """ + ), + ) - def getKubernetesValues(self, architecture: str = 'amd64'): + def getKubernetesValues(self, architecture: str = "amd64"): """ Returns a dict of Kubernetes component versions and paths for formatting into Kubernetes-related templates. """ @@ -857,10 +917,14 @@ def getKubernetesValues(self, architecture: str = 'amd64'): METRICS_API_VERSION="v0.3.7", CLUSTER_NAME=self.clusterName, # YAML line that tells the Kubelet to use a cloud provider, if we need one. - CLOUD_PROVIDER_SPEC=('cloud-provider: ' + cloud_provider) if cloud_provider else '' + CLOUD_PROVIDER_SPEC=( + ("cloud-provider: " + cloud_provider) if cloud_provider else "" + ), ) - def addKubernetesServices(self, config: InstanceConfiguration, architecture: str = 'amd64'): + def addKubernetesServices( + self, config: InstanceConfiguration, architecture: str = "amd64" + ): """ Add installing Kubernetes and Kubeadm and setting up the Kubelet to run when configured to an instance configuration. The same process applies to leaders and workers. @@ -869,7 +933,10 @@ def addKubernetesServices(self, config: InstanceConfiguration, architecture: str values = self.getKubernetesValues(architecture) # We're going to ship the Kubelet service from Kubernetes' release pipeline via cloud-config - config.addUnit("kubelet.service", contents=textwrap.dedent('''\ + config.addUnit( + "kubelet.service", + contents=textwrap.dedent( + """\ # This came from https://raw.githubusercontent.com/kubernetes/release/v0.4.0/cmd/kubepkg/templates/latest/deb/kubelet/lib/systemd/system/kubelet.service # It has been modified to replace /usr/bin with {DOWNLOAD_DIR} # License: https://raw.githubusercontent.com/kubernetes/release/v0.4.0/LICENSE @@ -888,11 +955,16 @@ def addKubernetesServices(self, config: InstanceConfiguration, architecture: str [Install] WantedBy=multi-user.target - ''').format(**values)) + """ + ).format(**values), + ) # It needs this config file - config.addFile("/etc/systemd/system/kubelet.service.d/10-kubeadm.conf", mode='0644', - contents=textwrap.dedent('''\ + config.addFile( + "/etc/systemd/system/kubelet.service.d/10-kubeadm.conf", + mode="0644", + contents=textwrap.dedent( + """\ # This came from https://raw.githubusercontent.com/kubernetes/release/v0.4.0/cmd/kubepkg/templates/latest/deb/kubeadm/10-kubeadm.conf # It has been modified to replace /usr/bin with {DOWNLOAD_DIR} # License: https://raw.githubusercontent.com/kubernetes/release/v0.4.0/LICENSE @@ -908,7 +980,9 @@ def addKubernetesServices(self, config: InstanceConfiguration, architecture: str EnvironmentFile=-/etc/default/kubelet ExecStart= ExecStart={DOWNLOAD_DIR}/kubelet $KUBELET_KUBECONFIG_ARGS $KUBELET_CONFIG_ARGS $KUBELET_KUBEADM_ARGS $KUBELET_EXTRA_ARGS - ''').format(**values)) + """ + ).format(**values), + ) # Before we let the kubelet try to start, we have to actually download it (and kubeadm) # We set up this service so it can restart on failure despite not @@ -919,7 +993,10 @@ def addKubernetesServices(self, config: InstanceConfiguration, architecture: str # restarts work if the script fails. We also use a condition which # treats the service as successful and skips it if it made a file to # say it already ran. - config.addFile("/home/core/install-kubernetes.sh", contents=textwrap.dedent('''\ + config.addFile( + "/home/core/install-kubernetes.sh", + contents=textwrap.dedent( + """\ #!/usr/bin/env bash set -e FLAG_FILE="{SETUP_STATE_DIR}/install-kubernetes.done" @@ -938,8 +1015,13 @@ def addKubernetesServices(self, config: InstanceConfiguration, architecture: str mkdir -p "{SETUP_STATE_DIR}" touch "$FLAG_FILE" - ''').format(**values)) - config.addUnit("install-kubernetes.service", contents=textwrap.dedent('''\ + """ + ).format(**values), + ) + config.addUnit( + "install-kubernetes.service", + contents=textwrap.dedent( + """\ [Unit] Description=base Kubernetes installation Wants=network-online.target @@ -957,12 +1039,14 @@ def addKubernetesServices(self, config: InstanceConfiguration, architecture: str [Install] WantedBy=multi-user.target RequiredBy=kubelet.service - ''').format(**values)) + """ + ).format(**values), + ) # Now we should have the kubeadm command, and the bootlooping kubelet # waiting for kubeadm to configure it. - def getKubernetesAutoscalerSetupCommands(self, values: Dict[str, str]) -> str: + def getKubernetesAutoscalerSetupCommands(self, values: dict[str, str]) -> str: """ Return Bash commands that set up the Kubernetes cluster autoscaler for provisioning from the environment supported by this provisioner. @@ -997,7 +1081,11 @@ def addKubernetesLeader(self, config: InstanceConfiguration): # Customize scheduler to pack jobs into as few nodes as possible # See: https://kubernetes.io/docs/reference/scheduling/config/#profiles - config.addFile("/home/core/scheduler-config.yml", mode='0644', contents=textwrap.dedent('''\ + config.addFile( + "/home/core/scheduler-config.yml", + mode="0644", + contents=textwrap.dedent( + """\ apiVersion: kubescheduler.config.k8s.io/v1beta1 kind: KubeSchedulerConfiguration clientConnection: @@ -1011,13 +1099,21 @@ def addKubernetesLeader(self, config: InstanceConfiguration): enabled: - name: NodeResourcesMostAllocated weight: 1 - '''.format(**values))) + """.format( + **values + ) + ), + ) # Main kubeadm cluster configuration. # Make sure to mount the scheduler config where the scheduler can see # it, which is undocumented but inferred from # https://pkg.go.dev/k8s.io/kubernetes@v1.21.0/cmd/kubeadm/app/apis/kubeadm#ControlPlaneComponent - config.addFile("/home/core/kubernetes-leader.yml", mode='0644', contents=textwrap.dedent('''\ + config.addFile( + "/home/core/kubernetes-leader.yml", + mode="0644", + contents=textwrap.dedent( + """\ apiVersion: kubeadm.k8s.io/v1beta2 kind: InitConfiguration nodeRegistration: @@ -1049,11 +1145,18 @@ def addKubernetesLeader(self, config: InstanceConfiguration): serverTLSBootstrap: true rotateCertificates: true cgroupDriver: systemd - '''.format(**values))) + """.format( + **values + ) + ), + ) # Make a script to apply that and the other cluster components # Note that we're escaping {{thing}} as {{{{thing}}}} because we need to match mustaches in a yaml we hack up. - config.addFile("/home/core/create-kubernetes-cluster.sh", contents=textwrap.dedent('''\ + config.addFile( + "/home/core/create-kubernetes-cluster.sh", + contents=textwrap.dedent( + """\ #!/usr/bin/env bash set -e @@ -1086,7 +1189,11 @@ def addKubernetesLeader(self, config: InstanceConfiguration): kubectl apply -f https://raw.githubusercontent.com/kontena/kubelet-rubber-stamp/release/{RUBBER_STAMP_VERSION}/deploy/role_binding.yaml kubectl apply -f https://raw.githubusercontent.com/kontena/kubelet-rubber-stamp/release/{RUBBER_STAMP_VERSION}/deploy/operator.yaml - ''').format(**values) + self.getKubernetesAutoscalerSetupCommands(values) + textwrap.dedent('''\ + """ + ).format(**values) + + self.getKubernetesAutoscalerSetupCommands(values) + + textwrap.dedent( + """\ # Set up metrics server, which needs serverTLSBootstrap and rubber stamp, and insists on running on a worker curl -sSL https://github.com/kubernetes-sigs/metrics-server/releases/download/{METRICS_API_VERSION}/components.yaml | \\ sed 's/ - --secure-port=4443/ - --secure-port=4443\\n - --kubelet-preferred-address-types=Hostname/' | \\ @@ -1100,8 +1207,13 @@ def addKubernetesLeader(self, config: InstanceConfiguration): mkdir -p "{SETUP_STATE_DIR}" touch "$FLAG_FILE" - ''').format(**values)) - config.addUnit("create-kubernetes-cluster.service", contents=textwrap.dedent('''\ + """ + ).format(**values), + ) + config.addUnit( + "create-kubernetes-cluster.service", + contents=textwrap.dedent( + """\ [Unit] Description=Kubernetes cluster bootstrap After=install-kubernetes.service @@ -1120,10 +1232,15 @@ def addKubernetesLeader(self, config: InstanceConfiguration): [Install] WantedBy=multi-user.target RequiredBy=toil-leader.service - ''').format(**values)) + """ + ).format(**values), + ) # We also need a node cleaner service - config.addFile("/home/core/cleanup-nodes.sh", contents=textwrap.dedent('''\ + config.addFile( + "/home/core/cleanup-nodes.sh", + contents=textwrap.dedent( + """\ #!/usr/bin/env bash # cleanup-nodes.sh: constantly clean up NotReady nodes that are tainted as having been deleted set -e @@ -1142,8 +1259,13 @@ def addKubernetesLeader(self, config: InstanceConfiguration): done sleep 300 done - ''').format(**values)) - config.addUnit("cleanup-nodes.service", contents=textwrap.dedent('''\ + """ + ).format(**values), + ) + config.addUnit( + "cleanup-nodes.service", + contents=textwrap.dedent( + """\ [Unit] Description=Remove scaled-in nodes After=create-kubernetes-cluster.service @@ -1155,9 +1277,16 @@ def addKubernetesLeader(self, config: InstanceConfiguration): RestartSec=10 [Install] WantedBy=multi-user.target - ''')) + """ + ), + ) - def addKubernetesWorker(self, config: InstanceConfiguration, authVars: Dict[str, str], preemptible: bool = False): + def addKubernetesWorker( + self, + config: InstanceConfiguration, + authVars: dict[str, str], + preemptible: bool = False, + ): """ Add services to configure as a Kubernetes worker, if Kubernetes is already set to be installed. @@ -1177,10 +1306,16 @@ def addKubernetesWorker(self, config: InstanceConfiguration, authVars: Dict[str, # TODO: We use the same label that EKS uses here, because nothing is standardized. # This won't be quite appropriate as we aren't on EKS and we might not # even be on AWS, but the batch system should understand it. - values['WORKER_LABEL_SPEC'] = 'node-labels: "eks.amazonaws.com/capacityType=SPOT"' if preemptible else '' + values["WORKER_LABEL_SPEC"] = ( + 'node-labels: "eks.amazonaws.com/capacityType=SPOT"' if preemptible else "" + ) # Kubeadm worker configuration - config.addFile("/home/core/kubernetes-worker.yml", mode='0644', contents=textwrap.dedent('''\ + config.addFile( + "/home/core/kubernetes-worker.yml", + mode="0644", + contents=textwrap.dedent( + """\ apiVersion: kubeadm.k8s.io/v1beta2 kind: JoinConfiguration nodeRegistration: @@ -1198,10 +1333,17 @@ def addKubernetesWorker(self, config: InstanceConfiguration, authVars: Dict[str, apiVersion: kubelet.config.k8s.io/v1beta1 kind: KubeletConfiguration cgroupDriver: systemd - '''.format(**values))) + """.format( + **values + ) + ), + ) # Make a script to join the cluster using that configuration - config.addFile("/home/core/join-kubernetes-cluster.sh", contents=textwrap.dedent('''\ + config.addFile( + "/home/core/join-kubernetes-cluster.sh", + contents=textwrap.dedent( + """\ #!/usr/bin/env bash set -e FLAG_FILE="{SETUP_STATE_DIR}/join-kubernetes-cluster.done" @@ -1216,9 +1358,14 @@ def addKubernetesWorker(self, config: InstanceConfiguration, authVars: Dict[str, mkdir -p "{SETUP_STATE_DIR}" touch "$FLAG_FILE" - ''').format(**values)) + """ + ).format(**values), + ) - config.addUnit("join-kubernetes-cluster.service", contents=textwrap.dedent('''\ + config.addUnit( + "join-kubernetes-cluster.service", + contents=textwrap.dedent( + """\ [Unit] Description=Kubernetes cluster membership After=install-kubernetes.service @@ -1236,9 +1383,17 @@ def addKubernetesWorker(self, config: InstanceConfiguration, authVars: Dict[str, [Install] WantedBy=multi-user.target - ''').format(**values)) + """ + ).format(**values), + ) - def _getIgnitionUserData(self, role: str, keyPath: Optional[str] = None, preemptible: bool = False, architecture: str = 'amd64') -> str: + def _getIgnitionUserData( + self, + role: str, + keyPath: Optional[str] = None, + preemptible: bool = False, + architecture: str = "amd64", + ) -> str: """ Return the text (not bytes) user data to pass to a provisioned node. @@ -1252,33 +1407,35 @@ def _getIgnitionUserData(self, role: str, keyPath: Optional[str] = None, preempt # Start with a base config config = self.getBaseInstanceConfiguration() - if self.clusterType == 'kubernetes': + if self.clusterType == "kubernetes": # Install Kubernetes self.addKubernetesServices(config, architecture) - if role == 'leader': + if role == "leader": # Set up the cluster self.addKubernetesLeader(config) # We can't actually set up a Kubernetes worker without credentials # to connect back to the leader. - if self.clusterType == 'mesos' or role == 'leader': + if self.clusterType == "mesos" or role == "leader": # Leaders, and all nodes in a Mesos cluster, need a Toil service self.add_toil_service(config, role, keyPath, preemptible) - if role == 'worker' and self._leaderWorkerAuthentication is not None: + if role == "worker" and self._leaderWorkerAuthentication is not None: # We need to connect the worker to the leader. - if self.clusterType == 'mesos': + if self.clusterType == "mesos": # This involves an SSH public key form the leader config.addSSHRSAKey(self._leaderWorkerAuthentication) - elif self.clusterType == 'kubernetes': + elif self.clusterType == "kubernetes": # We can install the Kubernetes worker and make it phone home # to the leader. # TODO: this puts sufficient info to fake a malicious worker # into the worker config, which probably is accessible by # anyone in the cloud account. - self.addKubernetesWorker(config, self._leaderWorkerAuthentication, preemptible=preemptible) + self.addKubernetesWorker( + config, self._leaderWorkerAuthentication, preemptible=preemptible + ) # Make it into a string for Ignition user_data = config.toIgnitionConfig() @@ -1289,21 +1446,29 @@ def _getIgnitionUserData(self, role: str, keyPath: Optional[str] = None, preempt user_data_limit: int = self._get_user_data_limit() if len(user_data) > user_data_limit: - logger.warning(f"Ignition config size exceeds the user data limit ({len(user_data)} > {user_data_limit}). " - "Writing to cloud storage...") - - src = self._write_file_to_cloud(f'configs/{role}/config-{uuid4()}.ign', contents=user_data.encode('utf-8')) - - return json.dumps({ - 'ignition': { - 'version': '2.2.0', - # See: https://github.com/coreos/ignition/blob/spec2x/doc/configuration-v2_2.md - 'config': { - 'replace': { - 'source': src, - } + logger.warning( + f"Ignition config size exceeds the user data limit ({len(user_data)} > {user_data_limit}). " + "Writing to cloud storage..." + ) + + src = self._write_file_to_cloud( + f"configs/{role}/config-{uuid4()}.ign", + contents=user_data.encode("utf-8"), + ) + + return json.dumps( + { + "ignition": { + "version": "2.2.0", + # See: https://github.com/coreos/ignition/blob/spec2x/doc/configuration-v2_2.md + "config": { + "replace": { + "source": src, + } + }, } - } - }, separators=(',', ':')) + }, + separators=(",", ":"), + ) return user_data diff --git a/src/toil/provisioners/aws/__init__.py b/src/toil/provisioners/aws/__init__.py index 694780591d..89b8e4509b 100644 --- a/src/toil/provisioners/aws/__init__.py +++ b/src/toil/provisioners/aws/__init__.py @@ -16,23 +16,28 @@ from collections import namedtuple from operator import attrgetter from statistics import mean, stdev -from typing import List, Optional +from typing import Optional from botocore.client import BaseClient -from toil.lib.aws import (get_aws_zone_from_boto, - get_aws_zone_from_environment, - get_aws_zone_from_environment_region, - get_aws_zone_from_metadata) +from toil.lib.aws import ( + get_aws_zone_from_boto, + get_aws_zone_from_environment, + get_aws_zone_from_environment_region, + get_aws_zone_from_metadata, +) logger = logging.getLogger(__name__) -ZoneTuple = namedtuple('ZoneTuple', ['name', 'price_deviation']) +ZoneTuple = namedtuple("ZoneTuple", ["name", "price_deviation"]) -def get_aws_zone_from_spot_market(spotBid: Optional[float], nodeType: Optional[str], - boto3_ec2: Optional[BaseClient], zone_options: Optional[List[str]]) -> \ -Optional[str]: +def get_aws_zone_from_spot_market( + spotBid: Optional[float], + nodeType: Optional[str], + boto3_ec2: Optional[BaseClient], + zone_options: Optional[list[str]], +) -> Optional[str]: """ If a spot bid, node type, and Boto2 EC2 connection are specified, picks a zone where instances are easy to buy from the zones in the region of the @@ -52,14 +57,22 @@ def get_aws_zone_from_spot_market(spotBid: Optional[float], nodeType: Optional[s # We can use all the zones in the region zone_options = [z.name for z in boto3_ec2.describe_availability_zones()] - return optimize_spot_bid(boto3_ec2, instance_type=nodeType, spot_bid=float(spotBid), zone_options=zone_options) + return optimize_spot_bid( + boto3_ec2, + instance_type=nodeType, + spot_bid=float(spotBid), + zone_options=zone_options, + ) else: return None -def get_best_aws_zone(spotBid: Optional[float] = None, nodeType: Optional[str] = None, - boto3_ec2: Optional[BaseClient] = None, - zone_options: Optional[List[str]] = None) -> Optional[str]: +def get_best_aws_zone( + spotBid: Optional[float] = None, + nodeType: Optional[str] = None, + boto3_ec2: Optional[BaseClient] = None, + zone_options: Optional[list[str]] = None, +) -> Optional[str]: """ Get the right AWS zone to use. @@ -84,15 +97,20 @@ def get_best_aws_zone(spotBid: Optional[float] = None, nodeType: Optional[str] = Returns None if no method can produce a zone to use. """ - return get_aws_zone_from_environment() or \ - get_aws_zone_from_metadata() or \ - get_aws_zone_from_spot_market(spotBid, nodeType, boto3_ec2, zone_options) or \ - get_aws_zone_from_environment_region() or \ - get_aws_zone_from_boto() - - -def choose_spot_zone(zones: List[str], bid: float, - spot_history: List['boto.ec2.spotpricehistory.SpotPriceHistory']) -> str: + return ( + get_aws_zone_from_environment() + or get_aws_zone_from_metadata() + or get_aws_zone_from_spot_market(spotBid, nodeType, boto3_ec2, zone_options) + or get_aws_zone_from_environment_region() + or get_aws_zone_from_boto() + ) + + +def choose_spot_zone( + zones: list[str], + bid: float, + spot_history: list["boto.ec2.spotpricehistory.SpotPriceHistory"], +) -> str: """ Returns the zone to put the spot request based on, in order of priority: @@ -131,7 +149,11 @@ def choose_spot_zone(zones: List[str], bid: float, # standard deviation values. markets_under_bid, markets_over_bid = [], [] for zone in zones: - zone_histories = [zone_history for zone_history in spot_history if zone_history.availability_zone == zone] + zone_histories = [ + zone_history + for zone_history in spot_history + if zone_history.availability_zone == zone + ] if zone_histories: price_deviation = stdev([history.price for history in zone_histories]) recent_price = zone_histories[0].price @@ -140,10 +162,14 @@ def choose_spot_zone(zones: List[str], bid: float, zone_tuple = ZoneTuple(name=zone, price_deviation=price_deviation) (markets_over_bid, markets_under_bid)[recent_price < bid].append(zone_tuple) - return min(markets_under_bid or markets_over_bid, key=attrgetter('price_deviation')).name + return min( + markets_under_bid or markets_over_bid, key=attrgetter("price_deviation") + ).name -def optimize_spot_bid(boto3_ec2: BaseClient, instance_type: str, spot_bid: float, zone_options: List[str]): +def optimize_spot_bid( + boto3_ec2: BaseClient, instance_type: str, spot_bid: float, zone_options: list[str] +): """ Check whether the bid is in line with history and makes an effort to place the instance in a sensible zone. @@ -188,8 +214,12 @@ def _check_spot_bid(spot_bid, spot_history): """ average = mean([datum.price for datum in spot_history]) if spot_bid > average * 2: - logger.warning("Your bid $ %f is more than double this instance type's average " - "spot price ($ %f) over the last week", spot_bid, average) + logger.warning( + "Your bid $ %f is more than double this instance type's average " + "spot price ($ %f) over the last week", + spot_bid, + average, + ) def _get_spot_history(boto3_ec2: BaseClient, instance_type: str): @@ -200,8 +230,10 @@ def _get_spot_history(boto3_ec2: BaseClient, instance_type: str): :rtype: list[SpotPriceHistory] """ one_week_ago = datetime.datetime.now() - datetime.timedelta(days=7) - spot_data = boto3_ec2.describe_spot_price_history(StartTime=one_week_ago.isoformat(), - InstanceTypes=[instance_type], - ProductDescriptions=["Linux/UNIX"]) + spot_data = boto3_ec2.describe_spot_price_history( + StartTime=one_week_ago.isoformat(), + InstanceTypes=[instance_type], + ProductDescriptions=["Linux/UNIX"], + ) spot_data.sort(key=attrgetter("timestamp"), reverse=True) return spot_data diff --git a/src/toil/provisioners/aws/awsProvisioner.py b/src/toil/provisioners/aws/awsProvisioner.py index be1e5fd34a..b5259b0fd8 100644 --- a/src/toil/provisioners/aws/awsProvisioner.py +++ b/src/toil/provisioners/aws/awsProvisioner.py @@ -22,40 +22,25 @@ import textwrap import time import uuid +from collections.abc import Collection, Iterable from functools import wraps from shlex import quote -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Collection, - Dict, - Iterable, - List, - Literal, - Optional, - Set, - TypeVar, - Union, - cast, -) -from urllib.parse import unquote +from typing import TYPE_CHECKING, Any, Callable, TypeVar, cast # We need these to exist as attributes we can get off of the boto object from botocore.exceptions import ClientError -from mypy_extensions import KwArg, VarArg from toil.lib.aws import AWSRegionName, AWSServerErrors, zone_to_region from toil.lib.aws.ami import get_flatcar_ami from toil.lib.aws.iam import ( CLUSTER_LAUNCHING_PERMISSIONS, + create_iam_role, get_policy_permissions, policy_permissions_allow, - create_iam_role ) from toil.lib.aws.session import AWSConnectionManager from toil.lib.aws.session import client as get_client -from toil.lib.aws.utils import boto3_pager, create_s3_bucket, flatten_tags +from toil.lib.aws.utils import boto3_pager, create_s3_bucket from toil.lib.conversions import human2bytes from toil.lib.ec2 import ( a_short_time, @@ -74,7 +59,6 @@ from toil.lib.memoize import memoize from toil.lib.misc import truncExpBackoff from toil.lib.retry import ( - ErrorCondition, get_error_body, get_error_code, get_error_message, @@ -119,12 +103,12 @@ logger = logging.getLogger(__name__) logging.getLogger("boto").setLevel(logging.CRITICAL) # Role name (used as the suffix) for EC2 instance profiles that are automatically created by Toil. -_INSTANCE_PROFILE_ROLE_NAME = 'toil' +_INSTANCE_PROFILE_ROLE_NAME = "toil" # The tag key that specifies the Toil node type ("leader" or "worker") so that # leader vs. worker nodes can be robustly identified. -_TAG_KEY_TOIL_NODE_TYPE = 'ToilNodeType' +_TAG_KEY_TOIL_NODE_TYPE = "ToilNodeType" # The tag that specifies the cluster name on all nodes -_TAG_KEY_TOIL_CLUSTER_NAME = 'clusterName' +_TAG_KEY_TOIL_CLUSTER_NAME = "clusterName" # How much storage on the root volume is expected to go to overhead and be # unavailable to jobs when the node comes up? # TODO: measure @@ -132,7 +116,7 @@ # The maximum length of a S3 bucket _S3_BUCKET_MAX_NAME_LEN = 63 # The suffix of the S3 bucket associated with the cluster -_S3_BUCKET_INTERNAL_SUFFIX = '--internal' +_S3_BUCKET_INTERNAL_SUFFIX = "--internal" def awsRetryPredicate(e: Exception) -> bool: @@ -141,14 +125,14 @@ def awsRetryPredicate(e: Exception) -> bool: # socket.gaierror: [Errno -2] Name or service not known return True # boto/AWS gives multiple messages for the same error... - if get_error_status(e) == 503 and 'Request limit exceeded' in get_error_body(e): + if get_error_status(e) == 503 and "Request limit exceeded" in get_error_body(e): return True - elif get_error_status(e) == 400 and 'Rate exceeded' in get_error_body(e): + elif get_error_status(e) == 400 and "Rate exceeded" in get_error_body(e): return True - elif get_error_status(e) == 400 and 'NotFound' in get_error_body(e): + elif get_error_status(e) == 400 and "NotFound" in get_error_body(e): # EC2 can take a while to propagate instance IDs to all servers. return True - elif get_error_status(e) == 400 and get_error_code(e) == 'Throttling': + elif get_error_status(e) == 400 and get_error_code(e) == "Throttling": return True return False @@ -162,7 +146,7 @@ def expectedShutdownErrors(e: Exception) -> bool: impossible or unnecessary (such as errors resulting from a thing not existing to be deleted). """ - return get_error_status(e) == 400 and 'dependent object' in get_error_body(e) + return get_error_status(e) == 400 and "dependent object" in get_error_body(e) F = TypeVar("F") # so mypy understands passed through types @@ -177,9 +161,9 @@ def awsRetry(f: Callable[..., F]) -> Callable[..., F]: @wraps(f) def wrapper(*args: Any, **kwargs: Any) -> Any: - for attempt in old_retry(delays=truncExpBackoff(), - timeout=300, - predicate=awsRetryPredicate): + for attempt in old_retry( + delays=truncExpBackoff(), timeout=300, predicate=awsRetryPredicate + ): with attempt: return f(*args, **kwargs) @@ -187,20 +171,32 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: def awsFilterImpairedNodes( - nodes: List["InstanceTypeDef"], boto3_ec2: "EC2Client" -) -> List["InstanceTypeDef"]: + nodes: list[InstanceTypeDef], boto3_ec2: EC2Client +) -> list[InstanceTypeDef]: # if TOIL_AWS_NODE_DEBUG is set don't terminate nodes with # failing status checks so they can be debugged - nodeDebug = os.environ.get('TOIL_AWS_NODE_DEBUG') in ('True', 'TRUE', 'true', True) + nodeDebug = os.environ.get("TOIL_AWS_NODE_DEBUG") in ("True", "TRUE", "true", True) if not nodeDebug: return nodes nodeIDs = [node["InstanceId"] for node in nodes] statuses = boto3_ec2.describe_instance_status(InstanceIds=nodeIDs) - statusMap = {status["InstanceId"]: status["InstanceStatus"]["Status"] for status in statuses["InstanceStatuses"]} - healthyNodes = [node for node in nodes if statusMap.get(node["InstanceId"], None) != 'impaired'] - impairedNodes = [node["InstanceId"] for node in nodes if statusMap.get(node["InstanceId"], None) == 'impaired'] - logger.warning('TOIL_AWS_NODE_DEBUG is set and nodes %s have failed EC2 status checks so ' - 'will not be terminated.', ' '.join(impairedNodes)) + statusMap = { + status["InstanceId"]: status["InstanceStatus"]["Status"] + for status in statuses["InstanceStatuses"] + } + healthyNodes = [ + node for node in nodes if statusMap.get(node["InstanceId"], None) != "impaired" + ] + impairedNodes = [ + node["InstanceId"] + for node in nodes + if statusMap.get(node["InstanceId"], None) == "impaired" + ] + logger.warning( + "TOIL_AWS_NODE_DEBUG is set and nodes %s have failed EC2 status checks so " + "will not be terminated.", + " ".join(impairedNodes), + ) return healthyNodes @@ -208,13 +204,13 @@ class InvalidClusterStateException(Exception): pass -def collapse_tags(instance_tags: List["TagTypeDef"]) -> Dict[str, str]: +def collapse_tags(instance_tags: list[TagTypeDef]) -> dict[str, str]: """ Collapse tags from boto3 format to node format :param instance_tags: tags as a list :return: Dict of tags """ - collapsed_tags: Dict[str, str] = dict() + collapsed_tags: dict[str, str] = dict() for tag in instance_tags: if tag.get("Key") is not None: collapsed_tags[tag["Key"]] = tag["Value"] @@ -222,10 +218,17 @@ def collapse_tags(instance_tags: List["TagTypeDef"]) -> Dict[str, str]: class AWSProvisioner(AbstractProvisioner): - def __init__(self, clusterName: Optional[str], clusterType: Optional[str], zone: Optional[str], - nodeStorage: int, nodeStorageOverrides: Optional[List[str]], sseKey: Optional[str], - enable_fuse: bool): - self.cloud = 'aws' + def __init__( + self, + clusterName: str | None, + clusterType: str | None, + zone: str | None, + nodeStorage: int, + nodeStorageOverrides: list[str] | None, + sseKey: str | None, + enable_fuse: bool, + ): + self.cloud = "aws" self._sseKey = sseKey # self._zone will be filled in by base class constructor # We will use it as the leader zone. @@ -233,9 +236,11 @@ def __init__(self, clusterName: Optional[str], clusterType: Optional[str], zone: if zone is None: # Can't proceed without a real zone - raise RuntimeError('No AWS availability zone specified. Configure in Boto ' - 'configuration file, TOIL_AWS_ZONE environment variable, or ' - 'on the command line.') + raise RuntimeError( + "No AWS availability zone specified. Configure in Boto " + "configuration file, TOIL_AWS_ZONE environment variable, or " + "on the command line." + ) # Determine our region to work in, before readClusterSettings() which # might need it. TODO: support multiple regions in one cluster @@ -246,24 +251,37 @@ def __init__(self, clusterName: Optional[str], clusterType: Optional[str], zone: # Set our architecture to the current machine architecture # Assume the same architecture unless specified differently in launchCluster() - self._architecture = 'amd64' if platform.machine() in ['x86_64', 'amd64'] else 'arm64' + self._architecture = ( + "amd64" if platform.machine() in ["x86_64", "amd64"] else "arm64" + ) # Call base class constructor, which will call createClusterSettings() # or readClusterSettings() - super().__init__(clusterName, clusterType, zone, nodeStorage, nodeStorageOverrides, enable_fuse) + super().__init__( + clusterName, + clusterType, + zone, + nodeStorage, + nodeStorageOverrides, + enable_fuse, + ) if self._zone is None: - logger.warning("Leader zone was never initialized before creating AWS provisioner. Defaulting to cluster zone.") + logger.warning( + "Leader zone was never initialized before creating AWS provisioner. Defaulting to cluster zone." + ) self._leader_subnet: str = self._get_default_subnet(self._zone or zone) - self._tags: Dict[str, Any] = {} + self._tags: dict[str, Any] = {} # After self.clusterName is set, generate a valid name for the S3 bucket associated with this cluster suffix = _S3_BUCKET_INTERNAL_SUFFIX - self.s3_bucket_name = self.clusterName[:_S3_BUCKET_MAX_NAME_LEN - len(suffix)] + suffix + self.s3_bucket_name = ( + self.clusterName[: _S3_BUCKET_MAX_NAME_LEN - len(suffix)] + suffix + ) - def supportedClusterTypes(self) -> Set[str]: - return {'mesos', 'kubernetes'} + def supportedClusterTypes(self) -> set[str]: + return {"mesos", "kubernetes"} def createClusterSettings(self) -> None: """ @@ -283,7 +301,7 @@ def readClusterSettings(self) -> None: from ec2_metadata import ec2_metadata boto3_ec2 = self.aws.client(self._region, "ec2") - instance: "InstanceTypeDef" = boto3_ec2.describe_instances( + instance: InstanceTypeDef = boto3_ec2.describe_instances( InstanceIds=[ec2_metadata.instance_id] )["Reservations"][0]["Instances"][0] # The cluster name is the same as the name of the leader. @@ -297,7 +315,11 @@ def readClusterSettings(self) -> None: self._worker_subnets_by_zone = self._get_good_subnets_like(self._leader_subnet) self._leaderPrivateIP = ec2_metadata.private_ipv4 # this is PRIVATE IP - self._tags = {k: v for k, v in (self.getLeader().tags or {}).items() if k != _TAG_KEY_TOIL_NODE_TYPE} + self._tags = { + k: v + for k, v in (self.getLeader().tags or {}).items() + if k != _TAG_KEY_TOIL_NODE_TYPE + } # Grab the ARN name of the instance profile (a str) to apply to workers leader_info = None for attempt in old_retry(timeout=300, predicate=lambda e: True): @@ -314,9 +336,9 @@ def readClusterSettings(self) -> None: # The existing metadata API returns a single string if there is one security group, but # a list when there are multiple: change the format to always be a list. rawSecurityGroups = ec2_metadata.security_groups - self._leaderSecurityGroupNames: Set[str] = set(rawSecurityGroups) + self._leaderSecurityGroupNames: set[str] = set(rawSecurityGroups) # Since we have access to the names, we don't also need to use any IDs - self._leaderSecurityGroupIDs: Set[str] = set() + self._leaderSecurityGroupIDs: set[str] = set() # Let the base provisioner work out how to deploy duly authorized # workers for this leader. @@ -327,8 +349,8 @@ def _write_file_to_cloud(self, key: str, contents: bytes) -> str: bucket_name = self.s3_bucket_name # Connect to S3 - s3 = self.aws.resource(self._region, 's3') - s3_client = self.aws.client(self._region, 's3') + s3 = self.aws.resource(self._region, "s3") + s3_client = self.aws.client(self._region, "s3") # create bucket if needed, then write file to S3 try: @@ -337,14 +359,18 @@ def _write_file_to_cloud(self, key: str, contents: bytes) -> str: bucket = s3.Bucket(bucket_name) except ClientError as err: if get_error_status(err) == 404: - bucket = create_s3_bucket(s3, bucket_name=bucket_name, region=self._region) + bucket = create_s3_bucket( + s3, bucket_name=bucket_name, region=self._region + ) bucket.wait_until_exists() bucket.Versioning().enable() - owner_tag = os.environ.get('TOIL_OWNER_TAG') + owner_tag = os.environ.get("TOIL_OWNER_TAG") if owner_tag: bucket_tagging = s3.BucketTagging(bucket_name) - bucket_tagging.put(Tagging={'TagSet': [{'Key': 'Owner', 'Value': owner_tag}]}) + bucket_tagging.put( + Tagging={"TagSet": [{"Key": "Owner", "Value": owner_tag}]} + ) else: raise @@ -354,34 +380,38 @@ def _write_file_to_cloud(self, key: str, contents: bytes) -> str: obj.put(Body=contents) obj.wait_until_exists() - return f's3://{bucket_name}/{key}' + return f"s3://{bucket_name}/{key}" def _read_file_from_cloud(self, key: str) -> bytes: bucket_name = self.s3_bucket_name - obj = self.aws.resource(self._region, 's3').Object(bucket_name, key) + obj = self.aws.resource(self._region, "s3").Object(bucket_name, key) try: - return obj.get()['Body'].read() + return obj.get()["Body"].read() except ClientError as e: if get_error_status(e) == 404: - logger.warning(f'Trying to read non-existent file "{key}" from {bucket_name}.') + logger.warning( + f'Trying to read non-existent file "{key}" from {bucket_name}.' + ) raise def _get_user_data_limit(self) -> int: # See: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-add-user-data.html - return human2bytes('16KB') - - def launchCluster(self, - leaderNodeType: str, - leaderStorage: int, - owner: str, - keyName: str, - botoPath: str, - userTags: Optional[Dict[str, str]], - vpcSubnet: Optional[str], - awsEc2ProfileArn: Optional[str], - awsEc2ExtraSecurityGroupIds: Optional[List[str]], - **kwargs: Dict[str, Any]) -> None: + return human2bytes("16KB") + + def launchCluster( + self, + leaderNodeType: str, + leaderStorage: int, + owner: str, + keyName: str, + botoPath: str, + userTags: dict[str, str] | None, + vpcSubnet: str | None, + awsEc2ProfileArn: str | None, + awsEc2ExtraSecurityGroupIds: list[str] | None, + **kwargs: dict[str, Any], + ) -> None: """ Starts a single leader node and populates this class with the leader's metadata. @@ -397,29 +427,42 @@ def launchCluster(self, :return: None """ - if 'network' in kwargs: - logger.warning('AWS provisioner does not support a network parameter. Ignoring %s!', kwargs["network"]) + if "network" in kwargs: + logger.warning( + "AWS provisioner does not support a network parameter. Ignoring %s!", + kwargs["network"], + ) # First, pre-flight-check our permissions before making anything. - if not policy_permissions_allow(get_policy_permissions(region=self._region), CLUSTER_LAUNCHING_PERMISSIONS): + if not policy_permissions_allow( + get_policy_permissions(region=self._region), CLUSTER_LAUNCHING_PERMISSIONS + ): # Function prints a more specific warning to the log, but give some context. - logger.warning('Toil may not be able to properly launch (or destroy!) your cluster.') + logger.warning( + "Toil may not be able to properly launch (or destroy!) your cluster." + ) leader_type = E2Instances[leaderNodeType] - if self.clusterType == 'kubernetes': + if self.clusterType == "kubernetes": if leader_type.cores < 2: # Kubernetes won't run here. - raise RuntimeError('Kubernetes requires 2 or more cores, and %s is too small' % - leaderNodeType) + raise RuntimeError( + "Kubernetes requires 2 or more cores, and %s is too small" + % leaderNodeType + ) self._keyName = keyName self._architecture = leader_type.architecture - if self.clusterType == 'mesos' and self._architecture != 'amd64': + if self.clusterType == "mesos" and self._architecture != "amd64": # Mesos images aren't currently available for this architecture, so we can't start a Mesos cluster. # Complain about this before we create anything. - raise ClusterCombinationNotSupportedException(type(self), self.clusterType, self._architecture, - reason="Mesos is only available for amd64.") + raise ClusterCombinationNotSupportedException( + type(self), + self.clusterType, + self._architecture, + reason="Mesos is only available for amd64.", + ) if vpcSubnet: # This is where we put the leader @@ -432,27 +475,29 @@ def launchCluster(self, bdms = self._getBoto3BlockDeviceMappings(leader_type, rootVolSize=leaderStorage) # Make up the tags - self._tags = {'Name': self.clusterName, - 'Owner': owner, - _TAG_KEY_TOIL_CLUSTER_NAME: self.clusterName} + self._tags = { + "Name": self.clusterName, + "Owner": owner, + _TAG_KEY_TOIL_CLUSTER_NAME: self.clusterName, + } if userTags is not None: self._tags.update(userTags) # All user specified tags have been set - userData = self._getIgnitionUserData('leader', architecture=self._architecture) + userData = self._getIgnitionUserData("leader", architecture=self._architecture) - if self.clusterType == 'kubernetes': + if self.clusterType == "kubernetes": # All nodes need a tag putting them in the cluster. # This tag needs to be on there before the a leader can finish its startup. - self._tags['kubernetes.io/cluster/' + self.clusterName] = '' + self._tags["kubernetes.io/cluster/" + self.clusterName] = "" # Make tags for the leader specifically leader_tags = dict(self._tags) - leader_tags[_TAG_KEY_TOIL_NODE_TYPE] = 'leader' - logger.debug('Launching leader with tags: %s', leader_tags) + leader_tags[_TAG_KEY_TOIL_NODE_TYPE] = "leader" + logger.debug("Launching leader with tags: %s", leader_tags) - instances: List["Instance"] = create_instances( + instances: list[Instance] = create_instances( self.aws.resource(self._region, "ec2"), image_id=self._discoverAMI(), num_instances=1, @@ -472,7 +517,7 @@ def launchCluster(self, leader.wait_until_exists() # Don't go on until the leader is started - logger.info('Waiting for leader instance %s to be running', leader) + logger.info("Waiting for leader instance %s to be running", leader) leader.wait_until_running() # Now reload it to make sure all the IPs are set. @@ -482,22 +527,31 @@ def launchCluster(self, # Sometimes AWS just fails to assign a public IP when we really need one. # But sometimes people intend to use private IPs only in Toil-managed clusters. # TODO: detect if we have a route to the private IP and fail fast if not. - logger.warning("AWS did not assign a public IP to the cluster leader. If you aren't " - "connected to the private subnet, cluster setup will fail!") + logger.warning( + "AWS did not assign a public IP to the cluster leader. If you aren't " + "connected to the private subnet, cluster setup will fail!" + ) # Remember enough about the leader to let us launch workers in its # cluster. self._leaderPrivateIP = leader.private_ip_address self._worker_subnets_by_zone = self._get_good_subnets_like(self._leader_subnet) self._leaderSecurityGroupNames = set() - self._leaderSecurityGroupIDs = set(createdSGs + (awsEc2ExtraSecurityGroupIds or [])) + self._leaderSecurityGroupIDs = set( + createdSGs + (awsEc2ExtraSecurityGroupIds or []) + ) self._leaderProfileArn = profileArn - leaderNode = Node(publicIP=leader.public_ip_address, privateIP=leader.private_ip_address, - name=leader.id, launchTime=leader.launch_time, - nodeType=leader_type.name, preemptible=False, - tags=collapse_tags(leader.tags)) - leaderNode.waitForNode('toil_leader') + leaderNode = Node( + publicIP=leader.public_ip_address, + privateIP=leader.private_ip_address, + name=leader.id, + launchTime=leader.launch_time, + nodeType=leader_type.name, + preemptible=False, + tags=collapse_tags(leader.tags), + ) + leaderNode.waitForNode("toil_leader") # Download credentials self._setLeaderWorkerAuthentication(leaderNode) @@ -510,7 +564,7 @@ def toil_service_env_options(self) -> str: instance_base_tags = json.dumps(self._tags) return config + " -e TOIL_AWS_TAGS=" + quote(instance_base_tags) - def _get_worker_subnets(self) -> List[str]: + def _get_worker_subnets(self) -> list[str]: """ Get all worker subnets we should balance across, as a flat list. """ @@ -526,7 +580,7 @@ def _get_worker_subnets(self) -> List[str]: return collected @awsRetry - def _get_good_subnets_like(self, base_subnet_id: str) -> Dict[str, List[str]]: + def _get_good_subnets_like(self, base_subnet_id: str) -> dict[str, list[str]]: """ Given a subnet ID, get all the similar subnets (including it), organized by availability zone. @@ -537,9 +591,9 @@ def _get_good_subnets_like(self, base_subnet_id: str) -> Dict[str, List[str]]: """ # Grab the ec2 resource we need to make queries - ec2 = self.aws.resource(self._region, 'ec2') + ec2 = self.aws.resource(self._region, "ec2") # And the client - ec2_client = self.aws.client(self._region, 'ec2') + ec2_client = self.aws.client(self._region, "ec2") # What subnet are we basing this on? base_subnet = ec2.Subnet(base_subnet_id) @@ -554,26 +608,33 @@ def _get_good_subnets_like(self, base_subnet_id: str) -> Dict[str, List[str]]: acls = set(self._get_subnet_acls(base_subnet_id)) # Compose a filter that selects the subnets we might want - filters: List["FilterTypeDef"] = [ + filters: list[FilterTypeDef] = [ {"Name": "vpc-id", "Values": [vpc_id]}, {"Name": "default-for-az", "Values": ["true" if is_default else "false"]}, {"Name": "state", "Values": ["available"]}, ] # Fill in this collection - by_az: Dict[str, List[str]] = {} + by_az: dict[str, list[str]] = {} # Go get all the subnets. There's no way to page manually here so it # must page automatically. - for subnet in self.aws.resource(self._region, 'ec2').subnets.filter(Filters=filters): + for subnet in self.aws.resource(self._region, "ec2").subnets.filter( + Filters=filters + ): # For each subnet in the VPC # See if it has the right ACLs subnet_acls = set(self._get_subnet_acls(subnet.subnet_id)) if subnet_acls != acls: # Reject this subnet because it has different ACLs - logger.debug('Subnet %s is a lot like subnet %s but has ACLs of %s instead of %s; skipping', - subnet.subnet_id, base_subnet_id, subnet_acls, acls) + logger.debug( + "Subnet %s is a lot like subnet %s but has ACLs of %s instead of %s; skipping", + subnet.subnet_id, + base_subnet_id, + subnet_acls, + acls, + ) continue if subnet.availability_zone not in by_az: @@ -585,24 +646,24 @@ def _get_good_subnets_like(self, base_subnet_id: str) -> Dict[str, List[str]]: return by_az @awsRetry - def _get_subnet_acls(self, subnet: str) -> List[str]: + def _get_subnet_acls(self, subnet: str) -> list[str]: """ Get all Network ACL IDs associated with a given subnet ID. """ # Grab the connection we need to use for this operation. - ec2 = self.aws.client(self._region, 'ec2') + ec2 = self.aws.client(self._region, "ec2") # Compose a filter that selects the default subnet in the AZ - filters = [{ - 'Name': 'association.subnet-id', - 'Values': [subnet] - }] + filters = [{"Name": "association.subnet-id", "Values": [subnet]}] # TODO: Can't we use the resource's network_acls.filter(Filters=)? - return [item['NetworkAclId'] for item in boto3_pager(ec2.describe_network_acls, - 'NetworkAcls', - Filters=filters)] + return [ + item["NetworkAclId"] + for item in boto3_pager( + ec2.describe_network_acls, "NetworkAcls", Filters=filters + ) + ] @awsRetry def _get_default_subnet(self, zone: str) -> str: @@ -612,27 +673,32 @@ def _get_default_subnet(self, zone: str) -> str: """ # Compose a filter that selects the default subnet in the AZ - filters: List["FilterTypeDef"] = [ + filters: list[FilterTypeDef] = [ {"Name": "default-for-az", "Values": ["true"]}, {"Name": "availability-zone", "Values": [zone]}, ] - for subnet in self.aws.resource(zone_to_region(zone), 'ec2').subnets.filter(Filters=filters): + for subnet in self.aws.resource(zone_to_region(zone), "ec2").subnets.filter( + Filters=filters + ): # There should only be one result, so when we see it, return it return subnet.subnet_id # If we don't find a subnet, something is wrong. Maybe this zone was # added after your account? - raise RuntimeError(f"No default subnet found in availability zone {zone}. " - f"Note that Amazon does not add default subnets for new " - f"zones to old accounts. Specify a VPC subnet ID to use, " - f"or create a default subnet in the zone.") + raise RuntimeError( + f"No default subnet found in availability zone {zone}. " + f"Note that Amazon does not add default subnets for new " + f"zones to old accounts. Specify a VPC subnet ID to use, " + f"or create a default subnet in the zone." + ) - def getKubernetesAutoscalerSetupCommands(self, values: Dict[str, str]) -> str: + def getKubernetesAutoscalerSetupCommands(self, values: dict[str, str]) -> str: """ Get the Bash commands necessary to configure the Kubernetes Cluster Autoscaler for AWS. """ - return textwrap.dedent('''\ + return textwrap.dedent( + """\ curl -sSL https://raw.githubusercontent.com/kubernetes/autoscaler/cluster-autoscaler-{AUTOSCALER_VERSION}/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-run-on-master.yaml | \\ sed "s|--nodes={{{{ node_asg_min }}}}:{{{{ node_asg_max }}}}:{{{{ name }}}}|--node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/{CLUSTER_NAME}|" | \\ sed 's|kubernetes.io/role: master|node-role.kubernetes.io/master: ""|' | \\ @@ -640,36 +706,41 @@ def getKubernetesAutoscalerSetupCommands(self, values: Dict[str, str]) -> str: sed '/value: "true"/d' | \\ sed 's|path: "/etc/ssl/certs/ca-bundle.crt"|path: "/usr/share/ca-certificates/ca-certificates.crt"|' | \\ kubectl apply -f - - ''').format(**values) + """ + ).format(**values) - def getKubernetesCloudProvider(self) -> Optional[str]: + def getKubernetesCloudProvider(self) -> str | None: """ Use the "aws" Kubernetes cloud provider when setting up Kubernetes. """ - return 'aws' + return "aws" - def getNodeShape(self, instance_type: str, preemptible: bool=False) -> Shape: + def getNodeShape(self, instance_type: str, preemptible: bool = False) -> Shape: """ Get the Shape for the given instance type (e.g. 't2.medium'). """ type_info = E2Instances[instance_type] - disk = type_info.disks * type_info.disk_capacity * 2 ** 30 + disk = type_info.disks * type_info.disk_capacity * 2**30 if disk == 0: # This is an EBS-backed instance. We will use the root # volume, so add the amount of EBS storage requested for # the root volume - disk = self._nodeStorageOverrides.get(instance_type, self._nodeStorage) * 2 ** 30 + disk = ( + self._nodeStorageOverrides.get(instance_type, self._nodeStorage) * 2**30 + ) # Underestimate memory by 100M to prevent autoscaler from disagreeing with # mesos about whether a job can run on a particular node type - memory = (type_info.memory - 0.1) * 2 ** 30 - return Shape(wallTime=60 * 60, - memory=int(memory), - cores=type_info.cores, - disk=int(disk), - preemptible=preemptible) + memory = (type_info.memory - 0.1) * 2**30 + return Shape( + wallTime=60 * 60, + memory=int(memory), + cores=type_info.cores, + disk=int(disk), + preemptible=preemptible, + ) @staticmethod def retryPredicate(e: Exception) -> bool: @@ -684,14 +755,14 @@ def destroyCluster(self) -> None: try: leader = self._getLeaderInstanceBoto3() vpcId = leader.get("VpcId") - logger.info('Terminating the leader first ...') + logger.info("Terminating the leader first ...") self._terminateInstances([leader]) except (NoSuchClusterException, InvalidClusterStateException): # It's ok if the leader is not found. We'll terminate any remaining # instances below anyway. pass - logger.debug('Deleting autoscaling groups ...') + logger.debug("Deleting autoscaling groups ...") removed = False for attempt in old_retry(timeout=300, predicate=expectedShutdownErrors): @@ -699,39 +770,47 @@ def destroyCluster(self) -> None: for asgName in self._getAutoScalingGroupNames(): try: # We delete the group and all the instances via ForceDelete. - self.aws.client(self._region, 'autoscaling').delete_auto_scaling_group(AutoScalingGroupName=asgName, ForceDelete=True) + self.aws.client( + self._region, "autoscaling" + ).delete_auto_scaling_group( + AutoScalingGroupName=asgName, ForceDelete=True + ) removed = True except ClientError as e: - if get_error_code(e) == 'ValidationError' and 'AutoScalingGroup name not found' in get_error_message(e): + if get_error_code( + e + ) == "ValidationError" and "AutoScalingGroup name not found" in get_error_message( + e + ): # This ASG does not need to be removed (or a # previous delete returned an error but also # succeeded). pass if removed: - logger.debug('... Successfully deleted autoscaling groups') + logger.debug("... Successfully deleted autoscaling groups") # Do the workers after the ASGs because some may belong to ASGs - logger.info('Terminating any remaining workers ...') + logger.info("Terminating any remaining workers ...") removed = False instances = self._get_nodes_in_cluster_boto3(include_stopped_nodes=True) spotIDs = self._getSpotRequestIDs() - boto3_ec2: "EC2Client" = self.aws.client( - region=self._region, service_name="ec2" - ) + boto3_ec2: EC2Client = self.aws.client(region=self._region, service_name="ec2") if spotIDs: boto3_ec2.cancel_spot_instance_requests(SpotInstanceRequestIds=spotIDs) # self.aws.boto2(self._region, 'ec2').cancel_spot_instance_requests(request_ids=spotIDs) removed = True - instancesToTerminate = awsFilterImpairedNodes(instances, self.aws.client(self._region, 'ec2')) + instancesToTerminate = awsFilterImpairedNodes( + instances, self.aws.client(self._region, "ec2") + ) if instancesToTerminate: vpcId = vpcId or instancesToTerminate[0].get("VpcId") self._terminateInstances(instancesToTerminate) removed = True if removed: - logger.debug('... Successfully terminated workers') + logger.debug("... Successfully terminated workers") - logger.info('Deleting launch templates ...') + logger.info("Deleting launch templates ...") removed = False for attempt in old_retry(timeout=300, predicate=expectedShutdownErrors): with attempt: @@ -740,7 +819,7 @@ def destroyCluster(self) -> None: mistake = False for ltID in self._get_launch_template_ids(): response = boto3_ec2.delete_launch_template(LaunchTemplateId=ltID) - if 'LaunchTemplate' not in response: + if "LaunchTemplate" not in response: mistake = True else: removed = True @@ -748,51 +827,59 @@ def destroyCluster(self) -> None: # We missed something removed = False if removed: - logger.debug('... Successfully deleted launch templates') + logger.debug("... Successfully deleted launch templates") if len(instances) == len(instancesToTerminate): # All nodes are gone now. - logger.info('Deleting IAM roles ...') + logger.info("Deleting IAM roles ...") self._deleteRoles(self._getRoleNames()) self._deleteInstanceProfiles(self._getInstanceProfileNames()) - logger.info('Deleting security group ...') + logger.info("Deleting security group ...") removed = False for attempt in old_retry(timeout=300, predicate=expectedShutdownErrors): with attempt: - security_groups: List["SecurityGroupTypeDef"] = ( + security_groups: list[SecurityGroupTypeDef] = ( boto3_ec2.describe_security_groups()["SecurityGroups"] ) for security_group in security_groups: # TODO: If we terminate the leader and the workers but # miss the security group, we won't find it now because # we won't have vpcId set. - if security_group.get("GroupName") == self.clusterName and vpcId and security_group.get("VpcId") == vpcId: + if ( + security_group.get("GroupName") == self.clusterName + and vpcId + and security_group.get("VpcId") == vpcId + ): try: - boto3_ec2.delete_security_group(GroupId=security_group["GroupId"]) + boto3_ec2.delete_security_group( + GroupId=security_group["GroupId"] + ) removed = True except ClientError as e: - if get_error_code(e) == 'InvalidGroup.NotFound': + if get_error_code(e) == "InvalidGroup.NotFound": pass else: raise if removed: - logger.debug('... Successfully deleted security group') + logger.debug("... Successfully deleted security group") else: assert len(instances) > len(instancesToTerminate) # the security group can't be deleted until all nodes are terminated - logger.warning('The TOIL_AWS_NODE_DEBUG environment variable is set and some nodes ' - 'have failed health checks. As a result, the security group & IAM ' - 'roles will not be deleted.') + logger.warning( + "The TOIL_AWS_NODE_DEBUG environment variable is set and some nodes " + "have failed health checks. As a result, the security group & IAM " + "roles will not be deleted." + ) # delete S3 buckets that might have been created by `self._write_file_to_cloud()` - logger.info('Deleting S3 buckets ...') + logger.info("Deleting S3 buckets ...") removed = False for attempt in old_retry(timeout=300, predicate=awsRetryPredicate): with attempt: # Grab the S3 resource to use - s3 = self.aws.resource(self._region, 's3') + s3 = self.aws.resource(self._region, "s3") try: bucket = s3.Bucket(self.s3_bucket_name) @@ -808,13 +895,15 @@ def destroyCluster(self) -> None: else: raise # retry this if removed: - print('... Successfully deleted S3 buckets') + print("... Successfully deleted S3 buckets") - def terminateNodes(self, nodes: List[Node]) -> None: + def terminateNodes(self, nodes: list[Node]) -> None: if nodes: self._terminateIDs([x.name for x in nodes]) - def _recover_node_type_bid(self, node_type: Set[str], spot_bid: Optional[float]) -> Optional[float]: + def _recover_node_type_bid( + self, node_type: set[str], spot_bid: float | None + ) -> float | None: """ The old Toil-managed autoscaler will tell us to make some nodes of particular instance types, and to just work out a bid, but it doesn't @@ -841,15 +930,23 @@ def _recover_node_type_bid(self, node_type: Set[str], spot_bid: Optional[float]) break if spot_bid is None: # We didn't bid on any class including this type either - raise RuntimeError("No spot bid given for a preemptible node request.") + raise RuntimeError( + "No spot bid given for a preemptible node request." + ) else: raise RuntimeError("No spot bid given for a preemptible node request.") return spot_bid - def addNodes(self, nodeTypes: Set[str], numNodes: int, preemptible: bool, spotBid: Optional[float]=None) -> int: + def addNodes( + self, + nodeTypes: set[str], + numNodes: int, + preemptible: bool, + spotBid: float | None = None, + ) -> int: # Grab the AWS connection we need - boto3_ec2 = get_client(service_name='ec2', region_name=self._region) + boto3_ec2 = get_client(service_name="ec2", region_name=self._region) assert self._leaderPrivateIP if preemptible: @@ -861,8 +958,7 @@ def addNodes(self, nodeTypes: Set[str], numNodes: int, preemptible: bool, spotBi node_type = next(iter(nodeTypes)) type_info = E2Instances[node_type] root_vol_size = self._nodeStorageOverrides.get(node_type, self._nodeStorage) - bdm = self._getBoto3BlockDeviceMapping(type_info, - rootVolSize=root_vol_size) + bdm = self._getBoto3BlockDeviceMapping(type_info, rootVolSize=root_vol_size) # Pick a zone and subnet_id to launch into if preemptible: @@ -884,7 +980,9 @@ def addNodes(self, nodeTypes: Set[str], numNodes: int, preemptible: bool, spotBi # Pick an arbitrary zone we can use. zone = next(iter(self._worker_subnets_by_zone.keys())) if zone is None: - logger.exception("Could not find a valid zone. Make sure TOIL_AWS_ZONE is set or spot bids are not too low.") + logger.exception( + "Could not find a valid zone. Make sure TOIL_AWS_ZONE is set or spot bids are not too low." + ) raise NoSuchZoneException() if self._leader_subnet in self._worker_subnets_by_zone.get(zone, []): # The leader's subnet is an option for this zone, so use it. @@ -894,40 +992,38 @@ def addNodes(self, nodeTypes: Set[str], numNodes: int, preemptible: bool, spotBi subnet_id = next(iter(self._worker_subnets_by_zone[zone])) keyPath = self._sseKey if self._sseKey else None - userData: str = self._getIgnitionUserData('worker', keyPath, preemptible, self._architecture) + userData: str = self._getIgnitionUserData( + "worker", keyPath, preemptible, self._architecture + ) userDataBytes: bytes = b"" if isinstance(userData, str): # Spot-market provisioning requires bytes for user data. - userDataBytes = userData.encode('utf-8') - - spot_kwargs = {'KeyName': self._keyName, - 'LaunchSpecification': { - 'SecurityGroupIds': self._getSecurityGroupIDs(), - 'InstanceType': type_info.name, - 'UserData': userDataBytes, - 'BlockDeviceMappings': bdm, - 'IamInstanceProfile': { - 'Arn': self._leaderProfileArn - }, - 'Placement': { - 'AvailabilityZone': zone - }, - 'SubnetId': subnet_id} - } - on_demand_kwargs = {'KeyName': self._keyName, - 'SecurityGroupIds': self._getSecurityGroupIDs(), - 'InstanceType': type_info.name, - 'UserData': userDataBytes, - 'BlockDeviceMappings': bdm, - 'IamInstanceProfile': { - 'Arn': self._leaderProfileArn - }, - 'Placement': { - 'AvailabilityZone': zone - }, - 'SubnetId': subnet_id} - - instancesLaunched: List["InstanceTypeDef"] = [] + userDataBytes = userData.encode("utf-8") + + spot_kwargs = { + "KeyName": self._keyName, + "LaunchSpecification": { + "SecurityGroupIds": self._getSecurityGroupIDs(), + "InstanceType": type_info.name, + "UserData": userDataBytes, + "BlockDeviceMappings": bdm, + "IamInstanceProfile": {"Arn": self._leaderProfileArn}, + "Placement": {"AvailabilityZone": zone}, + "SubnetId": subnet_id, + }, + } + on_demand_kwargs = { + "KeyName": self._keyName, + "SecurityGroupIds": self._getSecurityGroupIDs(), + "InstanceType": type_info.name, + "UserData": userDataBytes, + "BlockDeviceMappings": bdm, + "IamInstanceProfile": {"Arn": self._leaderProfileArn}, + "Placement": {"AvailabilityZone": zone}, + "SubnetId": subnet_id, + } + + instancesLaunched: list[InstanceTypeDef] = [] for attempt in old_retry(predicate=awsRetryPredicate): with attempt: @@ -935,59 +1031,85 @@ def addNodes(self, nodeTypes: Set[str], numNodes: int, preemptible: bool, spotBi # the biggest obstacle is AWS request throttling, so we retry on these errors at # every request in this method if not preemptible: - logger.debug('Launching %s non-preemptible nodes', numNodes) - instancesLaunched = create_ondemand_instances(boto3_ec2=boto3_ec2, - image_id=self._discoverAMI(), - spec=on_demand_kwargs, num_instances=numNodes) + logger.debug("Launching %s non-preemptible nodes", numNodes) + instancesLaunched = create_ondemand_instances( + boto3_ec2=boto3_ec2, + image_id=self._discoverAMI(), + spec=on_demand_kwargs, + num_instances=numNodes, + ) else: - logger.debug('Launching %s preemptible nodes', numNodes) + logger.debug("Launching %s preemptible nodes", numNodes) # force generator to evaluate - generatedInstancesLaunched: List[ - "DescribeInstancesResultTypeDef" - ] = list( - create_spot_instances( - boto3_ec2=boto3_ec2, - price=spotBid, - image_id=self._discoverAMI(), - tags={_TAG_KEY_TOIL_CLUSTER_NAME: self.clusterName}, - spec=spot_kwargs, - num_instances=numNodes, - tentative=True, + generatedInstancesLaunched: list[DescribeInstancesResultTypeDef] = ( + list( + create_spot_instances( + boto3_ec2=boto3_ec2, + price=spotBid, + image_id=self._discoverAMI(), + tags={_TAG_KEY_TOIL_CLUSTER_NAME: self.clusterName}, + spec=spot_kwargs, + num_instances=numNodes, + tentative=True, + ) ) ) # flatten the list - flatten_reservations: List["ReservationTypeDef"] = [ + flatten_reservations: list[ReservationTypeDef] = [ reservation for subdict in generatedInstancesLaunched for reservation in subdict["Reservations"] for key, value in subdict.items() ] # get a flattened list of all requested instances, as before instancesLaunched is a dict of reservations which is a dict of instance requests - instancesLaunched = [instance for instances in flatten_reservations for instance in instances['Instances']] + instancesLaunched = [ + instance + for instances in flatten_reservations + for instance in instances["Instances"] + ] for attempt in old_retry(predicate=awsRetryPredicate): with attempt: - list(wait_instances_running(boto3_ec2, instancesLaunched)) # ensure all instances are running + list( + wait_instances_running(boto3_ec2, instancesLaunched) + ) # ensure all instances are running increase_instance_hop_limit(boto3_ec2, instancesLaunched) - self._tags[_TAG_KEY_TOIL_NODE_TYPE] = 'worker' + self._tags[_TAG_KEY_TOIL_NODE_TYPE] = "worker" AWSProvisioner._addTags(boto3_ec2, instancesLaunched, self._tags) if self._sseKey: for i in instancesLaunched: self._waitForIP(i) - node = Node(publicIP=i['PublicIpAddress'], privateIP=i['PrivateIpAddress'], name=i['InstanceId'], - launchTime=i['LaunchTime'], nodeType=i['InstanceType'], preemptible=preemptible, - tags=collapse_tags(i['Tags'])) - node.waitForNode('toil_worker') - node.coreRsync([self._sseKey, ':' + self._sseKey], applianceName='toil_worker') - logger.debug('Launched %s new instance(s)', numNodes) + node = Node( + publicIP=i["PublicIpAddress"], + privateIP=i["PrivateIpAddress"], + name=i["InstanceId"], + launchTime=i["LaunchTime"], + nodeType=i["InstanceType"], + preemptible=preemptible, + tags=collapse_tags(i["Tags"]), + ) + node.waitForNode("toil_worker") + node.coreRsync( + [self._sseKey, ":" + self._sseKey], applianceName="toil_worker" + ) + logger.debug("Launched %s new instance(s)", numNodes) return len(instancesLaunched) - def addManagedNodes(self, nodeTypes: Set[str], minNodes: int, maxNodes: int, preemptible: bool, spotBid: Optional[float] = None) -> None: + def addManagedNodes( + self, + nodeTypes: set[str], + minNodes: int, + maxNodes: int, + preemptible: bool, + spotBid: float | None = None, + ) -> None: - if self.clusterType != 'kubernetes': - raise ManagedNodesNotSupportedException("Managed nodes only supported for Kubernetes clusters") + if self.clusterType != "kubernetes": + raise ManagedNodesNotSupportedException( + "Managed nodes only supported for Kubernetes clusters" + ) assert self._leaderPrivateIP @@ -999,27 +1121,51 @@ def addManagedNodes(self, nodeTypes: Set[str], minNodes: int, maxNodes: int, pre # Make one template per node type, so we can apply storage overrides correctly # TODO: deduplicate these if the same instance type appears in multiple sets? - launch_template_ids = {n: self._get_worker_launch_template(n, preemptible=preemptible) for n in nodeTypes} + launch_template_ids = { + n: self._get_worker_launch_template(n, preemptible=preemptible) + for n in nodeTypes + } # Make the ASG across all of them - self._createWorkerAutoScalingGroup(launch_template_ids, nodeTypes, minNodes, maxNodes, - spot_bid=spotBid) + self._createWorkerAutoScalingGroup( + launch_template_ids, nodeTypes, minNodes, maxNodes, spot_bid=spotBid + ) - def getProvisionedWorkers(self, instance_type: Optional[str] = None, preemptible: Optional[bool] = None) -> List[Node]: + def getProvisionedWorkers( + self, instance_type: str | None = None, preemptible: bool | None = None + ) -> list[Node]: assert self._leaderPrivateIP entireCluster = self._get_nodes_in_cluster_boto3(instance_type=instance_type) logger.debug("All nodes in cluster: %s", entireCluster) - workerInstances: List["InstanceTypeDef"] = [ + workerInstances: list[InstanceTypeDef] = [ i for i in entireCluster if i["PrivateIpAddress"] != self._leaderPrivateIP ] logger.debug("All workers found in cluster: %s", workerInstances) if preemptible is not None: - workerInstances = [i for i in workerInstances if preemptible == (i["SpotInstanceRequestId"] is not None)] - logger.debug('%spreemptible workers found in cluster: %s', 'non-' if not preemptible else '', workerInstances) - workerInstances = awsFilterImpairedNodes(workerInstances, self.aws.client(self._region, 'ec2')) - return [Node(publicIP=i["PublicIpAddress"], privateIP=i["PrivateIpAddress"], - name=i["InstanceId"], launchTime=i["LaunchTime"], nodeType=i["InstanceType"], - preemptible=i["SpotInstanceRequestId"] is not None, tags=collapse_tags(i["Tags"])) - for i in workerInstances] + workerInstances = [ + i + for i in workerInstances + if preemptible == (i["SpotInstanceRequestId"] is not None) + ] + logger.debug( + "%spreemptible workers found in cluster: %s", + "non-" if not preemptible else "", + workerInstances, + ) + workerInstances = awsFilterImpairedNodes( + workerInstances, self.aws.client(self._region, "ec2") + ) + return [ + Node( + publicIP=i["PublicIpAddress"], + privateIP=i["PrivateIpAddress"], + name=i["InstanceId"], + launchTime=i["LaunchTime"], + nodeType=i["InstanceType"], + preemptible=i["SpotInstanceRequestId"] is not None, + tags=collapse_tags(i["Tags"]), + ) + for i in workerInstances + ] @memoize def _discoverAMI(self) -> str: @@ -1027,17 +1173,19 @@ def _discoverAMI(self) -> str: :return: The AMI ID (a string like 'ami-0a9a5d2b65cce04eb') for Flatcar. :rtype: str """ - return get_flatcar_ami(self.aws.client(self._region, 'ec2'), self._architecture) + return get_flatcar_ami(self.aws.client(self._region, "ec2"), self._architecture) def _toNameSpace(self) -> str: assert isinstance(self.clusterName, (str, bytes)) - if any(char.isupper() for char in self.clusterName) or '_' in self.clusterName: - raise RuntimeError("The cluster name must be lowercase and cannot contain the '_' " - "character.") + if any(char.isupper() for char in self.clusterName) or "_" in self.clusterName: + raise RuntimeError( + "The cluster name must be lowercase and cannot contain the '_' " + "character." + ) namespace = self.clusterName - if not namespace.startswith('/'): - namespace = '/' + namespace + '/' - return namespace.replace('-', '/') + if not namespace.startswith("/"): + namespace = "/" + namespace + "/" + return namespace.replace("-", "/") def _namespace_name(self, name: str) -> str: """ @@ -1048,7 +1196,7 @@ def _namespace_name(self, name: str) -> str: # This logic is a bit weird, but it's what Boto2Context used to use. # Drop the leading / from the absolute-path-style "namespace" name and # then encode underscores and slashes. - return (self._toNameSpace() + name)[1:].replace('_', '__').replace('/', '_') + return (self._toNameSpace() + name)[1:].replace("_", "__").replace("/", "_") def _is_our_namespaced_name(self, namespaced_name: str) -> bool: """ @@ -1056,15 +1204,17 @@ def _is_our_namespaced_name(self, namespaced_name: str) -> bool: and was generated by _namespace_name(). """ - denamespaced = '/' + '_'.join(s.replace('_', '/') for s in namespaced_name.split('__')) + denamespaced = "/" + "_".join( + s.replace("_", "/") for s in namespaced_name.split("__") + ) return denamespaced.startswith(self._toNameSpace()) - def _getLeaderInstanceBoto3(self) -> "InstanceTypeDef": + def _getLeaderInstanceBoto3(self) -> InstanceTypeDef: """ Get the Boto 3 instance for the cluster's leader. """ # Tags are stored differently in Boto 3 - instances: List["InstanceTypeDef"] = self._get_nodes_in_cluster_boto3( + instances: list[InstanceTypeDef] = self._get_nodes_in_cluster_boto3( include_stopped_nodes=True ) instances.sort(key=lambda x: x["LaunchTime"]) @@ -1073,37 +1223,41 @@ def _getLeaderInstanceBoto3(self) -> "InstanceTypeDef": except IndexError: raise NoSuchClusterException(self.clusterName) if leader.get("Tags") is not None: - tag_value = next(item["Value"] for item in leader["Tags"] if item["Key"] == _TAG_KEY_TOIL_NODE_TYPE) + tag_value = next( + item["Value"] + for item in leader["Tags"] + if item["Key"] == _TAG_KEY_TOIL_NODE_TYPE + ) else: tag_value = None - if (tag_value or 'leader') != 'leader': + if (tag_value or "leader") != "leader": raise InvalidClusterStateException( - 'Invalid cluster state! The first launched instance appears not to be the leader ' + "Invalid cluster state! The first launched instance appears not to be the leader " 'as it is missing the "leader" tag. The safest recovery is to destroy the cluster ' - 'and restart the job. Incorrect Leader ID: %s' % leader["InstanceId"] + "and restart the job. Incorrect Leader ID: %s" % leader["InstanceId"] ) return leader - def _getLeaderInstance(self) -> "InstanceTypeDef": + def _getLeaderInstance(self) -> InstanceTypeDef: """ Get the Boto 2 instance for the cluster's leader. """ instances = self._get_nodes_in_cluster_boto3(include_stopped_nodes=True) instances.sort(key=lambda x: x["LaunchTime"]) try: - leader: "InstanceTypeDef" = instances[0] # assume leader was launched first + leader: InstanceTypeDef = instances[0] # assume leader was launched first except IndexError: raise NoSuchClusterException(self.clusterName) - tagged_node_type: str = 'leader' + tagged_node_type: str = "leader" for tag in leader["Tags"]: # If a tag specifying node type exists, if tag.get("Key") is not None and tag["Key"] == _TAG_KEY_TOIL_NODE_TYPE: tagged_node_type = tag["Value"] - if tagged_node_type != 'leader': + if tagged_node_type != "leader": raise InvalidClusterStateException( - 'Invalid cluster state! The first launched instance appears not to be the leader ' + "Invalid cluster state! The first launched instance appears not to be the leader " 'as it is missing the "leader" tag. The safest recovery is to destroy the cluster ' - 'and restart the job. Incorrect Leader ID: %s' % leader["InstanceId"] + "and restart the job. Incorrect Leader ID: %s" % leader["InstanceId"] ) return leader @@ -1111,72 +1265,87 @@ def getLeader(self, wait: bool = False) -> Node: """ Get the leader for the cluster as a Toil Node object. """ - leader: "InstanceTypeDef" = self._getLeaderInstanceBoto3() + leader: InstanceTypeDef = self._getLeaderInstanceBoto3() - leaderNode = Node(publicIP=leader["PublicIpAddress"], privateIP=leader["PrivateIpAddress"], - name=leader["InstanceId"], launchTime=leader["LaunchTime"], nodeType=None, - preemptible=False, tags=collapse_tags(leader["Tags"])) + leaderNode = Node( + publicIP=leader["PublicIpAddress"], + privateIP=leader["PrivateIpAddress"], + name=leader["InstanceId"], + launchTime=leader["LaunchTime"], + nodeType=None, + preemptible=False, + tags=collapse_tags(leader["Tags"]), + ) if wait: logger.debug("Waiting for toil_leader to enter 'running' state...") - wait_instances_running(self.aws.client(self._region, 'ec2'), [leader]) - logger.debug('... toil_leader is running') + wait_instances_running(self.aws.client(self._region, "ec2"), [leader]) + logger.debug("... toil_leader is running") self._waitForIP(leader) - leaderNode.waitForNode('toil_leader') + leaderNode.waitForNode("toil_leader") return leaderNode @classmethod @awsRetry def _addTag( - cls, boto3_ec2: "EC2Client", instance: "InstanceTypeDef", key: str, value: str + cls, boto3_ec2: EC2Client, instance: InstanceTypeDef, key: str, value: str ) -> None: if instance.get("Tags") is None: instance["Tags"] = [] - new_tag: "TagTypeDef" = {"Key": key, "Value": value} + new_tag: TagTypeDef = {"Key": key, "Value": value} boto3_ec2.create_tags(Resources=[instance["InstanceId"]], Tags=[new_tag]) @classmethod def _addTags( cls, - boto3_ec2: "EC2Client", - instances: List["InstanceTypeDef"], - tags: Dict[str, str], + boto3_ec2: EC2Client, + instances: list[InstanceTypeDef], + tags: dict[str, str], ) -> None: for instance in instances: for key, value in tags.items(): cls._addTag(boto3_ec2, instance, key, value) @classmethod - def _waitForIP(cls, instance: "InstanceTypeDef") -> None: + def _waitForIP(cls, instance: InstanceTypeDef) -> None: """ Wait until the instances has a public IP address assigned to it. :type instance: boto.ec2.instance.Instance """ - logger.debug('Waiting for ip...') + logger.debug("Waiting for ip...") while True: time.sleep(a_short_time) - if instance.get("PublicIpAddress") or instance.get("PublicDnsName") or instance.get("PrivateIpAddress"): - logger.debug('...got ip') + if ( + instance.get("PublicIpAddress") + or instance.get("PublicDnsName") + or instance.get("PrivateIpAddress") + ): + logger.debug("...got ip") break - def _terminateInstances(self, instances: List["InstanceTypeDef"]) -> None: + def _terminateInstances(self, instances: list[InstanceTypeDef]) -> None: instanceIDs = [x["InstanceId"] for x in instances] self._terminateIDs(instanceIDs) - logger.info('... Waiting for instance(s) to shut down...') + logger.info("... Waiting for instance(s) to shut down...") for instance in instances: - wait_transition(self.aws.client(region=self._region, service_name="ec2"), instance, {'pending', 'running', 'shutting-down', 'stopping', 'stopped'}, 'terminated') - logger.info('Instance(s) terminated.') + wait_transition( + self.aws.client(region=self._region, service_name="ec2"), + instance, + {"pending", "running", "shutting-down", "stopping", "stopped"}, + "terminated", + ) + logger.info("Instance(s) terminated.") @awsRetry - def _terminateIDs(self, instanceIDs: List[str]) -> None: - logger.info('Terminating instance(s): %s', instanceIDs) + def _terminateIDs(self, instanceIDs: list[str]) -> None: + logger.info("Terminating instance(s): %s", instanceIDs) boto3_ec2 = self.aws.client(region=self._region, service_name="ec2") boto3_ec2.terminate_instances(InstanceIds=instanceIDs) - logger.info('Instance(s) terminated.') + logger.info("Instance(s) terminated.") @awsRetry - def _deleteRoles(self, names: List[str]) -> None: + def _deleteRoles(self, names: list[str]) -> None: """ Delete all the given named IAM roles. Detatches but does not delete associated instance profiles. @@ -1189,22 +1358,24 @@ def _deleteRoles(self, names: List[str]) -> None: for attempt in old_retry(timeout=300, predicate=expectedShutdownErrors): with attempt: - boto3_iam.remove_role_from_instance_profile(InstanceProfileName=profile_name, - RoleName=role_name) + boto3_iam.remove_role_from_instance_profile( + InstanceProfileName=profile_name, RoleName=role_name + ) # We also need to drop all inline policies for policy_name in self._getRoleInlinePolicyNames(role_name): for attempt in old_retry(timeout=300, predicate=expectedShutdownErrors): with attempt: - boto3_iam.delete_role_policy(PolicyName=policy_name, - RoleName=role_name) + boto3_iam.delete_role_policy( + PolicyName=policy_name, RoleName=role_name + ) for attempt in old_retry(timeout=300, predicate=expectedShutdownErrors): with attempt: boto3_iam.delete_role(RoleName=role_name) - logger.debug('... Successfully deleted IAM role %s', role_name) + logger.debug("... Successfully deleted IAM role %s", role_name) @awsRetry - def _deleteInstanceProfiles(self, names: List[str]) -> None: + def _deleteInstanceProfiles(self, names: list[str]) -> None: """ Delete all the given named IAM instance profiles. All roles must already be detached. @@ -1214,21 +1385,23 @@ def _deleteInstanceProfiles(self, names: List[str]) -> None: for attempt in old_retry(timeout=300, predicate=expectedShutdownErrors): with attempt: boto3_iam.delete_instance_profile(InstanceProfileName=profile_name) - logger.debug('... Succesfully deleted instance profile %s', profile_name) + logger.debug( + "... Succesfully deleted instance profile %s", profile_name + ) @classmethod def _getBoto3BlockDeviceMapping( cls, type_info: InstanceType, rootVolSize: int = 50 - ) -> List["BlockDeviceMappingTypeDef"]: + ) -> list[BlockDeviceMappingTypeDef]: # determine number of ephemeral drives via cgcloud-lib (actually this is moved into toil's lib - bdtKeys = [''] + [f'/dev/xvd{c}' for c in string.ascii_lowercase[1:]] - bdm_list: List["BlockDeviceMappingTypeDef"] = [] + bdtKeys = [""] + [f"/dev/xvd{c}" for c in string.ascii_lowercase[1:]] + bdm_list: list[BlockDeviceMappingTypeDef] = [] # Change root volume size to allow for bigger Docker instances - root_vol: "EbsBlockDeviceTypeDef" = { + root_vol: EbsBlockDeviceTypeDef = { "DeleteOnTermination": True, "VolumeSize": rootVolSize, } - bdm: "BlockDeviceMappingTypeDef" = {"DeviceName": "/dev/xvda", "Ebs": root_vol} + bdm: BlockDeviceMappingTypeDef = {"DeviceName": "/dev/xvda", "Ebs": root_vol} bdm_list.append(bdm) # The first disk is already attached for us so start with 2nd. # Disk count is weirdly a float in our instance database, so make it an int here. @@ -1240,19 +1413,19 @@ def _getBoto3BlockDeviceMapping( # bdm["Ebs"] = root_vol.update({"VirtualName": f"ephemeral{disk - 1}"}) bdm_list.append(bdm) - logger.debug('Device mapping: %s', bdm_list) + logger.debug("Device mapping: %s", bdm_list) return bdm_list @classmethod def _getBoto3BlockDeviceMappings( cls, type_info: InstanceType, rootVolSize: int = 50 - ) -> List["BlockDeviceMappingTypeDef"]: + ) -> list[BlockDeviceMappingTypeDef]: """ Get block device mappings for the root volume for a worker. """ # Start with the root - bdms: List["BlockDeviceMappingTypeDef"] = [ + bdms: list[BlockDeviceMappingTypeDef] = [ { "DeviceName": "/dev/xvda", "Ebs": { @@ -1264,112 +1437,128 @@ def _getBoto3BlockDeviceMappings( ] # Get all the virtual drives we might have - bdtKeys = [f'/dev/xvd{c}' for c in string.ascii_lowercase] + bdtKeys = [f"/dev/xvd{c}" for c in string.ascii_lowercase] # The first disk is already attached for us so start with 2nd. # Disk count is weirdly a float in our instance database, so make it an int here. for disk in range(1, int(type_info.disks) + 1): # Make a block device mapping to attach the ephemeral disk to a # virtual block device in the VM - bdms.append({ - 'DeviceName': bdtKeys[disk], - 'VirtualName': f'ephemeral{disk - 1}' # ephemeral counts start at 0 - }) - logger.debug('Device mapping: %s', bdms) + bdms.append( + { + "DeviceName": bdtKeys[disk], + "VirtualName": f"ephemeral{disk - 1}", # ephemeral counts start at 0 + } + ) + logger.debug("Device mapping: %s", bdms) return bdms @awsRetry def _get_nodes_in_cluster_boto3( - self, instance_type: Optional[str] = None, include_stopped_nodes: bool = False - ) -> List["InstanceTypeDef"]: + self, instance_type: str | None = None, include_stopped_nodes: bool = False + ) -> list[InstanceTypeDef]: """ Get Boto3 instance objects for all nodes in the cluster. """ - boto3_ec2: "EC2Client" = self.aws.client( - region=self._region, service_name="ec2" - ) - instance_filter: "FilterTypeDef" = { + boto3_ec2: EC2Client = self.aws.client(region=self._region, service_name="ec2") + instance_filter: FilterTypeDef = { "Name": "instance.group-name", "Values": [self.clusterName], } - describe_response: "DescribeInstancesResultTypeDef" = ( + describe_response: DescribeInstancesResultTypeDef = ( boto3_ec2.describe_instances(Filters=[instance_filter]) ) - all_instances: List["InstanceTypeDef"] = [] - for reservation in describe_response['Reservations']: - instances = reservation['Instances'] + all_instances: list[InstanceTypeDef] = [] + for reservation in describe_response["Reservations"]: + instances = reservation["Instances"] all_instances.extend(instances) # all_instances = self.aws.boto2(self._region, 'ec2').get_only_instances(filters={'instance.group-name': self.clusterName}) - def instanceFilter(i: "InstanceTypeDef") -> bool: + def instanceFilter(i: InstanceTypeDef) -> bool: # filter by type only if nodeType is true - rightType = not instance_type or i['InstanceType'] == instance_type - rightState = i['State']['Name'] == 'running' or i['State']['Name'] == 'pending' + rightType = not instance_type or i["InstanceType"] == instance_type + rightState = ( + i["State"]["Name"] == "running" or i["State"]["Name"] == "pending" + ) if include_stopped_nodes: - rightState = rightState or i['State']['Name'] == 'stopping' or i['State']['Name'] == 'stopped' + rightState = ( + rightState + or i["State"]["Name"] == "stopping" + or i["State"]["Name"] == "stopped" + ) return rightType and rightState return [i for i in all_instances if instanceFilter(i)] - def _getSpotRequestIDs(self) -> List[str]: + def _getSpotRequestIDs(self) -> list[str]: """ Get the IDs of all spot requests associated with the cluster. """ # Grab the connection we need to use for this operation. - ec2: "EC2Client" = self.aws.client(self._region, "ec2") + ec2: EC2Client = self.aws.client(self._region, "ec2") - requests: List["SpotInstanceRequestTypeDef"] = ( + requests: list[SpotInstanceRequestTypeDef] = ( ec2.describe_spot_instance_requests()["SpotInstanceRequests"] ) - tag_filter: "FilterTypeDef" = { + tag_filter: FilterTypeDef = { "Name": "tag:" + _TAG_KEY_TOIL_CLUSTER_NAME, "Values": [self.clusterName], } - tags: List["TagDescriptionTypeDef"] = ec2.describe_tags(Filters=[tag_filter])[ + tags: list[TagDescriptionTypeDef] = ec2.describe_tags(Filters=[tag_filter])[ "Tags" ] idsToCancel = [tag["ResourceId"] for tag in tags] - return [request["SpotInstanceRequestId"] for request in requests if request["InstanceId"] in idsToCancel] + return [ + request["SpotInstanceRequestId"] + for request in requests + if request["InstanceId"] in idsToCancel + ] - def _createSecurityGroups(self) -> List[str]: + def _createSecurityGroups(self) -> list[str]: """ Create security groups for the cluster. Returns a list of their IDs. """ + def group_not_found(e: ClientError) -> bool: - retry = (get_error_status(e) == 400 and 'does not exist in default VPC' in get_error_body(e)) + retry = get_error_status( + e + ) == 400 and "does not exist in default VPC" in get_error_body(e) return retry # Grab the connection we need to use for this operation. # The VPC connection can do anything the EC2 one can do, but also look at subnets. - boto3_ec2: "EC2Client" = self.aws.client( - region=self._region, service_name="ec2" - ) + boto3_ec2: EC2Client = self.aws.client(region=self._region, service_name="ec2") vpc_id = None if self._leader_subnet: - subnets = boto3_ec2.describe_subnets(SubnetIds=[self._leader_subnet])["Subnets"] + subnets = boto3_ec2.describe_subnets(SubnetIds=[self._leader_subnet])[ + "Subnets" + ] if len(subnets) > 0: vpc_id = subnets[0]["VpcId"] try: # Security groups need to belong to the same VPC as the leader. If we # put the leader in a particular non-default subnet, it may be in a # particular non-default VPC, which we need to know about. - other = {"GroupName": self.clusterName, "Description": "Toil appliance security group"} + other = { + "GroupName": self.clusterName, + "Description": "Toil appliance security group", + } if vpc_id is not None: other["VpcId"] = vpc_id # mypy stubs don't explicitly state kwargs even though documentation allows it, and mypy gets confused - web_response: "CreateSecurityGroupResultTypeDef" = boto3_ec2.create_security_group(**other) # type: ignore[arg-type] + web_response: CreateSecurityGroupResultTypeDef = boto3_ec2.create_security_group(**other) # type: ignore[arg-type] except ClientError as e: - if get_error_status(e) == 400 and 'already exists' in get_error_body(e): + if get_error_status(e) == 400 and "already exists" in get_error_body(e): pass else: raise else: for attempt in old_retry(predicate=group_not_found, timeout=300): with attempt: - ip_permissions: List["IpPermissionTypeDef"] = [ + ip_permissions: list[IpPermissionTypeDef] = [ { "IpProtocol": "tcp", "FromPort": 22, @@ -1379,21 +1568,34 @@ def group_not_found(e: ClientError) -> bool: } ] for protocol in ("tcp", "udp"): - ip_permissions.append({"IpProtocol": protocol, - "FromPort": 0, - "ToPort": 65535, - "UserIdGroupPairs": - [{"GroupId": web_response["GroupId"], - "GroupName": self.clusterName}]}) - boto3_ec2.authorize_security_group_ingress(IpPermissions=ip_permissions, GroupName=self.clusterName, GroupId=web_response["GroupId"]) + ip_permissions.append( + { + "IpProtocol": protocol, + "FromPort": 0, + "ToPort": 65535, + "UserIdGroupPairs": [ + { + "GroupId": web_response["GroupId"], + "GroupName": self.clusterName, + } + ], + } + ) + boto3_ec2.authorize_security_group_ingress( + IpPermissions=ip_permissions, + GroupName=self.clusterName, + GroupId=web_response["GroupId"], + ) out = [] for sg in boto3_ec2.describe_security_groups()["SecurityGroups"]: - if sg["GroupName"] == self.clusterName and (vpc_id is None or sg["VpcId"] == vpc_id): + if sg["GroupName"] == self.clusterName and ( + vpc_id is None or sg["VpcId"] == vpc_id + ): out.append(sg["GroupId"]) return out @awsRetry - def _getSecurityGroupIDs(self) -> List[str]: + def _getSecurityGroupIDs(self) -> list[str]: """ Get all the security group IDs to apply to leaders and workers. """ @@ -1402,15 +1604,20 @@ def _getSecurityGroupIDs(self) -> List[str]: # Depending on if we enumerated them on the leader or locally, we might # know the required security groups by name, ID, or both. - boto3_ec2 = self.aws.client(region=self._region, service_name='ec2') - return [sg["GroupId"] for sg in boto3_ec2.describe_security_groups()["SecurityGroups"] - if (sg["GroupName"] in self._leaderSecurityGroupNames or - sg["GroupId"] in self._leaderSecurityGroupIDs)] + boto3_ec2 = self.aws.client(region=self._region, service_name="ec2") + return [ + sg["GroupId"] + for sg in boto3_ec2.describe_security_groups()["SecurityGroups"] + if ( + sg["GroupName"] in self._leaderSecurityGroupNames + or sg["GroupId"] in self._leaderSecurityGroupIDs + ) + ] @awsRetry def _get_launch_template_ids( - self, filters: Optional[List["FilterTypeDef"]] = None - ) -> List[str]: + self, filters: list[FilterTypeDef] | None = None + ) -> list[str]: """ Find all launch templates associated with the cluster. @@ -1418,10 +1625,10 @@ def _get_launch_template_ids( """ # Grab the connection we need to use for this operation. - ec2: "EC2Client" = self.aws.client(self._region, "ec2") + ec2: EC2Client = self.aws.client(self._region, "ec2") # How do we match the right templates? - combined_filters: List["FilterTypeDef"] = [ + combined_filters: list[FilterTypeDef] = [ {"Name": "tag:" + _TAG_KEY_TOIL_CLUSTER_NAME, "Values": [self.clusterName]} ] @@ -1431,16 +1638,21 @@ def _get_launch_template_ids( allTemplateIDs = [] # Get the first page with no NextToken - response = ec2.describe_launch_templates(Filters=combined_filters, - MaxResults=200) + response = ec2.describe_launch_templates( + Filters=combined_filters, MaxResults=200 + ) while True: # Process the current page - allTemplateIDs += [item['LaunchTemplateId'] for item in response.get('LaunchTemplates', [])] - if 'NextToken' in response: + allTemplateIDs += [ + item["LaunchTemplateId"] for item in response.get("LaunchTemplates", []) + ] + if "NextToken" in response: # There are more pages. Get the next one, supplying the token. - response = ec2.describe_launch_templates(Filters=filters or [], - NextToken=response['NextToken'], - MaxResults=200) + response = ec2.describe_launch_templates( + Filters=filters or [], + NextToken=response["NextToken"], + MaxResults=200, + ) else: # No more pages break @@ -1448,7 +1660,9 @@ def _get_launch_template_ids( return allTemplateIDs @awsRetry - def _get_worker_launch_template(self, instance_type: str, preemptible: bool = False, backoff: float = 1.0) -> str: + def _get_worker_launch_template( + self, instance_type: str, preemptible: bool = False, backoff: float = 1.0 + ) -> str: """ Get a launch template for instances with the given parameters. Only one such launch template will be created, no matter how many times the @@ -1467,38 +1681,55 @@ def _get_worker_launch_template(self, instance_type: str, preemptible: bool = Fa :return: The ID of the template. """ - lt_name = self._name_worker_launch_template(instance_type, preemptible=preemptible) + lt_name = self._name_worker_launch_template( + instance_type, preemptible=preemptible + ) # How do we match the right templates? - filters: List["FilterTypeDef"] = [ + filters: list[FilterTypeDef] = [ {"Name": "launch-template-name", "Values": [lt_name]} ] # Get the templates - templates: List[str] = self._get_launch_template_ids(filters=filters) + templates: list[str] = self._get_launch_template_ids(filters=filters) if len(templates) > 1: # There shouldn't ever be multiple templates with our reserved name - raise RuntimeError(f"Multiple launch templates already exist named {lt_name}; " - "something else is operating in our cluster namespace.") + raise RuntimeError( + f"Multiple launch templates already exist named {lt_name}; " + "something else is operating in our cluster namespace." + ) elif len(templates) == 0: # Template doesn't exist so we can create it. try: - return self._create_worker_launch_template(instance_type, preemptible=preemptible) + return self._create_worker_launch_template( + instance_type, preemptible=preemptible + ) except ClientError as e: - if get_error_code(e) == 'InvalidLaunchTemplateName.AlreadyExistsException': + if ( + get_error_code(e) + == "InvalidLaunchTemplateName.AlreadyExistsException" + ): # Someone got to it before us (or we couldn't read our own # writes). Recurse to try again, because now it exists. - logger.info('Waiting %f seconds for template %s to be available', backoff, lt_name) + logger.info( + "Waiting %f seconds for template %s to be available", + backoff, + lt_name, + ) time.sleep(backoff) - return self._get_worker_launch_template(instance_type, preemptible=preemptible, backoff=backoff * 2) + return self._get_worker_launch_template( + instance_type, preemptible=preemptible, backoff=backoff * 2 + ) else: raise else: # There must be exactly one template return templates[0] - def _name_worker_launch_template(self, instance_type: str, preemptible: bool = False) -> str: + def _name_worker_launch_template( + self, instance_type: str, preemptible: bool = False + ) -> str: """ Get the name we should use for the launch template with the given parameters. @@ -1509,13 +1740,15 @@ def _name_worker_launch_template(self, instance_type: str, preemptible: bool = F """ # The name has the cluster name in it - lt_name = f'{self.clusterName}-lt-{instance_type}' + lt_name = f"{self.clusterName}-lt-{instance_type}" if preemptible: - lt_name += '-spot' + lt_name += "-spot" return lt_name - def _create_worker_launch_template(self, instance_type: str, preemptible: bool = False) -> str: + def _create_worker_launch_template( + self, instance_type: str, preemptible: bool = False + ) -> str: """ Create the launch template for launching worker instances for the cluster. @@ -1535,27 +1768,33 @@ def _create_worker_launch_template(self, instance_type: str, preemptible: bool = bdms = self._getBoto3BlockDeviceMappings(type_info, rootVolSize=rootVolSize) keyPath = self._sseKey if self._sseKey else None - userData = self._getIgnitionUserData('worker', keyPath, preemptible, self._architecture) + userData = self._getIgnitionUserData( + "worker", keyPath, preemptible, self._architecture + ) - lt_name = self._name_worker_launch_template(instance_type, preemptible=preemptible) + lt_name = self._name_worker_launch_template( + instance_type, preemptible=preemptible + ) # But really we find it by tag tags = dict(self._tags) - tags[_TAG_KEY_TOIL_NODE_TYPE] = 'worker' - - return create_launch_template(self.aws.client(self._region, 'ec2'), - template_name=lt_name, - image_id=self._discoverAMI(), - key_name=self._keyName, - security_group_ids=self._getSecurityGroupIDs(), - instance_type=instance_type, - user_data=userData, - block_device_map=bdms, - instance_profile_arn=self._leaderProfileArn, - tags=tags) + tags[_TAG_KEY_TOIL_NODE_TYPE] = "worker" + + return create_launch_template( + self.aws.client(self._region, "ec2"), + template_name=lt_name, + image_id=self._discoverAMI(), + key_name=self._keyName, + security_group_ids=self._getSecurityGroupIDs(), + instance_type=instance_type, + user_data=userData, + block_device_map=bdms, + instance_profile_arn=self._leaderProfileArn, + tags=tags, + ) @awsRetry - def _getAutoScalingGroupNames(self) -> List[str]: + def _getAutoScalingGroupNames(self) -> list[str]: """ Find all auto-scaling groups associated with the cluster. @@ -1563,13 +1802,13 @@ def _getAutoScalingGroupNames(self) -> List[str]: """ # Grab the connection we need to use for this operation. - autoscaling: "AutoScalingClient" = self.aws.client(self._region, "autoscaling") + autoscaling: AutoScalingClient = self.aws.client(self._region, "autoscaling") # AWS won't filter ASGs server-side for us in describe_auto_scaling_groups. # So we search instances of applied tags for the ASGs they are on. # The ASGs tagged with our cluster are our ASGs. # The filtering is on different fields of the tag object itself. - filters: List["FilterTypeDef"] = [ + filters: list[FilterTypeDef] = [ {"Name": "key", "Values": [_TAG_KEY_TOIL_CLUSTER_NAME]}, {"Name": "value", "Values": [self.clusterName]}, ] @@ -1579,13 +1818,17 @@ def _getAutoScalingGroupNames(self) -> List[str]: response = autoscaling.describe_tags(Filters=filters) while True: # Process the current page - matchedASGs += [item['ResourceId'] for item in response.get('Tags', []) - if item['Key'] == _TAG_KEY_TOIL_CLUSTER_NAME and - item['Value'] == self.clusterName] - if 'NextToken' in response: + matchedASGs += [ + item["ResourceId"] + for item in response.get("Tags", []) + if item["Key"] == _TAG_KEY_TOIL_CLUSTER_NAME + and item["Value"] == self.clusterName + ] + if "NextToken" in response: # There are more pages. Get the next one, supplying the token. - response = autoscaling.describe_tags(Filters=filters, - NextToken=response['NextToken']) + response = autoscaling.describe_tags( + Filters=filters, NextToken=response["NextToken"] + ) else: # No more pages break @@ -1593,16 +1836,18 @@ def _getAutoScalingGroupNames(self) -> List[str]: for name in matchedASGs: # Double check to make sure we definitely aren't finding non-Toil # things - assert name.startswith('toil-') + assert name.startswith("toil-") return matchedASGs - def _createWorkerAutoScalingGroup(self, - launch_template_ids: Dict[str, str], - instance_types: Collection[str], - min_size: int, - max_size: int, - spot_bid: Optional[float] = None) -> str: + def _createWorkerAutoScalingGroup( + self, + launch_template_ids: dict[str, str], + instance_types: Collection[str], + min_size: int, + max_size: int, + spot_bid: float | None = None, + ) -> str: """ Create an autoscaling group. @@ -1632,8 +1877,12 @@ def _createWorkerAutoScalingGroup(self, for instance_type in instance_types: spec = E2Instances[instance_type] spec_gigs = spec.disks * spec.disk_capacity - rootVolSize = self._nodeStorageOverrides.get(instance_type, self._nodeStorage) - storage_gigs.append(max(rootVolSize - _STORAGE_ROOT_OVERHEAD_GIGS, spec_gigs)) + rootVolSize = self._nodeStorageOverrides.get( + instance_type, self._nodeStorage + ) + storage_gigs.append( + max(rootVolSize - _STORAGE_ROOT_OVERHEAD_GIGS, spec_gigs) + ) # Get the min storage we expect to see, but not less than 0. min_gigs = max(min(storage_gigs), 0) @@ -1642,32 +1891,36 @@ def _createWorkerAutoScalingGroup(self, tags = dict(self._tags) # We tag the ASG with the Toil type, although nothing cares. - tags[_TAG_KEY_TOIL_NODE_TYPE] = 'worker' + tags[_TAG_KEY_TOIL_NODE_TYPE] = "worker" - if self.clusterType == 'kubernetes': + if self.clusterType == "kubernetes": # We also need to tag it with Kubernetes autoscaler info (empty tags) - tags['k8s.io/cluster-autoscaler/' + self.clusterName] = '' - assert (self.clusterName != 'enabled') - tags['k8s.io/cluster-autoscaler/enabled'] = '' - tags['k8s.io/cluster-autoscaler/node-template/resources/ephemeral-storage'] = f'{min_gigs}G' + tags["k8s.io/cluster-autoscaler/" + self.clusterName] = "" + assert self.clusterName != "enabled" + tags["k8s.io/cluster-autoscaler/enabled"] = "" + tags[ + "k8s.io/cluster-autoscaler/node-template/resources/ephemeral-storage" + ] = f"{min_gigs}G" # Now we need to make up a unique name # TODO: can we make this more semantic without risking collisions? Maybe count up in memory? - asg_name = 'toil-' + str(uuid.uuid4()) - - create_auto_scaling_group(self.aws.client(self._region, 'autoscaling'), - asg_name=asg_name, - launch_template_ids=launch_template_ids, - vpc_subnets=self._get_worker_subnets(), - min_size=min_size, - max_size=max_size, - instance_types=instance_types, - spot_bid=spot_bid, - tags=tags) + asg_name = "toil-" + str(uuid.uuid4()) + + create_auto_scaling_group( + self.aws.client(self._region, "autoscaling"), + asg_name=asg_name, + launch_template_ids=launch_template_ids, + vpc_subnets=self._get_worker_subnets(), + min_size=min_size, + max_size=max_size, + instance_types=instance_types, + spot_bid=spot_bid, + tags=tags, + ) return asg_name - def _boto2_pager(self, requestor_callable: Callable[[...], Any], result_attribute_name: str) -> Iterable[Dict[str, Any]]: # type: ignore[misc] + def _boto2_pager(self, requestor_callable: Callable[[...], Any], result_attribute_name: str) -> Iterable[dict[str, Any]]: # type: ignore[misc] """ Yield all the results from calling the given Boto 2 method and paging through all the results using the "marker" field. Results are to be @@ -1677,20 +1930,20 @@ def _boto2_pager(self, requestor_callable: Callable[[...], Any], result_attribut while True: result = requestor_callable(marker=marker) # type: ignore[call-arg] yield from getattr(result, result_attribute_name) - if result.is_truncated == 'true': + if result.is_truncated == "true": marker = result.marker else: break @awsRetry - def _getRoleNames(self) -> List[str]: + def _getRoleNames(self) -> list[str]: """ Get all the IAM roles belonging to the cluster, as names. """ results = [] - boto3_iam = self.aws.client(self._region, 'iam') - for result in boto3_pager(boto3_iam.list_roles, 'Roles'): + boto3_iam = self.aws.client(self._region, "iam") + for result in boto3_pager(boto3_iam.list_roles, "Roles"): # For each Boto2 role object # Grab out the name result2 = cast("RoleTypeDef", result) @@ -1701,15 +1954,14 @@ def _getRoleNames(self) -> List[str]: return results @awsRetry - def _getInstanceProfileNames(self) -> List[str]: + def _getInstanceProfileNames(self) -> list[str]: """ Get all the instance profiles belonging to the cluster, as names. """ results = [] - boto3_iam = self.aws.client(self._region, 'iam') - for result in boto3_pager(boto3_iam.list_instance_profiles, - 'InstanceProfiles'): + boto3_iam = self.aws.client(self._region, "iam") + for result in boto3_pager(boto3_iam.list_instance_profiles, "InstanceProfiles"): # Grab out the name result2 = cast("InstanceProfileTypeDef", result) name = result2["InstanceProfileName"] @@ -1719,7 +1971,7 @@ def _getInstanceProfileNames(self) -> List[str]: return results @awsRetry - def _getRoleInstanceProfileNames(self, role_name: str) -> List[str]: + def _getRoleInstanceProfileNames(self, role_name: str) -> list[str]: """ Get all the instance profiles with the IAM role with the given name. @@ -1727,14 +1979,19 @@ def _getRoleInstanceProfileNames(self, role_name: str) -> List[str]: """ # Grab the connection we need to use for this operation. - boto3_iam: "IAMClient" = self.aws.client(self._region, "iam") - - return [item['InstanceProfileName'] for item in boto3_pager(boto3_iam.list_instance_profiles_for_role, - 'InstanceProfiles', - RoleName=role_name)] + boto3_iam: IAMClient = self.aws.client(self._region, "iam") + + return [ + item["InstanceProfileName"] + for item in boto3_pager( + boto3_iam.list_instance_profiles_for_role, + "InstanceProfiles", + RoleName=role_name, + ) + ] @awsRetry - def _getRolePolicyArns(self, role_name: str) -> List[str]: + def _getRolePolicyArns(self, role_name: str) -> list[str]: """ Get all the policies attached to the IAM role with the given name. @@ -1744,34 +2001,44 @@ def _getRolePolicyArns(self, role_name: str) -> List[str]: """ # Grab the connection we need to use for this operation. - boto3_iam: "IAMClient" = self.aws.client(self._region, "iam") + boto3_iam: IAMClient = self.aws.client(self._region, "iam") # TODO: we don't currently use attached policies. - return [item['PolicyArn'] for item in boto3_pager(boto3_iam.list_attached_role_policies, - 'AttachedPolicies', - RoleName=role_name)] + return [ + item["PolicyArn"] + for item in boto3_pager( + boto3_iam.list_attached_role_policies, + "AttachedPolicies", + RoleName=role_name, + ) + ] @awsRetry - def _getRoleInlinePolicyNames(self, role_name: str) -> List[str]: + def _getRoleInlinePolicyNames(self, role_name: str) -> list[str]: """ Get all the policies inline in the given IAM role. Returns policy names. """ # Grab the connection we need to use for this operation. - boto3_iam: "IAMClient" = self.aws.client(self._region, "iam") + boto3_iam: IAMClient = self.aws.client(self._region, "iam") - return list(boto3_pager(boto3_iam.list_role_policies, 'PolicyNames', RoleName=role_name)) + return list( + boto3_pager(boto3_iam.list_role_policies, "PolicyNames", RoleName=role_name) + ) - def full_policy(self, resource: str) -> Dict[str, Any]: + def full_policy(self, resource: str) -> dict[str, Any]: """ Produce a dict describing the JSON form of a full-access-granting AWS IAM policy for the service with the given name (e.g. 's3'). """ - return dict(Version="2012-10-17", Statement=[dict(Effect="Allow", Resource="*", Action=f"{resource}:*")]) + return dict( + Version="2012-10-17", + Statement=[dict(Effect="Allow", Resource="*", Action=f"{resource}:*")], + ) - def kubernetes_policy(self) -> Dict[str, Any]: + def kubernetes_policy(self) -> dict[str, Any]: """ Get the Kubernetes policy grants not provided by the full grants on EC2 and IAM. See @@ -1785,69 +2052,85 @@ def kubernetes_policy(self) -> Dict[str, Any]: Some of these are really only needed on the leader. """ - return dict(Version="2012-10-17", Statement=[dict(Effect="Allow", Resource="*", Action=[ - "ecr:GetAuthorizationToken", - "ecr:BatchCheckLayerAvailability", - "ecr:GetDownloadUrlForLayer", - "ecr:GetRepositoryPolicy", - "ecr:DescribeRepositories", - "ecr:ListImages", - "ecr:BatchGetImage", - "autoscaling:DescribeAutoScalingGroups", - "autoscaling:DescribeAutoScalingInstances", - "autoscaling:DescribeLaunchConfigurations", - "autoscaling:DescribeTags", - "autoscaling:SetDesiredCapacity", - "autoscaling:TerminateInstanceInAutoScalingGroup", - "elasticloadbalancing:AddTags", - "elasticloadbalancing:ApplySecurityGroupsToLoadBalancer", - "elasticloadbalancing:AttachLoadBalancerToSubnets", - "elasticloadbalancing:ConfigureHealthCheck", - "elasticloadbalancing:CreateListener", - "elasticloadbalancing:CreateLoadBalancer", - "elasticloadbalancing:CreateLoadBalancerListeners", - "elasticloadbalancing:CreateLoadBalancerPolicy", - "elasticloadbalancing:CreateTargetGroup", - "elasticloadbalancing:DeleteListener", - "elasticloadbalancing:DeleteLoadBalancer", - "elasticloadbalancing:DeleteLoadBalancerListeners", - "elasticloadbalancing:DeleteTargetGroup", - "elasticloadbalancing:DeregisterInstancesFromLoadBalancer", - "elasticloadbalancing:DeregisterTargets", - "elasticloadbalancing:DescribeListeners", - "elasticloadbalancing:DescribeLoadBalancerAttributes", - "elasticloadbalancing:DescribeLoadBalancerPolicies", - "elasticloadbalancing:DescribeLoadBalancers", - "elasticloadbalancing:DescribeTargetGroups", - "elasticloadbalancing:DescribeTargetHealth", - "elasticloadbalancing:DetachLoadBalancerFromSubnets", - "elasticloadbalancing:ModifyListener", - "elasticloadbalancing:ModifyLoadBalancerAttributes", - "elasticloadbalancing:ModifyTargetGroup", - "elasticloadbalancing:RegisterInstancesWithLoadBalancer", - "elasticloadbalancing:RegisterTargets", - "elasticloadbalancing:SetLoadBalancerPoliciesForBackendServer", - "elasticloadbalancing:SetLoadBalancerPoliciesOfListener", - "kms:DescribeKey" - ])]) - - def _setup_iam_ec2_role(self, local_role_name: str, policies: Dict[str, Any]) -> str: + return dict( + Version="2012-10-17", + Statement=[ + dict( + Effect="Allow", + Resource="*", + Action=[ + "ecr:GetAuthorizationToken", + "ecr:BatchCheckLayerAvailability", + "ecr:GetDownloadUrlForLayer", + "ecr:GetRepositoryPolicy", + "ecr:DescribeRepositories", + "ecr:ListImages", + "ecr:BatchGetImage", + "autoscaling:DescribeAutoScalingGroups", + "autoscaling:DescribeAutoScalingInstances", + "autoscaling:DescribeLaunchConfigurations", + "autoscaling:DescribeTags", + "autoscaling:SetDesiredCapacity", + "autoscaling:TerminateInstanceInAutoScalingGroup", + "elasticloadbalancing:AddTags", + "elasticloadbalancing:ApplySecurityGroupsToLoadBalancer", + "elasticloadbalancing:AttachLoadBalancerToSubnets", + "elasticloadbalancing:ConfigureHealthCheck", + "elasticloadbalancing:CreateListener", + "elasticloadbalancing:CreateLoadBalancer", + "elasticloadbalancing:CreateLoadBalancerListeners", + "elasticloadbalancing:CreateLoadBalancerPolicy", + "elasticloadbalancing:CreateTargetGroup", + "elasticloadbalancing:DeleteListener", + "elasticloadbalancing:DeleteLoadBalancer", + "elasticloadbalancing:DeleteLoadBalancerListeners", + "elasticloadbalancing:DeleteTargetGroup", + "elasticloadbalancing:DeregisterInstancesFromLoadBalancer", + "elasticloadbalancing:DeregisterTargets", + "elasticloadbalancing:DescribeListeners", + "elasticloadbalancing:DescribeLoadBalancerAttributes", + "elasticloadbalancing:DescribeLoadBalancerPolicies", + "elasticloadbalancing:DescribeLoadBalancers", + "elasticloadbalancing:DescribeTargetGroups", + "elasticloadbalancing:DescribeTargetHealth", + "elasticloadbalancing:DetachLoadBalancerFromSubnets", + "elasticloadbalancing:ModifyListener", + "elasticloadbalancing:ModifyLoadBalancerAttributes", + "elasticloadbalancing:ModifyTargetGroup", + "elasticloadbalancing:RegisterInstancesWithLoadBalancer", + "elasticloadbalancing:RegisterTargets", + "elasticloadbalancing:SetLoadBalancerPoliciesForBackendServer", + "elasticloadbalancing:SetLoadBalancerPoliciesOfListener", + "kms:DescribeKey", + ], + ) + ], + ) + + def _setup_iam_ec2_role( + self, local_role_name: str, policies: dict[str, Any] + ) -> str: """ Create an IAM role with the given policies, using the given name in addition to the cluster name, and return its full name. """ - ec2_role_policy_document = json.dumps({ - "Version": "2012-10-17", - "Statement": [{ - "Effect": "Allow", - "Principal": {"Service": ["ec2.amazonaws.com"]}, - "Action": ["sts:AssumeRole"]} - ]}) + ec2_role_policy_document = json.dumps( + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": {"Service": ["ec2.amazonaws.com"]}, + "Action": ["sts:AssumeRole"], + } + ], + } + ) return create_iam_role( role_name=self._namespace_name(local_role_name), assume_role_policy_document=ec2_role_policy_document, policies=policies, - region=self._region + region=self._region, ) @awsRetry @@ -1860,22 +2143,30 @@ def _createProfileArn(self) -> str: """ # Grab the connection we need to use for this operation. - boto3_iam: "IAMClient" = self.aws.client(self._region, "iam") + boto3_iam: IAMClient = self.aws.client(self._region, "iam") - policy = dict(iam_full=self.full_policy('iam'), ec2_full=self.full_policy('ec2'), - s3_full=self.full_policy('s3'), sbd_full=self.full_policy('sdb')) - if self.clusterType == 'kubernetes': + policy = dict( + iam_full=self.full_policy("iam"), + ec2_full=self.full_policy("ec2"), + s3_full=self.full_policy("s3"), + sbd_full=self.full_policy("sdb"), + ) + if self.clusterType == "kubernetes": # We also need autoscaling groups and some other stuff for AWS-Kubernetes integrations. # TODO: We use one merged policy for leader and worker, but we could be more specific. - policy['kubernetes_merged'] = self.kubernetes_policy() + policy["kubernetes_merged"] = self.kubernetes_policy() iamRoleName = self._setup_iam_ec2_role(_INSTANCE_PROFILE_ROLE_NAME, policy) try: - profile_result = boto3_iam.get_instance_profile(InstanceProfileName=iamRoleName) - profile: "InstanceProfileTypeDef" = profile_result["InstanceProfile"] + profile_result = boto3_iam.get_instance_profile( + InstanceProfileName=iamRoleName + ) + profile: InstanceProfileTypeDef = profile_result["InstanceProfile"] logger.debug("Have preexisting instance profile: %s", profile) except boto3_iam.exceptions.NoSuchEntityException: - profile_result = boto3_iam.create_instance_profile(InstanceProfileName=iamRoleName) + profile_result = boto3_iam.create_instance_profile( + InstanceProfileName=iamRoleName + ) profile = profile_result["InstanceProfile"] logger.debug("Created new instance profile: %s", profile) else: @@ -1891,21 +2182,28 @@ def _createProfileArn(self) -> str: # This is too many roles. We probably grabbed something we should # not have by mistake, and this is some important profile for # something else. - raise RuntimeError(f'Did not expect instance profile {profile_arn} to contain ' - f'more than one role; is it really a Toil-managed profile?') + raise RuntimeError( + f"Did not expect instance profile {profile_arn} to contain " + f"more than one role; is it really a Toil-managed profile?" + ) elif len(profile["Roles"]) == 1: if profile["Roles"][0]["RoleName"] == iamRoleName: return profile_arn else: # Drop this wrong role and use the fallback code for 0 roles - boto3_iam.remove_role_from_instance_profile(InstanceProfileName=iamRoleName, - RoleName=profile["Roles"][0]["RoleName"]) + boto3_iam.remove_role_from_instance_profile( + InstanceProfileName=iamRoleName, + RoleName=profile["Roles"][0]["RoleName"], + ) # If we get here, we had 0 roles on the profile, or we had 1 but we removed it. for attempt in old_retry(predicate=lambda err: get_error_status(err) == 404): with attempt: # Put the IAM role on the profile - boto3_iam.add_role_to_instance_profile(InstanceProfileName=profile["InstanceProfileName"], RoleName=iamRoleName) + boto3_iam.add_role_to_instance_profile( + InstanceProfileName=profile["InstanceProfileName"], + RoleName=iamRoleName, + ) logger.debug("Associated role %s with profile", iamRoleName) return profile_arn diff --git a/src/toil/provisioners/clusterScaler.py b/src/toil/provisioners/clusterScaler.py index c31ed5b84a..f331cee06b 100644 --- a/src/toil/provisioners/clusterScaler.py +++ b/src/toil/provisioners/clusterScaler.py @@ -18,27 +18,21 @@ import os import time from collections import defaultdict -from typing import (TYPE_CHECKING, - Any, - Callable, - Dict, - List, - Optional, - Set, - Tuple, - Union) - -from toil.batchSystems.abstractBatchSystem import (AbstractBatchSystem, - AbstractScalableBatchSystem, - NodeInfo) +from typing import TYPE_CHECKING, Any, Callable, Optional, Union + +from toil.batchSystems.abstractBatchSystem import ( + AbstractBatchSystem, + AbstractScalableBatchSystem, + NodeInfo, +) from toil.bus import ClusterDesiredSizeMessage, ClusterSizeMessage from toil.common import Config -from toil.options.common import defaultTargetTime from toil.job import JobDescription, ServiceJobDescription from toil.lib.conversions import bytes2human, human2bytes from toil.lib.retry import old_retry from toil.lib.threading import ExceptionalThread from toil.lib.throttle import throttle +from toil.options.common import defaultTargetTime from toil.provisioners.abstractProvisioner import AbstractProvisioner, Shape if TYPE_CHECKING: @@ -48,18 +42,25 @@ logger = logging.getLogger(__name__) # Properties of GKE's memory overhead algorithm -EVICTION_THRESHOLD = human2bytes('100MiB') -RESERVE_SMALL_LIMIT = human2bytes('1GiB') -RESERVE_SMALL_AMOUNT = human2bytes('255MiB') -RESERVE_BREAKPOINTS: List[Union[int, float]] = [human2bytes('4GiB'), human2bytes('8GiB'), human2bytes('16GiB'), human2bytes('128GiB'), math.inf] +EVICTION_THRESHOLD = human2bytes("100MiB") +RESERVE_SMALL_LIMIT = human2bytes("1GiB") +RESERVE_SMALL_AMOUNT = human2bytes("255MiB") +RESERVE_BREAKPOINTS: list[Union[int, float]] = [ + human2bytes("4GiB"), + human2bytes("8GiB"), + human2bytes("16GiB"), + human2bytes("128GiB"), + math.inf, +] RESERVE_FRACTIONS = [0.25, 0.2, 0.1, 0.06, 0.02] # Guess of how much disk space on the root volume is used for the OS and essential container images -OS_SIZE = human2bytes('5G') +OS_SIZE = human2bytes("5G") # Define a type for an explanation of why a job can't fit on a node. # Consists of a resource name and a constraining value for that resource. -FailedConstraint = Tuple[str, Union[int, float, bool]] +FailedConstraint = tuple[str, Union[int, float, bool]] + class BinPackedFit: """ @@ -80,24 +81,30 @@ class BinPackedFit: :returns: The minimum number of minimal node allocations estimated to be required to run all the jobs in jobShapes. """ - nodeReservations: Dict[Shape, List['NodeReservation']] - def __init__(self, nodeShapes: List[Shape], targetTime: float = defaultTargetTime) -> None: + nodeReservations: dict[Shape, list["NodeReservation"]] + + def __init__( + self, nodeShapes: list[Shape], targetTime: float = defaultTargetTime + ) -> None: self.nodeShapes = sorted(nodeShapes) self.targetTime = targetTime self.nodeReservations = {nodeShape: [] for nodeShape in nodeShapes} - def binPack(self, jobShapes: List[Shape]) -> Dict[Shape, List[FailedConstraint]]: + def binPack(self, jobShapes: list[Shape]) -> dict[Shape, list[FailedConstraint]]: """ Pack a list of jobShapes into the fewest nodes reasonable. - + Can be run multiple times. - + Returns any distinct Shapes that did not fit, mapping to reasons they did not fit. """ # TODO: Check for redundancy with batchsystems.mesos.JobQueue() sorting - logger.debug('Running bin packing for node shapes %s and %s job(s).', - self.nodeShapes, len(jobShapes)) + logger.debug( + "Running bin packing for node shapes %s and %s job(s).", + self.nodeShapes, + len(jobShapes), + ) # Sort in descending order from largest to smallest. The FFD like-strategy will pack the # jobs in order from longest to shortest. jobShapes.sort() @@ -111,11 +118,13 @@ def binPack(self, jobShapes: List[Shape]) -> Dict[Shape, List[FailedConstraint]] could_not_fit[rejection[0]] = rejection[1] return could_not_fit - def addJobShape(self, jobShape: Shape) -> Optional[Tuple[Shape, List[FailedConstraint]]]: + def addJobShape( + self, jobShape: Shape + ) -> Optional[tuple[Shape, list[FailedConstraint]]]: """ Add the job to the first node reservation in which it will fit. (This is the bin-packing aspect). - + Returns the job shape again, and a list of failed constraints, if it did not fit. """ chosenNodeShape = None @@ -126,24 +135,33 @@ def addJobShape(self, jobShape: Shape) -> Optional[Tuple[Shape, List[FailedConst break if chosenNodeShape is None: - logger.debug("Couldn't fit job with requirements %s into any nodes in the nodeTypes " - "list.", jobShape) + logger.debug( + "Couldn't fit job with requirements %s into any nodes in the nodeTypes " + "list.", + jobShape, + ) # Go back and debug why this happened. - fewest_constraints: Optional[List[FailedConstraint]] = None + fewest_constraints: Optional[list[FailedConstraint]] = None for shape in self.nodeShapes: failures = NodeReservation(nodeShape).get_failed_constraints(jobShape) - if fewest_constraints is None or len(failures) < len(fewest_constraints): + if fewest_constraints is None or len(failures) < len( + fewest_constraints + ): # This was closer to fitting. # TODO: Check the actual constraint values so we don't tell # the user to raise the memory on the smallest machine? fewest_constraints = failures - - return jobShape, fewest_constraints if fewest_constraints is not None else [] + + return jobShape, ( + fewest_constraints if fewest_constraints is not None else [] + ) # grab current list of job objects appended to this instance type nodeReservations = self.nodeReservations[chosenNodeShape] for nodeReservation in nodeReservations: - if nodeReservation.attemptToAddJob(jobShape, chosenNodeShape, self.targetTime): + if nodeReservation.attemptToAddJob( + jobShape, chosenNodeShape, self.targetTime + ): # We succeeded adding the job to this node reservation. Now we're done. return None @@ -160,7 +178,7 @@ def addJobShape(self, jobShape: Shape) -> Optional[Tuple[Shape, List[FailedConst reservation = extendThisReservation return None - def getRequiredNodes(self) -> Dict[Shape, int]: + def getRequiredNodes(self) -> dict[Shape, int]: """Return a dict from node shape to number of nodes required to run the packed jobs.""" return { nodeShape: len(self.nodeReservations[nodeShape]) @@ -184,48 +202,72 @@ def __init__(self, shape: Shape) -> None: self.nReservation: Optional[NodeReservation] = None def __str__(self) -> str: - return "-------------------\n" \ - "Current Reservation\n" \ - "-------------------\n" \ - "Shape wallTime: %s\n" \ - "Shape memory: %s\n" \ - "Shape cores: %s\n" \ - "Shape disk: %s\n" \ - "Shape preempt: %s\n" \ - "\n" \ - "nReserv wallTime: %s\n" \ - "nReserv memory: %s\n" \ - "nReserv cores: %s\n" \ - "nReserv disk: %s\n" \ - "nReserv preempt: %s\n" \ - "\n" \ - "Time slices: %s\n" \ - "\n" % \ - (self.shape.wallTime, + return ( + "-------------------\n" + "Current Reservation\n" + "-------------------\n" + "Shape wallTime: %s\n" + "Shape memory: %s\n" + "Shape cores: %s\n" + "Shape disk: %s\n" + "Shape preempt: %s\n" + "\n" + "nReserv wallTime: %s\n" + "nReserv memory: %s\n" + "nReserv cores: %s\n" + "nReserv disk: %s\n" + "nReserv preempt: %s\n" + "\n" + "Time slices: %s\n" + "\n" + % ( + self.shape.wallTime, self.shape.memory, self.shape.cores, self.shape.disk, self.shape.preemptible, - self.nReservation.shape.wallTime if self.nReservation is not None else str(None), - self.nReservation.shape.memory if self.nReservation is not None else str(None), - self.nReservation.shape.cores if self.nReservation is not None else str(None), - self.nReservation.shape.disk if self.nReservation is not None else str(None), - self.nReservation.shape.preemptible if self.nReservation is not None else str(None), - str(len(self.shapes()))) - - def get_failed_constraints(self, job_shape: Shape) -> List[FailedConstraint]: + ( + self.nReservation.shape.wallTime + if self.nReservation is not None + else str(None) + ), + ( + self.nReservation.shape.memory + if self.nReservation is not None + else str(None) + ), + ( + self.nReservation.shape.cores + if self.nReservation is not None + else str(None) + ), + ( + self.nReservation.shape.disk + if self.nReservation is not None + else str(None) + ), + ( + self.nReservation.shape.preemptible + if self.nReservation is not None + else str(None) + ), + str(len(self.shapes())), + ) + ) + + def get_failed_constraints(self, job_shape: Shape) -> list[FailedConstraint]: """ Check if a job shape's resource requirements will fit within this allocation. - + If the job does *not* fit, returns the failing constraints: the resources that can't be accomodated, and the limits that were hit. - + If the job *does* fit, returns an empty list. - + Must always agree with fits()! This codepath is slower and used for diagnosis. """ - - failures: List[FailedConstraint] = [] + + failures: list[FailedConstraint] = [] if job_shape.memory > self.shape.memory: failures.append(("memory", self.shape.memory)) if job_shape.cores > self.shape.cores: @@ -235,15 +277,17 @@ def get_failed_constraints(self, job_shape: Shape) -> List[FailedConstraint]: if not job_shape.preemptible and self.shape.preemptible: failures.append(("preemptible", self.shape.preemptible)) return failures - + def fits(self, jobShape: Shape) -> bool: """Check if a job shape's resource requirements will fit within this allocation.""" - return jobShape.memory <= self.shape.memory and \ - jobShape.cores <= self.shape.cores and \ - jobShape.disk <= self.shape.disk and \ - (jobShape.preemptible or not self.shape.preemptible) + return ( + jobShape.memory <= self.shape.memory + and jobShape.cores <= self.shape.cores + and jobShape.disk <= self.shape.disk + and (jobShape.preemptible or not self.shape.preemptible) + ) - def shapes(self) -> List[Shape]: + def shapes(self) -> list[Shape]: """Get all time-slice shapes, in order, from this reservation on.""" shapes = [] curRes: Optional[NodeReservation] = self @@ -254,11 +298,13 @@ def shapes(self) -> List[Shape]: def subtract(self, jobShape: Shape) -> None: """Subtract the resources necessary to run a jobShape from the reservation.""" - self.shape = Shape(self.shape.wallTime, - self.shape.memory - jobShape.memory, - self.shape.cores - jobShape.cores, - self.shape.disk - jobShape.disk, - self.shape.preemptible) + self.shape = Shape( + self.shape.wallTime, + self.shape.memory - jobShape.memory, + self.shape.cores - jobShape.cores, + self.shape.disk - jobShape.disk, + self.shape.preemptible, + ) def attemptToAddJob( self, jobShape: Shape, nodeShape: Shape, targetTime: float @@ -286,27 +332,42 @@ def attemptToAddJob( # does the job time fit in the reservation's remaining time? if availableTime >= jobShape.wallTime: timeSlice: float = 0 - while (startingReservation != endingReservation): + while startingReservation != endingReservation: # removes resources only (NO time) from startingReservation startingReservation.subtract(jobShape) # type: ignore # set aside the timeSlice timeSlice += startingReservation.shape.wallTime # type: ignore startingReservation = startingReservation.nReservation # type: ignore - assert jobShape.wallTime - timeSlice <= startingReservation.shape.wallTime - adjustEndingReservationForJob(endingReservation, jobShape, timeSlice) + assert ( + jobShape.wallTime - timeSlice + <= startingReservation.shape.wallTime + ) + adjustEndingReservationForJob( + endingReservation, jobShape, timeSlice + ) # Packed the job. return True # If the job would fit, but is longer than the total node allocation # extend the node allocation - elif endingReservation.nReservation == None and startingReservation == self: + elif ( + endingReservation.nReservation == None + and startingReservation == self + ): # Extend the node reservation to accommodate jobShape endingReservation.nReservation = NodeReservation(nodeShape) # can't run the job with the current resources else: - if startingReservationTime + availableTime + endingReservation.shape.wallTime <= targetTime: + if ( + startingReservationTime + + availableTime + + endingReservation.shape.wallTime + <= targetTime + ): startingReservation = endingReservation.nReservation - startingReservationTime += availableTime + endingReservation.shape.wallTime + startingReservationTime += ( + availableTime + endingReservation.shape.wallTime + ) availableTime = 0 else: break @@ -332,7 +393,9 @@ def adjustEndingReservationForJob( """ if jobShape.wallTime - wallTime < reservation.shape.wallTime: # This job only partially fills one of the slices. Create a new slice. - reservation.shape, nS = split(reservation.shape, jobShape, jobShape.wallTime - wallTime) + reservation.shape, nS = split( + reservation.shape, jobShape, jobShape.wallTime - wallTime + ) nS.nReservation = reservation.nReservation reservation.nReservation = nS else: @@ -342,30 +405,40 @@ def adjustEndingReservationForJob( def split( nodeShape: Shape, jobShape: Shape, wallTime: float -) -> Tuple[Shape, NodeReservation]: +) -> tuple[Shape, NodeReservation]: """ Partition a node allocation into two to fit the job. Returning the modified shape of the node and a new node reservation for the extra time that the job didn't fill. """ - return (Shape(wallTime, - nodeShape.memory - jobShape.memory, - nodeShape.cores - jobShape.cores, - nodeShape.disk - jobShape.disk, - nodeShape.preemptible), - NodeReservation(Shape(nodeShape.wallTime - wallTime, - nodeShape.memory, - nodeShape.cores, - nodeShape.disk, - nodeShape.preemptible))) - - -def binPacking(nodeShapes: List[Shape], jobShapes: List[Shape], goalTime: float) -> Tuple[Dict[Shape, int], Dict[Shape, List[FailedConstraint]]]: + return ( + Shape( + wallTime, + nodeShape.memory - jobShape.memory, + nodeShape.cores - jobShape.cores, + nodeShape.disk - jobShape.disk, + nodeShape.preemptible, + ), + NodeReservation( + Shape( + nodeShape.wallTime - wallTime, + nodeShape.memory, + nodeShape.cores, + nodeShape.disk, + nodeShape.preemptible, + ) + ), + ) + + +def binPacking( + nodeShapes: list[Shape], jobShapes: list[Shape], goalTime: float +) -> tuple[dict[Shape, int], dict[Shape, list[FailedConstraint]]]: """ Using the given node shape bins, pack the given job shapes into nodes to get them done in the given amount of time. - + Returns a dict saying how many of each node will be needed, a dict from job shapes that could not fit to reasons why. """ @@ -388,34 +461,37 @@ def __init__( self.provisioner = provisioner self.leader = leader self.config = config - self.static: Dict[bool, Dict[str, "Node"]] = {} - + self.static: dict[bool, dict[str, "Node"]] = {} + # If we encounter a Shape of job that we don't think we can run, call # these callbacks with the Shape that didn't fit and the Shapes that # were available. - self.on_too_big: List[Callable[[Shape, List[Shape]], Any]] = [] + self.on_too_big: list[Callable[[Shape, list[Shape]], Any]] = [] # Dictionary of job names to their average runtime, used to estimate wall time of queued # jobs for bin-packing - self.jobNameToAvgRuntime: Dict[str, float] = {} - self.jobNameToNumCompleted: Dict[str, int] = {} + self.jobNameToAvgRuntime: dict[str, float] = {} + self.jobNameToNumCompleted: dict[str, int] = {} self.totalAvgRuntime = 0.0 self.totalJobsCompleted = 0 self.targetTime: float = config.targetTime if self.targetTime <= 0: - raise RuntimeError('targetTime (%s) must be a positive integer!' % self.targetTime) + raise RuntimeError( + "targetTime (%s) must be a positive integer!" % self.targetTime + ) self.betaInertia = config.betaInertia if not 0.0 <= self.betaInertia <= 0.9: - raise RuntimeError('betaInertia (%f) must be between 0.0 and 0.9!' % self.betaInertia) - + raise RuntimeError( + "betaInertia (%f) must be between 0.0 and 0.9!" % self.betaInertia + ) # Pull scaling information from the provisioner. self.nodeShapeToType = provisioner.getAutoscaledInstanceShapes() self.instance_types = list(self.nodeShapeToType.values()) self.nodeShapes = list(self.nodeShapeToType.keys()) - self.ignoredNodes: Set[str] = set() + self.ignoredNodes: set[str] = set() # A *deficit* exists when we have more jobs that can run on preemptible # nodes than we have preemptible nodes. In order to not block these jobs, @@ -426,13 +502,17 @@ def __init__( # of provisioned preemptible nodes and the number of nodes that were requested. # Then, when provisioning non-preemptible nodes of the same type, we attempt to # make up the deficit. - self.preemptibleNodeDeficit = {instance_type: 0 for instance_type in self.instance_types} + self.preemptibleNodeDeficit = { + instance_type: 0 for instance_type in self.instance_types + } # Keeps track of the last raw (i.e. float, not limited by # max/min nodes) estimates of the number of nodes needed for # each node shape. NB: we start with an estimate of 0, so # scaling up is smoothed as well. - self.previousWeightedEstimate = {nodeShape: 0.0 for nodeShape in self.nodeShapes} + self.previousWeightedEstimate = { + nodeShape: 0.0 for nodeShape in self.nodeShapes + } assert len(self.nodeShapes) > 0 @@ -454,26 +534,38 @@ def __init__( self.nodeShapes.sort() # Nodes might not actually provide all the resources of their nominal shapes - self.node_shapes_after_overhead = self.nodeShapes if config.assume_zero_overhead else [self._reserve_overhead(s) for s in self.nodeShapes] - self.without_overhead = {k: v for k, v in zip(self.node_shapes_after_overhead, self.nodeShapes)} + self.node_shapes_after_overhead = ( + self.nodeShapes + if config.assume_zero_overhead + else [self._reserve_overhead(s) for s in self.nodeShapes] + ) + self.without_overhead = { + k: v for k, v in zip(self.node_shapes_after_overhead, self.nodeShapes) + } - #Node shape to number of currently provisioned nodes - totalNodes: Dict[Shape, int] = defaultdict(int) - if isinstance(leader.batchSystem, AbstractScalableBatchSystem) and leader.provisioner: + # Node shape to number of currently provisioned nodes + totalNodes: dict[Shape, int] = defaultdict(int) + if ( + isinstance(leader.batchSystem, AbstractScalableBatchSystem) + and leader.provisioner + ): for preemptible in (True, False): - nodes: List["Node"] = [] + nodes: list["Node"] = [] for nodeShape, instance_type in self.nodeShapeToType.items(): - nodes_thisType = leader.provisioner.getProvisionedWorkers(instance_type=instance_type, - preemptible=preemptible) + nodes_thisType = leader.provisioner.getProvisionedWorkers( + instance_type=instance_type, preemptible=preemptible + ) totalNodes[nodeShape] += len(nodes_thisType) nodes.extend(nodes_thisType) self.setStaticNodes(nodes, preemptible) - logger.debug('Starting with the following nodes in the cluster: %s' % totalNodes) + logger.debug( + "Starting with the following nodes in the cluster: %s" % totalNodes + ) if not sum(config.maxNodes) > 0: - raise RuntimeError('Not configured to create nodes of any type.') + raise RuntimeError("Not configured to create nodes of any type.") def _round(self, number: float) -> int: """ @@ -529,7 +621,7 @@ def _reserve_overhead(self, full_node: Shape) -> Shape: # TODO: Figure out if the disk is an OS disk of a scratch disk smaller.disk -= self._disk_overhead(smaller.disk) - logger.debug('Node shape %s can hold jobs of shape %s', full_node, smaller) + logger.debug("Node shape %s can hold jobs of shape %s", full_node, smaller) return smaller @@ -558,12 +650,21 @@ def _gke_overhead(self, memory_bytes: int) -> int: # since the previous breakpoint, like a progressive income tax. limit = min(breakpoint, memory_bytes) reservation = fraction * (limit - accounted) - logger.debug('Reserve %s of memory between %s and %s', bytes2human(reservation), bytes2human(accounted), bytes2human(limit)) + logger.debug( + "Reserve %s of memory between %s and %s", + bytes2human(reservation), + bytes2human(accounted), + bytes2human(limit), + ) reserved += reservation accounted = limit if accounted >= memory_bytes: break - logger.debug('Reserved %s/%s memory for overhead', bytes2human(reserved), bytes2human(memory_bytes)) + logger.debug( + "Reserved %s/%s memory for overhead", + bytes2human(reserved), + bytes2human(memory_bytes), + ) return int(reserved) + EVICTION_THRESHOLD @@ -579,15 +680,20 @@ def _disk_overhead(self, disk_bytes: int) -> int: if disk_bytes <= disk_needed: # We don't think we can actually use any of this disk - logger.warning('All %sB of disk on a node type are likely to be needed by the OS! The node probably cannot do any useful work!', bytes2human(disk_bytes)) + logger.warning( + "All %sB of disk on a node type are likely to be needed by the OS! The node probably cannot do any useful work!", + bytes2human(disk_bytes), + ) return disk_bytes if disk_needed * 2 > disk_bytes: - logger.warning('A node type has only %sB disk, of which more than half are expected to be used by the OS. Consider using a larger --nodeStorage', bytes2human(disk_bytes)) + logger.warning( + "A node type has only %sB disk, of which more than half are expected to be used by the OS. Consider using a larger --nodeStorage", + bytes2human(disk_bytes), + ) return disk_needed - def getAverageRuntime(self, jobName: str, service: bool = False) -> float: if service: # We short-circuit service jobs and assume that they will @@ -599,15 +705,15 @@ def getAverageRuntime(self, jobName: str, service: bool = False) -> float: # be running at once for any actual work to get done. return self.targetTime * 24 + 3600 if jobName in self.jobNameToAvgRuntime: - #Have seen jobs of this type before, so estimate - #the runtime based on average of previous jobs of this type + # Have seen jobs of this type before, so estimate + # the runtime based on average of previous jobs of this type return self.jobNameToAvgRuntime[jobName] elif self.totalAvgRuntime > 0: - #Haven't seen this job yet, so estimate its runtime as - #the average runtime of all completed jobs + # Haven't seen this job yet, so estimate its runtime as + # the average runtime of all completed jobs return self.totalAvgRuntime else: - #Have no information whatsoever + # Have no information whatsoever return 1.0 def addCompletedJob(self, job: JobDescription, wallTime: int) -> None: @@ -618,21 +724,25 @@ def addCompletedJob(self, job: JobDescription, wallTime: int) -> None: :param int wallTime: The wall-time taken to complete the job in seconds. """ - #Adjust average runtimes to include this job. + # Adjust average runtimes to include this job. if job.jobName in self.jobNameToAvgRuntime: prevAvg = self.jobNameToAvgRuntime[job.jobName] prevNum = self.jobNameToNumCompleted[job.jobName] - self.jobNameToAvgRuntime[job.jobName] = float(prevAvg*prevNum + wallTime)/(prevNum + 1) + self.jobNameToAvgRuntime[job.jobName] = float( + prevAvg * prevNum + wallTime + ) / (prevNum + 1) self.jobNameToNumCompleted[job.jobName] += 1 else: self.jobNameToAvgRuntime[job.jobName] = wallTime self.jobNameToNumCompleted[job.jobName] = 1 self.totalJobsCompleted += 1 - self.totalAvgRuntime = float(self.totalAvgRuntime * (self.totalJobsCompleted - 1) + \ - wallTime)/self.totalJobsCompleted + self.totalAvgRuntime = ( + float(self.totalAvgRuntime * (self.totalJobsCompleted - 1) + wallTime) + / self.totalJobsCompleted + ) - def setStaticNodes(self, nodes: List["Node"], preemptible: bool) -> None: + def setStaticNodes(self, nodes: list["Node"], preemptible: bool) -> None: """ Used to track statically provisioned nodes. This method must be called before any auto-scaled nodes are provisioned. @@ -642,12 +752,12 @@ def setStaticNodes(self, nodes: List["Node"], preemptible: bool) -> None: :param nodes: list of Node objects """ - prefix = 'non-' if not preemptible else '' + prefix = "non-" if not preemptible else "" logger.debug("Adding %s to %spreemptible static nodes", nodes, prefix) if nodes is not None: - self.static[preemptible] = {node.privateIP : node for node in nodes} + self.static[preemptible] = {node.privateIP: node for node in nodes} - def getStaticNodes(self, preemptible: bool) -> Dict[str, "Node"]: + def getStaticNodes(self, preemptible: bool) -> dict[str, "Node"]: """ Returns nodes set in setStaticNodes(). @@ -662,14 +772,17 @@ def smoothEstimate(self, nodeShape: Shape, estimatedNodeCount: int) -> int: Returns an integer. """ - weightedEstimate = (1 - self.betaInertia) * estimatedNodeCount + \ - self.betaInertia * self.previousWeightedEstimate[nodeShape] + weightedEstimate = ( + 1 - self.betaInertia + ) * estimatedNodeCount + self.betaInertia * self.previousWeightedEstimate[ + nodeShape + ] self.previousWeightedEstimate[nodeShape] = weightedEstimate return self._round(weightedEstimate) def getEstimatedNodeCounts( - self, queuedJobShapes: List[Shape], currentNodeCounts: Dict[Shape, int] - ) -> Tuple[Dict[Shape, int], Dict[Shape, List[FailedConstraint]]]: + self, queuedJobShapes: list[Shape], currentNodeCounts: dict[Shape, int] + ) -> tuple[dict[Shape, int], dict[Shape, list[FailedConstraint]]]: """ Given the resource requirements of queued jobs and the current size of the cluster. @@ -682,21 +795,30 @@ def getEstimatedNodeCounts( nodesToRunQueuedJobs, could_not_fit = binPacking( jobShapes=queuedJobShapes, nodeShapes=self.node_shapes_after_overhead, - goalTime=self.targetTime + goalTime=self.targetTime, ) - + # Then translate back to get results in terms of full nodes without overhead. - nodesToRunQueuedJobs = {self.without_overhead[k]: v for k, v in nodesToRunQueuedJobs.items()} + nodesToRunQueuedJobs = { + self.without_overhead[k]: v for k, v in nodesToRunQueuedJobs.items() + } estimatedNodeCounts = {} for nodeShape in self.nodeShapes: instance_type = self.nodeShapeToType[nodeShape] - logger.debug(f"Nodes of type {instance_type} to run queued jobs: {nodesToRunQueuedJobs[nodeShape]}") + logger.debug( + f"Nodes of type {instance_type} to run queued jobs: {nodesToRunQueuedJobs[nodeShape]}" + ) # Actual calculation of the estimated number of nodes required - estimatedNodeCount = 0 if nodesToRunQueuedJobs[nodeShape] == 0 \ + estimatedNodeCount = ( + 0 + if nodesToRunQueuedJobs[nodeShape] == 0 else max(1, self._round(nodesToRunQueuedJobs[nodeShape])) - logger.debug("Estimating %i nodes of shape %s" % (estimatedNodeCount, nodeShape)) + ) + logger.debug( + "Estimating %i nodes of shape %s" % (estimatedNodeCount, nodeShape) + ) # Use inertia parameter to smooth out fluctuations according to an exponentially # weighted moving average. @@ -710,37 +832,56 @@ def getEstimatedNodeCounts( # The number of nodes we provision as compensation for missing preemptible # nodes is the product of the deficit (the number of preemptible nodes we did # _not_ allocate) and configuration preference. - compensationNodes = self._round(self.preemptibleNodeDeficit[instance_type] * compensation) + compensationNodes = self._round( + self.preemptibleNodeDeficit[instance_type] * compensation + ) if compensationNodes > 0: - logger.debug('Adding %d non-preemptible nodes of type %s to compensate for a ' - 'deficit of %d preemptible ones.', compensationNodes, - instance_type, - self.preemptibleNodeDeficit[instance_type]) + logger.debug( + "Adding %d non-preemptible nodes of type %s to compensate for a " + "deficit of %d preemptible ones.", + compensationNodes, + instance_type, + self.preemptibleNodeDeficit[instance_type], + ) estimatedNodeCount += compensationNodes # Tell everyone how big the cluster is - logger.debug("Currently %i nodes of type %s in cluster" % (currentNodeCounts[nodeShape], - instance_type)) - self.leader.toilState.bus.publish(ClusterSizeMessage(instance_type, currentNodeCounts[nodeShape])) - self.leader.toilState.bus.publish(ClusterDesiredSizeMessage(instance_type, estimatedNodeCount)) + logger.debug( + "Currently %i nodes of type %s in cluster" + % (currentNodeCounts[nodeShape], instance_type) + ) + self.leader.toilState.bus.publish( + ClusterSizeMessage(instance_type, currentNodeCounts[nodeShape]) + ) + self.leader.toilState.bus.publish( + ClusterDesiredSizeMessage(instance_type, estimatedNodeCount) + ) # Bound number using the max and min node parameters if estimatedNodeCount > self.maxNodes[nodeShape]: - logger.debug('Limiting the estimated number of necessary %s (%s) to the ' - 'configured maximum (%s).', instance_type, - estimatedNodeCount, - self.maxNodes[nodeShape]) + logger.debug( + "Limiting the estimated number of necessary %s (%s) to the " + "configured maximum (%s).", + instance_type, + estimatedNodeCount, + self.maxNodes[nodeShape], + ) estimatedNodeCount = self.maxNodes[nodeShape] elif estimatedNodeCount < self.minNodes[nodeShape]: - logger.debug('Raising the estimated number of necessary %s (%s) to the ' - 'configured minimum (%s).', instance_type, - estimatedNodeCount, - self.minNodes[nodeShape]) + logger.debug( + "Raising the estimated number of necessary %s (%s) to the " + "configured minimum (%s).", + instance_type, + estimatedNodeCount, + self.minNodes[nodeShape], + ) estimatedNodeCount = self.minNodes[nodeShape] estimatedNodeCounts[nodeShape] = estimatedNodeCount return estimatedNodeCounts, could_not_fit - def updateClusterSize(self, estimatedNodeCounts: Dict[Shape, int]) -> Dict[Shape, int]: + def updateClusterSize( + self, estimatedNodeCounts: dict[Shape, int] + ) -> dict[Shape, int]: """ Given the desired and current size of the cluster, attempts to launch/remove instances to get to the desired size. @@ -752,21 +893,26 @@ def updateClusterSize(self, estimatedNodeCounts: Dict[Shape, int]) -> Dict[Shape for nodeShape, estimatedNodeCount in estimatedNodeCounts.items(): instance_type = self.nodeShapeToType[nodeShape] - newNodeCount = self.setNodeCount(instance_type, estimatedNodeCount, preemptible=nodeShape.preemptible) + newNodeCount = self.setNodeCount( + instance_type, estimatedNodeCount, preemptible=nodeShape.preemptible + ) # If we were scaling up a preemptible node type and failed to meet # our target, we will attempt to compensate for the deficit while scaling # non-preemptible nodes of this type. if nodeShape.preemptible: if newNodeCount < estimatedNodeCount: deficit = estimatedNodeCount - newNodeCount - logger.debug('Preemptible scaler detected deficit of %d nodes of type %s.' % (deficit, instance_type)) + logger.debug( + "Preemptible scaler detected deficit of %d nodes of type %s." + % (deficit, instance_type) + ) self.preemptibleNodeDeficit[instance_type] = deficit else: self.preemptibleNodeDeficit[instance_type] = 0 newNodeCounts[nodeShape] = newNodeCount - #Attempt to terminate any nodes that we previously designated for - #termination, but which still had workers running. + # Attempt to terminate any nodes that we previously designated for + # termination, but which still had workers running. self._terminateIgnoredNodes() return newNodeCounts @@ -800,18 +946,29 @@ def setNodeCount( actual cluster size at the time this method returns. """ if not isinstance(self.leader.batchSystem, AbstractScalableBatchSystem): - raise RuntimeError('Non-scalable batch system abusing a scalable-only function.') + raise RuntimeError( + "Non-scalable batch system abusing a scalable-only function." + ) for attempt in old_retry(predicate=self.provisioner.retryPredicate): with attempt: nodes = self.getNodes(preemptible) logger.debug("Cluster contains %i instances" % len(nodes)) - nodes = {node: nodes[node] for node in nodes if node.nodeType == instance_type} - ignoredNodes = [node for node in nodes if node.privateIP in self.ignoredNodes] + nodes = { + node: nodes[node] + for node in nodes + if node.nodeType == instance_type + } + ignoredNodes = [ + node for node in nodes if node.privateIP in self.ignoredNodes + ] numIgnoredNodes = len(ignoredNodes) numCurrentNodes = len(nodes) - logger.debug("Cluster contains %i instances of type %s (%i ignored and draining jobs until " - "they can be safely terminated)" % (numCurrentNodes, instance_type, numIgnoredNodes)) + logger.debug( + "Cluster contains %i instances of type %s (%i ignored and draining jobs until " + "they can be safely terminated)" + % (numCurrentNodes, instance_type, numIgnoredNodes) + ) if not force: delta = numNodes - (numCurrentNodes - numIgnoredNodes) else: @@ -819,38 +976,59 @@ def setNodeCount( if delta > 0 and numIgnoredNodes > 0: # We can un-ignore a few nodes to compensate for the additional nodes we want. numNodesToUnignore = min(delta, numIgnoredNodes) - logger.debug('Unignoring %i nodes because we want to scale back up again.' % numNodesToUnignore) + logger.debug( + "Unignoring %i nodes because we want to scale back up again." + % numNodesToUnignore + ) delta -= numNodesToUnignore for node in ignoredNodes[:numNodesToUnignore]: self.ignoredNodes.remove(node.privateIP) self.leader.batchSystem.unignoreNode(node.privateIP) if delta > 0: - logger.info('Adding %i %s nodes to get to desired cluster size of %i.', - delta, - 'preemptible' if preemptible else 'non-preemptible', - numNodes) - numNodes = numCurrentNodes + self._addNodes(instance_type, numNodes=delta, - preemptible=preemptible) + logger.info( + "Adding %i %s nodes to get to desired cluster size of %i.", + delta, + "preemptible" if preemptible else "non-preemptible", + numNodes, + ) + numNodes = numCurrentNodes + self._addNodes( + instance_type, numNodes=delta, preemptible=preemptible + ) elif delta < 0: - logger.info('Removing %i %s nodes to get to desired cluster size of %i.', -delta, 'preemptible' if preemptible else 'non-preemptible', numNodes) - numNodes = numCurrentNodes - self._removeNodes(nodes, - instance_type=instance_type, - num_nodes=-delta, - preemptible=preemptible, - force=force) + logger.info( + "Removing %i %s nodes to get to desired cluster size of %i.", + -delta, + "preemptible" if preemptible else "non-preemptible", + numNodes, + ) + numNodes = numCurrentNodes - self._removeNodes( + nodes, + instance_type=instance_type, + num_nodes=-delta, + preemptible=preemptible, + force=force, + ) elif force: - logger.debug('Cluster already at desired size of %i. Nothing to do.', numNodes) + logger.debug( + "Cluster already at desired size of %i. Nothing to do.", + numNodes, + ) else: - logger.debug('Cluster (minus ignored nodes) already at desired size of %i. Nothing to do.', numNodes) + logger.debug( + "Cluster (minus ignored nodes) already at desired size of %i. Nothing to do.", + numNodes, + ) return numNodes def _addNodes(self, instance_type: str, numNodes: int, preemptible: bool) -> int: - return self.provisioner.addNodes(nodeTypes={instance_type}, numNodes=numNodes, preemptible=preemptible) + return self.provisioner.addNodes( + nodeTypes={instance_type}, numNodes=numNodes, preemptible=preemptible + ) def _removeNodes( self, - nodes: Dict["Node", NodeInfo], + nodes: dict["Node", NodeInfo], instance_type: str, num_nodes: int, preemptible: bool = False, @@ -867,17 +1045,18 @@ def _removeNodes( nodes = self.getNodes(preemptible) # Filter down to nodes of the correct node type - nodes = {node: nodes[node] for node in nodes if - node.nodeType == instance_type} + nodes = { + node: nodes[node] for node in nodes if node.nodeType == instance_type + } filtered_nodes = self.filter_out_static_nodes(nodes, preemptible) filtered_nodes = filtered_nodes[:num_nodes] # Join nodes and instances on private IP address. - logger.debug('Nodes considered to terminate: %s', ' '.join(map(str, nodes))) + logger.debug("Nodes considered to terminate: %s", " ".join(map(str, nodes))) # Tell the batch system to stop sending jobs to these nodes - for (node, nodeInfo) in filtered_nodes: + for node, nodeInfo in filtered_nodes: self.ignoredNodes.add(node.privateIP) self.leader.batchSystem.ignoreNode(node.privateIP) @@ -886,8 +1065,11 @@ def _removeNodes( # will be terminated in _removeIgnoredNodes later on # once all jobs have finished, but they will be ignored by # the batch system and cluster scaler from now on - filtered_nodes = [(node, nodeInfo) for (node, nodeInfo) in filtered_nodes if - nodeInfo and nodeInfo.workers < 1] + filtered_nodes = [ + (node, nodeInfo) + for (node, nodeInfo) in filtered_nodes + if nodeInfo and nodeInfo.workers < 1 + ] nodes_to_terminate = [node for (node, nodeInfo) in filtered_nodes] for node in nodes_to_terminate: if node.privateIP in self.ignoredNodes: @@ -895,10 +1077,12 @@ def _removeNodes( self.leader.batchSystem.unignoreNode(node.privateIP) else: # Without load info all we can do is sort instances by time left in billing cycle. - nodes_to_terminate = sorted(nodes.keys(), key=lambda x: x.remainingBillingInterval()) + nodes_to_terminate = sorted( + nodes.keys(), key=lambda x: x.remainingBillingInterval() + ) nodes_to_terminate = nodes_to_terminate[:num_nodes] number_terminated = len(nodes_to_terminate) - logger.debug('Terminating %i instance(s).', number_terminated) + logger.debug("Terminating %i instance(s).", number_terminated) for node in nodes_to_terminate: if node.privateIP in self.ignoredNodes: # TODO: Why are we undoing what was just done above??? @@ -912,7 +1096,9 @@ def _terminateIgnoredNodes(self) -> None: but which still have workers running. """ if not isinstance(self.leader.batchSystem, AbstractScalableBatchSystem): - raise RuntimeError('Non-scalable batch system abusing a scalable-only function.') + raise RuntimeError( + "Non-scalable batch system abusing a scalable-only function." + ) # start with a dictionary of all nodes and filter down nodes = self.getNodes() @@ -926,10 +1112,18 @@ def _terminateIgnoredNodes(self) -> None: self.ignoredNodes.remove(ip) self.leader.batchSystem.unignoreNode(ip) - logger.debug("There are %i nodes being ignored by the batch system, " - "checking if they can be terminated" % len(self.ignoredNodes)) - nodes = {node: info for node, info in nodes.items() if node.privateIP in self.ignoredNodes} - nodes = {node: info for node, info in nodes.items() if info and info.workers < 1} + logger.debug( + "There are %i nodes being ignored by the batch system, " + "checking if they can be terminated" % len(self.ignoredNodes) + ) + nodes = { + node: info + for node, info in nodes.items() + if node.privateIP in self.ignoredNodes + } + nodes = { + node: info for node, info in nodes.items() if info and info.workers < 1 + } nodes_to_terminate = list(nodes.keys()) for node in nodes_to_terminate: @@ -938,25 +1132,32 @@ def _terminateIgnoredNodes(self) -> None: self.provisioner.terminateNodes(nodes_to_terminate) def filter_out_static_nodes( - self, - nodes: Dict["Node", NodeInfo], - preemptible: bool = False) -> List[Tuple["Node", NodeInfo]]: + self, nodes: dict["Node", NodeInfo], preemptible: bool = False + ) -> list[tuple["Node", NodeInfo]]: filtered_nodes = [] for node, nodeInfo in nodes.items(): if node: - non = 'non-' if not preemptible else '' + non = "non-" if not preemptible else "" if node.privateIP in self.getStaticNodes(preemptible): # we don't want to automatically terminate any statically provisioned nodes - logger.debug(f'Found {node.privateIP} in {non}preemptible static nodes') + logger.debug( + f"Found {node.privateIP} in {non}preemptible static nodes" + ) else: - logger.debug(f'Did not find {node.privateIP} in {non}preemptible static nodes') + logger.debug( + f"Did not find {node.privateIP} in {non}preemptible static nodes" + ) filtered_nodes.append((node, nodeInfo)) # Sort nodes by number of workers and time left in billing cycle - filtered_nodes.sort(key=lambda node_nodeInfo: ( - node_nodeInfo[1].workers if node_nodeInfo[1] else 1, node_nodeInfo[0].remainingBillingInterval())) + filtered_nodes.sort( + key=lambda node_nodeInfo: ( + node_nodeInfo[1].workers if node_nodeInfo[1] else 1, + node_nodeInfo[0].remainingBillingInterval(), + ) + ) return filtered_nodes - def getNodes(self, preemptible: Optional[bool] = None) -> Dict["Node", NodeInfo]: + def getNodes(self, preemptible: Optional[bool] = None) -> dict["Node", NodeInfo]: """ Returns a dictionary mapping node identifiers of preemptible or non-preemptible nodes to NodeInfo objects, one for each node. @@ -968,25 +1169,31 @@ def getNodes(self, preemptible: Optional[bool] = None) -> Dict["Node", NodeInfo] If None, all nodes will be returned. """ if not isinstance(self.leader.batchSystem, AbstractScalableBatchSystem): - raise RuntimeError('Non-scalable batch system abusing a scalable-only function.') + raise RuntimeError( + "Non-scalable batch system abusing a scalable-only function." + ) # nodes seen within the last 600 seconds (10 minutes) recent_nodes = self.leader.batchSystem.getNodes(preemptible, timeout=600) # all available nodes all_nodes = self.leader.batchSystem.getNodes(preemptible) # nodes that are supposedly doing something - provisioned_nodes = self.provisioner.getProvisionedWorkers(preemptible=preemptible) + provisioned_nodes = self.provisioner.getProvisionedWorkers( + preemptible=preemptible + ) if len(recent_nodes) != len(provisioned_nodes): logger.debug("Consolidating state between mesos and provisioner") - nodeToInfo: Dict["Node", NodeInfo] = {} + nodeToInfo: dict["Node", NodeInfo] = {} # fixme: what happens if awsFilterImpairedNodes is used? # if this assertion is false it means that user-managed nodes are being # used that are outside the provisioner's control # this would violate many basic assumptions in autoscaling so it currently not allowed for node, ip in ((node, node.privateIP) for node in provisioned_nodes): if ip not in recent_nodes: - logger.debug("Worker node at %s is not reporting executor information", ip) + logger.debug( + "Worker node at %s is not reporting executor information", ip + ) # get up-to-date information about the node, if available info = all_nodes.get(ip) @@ -1009,9 +1216,15 @@ def getNodes(self, preemptible: Optional[bool] = None) -> Dict["Node", NodeInfo] # # In all 3 situations it's safe to fake executor info with 0 workers, # since in all cases there are no workers running. - info = NodeInfo(coresTotal=1, coresUsed=0, requestedCores=0, - memoryTotal=1, memoryUsed=0, requestedMemory=0, - workers=0) + info = NodeInfo( + coresTotal=1, + coresUsed=0, + requestedCores=0, + memoryTotal=1, + memoryUsed=0, + requestedMemory=0, + workers=0, + ) else: # mesos knows about the ip & we have up-to-date information - easy! info = recent_nodes[ip] @@ -1020,40 +1233,55 @@ def getNodes(self, preemptible: Optional[bool] = None) -> Dict["Node", NodeInfo] return nodeToInfo def shutDown(self) -> None: - logger.debug('Forcing provisioner to reduce cluster size to zero.') + logger.debug("Forcing provisioner to reduce cluster size to zero.") for nodeShape in self.nodeShapes: preemptible = nodeShape.preemptible instance_type = self.nodeShapeToType[nodeShape] - self.setNodeCount(instance_type=instance_type, numNodes=0, preemptible=preemptible, force=True) + self.setNodeCount( + instance_type=instance_type, + numNodes=0, + preemptible=preemptible, + force=True, + ) + class JobTooBigError(Exception): """ Raised in the scaler thread when a job cannot fit in any available node type and is likely to lock up the workflow. """ - - def __init__(self, job: Optional[JobDescription] = None, shape: Optional[Shape] = None, constraints: Optional[List[FailedConstraint]] = None): + + def __init__( + self, + job: Optional[JobDescription] = None, + shape: Optional[Shape] = None, + constraints: Optional[list[FailedConstraint]] = None, + ): """ Make a JobTooBigError. - + Can have a job, the job's shape, and the limiting resources and amounts. All are optional. """ self.job = job self.shape = shape self.constraints = constraints if constraints is not None else [] - + parts = [ f"The job {self.job}" if self.job else "A job", f" with shape {self.shape}" if self.shape else "", - " is too big for any available node type." + " is too big for any available node type.", ] - + if self.constraints: parts.append(" It could have fit if it only needed ") - parts.append(", ".join([f"{limit} {resource}" for resource, limit in self.constraints])) - parts.append(".") - - self.msg = ''.join(parts) + parts.append( + ", ".join( + [f"{limit} {resource}" for resource, limit in self.constraints] + ) + ) + parts.append(".") + + self.msg = "".join(parts) super().__init__() def __str__(self) -> str: @@ -1062,6 +1290,7 @@ def __str__(self) -> str: """ return self.msg + class ScalerThread(ExceptionalThread): """ A thread that automatically scales the number of either preemptible or non-preemptible worker @@ -1077,10 +1306,17 @@ class ScalerThread(ExceptionalThread): is made, else the size of the cluster is adapted. The beta factor is an inertia parameter that prevents continual fluctuations in the number of nodes. """ - def __init__(self, provisioner: AbstractProvisioner, leader: "Leader", config: Config, stop_on_exception: bool = False) -> None: - super().__init__(name='scaler') + + def __init__( + self, + provisioner: AbstractProvisioner, + leader: "Leader", + config: Config, + stop_on_exception: bool = False, + ) -> None: + super().__init__(name="scaler") self.scaler = ClusterScaler(provisioner, leader, config) - + # Indicates that the scaling thread should shutdown self.stop = False # Indicates that we should stop the thread if we encounter an error. @@ -1090,13 +1326,13 @@ def __init__(self, provisioner: AbstractProvisioner, leader: "Leader", config: C self.stats = None if config.clusterStats: logger.debug("Starting up cluster statistics...") - self.stats = ClusterStats(leader.config.clusterStats, - leader.batchSystem, - provisioner.clusterName) + self.stats = ClusterStats( + leader.config.clusterStats, leader.batchSystem, provisioner.clusterName + ) for preemptible in [True, False]: self.stats.startStats(preemptible=preemptible) logger.debug("...Cluster stats started.") - + def check(self) -> None: """ Attempt to join any existing scaler threads that may have died or finished. @@ -1121,20 +1357,27 @@ def addCompletedJob(self, job: JobDescription, wallTime: int) -> None: def tryRun(self) -> None: if self.scaler.leader.provisioner is None: - raise RuntimeError('No provisioner found for a scaling cluster ' - '(cannot access "getProvisionedWorkers").') + raise RuntimeError( + "No provisioner found for a scaling cluster " + '(cannot access "getProvisionedWorkers").' + ) while not self.stop: with throttle(self.scaler.config.scaleInterval): try: queuedJobs = self.scaler.leader.getJobs() queuedJobShapes = [ - Shape(wallTime=self.scaler.getAverageRuntime( - jobName=job.jobName, - service=isinstance(job, ServiceJobDescription)), + Shape( + wallTime=self.scaler.getAverageRuntime( + jobName=job.jobName, + service=isinstance(job, ServiceJobDescription), + ), memory=job.memory, cores=job.cores, disk=job.disk, - preemptible=job.preemptible) for job in queuedJobs] + preemptible=job.preemptible, + ) + for job in queuedJobs + ] currentNodeCounts = {} for nodeShape in self.scaler.nodeShapes: instance_type = self.scaler.nodeShapeToType[nodeShape] @@ -1144,14 +1387,16 @@ def tryRun(self) -> None: preemptible=nodeShape.preemptible, ) ) - estimatedNodeCounts, could_not_fit = self.scaler.getEstimatedNodeCounts( - queuedJobShapes, currentNodeCounts + estimatedNodeCounts, could_not_fit = ( + self.scaler.getEstimatedNodeCounts( + queuedJobShapes, currentNodeCounts + ) ) self.scaler.updateClusterSize(estimatedNodeCounts) if self.stats: self.stats.checkStats() - - if len(could_not_fit) != 0: + + if len(could_not_fit) != 0: # If we have any jobs left over that we couldn't fit, complain. bad_job: Optional[JobDescription] = None bad_shape: Optional[Shape] = None @@ -1164,39 +1409,49 @@ def tryRun(self) -> None: if bad_shape is None: # If we can't find an offending job, grab an arbitrary offending shape. bad_shape = next(iter(could_not_fit)) - - raise JobTooBigError(job=bad_job, shape=bad_shape, constraints=could_not_fit[bad_shape]) - + + raise JobTooBigError( + job=bad_job, + shape=bad_shape, + constraints=could_not_fit[bad_shape], + ) + except: if self.stop_on_exception: logger.critical("Stopping ScalerThread due to an error.") raise else: - logger.exception("Exception encountered in scaler thread. Making a best-effort " - "attempt to keep going, but things may go wrong from now on.") + logger.exception( + "Exception encountered in scaler thread. Making a best-effort " + "attempt to keep going, but things may go wrong from now on." + ) self.scaler.shutDown() + class ClusterStats: def __init__( self, path: str, batchSystem: AbstractBatchSystem, clusterName: Optional[str] ) -> None: logger.debug("Initializing cluster statistics") - self.stats: Dict[str, Dict[str, List[Dict[str, Any]]]] = {} - self.statsThreads: List[ExceptionalThread] = [] + self.stats: dict[str, dict[str, list[dict[str, Any]]]] = {} + self.statsThreads: list[ExceptionalThread] = [] self.statsPath = path self.stop = False self.clusterName = clusterName self.batchSystem = batchSystem - self.scaleable = isinstance(self.batchSystem, AbstractScalableBatchSystem) \ - if batchSystem else False + self.scaleable = ( + isinstance(self.batchSystem, AbstractScalableBatchSystem) + if batchSystem + else False + ) def shutDownStats(self) -> None: if self.stop: return def getFileName() -> str: - extension = '.json' - file = '%s-stats' % self.clusterName + extension = ".json" + file = "%s-stats" % self.clusterName counter = 0 while True: suffix = str(counter).zfill(3) + extension @@ -1204,12 +1459,13 @@ def getFileName() -> str: if not os.path.exists(fullName): return fullName counter += 1 + if self.statsPath and self.scaleable: self.stop = True for thread in self.statsThreads: thread.join() fileName = getFileName() - with open(fileName, 'w') as f: + with open(fileName, "w") as f: json.dump(self.stats, f) def startStats(self, preemptible: bool) -> None: @@ -1223,22 +1479,26 @@ def checkStats(self) -> None: thread.join(timeout=0) def _gatherStats(self, preemptible: bool) -> None: - def toDict(nodeInfo: NodeInfo) -> Dict[str, Any]: + def toDict(nodeInfo: NodeInfo) -> dict[str, Any]: # convert NodeInfo object to dict to improve JSON output - return dict(memory=nodeInfo.memoryUsed, - cores=nodeInfo.coresUsed, - memoryTotal=nodeInfo.memoryTotal, - coresTotal=nodeInfo.coresTotal, - requestedCores=nodeInfo.requestedCores, - requestedMemory=nodeInfo.requestedMemory, - workers=nodeInfo.workers, - time=time.time() # add time stamp - ) + return dict( + memory=nodeInfo.memoryUsed, + cores=nodeInfo.coresUsed, + memoryTotal=nodeInfo.memoryTotal, + coresTotal=nodeInfo.coresTotal, + requestedCores=nodeInfo.requestedCores, + requestedMemory=nodeInfo.requestedMemory, + workers=nodeInfo.workers, + time=time.time(), # add time stamp + ) + if self.scaleable: logger.debug("Starting to gather statistics") - stats: Dict[str, List[Dict[str, Any]]] = {} + stats: dict[str, list[dict[str, Any]]] = {} if not isinstance(self.batchSystem, AbstractScalableBatchSystem): - raise RuntimeError('Non-scalable batch system abusing a scalable-only function.') + raise RuntimeError( + "Non-scalable batch system abusing a scalable-only function." + ) try: while not self.stop: nodeInfo = self.batchSystem.getNodes(preemptible) @@ -1255,6 +1515,8 @@ def toDict(nodeInfo: NodeInfo) -> Dict[str, Any]: stats[nodeIP] = [nodeStatsDict] time.sleep(60) finally: - threadName = 'Preemptible' if preemptible else 'Non-preemptible' - logger.debug('%s provisioner stats thread shut down successfully.', threadName) + threadName = "Preemptible" if preemptible else "Non-preemptible" + logger.debug( + "%s provisioner stats thread shut down successfully.", threadName + ) self.stats[threadName] = stats diff --git a/src/toil/provisioners/gceProvisioner.py b/src/toil/provisioners/gceProvisioner.py index 24c7ef1214..99310044de 100644 --- a/src/toil/provisioners/gceProvisioner.py +++ b/src/toil/provisioners/gceProvisioner.py @@ -17,7 +17,7 @@ import threading import time import uuid -from typing import Optional, Set +from typing import Optional import requests from libcloud.compute.drivers.gce import GCEFailedNode @@ -25,8 +25,8 @@ from libcloud.compute.types import Provider from toil.jobStores.googleJobStore import GoogleJobStore -from toil.lib.conversions import human2bytes from toil.lib.compatibility import compat_bytes_recursive +from toil.lib.conversions import human2bytes from toil.provisioners import NoSuchClusterException from toil.provisioners.abstractProvisioner import AbstractProvisioner, Shape from toil.provisioners.node import Node @@ -34,24 +34,41 @@ logger = logging.getLogger(__name__) logging.getLogger("urllib3.connectionpool").setLevel(logging.WARNING) + class GCEProvisioner(AbstractProvisioner): """ Implements a Google Compute Engine Provisioner using libcloud. """ NODE_BOTO_PATH = "/root/.boto" # boto file path on instances - SOURCE_IMAGE = b'projects/kinvolk-public/global/images/family/flatcar-stable' - - def __init__(self, clusterName, clusterType, zone, nodeStorage, nodeStorageOverrides, sseKey, enable_fuse): - self.cloud = 'gce' + SOURCE_IMAGE = b"projects/kinvolk-public/global/images/family/flatcar-stable" + + def __init__( + self, + clusterName, + clusterType, + zone, + nodeStorage, + nodeStorageOverrides, + sseKey, + enable_fuse, + ): + self.cloud = "gce" self._sseKey = sseKey # Call base class constructor, which will call createClusterSettings() # or readClusterSettings() - super().__init__(clusterName, clusterType, zone, nodeStorage, nodeStorageOverrides, enable_fuse) + super().__init__( + clusterName, + clusterType, + zone, + nodeStorage, + nodeStorageOverrides, + enable_fuse, + ) def supportedClusterTypes(self): - return {'mesos'} + return {"mesos"} def createClusterSettings(self): # All we need to do is read the Google credentials we need to provision @@ -65,30 +82,38 @@ def readClusterSettings(self): reading the metadata. """ metadata_server = "http://metadata/computeMetadata/v1/instance/" - metadata_flavor = {'Metadata-Flavor': 'Google'} - zone = requests.get(metadata_server + 'zone', headers = metadata_flavor).text - self._zone = zone.split('/')[-1] + metadata_flavor = {"Metadata-Flavor": "Google"} + zone = requests.get(metadata_server + "zone", headers=metadata_flavor).text + self._zone = zone.split("/")[-1] project_metadata_server = "http://metadata/computeMetadata/v1/project/" - self._projectId = requests.get(project_metadata_server + 'project-id', headers = metadata_flavor).text + self._projectId = requests.get( + project_metadata_server + "project-id", headers=metadata_flavor + ).text # From a GCE instance, these values can be blank. Only the projectId is needed - self._googleJson = '' - self._clientEmail = '' + self._googleJson = "" + self._clientEmail = "" - self._tags = requests.get(metadata_server + 'description', headers = metadata_flavor).text + self._tags = requests.get( + metadata_server + "description", headers=metadata_flavor + ).text tags = json.loads(self._tags) - self.clusterName = tags['clusterName'] + self.clusterName = tags["clusterName"] self._gceDriver = self._getDriver() - self._instanceGroup = self._gceDriver.ex_get_instancegroup(self.clusterName, zone=self._zone) + self._instanceGroup = self._gceDriver.ex_get_instancegroup( + self.clusterName, zone=self._zone + ) leader = self.getLeader() self._leaderPrivateIP = leader.privateIP # The location of the Google credentials file on instances. self._credentialsPath = GoogleJobStore.nodeServiceAccountJson - self._keyName = 'core' # key name leader users to communicate with works - self._botoPath = self.NODE_BOTO_PATH # boto credentials (used if reading an AWS bucket) + self._keyName = "core" # key name leader users to communicate with works + self._botoPath = ( + self.NODE_BOTO_PATH + ) # boto credentials (used if reading an AWS bucket) # Let the base provisioner work out how to deploy duly authorized # workers for this leader. @@ -98,28 +123,32 @@ def _readCredentials(self): """ Get the credentials from the file specified by GOOGLE_APPLICATION_CREDENTIALS. """ - self._googleJson = os.getenv('GOOGLE_APPLICATION_CREDENTIALS') + self._googleJson = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") if not self._googleJson: - raise RuntimeError('GOOGLE_APPLICATION_CREDENTIALS not set.') + raise RuntimeError("GOOGLE_APPLICATION_CREDENTIALS not set.") try: with open(self._googleJson) as jsonFile: self.googleConnectionParams = json.loads(jsonFile.read()) except: - raise RuntimeError('GCEProvisioner: Could not parse the Google service account json file %s' - % self._googleJson) + raise RuntimeError( + "GCEProvisioner: Could not parse the Google service account json file %s" + % self._googleJson + ) - self._projectId = self.googleConnectionParams['project_id'] - self._clientEmail = self.googleConnectionParams['client_email'] + self._projectId = self.googleConnectionParams["project_id"] + self._clientEmail = self.googleConnectionParams["client_email"] self._credentialsPath = self._googleJson self._clearLeaderWorkerAuthentication() # TODO: Why are we doing this? self._gceDriver = self._getDriver() def _write_file_to_cloud(self, key: str, contents: bytes) -> str: - raise NotImplementedError("The gceProvisioner doesn't support _write_file_to_cloud().") + raise NotImplementedError( + "The gceProvisioner doesn't support _write_file_to_cloud()." + ) def _get_user_data_limit(self) -> int: # See: https://cloud.google.com/compute/docs/metadata/setting-custom-metadata#limitations - return human2bytes('256KB') + return human2bytes("256KB") def launchCluster(self, leaderNodeType, leaderStorage, owner, **kwargs): """ @@ -131,39 +160,42 @@ def launchCluster(self, leaderNodeType, leaderStorage, owner, **kwargs): vpcSubnet: A subnet (optional). use_private_ip: even though a public ip exists, ignore it (optional) """ - if 'keyName' not in kwargs: + if "keyName" not in kwargs: raise RuntimeError("A keyPairName is required for the GCE provisioner.") - self._keyName = kwargs['keyName'] - if 'botoPath' in kwargs: - self._botoPath = kwargs['botoPath'] - self._vpcSubnet = kwargs.get('vpcSubnet', None) - self._network = kwargs.get('network', None) - self._use_private_ip = kwargs.get('use_private_ip', None) + self._keyName = kwargs["keyName"] + if "botoPath" in kwargs: + self._botoPath = kwargs["botoPath"] + self._vpcSubnet = kwargs.get("vpcSubnet", None) + self._network = kwargs.get("network", None) + self._use_private_ip = kwargs.get("use_private_ip", None) # Throws an error if cluster exists - self._instanceGroup = self._gceDriver.ex_create_instancegroup(self.clusterName, self._zone) - logger.debug('Launching leader') + self._instanceGroup = self._gceDriver.ex_create_instancegroup( + self.clusterName, self._zone + ) + logger.debug("Launching leader") # GCE doesn't have a dictionary tags field. The tags field is just a string list. # Therefore, dumping tags into the description. - tags = {'Owner': self._keyName, 'clusterName': self.clusterName} - if 'userTags' in kwargs: - tags.update(kwargs['userTags']) + tags = {"Owner": self._keyName, "clusterName": self.clusterName} + if "userTags" in kwargs: + tags.update(kwargs["userTags"]) self._tags = json.dumps(tags) - metadata = {'items': [{'key': 'user-data', 'value': self._getIgnitionUserData('leader')}]} - imageType = 'flatcar-stable' - sa_scopes = [{'scopes': ['compute', 'storage-full']}] + metadata = { + "items": [ + {"key": "user-data", "value": self._getIgnitionUserData("leader")} + ] + } + imageType = "flatcar-stable" + sa_scopes = [{"scopes": ["compute", "storage-full"]}] disk = {} - disk['initializeParams'] = { - 'sourceImage': self.SOURCE_IMAGE, - 'diskSizeGb': leaderStorage + disk["initializeParams"] = { + "sourceImage": self.SOURCE_IMAGE, + "diskSizeGb": leaderStorage, } - disk.update({ - 'boot': True, - 'autoDelete': True - }) - name = 'l' + str(uuid.uuid4()) + disk.update({"boot": True, "autoDelete": True}) + name = "l" + str(uuid.uuid4()) leader = self._gceDriver.create_node( name, @@ -174,9 +206,9 @@ def launchCluster(self, leaderNodeType, leaderStorage, owner, **kwargs): ex_metadata=compat_bytes_recursive(metadata), ex_network=self._network, ex_subnetwork=self._vpcSubnet, - ex_disks_gce_struct = [ compat_bytes_recursive(disk) ], + ex_disks_gce_struct=[compat_bytes_recursive(disk)], description=self._tags, - ex_preemptible=False + ex_preemptible=False, ) self._instanceGroup.add_instances([leader]) @@ -184,18 +216,27 @@ def launchCluster(self, leaderNodeType, leaderStorage, owner, **kwargs): # self.subnetID = leader.subnet_id # TODO: get subnetID # Wait for the appliance to start and inject credentials. - leaderNode = Node(publicIP=leader.public_ips[0], privateIP=leader.private_ips[0], - name=leader.name, launchTime=leader.created_at, nodeType=leader.size, - preemptible=False, tags=self._tags, use_private_ip=self._use_private_ip) - leaderNode.waitForNode('toil_leader', keyName=self._keyName) + leaderNode = Node( + publicIP=leader.public_ips[0], + privateIP=leader.private_ips[0], + name=leader.name, + launchTime=leader.created_at, + nodeType=leader.size, + preemptible=False, + tags=self._tags, + use_private_ip=self._use_private_ip, + ) + leaderNode.waitForNode("toil_leader", keyName=self._keyName) leaderNode.copySshKeys(self._keyName) - leaderNode.injectFile(self._credentialsPath, GoogleJobStore.nodeServiceAccountJson, 'toil_leader') + leaderNode.injectFile( + self._credentialsPath, GoogleJobStore.nodeServiceAccountJson, "toil_leader" + ) if self._botoPath: - leaderNode.injectFile(self._botoPath, self.NODE_BOTO_PATH, 'toil_leader') + leaderNode.injectFile(self._botoPath, self.NODE_BOTO_PATH, "toil_leader") # Download credentials self._setLeaderWorkerAuthentication(leaderNode) - logger.debug('Launched leader') + logger.debug("Launched leader") def getNodeShape(self, instance_type: str, preemptible=False) -> Shape: # TODO: read this value only once @@ -208,21 +249,25 @@ def getNodeShape(self, instance_type: str, preemptible=False) -> Shape: if disk == 0: # This is an EBS-backed instance. We will use the root # volume, so add the amount of EBS storage requested forhe root volume - disk = self._nodeStorageOverrides.get(instance_type, self._nodeStorage) * 2 ** 30 + disk = ( + self._nodeStorageOverrides.get(instance_type, self._nodeStorage) * 2**30 + ) # Ram is in M. # Underestimate memory by 100M to prevent autoscaler from disagreeing with # mesos about whether a job can run on a particular node type - memory = (instanceType.ram/1000 - 0.1) * 2 ** 30 - return Shape(wallTime=60 * 60, - memory=memory, - cores=instanceType.extra['guestCpus'], - disk=disk, - preemptible=preemptible) + memory = (instanceType.ram / 1000 - 0.1) * 2**30 + return Shape( + wallTime=60 * 60, + memory=memory, + cores=instanceType.extra["guestCpus"], + disk=disk, + preemptible=preemptible, + ) @staticmethod def retryPredicate(e): - """ Not used by GCE """ + """Not used by GCE""" return False def destroyCluster(self) -> None: @@ -238,7 +283,9 @@ def destroyCluster(self) -> None: attempts += 1 # remove group - instanceGroup = self._gceDriver.ex_get_instancegroup(self.clusterName, zone=self._zone) + instanceGroup = self._gceDriver.ex_get_instancegroup( + self.clusterName, zone=self._zone + ) instanceGroup.destroy() def terminateNodes(self, nodes): @@ -248,7 +295,7 @@ def terminateNodes(self, nodes): instancesToKill = [i for i in instances if i.name in nodeNames] self._terminateInstances(instancesToKill) - def addNodes(self, nodeTypes: Set[str], numNodes, preemptible, spotBid=None) -> int: + def addNodes(self, nodeTypes: set[str], numNodes, preemptible, spotBid=None) -> int: assert self._leaderPrivateIP # We don't support any balancing here so just pick one of the @@ -268,23 +315,21 @@ def addNodes(self, nodeTypes: Set[str], numNodes, preemptible, spotBid=None) -> keyPath = self._sseKey if not preemptible: - logger.debug('Launching %s non-preemptible nodes', numNodes) + logger.debug("Launching %s non-preemptible nodes", numNodes) else: - logger.debug('Launching %s preemptible nodes', numNodes) + logger.debug("Launching %s preemptible nodes", numNodes) # kwargs["subnet_id"] = self.subnetID if self.subnetID else self._getClusterInstance(self.instanceMetaData).subnet_id - userData = self._getIgnitionUserData('worker', keyPath, preemptible) - metadata = {'items': [{'key': 'user-data', 'value': userData}]} - imageType = 'flatcar-stable' - sa_scopes = [{'scopes': ['compute', 'storage-full']}] + userData = self._getIgnitionUserData("worker", keyPath, preemptible) + metadata = {"items": [{"key": "user-data", "value": userData}]} + imageType = "flatcar-stable" + sa_scopes = [{"scopes": ["compute", "storage-full"]}] disk = {} - disk['initializeParams'] = { - 'sourceImage': self.SOURCE_IMAGE, - 'diskSizeGb': self._nodeStorageOverrides.get(node_type, self._nodeStorage) } - disk.update({ - 'boot': True, - 'autoDelete': True - }) + disk["initializeParams"] = { + "sourceImage": self.SOURCE_IMAGE, + "diskSizeGb": self._nodeStorageOverrides.get(node_type, self._nodeStorage), + } + disk.update({"boot": True, "autoDelete": True}) # TODO: # - bug in gce.py for ex_create_multiple_nodes (erroneously, doesn't allow image and disk to specified) @@ -294,26 +339,38 @@ def addNodes(self, nodeTypes: Set[str], numNodes, preemptible, spotBid=None) -> retries = 0 workersCreated = 0 # Try a few times to create the requested number of workers - while numNodes-workersCreated > 0 and retries < 3: + while numNodes - workersCreated > 0 and retries < 3: instancesLaunched = self.ex_create_multiple_nodes( - '', node_type, imageType, numNodes-workersCreated, - location=self._zone, - ex_service_accounts=sa_scopes, - ex_metadata=metadata, - ex_disks_gce_struct=[disk], - description=self._tags, - ex_preemptible=preemptible - ) + "", + node_type, + imageType, + numNodes - workersCreated, + location=self._zone, + ex_service_accounts=sa_scopes, + ex_metadata=metadata, + ex_disks_gce_struct=[disk], + description=self._tags, + ex_preemptible=preemptible, + ) failedWorkers = [] for instance in instancesLaunched: if isinstance(instance, GCEFailedNode): - logger.error("Worker failed to launch with code %s. Error message: %s" - % (instance.code, instance.error)) + logger.error( + "Worker failed to launch with code %s. Error message: %s" + % (instance.code, instance.error) + ) continue - node = Node(publicIP=instance.public_ips[0], privateIP=instance.private_ips[0], - name=instance.name, launchTime=instance.created_at, nodeType=instance.size, - preemptible=False, tags=self._tags, use_private_ip=self._use_private_ip) # FIXME: what should tags be set to? + node = Node( + publicIP=instance.public_ips[0], + privateIP=instance.private_ips[0], + name=instance.name, + launchTime=instance.created_at, + nodeType=instance.size, + preemptible=False, + tags=self._tags, + use_private_ip=self._use_private_ip, + ) # FIXME: what should tags be set to? try: self._injectWorkerFiles(node, botoExists) @@ -321,43 +378,55 @@ def addNodes(self, nodeTypes: Set[str], numNodes, preemptible, spotBid=None) -> self._instanceGroup.add_instances([instance]) workersCreated += 1 except Exception as e: - logger.error(f"Failed to configure worker {node.name}. Error message: {e}") + logger.error( + f"Failed to configure worker {node.name}. Error message: {e}" + ) failedWorkers.append(instance) if failedWorkers: logger.error("Terminating %d failed workers" % len(failedWorkers)) self._terminateInstances(failedWorkers) retries += 1 - logger.debug('Launched %d new instance(s)', numNodes) + logger.debug("Launched %d new instance(s)", numNodes) if numNodes != workersCreated: - logger.error("Failed to launch %d worker(s)", numNodes-workersCreated) + logger.error("Failed to launch %d worker(s)", numNodes - workersCreated) return workersCreated - def getProvisionedWorkers(self, instance_type: Optional[str] = None, preemptible: Optional[bool] = None): + def getProvisionedWorkers( + self, instance_type: Optional[str] = None, preemptible: Optional[bool] = None + ): assert self._leaderPrivateIP entireCluster = self._getNodesInCluster(instance_type=instance_type) - logger.debug('All nodes in cluster: %s', entireCluster) + logger.debug("All nodes in cluster: %s", entireCluster) workerInstances = [] for instance in entireCluster: if preemptible is not None: - scheduling = instance.extra.get('scheduling') + scheduling = instance.extra.get("scheduling") # If this field is not found in the extra meta-data, assume the node is not preemptible. - if scheduling and scheduling.get('preemptible', False) != preemptible: + if scheduling and scheduling.get("preemptible", False) != preemptible: continue isWorker = True for ip in instance.private_ips: if ip == self._leaderPrivateIP: isWorker = False break # don't include the leader - if isWorker and instance.state == 'running': + if isWorker and instance.state == "running": workerInstances.append(instance) - logger.debug('All workers found in cluster: %s', workerInstances) - return [Node(publicIP=i.public_ips[0], privateIP=i.private_ips[0], - name=i.name, launchTime=i.created_at, nodeType=i.size, - preemptible=i.extra.get('scheduling', {}).get('preemptible', False), - tags=None, use_private_ip=self._use_private_ip) - for i in workerInstances] + logger.debug("All workers found in cluster: %s", workerInstances) + return [ + Node( + publicIP=i.public_ips[0], + privateIP=i.private_ips[0], + name=i.name, + launchTime=i.created_at, + nodeType=i.size, + preemptible=i.extra.get("scheduling", {}).get("preemptible", False), + tags=None, + use_private_ip=self._use_private_ip, + ) + for i in workerInstances + ] def getLeader(self): instances = self._getNodesInCluster() @@ -366,49 +435,64 @@ def getLeader(self): leader = instances[0] # assume leader was launched first except IndexError: raise NoSuchClusterException(self.clusterName) - return Node(publicIP=leader.public_ips[0], privateIP=leader.private_ips[0], - name=leader.name, launchTime=leader.created_at, nodeType=leader.size, - preemptible=False, tags=None, use_private_ip=self._use_private_ip) + return Node( + publicIP=leader.public_ips[0], + privateIP=leader.private_ips[0], + name=leader.name, + launchTime=leader.created_at, + nodeType=leader.size, + preemptible=False, + tags=None, + use_private_ip=self._use_private_ip, + ) def _injectWorkerFiles(self, node, botoExists): """ Set up the credentials on the worker. """ - node.waitForNode('toil_worker', keyName=self._keyName) + node.waitForNode("toil_worker", keyName=self._keyName) node.copySshKeys(self._keyName) - node.injectFile(self._credentialsPath, GoogleJobStore.nodeServiceAccountJson, 'toil_worker') + node.injectFile( + self._credentialsPath, GoogleJobStore.nodeServiceAccountJson, "toil_worker" + ) if self._sseKey: - node.injectFile(self._sseKey, self._sseKey, 'toil_worker') + node.injectFile(self._sseKey, self._sseKey, "toil_worker") if botoExists: - node.injectFile(self._botoPath, self.NODE_BOTO_PATH, 'toil_worker') + node.injectFile(self._botoPath, self.NODE_BOTO_PATH, "toil_worker") def _getNodesInCluster(self, instance_type: Optional[str] = None): - instanceGroup = self._gceDriver.ex_get_instancegroup(self.clusterName, zone=self._zone) + instanceGroup = self._gceDriver.ex_get_instancegroup( + self.clusterName, zone=self._zone + ) instances = instanceGroup.list_instances() if instance_type: - instances = [instance for instance in instances if instance.size == instance_type] + instances = [ + instance for instance in instances if instance.size == instance_type + ] return instances def _getDriver(self): - """ Connect to GCE """ + """Connect to GCE""" driverCls = get_driver(Provider.GCE) - return driverCls(self._clientEmail, - self._googleJson, - project=self._projectId, - datacenter=self._zone) + return driverCls( + self._clientEmail, + self._googleJson, + project=self._projectId, + datacenter=self._zone, + ) def _terminateInstances(self, instances): def worker(driver, instance): - logger.debug('Terminating instance: %s', instance.name) + logger.debug("Terminating instance: %s", instance.name) driver.destroy_node(instance) threads = [] for instance in instances: - t = threading.Thread(target=worker, args=(self._gceDriver,instance)) + t = threading.Thread(target=worker, args=(self._gceDriver, instance)) threads.append(t) t.start() - logger.debug('... Waiting for instance(s) to shut down...') + logger.debug("... Waiting for instance(s) to shut down...") for t in threads: t.join() @@ -416,20 +500,37 @@ def worker(driver, instance): DEFAULT_TASK_COMPLETION_TIMEOUT = 180 def ex_create_multiple_nodes( - self, base_name, size, image, number, location=None, - ex_network='default', ex_subnetwork=None, ex_tags=None, - ex_metadata=None, ignore_errors=True, use_existing_disk=True, - poll_interval=2, external_ip='ephemeral', - ex_disk_type='pd-standard', ex_disk_auto_delete=True, - ex_service_accounts=None, timeout=DEFAULT_TASK_COMPLETION_TIMEOUT, - description=None, ex_can_ip_forward=None, ex_disks_gce_struct=None, - ex_nic_gce_struct=None, ex_on_host_maintenance=None, - ex_automatic_restart=None, ex_image_family=None, - ex_preemptible=None): + self, + base_name, + size, + image, + number, + location=None, + ex_network="default", + ex_subnetwork=None, + ex_tags=None, + ex_metadata=None, + ignore_errors=True, + use_existing_disk=True, + poll_interval=2, + external_ip="ephemeral", + ex_disk_type="pd-standard", + ex_disk_auto_delete=True, + ex_service_accounts=None, + timeout=DEFAULT_TASK_COMPLETION_TIMEOUT, + description=None, + ex_can_ip_forward=None, + ex_disks_gce_struct=None, + ex_nic_gce_struct=None, + ex_on_host_maintenance=None, + ex_automatic_restart=None, + ex_image_family=None, + ex_preemptible=None, + ): """ - Monkey patch to gce.py in libcloud to allow disk and images to be specified. - Also changed name to a uuid below. - The prefix 'wp' identifies preemptible nodes and 'wn' non-preemptible nodes. + Monkey patch to gce.py in libcloud to allow disk and images to be specified. + Also changed name to a uuid below. + The prefix 'wp' identifies preemptible nodes and 'wn' non-preemptible nodes. """ # if image and ex_disks_gce_struct: # raise ValueError("Cannot specify both 'image' and " @@ -437,78 +538,80 @@ def ex_create_multiple_nodes( driver = self._getDriver() if image and ex_image_family: - raise ValueError("Cannot specify both 'image' and " - "'ex_image_family'") + raise ValueError("Cannot specify both 'image' and " "'ex_image_family'") location = location or driver.zone - if not hasattr(location, 'name'): + if not hasattr(location, "name"): location = driver.ex_get_zone(location) - if not hasattr(size, 'name'): + if not hasattr(size, "name"): size = driver.ex_get_size(size, location) - if not hasattr(ex_network, 'name'): + if not hasattr(ex_network, "name"): ex_network = driver.ex_get_network(ex_network) - if ex_subnetwork and not hasattr(ex_subnetwork, 'name'): - ex_subnetwork = \ - driver.ex_get_subnetwork(ex_subnetwork, - region=driver._get_region_from_zone(location)) + if ex_subnetwork and not hasattr(ex_subnetwork, "name"): + ex_subnetwork = driver.ex_get_subnetwork( + ex_subnetwork, region=driver._get_region_from_zone(location) + ) if ex_image_family: image = driver.ex_get_image_from_family(ex_image_family) - if image and not hasattr(image, 'name'): + if image and not hasattr(image, "name"): image = driver.ex_get_image(image) - if not hasattr(ex_disk_type, 'name'): + if not hasattr(ex_disk_type, "name"): ex_disk_type = driver.ex_get_disktype(ex_disk_type, zone=location) - node_attrs = {'size': size, - 'image': image, - 'location': location, - 'network': ex_network, - 'subnetwork': ex_subnetwork, - 'tags': ex_tags, - 'metadata': ex_metadata, - 'ignore_errors': ignore_errors, - 'use_existing_disk': use_existing_disk, - 'external_ip': external_ip, - 'ex_disk_type': ex_disk_type, - 'ex_disk_auto_delete': ex_disk_auto_delete, - 'ex_service_accounts': ex_service_accounts, - 'description': description, - 'ex_can_ip_forward': ex_can_ip_forward, - 'ex_disks_gce_struct': ex_disks_gce_struct, - 'ex_nic_gce_struct': ex_nic_gce_struct, - 'ex_on_host_maintenance': ex_on_host_maintenance, - 'ex_automatic_restart': ex_automatic_restart, - 'ex_preemptible': ex_preemptible} + node_attrs = { + "size": size, + "image": image, + "location": location, + "network": ex_network, + "subnetwork": ex_subnetwork, + "tags": ex_tags, + "metadata": ex_metadata, + "ignore_errors": ignore_errors, + "use_existing_disk": use_existing_disk, + "external_ip": external_ip, + "ex_disk_type": ex_disk_type, + "ex_disk_auto_delete": ex_disk_auto_delete, + "ex_service_accounts": ex_service_accounts, + "description": description, + "ex_can_ip_forward": ex_can_ip_forward, + "ex_disks_gce_struct": ex_disks_gce_struct, + "ex_nic_gce_struct": ex_nic_gce_struct, + "ex_on_host_maintenance": ex_on_host_maintenance, + "ex_automatic_restart": ex_automatic_restart, + "ex_preemptible": ex_preemptible, + } # List for holding the status information for disk/node creation. status_list = [] for i in range(number): - name = 'wp' if ex_preemptible else 'wn' + name = "wp" if ex_preemptible else "wn" name += str(uuid.uuid4()) # '%s-%03d' % (base_name, i) - status = {'name': name, 'node_response': None, 'node': None} + status = {"name": name, "node_response": None, "node": None} status_list.append(status) start_time = time.time() complete = False while not complete: if time.time() - start_time >= timeout: - raise Exception("Timeout (%s sec) while waiting for multiple " - "instances") + raise Exception( + "Timeout (%s sec) while waiting for multiple " "instances" + ) complete = True time.sleep(poll_interval) for status in status_list: # Create the node or check status if already in progress. - if not status['node']: - if not status['node_response']: + if not status["node"]: + if not status["node_response"]: driver._multi_create_node(status, node_attrs) else: driver._multi_check_node(status, node_attrs) # If any of the nodes have not been created (or failed) we are # not done yet. - if not status['node']: + if not status["node"]: complete = False # Return list of nodes node_list = [] for status in status_list: - node_list.append(status['node']) + node_list.append(status["node"]) return node_list diff --git a/src/toil/provisioners/node.py b/src/toil/provisioners/node.py index dc025370c1..16a23be226 100644 --- a/src/toil/provisioners/node.py +++ b/src/toil/provisioners/node.py @@ -13,12 +13,12 @@ # limitations under the License. import datetime import logging -from shlex import quote import socket import subprocess import time from itertools import count -from typing import Union, Dict, Optional, List, Any +from shlex import quote +from typing import Any, Optional, Union from toil.lib.memoize import parse_iso_utc @@ -30,12 +30,21 @@ class Node: maxWaitTime = 7 * 60 - def __init__(self, publicIP: str, privateIP: str, name: str, launchTime: Union[datetime.datetime, str], - nodeType: Optional[str], preemptible: bool, tags: Optional[Dict[str, str]] = None, use_private_ip: Optional[bool] = None) -> None: + def __init__( + self, + publicIP: str, + privateIP: str, + name: str, + launchTime: Union[datetime.datetime, str], + nodeType: Optional[str], + preemptible: bool, + tags: Optional[dict[str, str]] = None, + use_private_ip: Optional[bool] = None, + ) -> None: self.publicIP = publicIP self.privateIP = privateIP if use_private_ip: - self.effectiveIP = self.privateIP #or self.publicIP? + self.effectiveIP = self.privateIP # or self.publicIP? else: self.effectiveIP = self.publicIP or self.privateIP self.name = name @@ -78,7 +87,7 @@ def remainingBillingInterval(self) -> float: else: return 1 - def waitForNode(self, role: str, keyName: str='core') -> None: + def waitForNode(self, role: str, keyName: str = "core") -> None: self._waitForSSHPort() # wait here so docker commands can be used reliably afterwards self._waitForSSHKeys(keyName=keyName) @@ -86,8 +95,8 @@ def waitForNode(self, role: str, keyName: str='core') -> None: self._waitForAppliance(role=role, keyName=keyName) def copySshKeys(self, keyName): - """ Copy authorized_keys file to the core user from the keyName user.""" - if keyName == 'core': + """Copy authorized_keys file to the core user from the keyName user.""" + if keyName == "core": return # No point. # Make sure that keys are there. @@ -96,9 +105,17 @@ def copySshKeys(self, keyName): # copy keys to core user so that the ssh calls will work # - normal mechanism failed unless public key was in the google-ssh format # - even so, the key wasn't copied correctly to the core account - keyFile = '/home/%s/.ssh/authorized_keys' % keyName - self.sshInstance('/usr/bin/sudo', '/usr/bin/cp', keyFile, '/home/core/.ssh', user=keyName) - self.sshInstance('/usr/bin/sudo', '/usr/bin/chown', 'core', '/home/core/.ssh/authorized_keys', user=keyName) + keyFile = "/home/%s/.ssh/authorized_keys" % keyName + self.sshInstance( + "/usr/bin/sudo", "/usr/bin/cp", keyFile, "/home/core/.ssh", user=keyName + ) + self.sshInstance( + "/usr/bin/sudo", + "/usr/bin/chown", + "core", + "/home/core/.ssh/authorized_keys", + user=keyName, + ) def injectFile(self, fromFile, toFile, role): """ @@ -110,9 +127,13 @@ def injectFile(self, fromFile, toFile, role): self.coreRsync([fromFile, ":" + toFile], applianceName=role) return True except Exception as e: - logger.debug("Rsync to new node failed, trying again. Error message: %s" % e) + logger.debug( + "Rsync to new node failed, trying again. Error message: %s" % e + ) time.sleep(10 * retry) - raise RuntimeError(f"Failed to inject file {fromFile} to {role} with ip {self.effectiveIP}") + raise RuntimeError( + f"Failed to inject file {fromFile} to {role} with ip {self.effectiveIP}" + ) def extractFile(self, fromFile, toFile, role): """ @@ -124,74 +145,111 @@ def extractFile(self, fromFile, toFile, role): self.coreRsync([":" + fromFile, toFile], applianceName=role) return True except Exception as e: - logger.debug("Rsync from new node failed, trying again. Error message: %s" % e) + logger.debug( + "Rsync from new node failed, trying again. Error message: %s" % e + ) time.sleep(10 * retry) - raise RuntimeError(f"Failed to extract file {fromFile} from {role} with ip {self.effectiveIP}") + raise RuntimeError( + f"Failed to extract file {fromFile} from {role} with ip {self.effectiveIP}" + ) - def _waitForSSHKeys(self, keyName='core'): + def _waitForSSHKeys(self, keyName="core"): # the propagation of public ssh keys vs. opening the SSH port is racey, so this method blocks until # the keys are propagated and the instance can be SSH into start_time = time.time() last_error = None while True: if time.time() - start_time > self.maxWaitTime: - raise RuntimeError(f"Key propagation failed on machine with ip {self.effectiveIP}." + - ("\n\nMake sure that your public key is attached to your account and you are using " - "the correct private key. If you are using a key with a passphrase, be sure to " - "set up ssh-agent. For details, refer to " - "https://toil.readthedocs.io/en/latest/running/cloud/cloud.html." - if last_error and 'Permission denied' in last_error else "")) + raise RuntimeError( + f"Key propagation failed on machine with ip {self.effectiveIP}." + + ( + "\n\nMake sure that your public key is attached to your account and you are using " + "the correct private key. If you are using a key with a passphrase, be sure to " + "set up ssh-agent. For details, refer to " + "https://toil.readthedocs.io/en/latest/running/cloud/cloud.html." + if last_error and "Permission denied" in last_error + else "" + ) + ) try: - logger.info('Attempting to establish SSH connection to %s@%s...', keyName, self.effectiveIP) - self.sshInstance('ps', sshOptions=['-oBatchMode=yes'], user=keyName) + logger.info( + "Attempting to establish SSH connection to %s@%s...", + keyName, + self.effectiveIP, + ) + self.sshInstance("ps", sshOptions=["-oBatchMode=yes"], user=keyName) except RuntimeError as err: last_error = str(err) - logger.info('Connection rejected, waiting for public SSH key to be propagated. Trying again in 10s.') + logger.info( + "Connection rejected, waiting for public SSH key to be propagated. Trying again in 10s." + ) time.sleep(10) else: - logger.info('...SSH connection established.') + logger.info("...SSH connection established.") return - def _waitForDockerDaemon(self, keyName='core'): - logger.info('Waiting for docker on %s to start...', self.effectiveIP) + def _waitForDockerDaemon(self, keyName="core"): + logger.info("Waiting for docker on %s to start...", self.effectiveIP) sleepTime = 10 startTime = time.time() while True: if time.time() - startTime > self.maxWaitTime: - raise RuntimeError("Docker daemon failed to start on machine with ip %s" % self.effectiveIP) + raise RuntimeError( + "Docker daemon failed to start on machine with ip %s" + % self.effectiveIP + ) try: - output = self.sshInstance('/usr/bin/ps', 'auxww', sshOptions=['-oBatchMode=yes'], user=keyName) - if b'dockerd' in output: + output = self.sshInstance( + "/usr/bin/ps", "auxww", sshOptions=["-oBatchMode=yes"], user=keyName + ) + if b"dockerd" in output: # docker daemon has started - logger.info('Docker daemon running') + logger.info("Docker daemon running") break else: - logger.info('... Still waiting for docker daemon, trying in %s sec...' % sleepTime) + logger.info( + "... Still waiting for docker daemon, trying in %s sec..." + % sleepTime + ) time.sleep(sleepTime) except RuntimeError: logger.info("Wait for docker daemon failed ssh, trying again.") sleepTime += 20 - def _waitForAppliance(self, role, keyName='core'): - logger.info('Waiting for %s Toil appliance to start...', role) + def _waitForAppliance(self, role, keyName="core"): + logger.info("Waiting for %s Toil appliance to start...", role) sleepTime = 20 startTime = time.time() while True: if time.time() - startTime > self.maxWaitTime: - raise RuntimeError("Appliance failed to start on machine with IP: " + self.effectiveIP + - "\nCheck if TOIL_APPLIANCE_SELF is set correctly and the container exists.") + raise RuntimeError( + "Appliance failed to start on machine with IP: " + + self.effectiveIP + + "\nCheck if TOIL_APPLIANCE_SELF is set correctly and the container exists." + ) try: - output = self.sshInstance('/usr/bin/docker', 'ps', sshOptions=['-oBatchMode=yes'], user=keyName) - - role = bytes(role, encoding='utf-8') if type(role) != type(output) else role + output = self.sshInstance( + "/usr/bin/docker", + "ps", + sshOptions=["-oBatchMode=yes"], + user=keyName, + ) + + role = ( + bytes(role, encoding="utf-8") + if type(role) != type(output) + else role + ) if role in output: - logger.info('...Toil appliance started') + logger.info("...Toil appliance started") break else: - logger.info('...Still waiting for appliance, trying again in %s sec...' % sleepTime) - logger.debug(f'Role: {role}\n' - f'Output: {output}\n\n') + logger.info( + "...Still waiting for appliance, trying again in %s sec..." + % sleepTime + ) + logger.debug(f"Role: {role}\n" f"Output: {output}\n\n") time.sleep(sleepTime) except RuntimeError: # ignore exceptions, keep trying @@ -205,13 +263,13 @@ def _waitForSSHPort(self): :return: the number of unsuccessful attempts to connect to the port before a the first success """ - logger.debug('Waiting for ssh port on %s to open...', self.effectiveIP) + logger.debug("Waiting for ssh port on %s to open...", self.effectiveIP) for i in count(): s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) try: s.settimeout(a_short_time) s.connect((self.effectiveIP, 22)) - logger.debug('...ssh port open') + logger.debug("...ssh port open") return i except OSError: pass @@ -225,7 +283,7 @@ def sshAppliance(self, *args, **kwargs): interactive SSHing. The default value is False. Input=string is passed as input to the Popen call. """ - kwargs['appliance'] = True + kwargs["appliance"] = True return self.coreSSH(*args, **kwargs) def sshInstance(self, *args, **kwargs): @@ -233,7 +291,7 @@ def sshInstance(self, *args, **kwargs): Run a command on the instance. Returns the binary output of the command. """ - kwargs['collectStdout'] = True + kwargs["collectStdout"] = True return self.coreSSH(*args, **kwargs) def coreSSH(self, *args, **kwargs): @@ -249,64 +307,74 @@ def coreSSH(self, *args, **kwargs): :param bytes input: UTF-8 encoded input bytes to send to the command """ - commandTokens = ['ssh', '-tt'] - if not kwargs.pop('strict', False): - kwargs['sshOptions'] = ['-oUserKnownHostsFile=/dev/null', '-oStrictHostKeyChecking=no'] + kwargs.get( - 'sshOptions', []) - sshOptions = kwargs.pop('sshOptions', None) + commandTokens = ["ssh", "-tt"] + if not kwargs.pop("strict", False): + kwargs["sshOptions"] = [ + "-oUserKnownHostsFile=/dev/null", + "-oStrictHostKeyChecking=no", + ] + kwargs.get("sshOptions", []) + sshOptions = kwargs.pop("sshOptions", None) # Forward ports: # 5050 for Mesos dashboard (although to talk to agents you will need a proxy) - commandTokens.extend(['-L', '5050:localhost:5050']) + commandTokens.extend(["-L", "5050:localhost:5050"]) if sshOptions: # add specified options to ssh command assert isinstance(sshOptions, list) commandTokens.extend(sshOptions) # specify host - user = kwargs.pop('user', 'core') # CHANGED: Is this needed? - commandTokens.append(f'{user}@{str(self.effectiveIP)}') + user = kwargs.pop("user", "core") # CHANGED: Is this needed? + commandTokens.append(f"{user}@{str(self.effectiveIP)}") - inputString = kwargs.pop('input', None) + inputString = kwargs.pop("input", None) if inputString is not None: - kwargs['stdin'] = subprocess.PIPE + kwargs["stdin"] = subprocess.PIPE - if kwargs.pop('collectStdout', None): - kwargs['stdout'] = subprocess.PIPE - kwargs['stderr'] = subprocess.PIPE + if kwargs.pop("collectStdout", None): + kwargs["stdout"] = subprocess.PIPE + kwargs["stderr"] = subprocess.PIPE - tty = kwargs.pop('tty', None) - if kwargs.pop('appliance', None): - ttyFlag = '-t' if tty else '' - commandTokens += ['docker', 'exec', '-i', ttyFlag, 'toil_leader'] + tty = kwargs.pop("tty", None) + if kwargs.pop("appliance", None): + ttyFlag = "-t" if tty else "" + commandTokens += ["docker", "exec", "-i", ttyFlag, "toil_leader"] - logger.debug('Node %s: %s', self.effectiveIP, ' '.join(args)) + logger.debug("Node %s: %s", self.effectiveIP, " ".join(args)) args = list(map(quote, args)) commandTokens += args - logger.debug('Full command %s', ' '.join(commandTokens)) + logger.debug("Full command %s", " ".join(commandTokens)) process = subprocess.Popen(commandTokens, **kwargs) stdout, stderr = process.communicate(input=inputString) # at this point the process has already exited, no need for a timeout exit_code = process.returncode # ssh has been throwing random 255 errors - why? if exit_code != 0: - logger.info('Executing the command "%s" on the appliance returned a non-zero ' - 'exit code %s with stdout %s and stderr %s' - % (' '.join(args), exit_code, stdout, stderr)) - raise RuntimeError('Executing the command "%s" on the appliance returned a non-zero ' - 'exit code %s with stdout %s and stderr %s' - % (' '.join(args), exit_code, stdout, stderr)) + logger.info( + 'Executing the command "%s" on the appliance returned a non-zero ' + "exit code %s with stdout %s and stderr %s" + % (" ".join(args), exit_code, stdout, stderr) + ) + raise RuntimeError( + 'Executing the command "%s" on the appliance returned a non-zero ' + "exit code %s with stdout %s and stderr %s" + % (" ".join(args), exit_code, stdout, stderr) + ) return stdout - def coreRsync(self, args: List[str], applianceName: str = 'toil_leader', **kwargs: Any) -> int: - remoteRsync = "docker exec -i %s rsync -v" % applianceName # Access rsync inside appliance + def coreRsync( + self, args: list[str], applianceName: str = "toil_leader", **kwargs: Any + ) -> int: + remoteRsync = ( + "docker exec -i %s rsync -v" % applianceName + ) # Access rsync inside appliance parsedArgs = [] sshCommand = "ssh" - if not kwargs.pop('strict', False): + if not kwargs.pop("strict", False): sshCommand = "ssh -oUserKnownHostsFile=/dev/null -oStrictHostKeyChecking=no" hostInserted = False # Insert remote host address for i in args: if i.startswith(":") and not hostInserted: - user = kwargs.pop('user', 'core') # CHANGED: Is this needed? + user = kwargs.pop("user", "core") # CHANGED: Is this needed? i = (f"{user}@{self.effectiveIP}") + i hostInserted = True elif i.startswith(":") and hostInserted: @@ -314,7 +382,7 @@ def coreRsync(self, args: List[str], applianceName: str = 'toil_leader', **kwarg parsedArgs.append(i) if not hostInserted: raise ValueError("No remote host found in argument list") - command = ['rsync', '-e', sshCommand, '--rsync-path', remoteRsync] + command = ["rsync", "-e", sshCommand, "--rsync-path", remoteRsync] logger.debug("Running %r.", command + parsedArgs) return subprocess.check_call(command + parsedArgs) diff --git a/src/toil/realtimeLogger.py b/src/toil/realtimeLogger.py index 53738960ca..2cc6336955 100644 --- a/src/toil/realtimeLogger.py +++ b/src/toil/realtimeLogger.py @@ -20,7 +20,7 @@ import socketserver as SocketServer import threading from types import TracebackType -from typing import TYPE_CHECKING, Any, Optional, Type +from typing import TYPE_CHECKING, Any, Optional from toil.lib.misc import get_public_ip from toil.statsAndLogging import set_log_level @@ -49,7 +49,7 @@ def handle(self) -> None: try: # Parse it as JSON - message_attrs = json.loads(data.decode('utf-8')) + message_attrs = json.loads(data.decode("utf-8")) # Fluff it up into a proper logging record record = logging.makeLogRecord(message_attrs) if isinstance(record.args, list): @@ -81,7 +81,7 @@ class JSONDatagramHandler(logging.handlers.DatagramHandler): def makePickle(self, record: logging.LogRecord) -> bytes: """Actually, encode the record as bare JSON instead.""" - return json.dumps(record.__dict__).encode('utf-8') + return json.dumps(record.__dict__).encode("utf-8") class RealtimeLoggerMetaclass(type): @@ -113,7 +113,7 @@ class RealtimeLogger(metaclass=RealtimeLoggerMetaclass): envPrefix = "TOIL_RT_LOGGING_" # Avoid duplicating the default level everywhere - defaultLevel = 'INFO' + defaultLevel = "INFO" # State maintained on server and client @@ -131,19 +131,24 @@ class RealtimeLogger(metaclass=RealtimeLoggerMetaclass): logger = None @classmethod - def _startLeader(cls, batchSystem: 'AbstractBatchSystem', level: str = defaultLevel) -> None: + def _startLeader( + cls, batchSystem: "AbstractBatchSystem", level: str = defaultLevel + ) -> None: with cls.lock: if cls.initialized == 0: cls.initialized += 1 if level: - logger.info('Starting real-time logging.') + logger.info("Starting real-time logging.") # Start up the logging server cls.loggingServer = SocketServer.ThreadingUDPServer( - server_address=('0.0.0.0', 0), - RequestHandlerClass=LoggingDatagramHandler) + server_address=("0.0.0.0", 0), + RequestHandlerClass=LoggingDatagramHandler, + ) # Set up a thread to do all the serving in the background and exit when we do - cls.serverThread = threading.Thread(target=cls.loggingServer.serve_forever) + cls.serverThread = threading.Thread( + target=cls.loggingServer.serve_forever + ) cls.serverThread.daemon = True cls.serverThread.start() @@ -156,28 +161,30 @@ def _setEnv(name: str, value: str) -> None: os.environ[name] = value batchSystem.setEnv(name) - _setEnv('ADDRESS', '%s:%i' % (ip, port)) - _setEnv('LEVEL', level) + _setEnv("ADDRESS", "%s:%i" % (ip, port)) + _setEnv("LEVEL", level) else: - logger.debug('Real-time logging disabled') + logger.debug("Real-time logging disabled") else: if level: - logger.warning('Ignoring nested request to start real-time logging') + logger.warning("Ignoring nested request to start real-time logging") @classmethod def _stopLeader(cls) -> None: """Stop the server on the leader.""" with cls.lock: if cls.initialized == 0: - raise RuntimeError("Can't stop the server on the leader as the leader was never initialized.") + raise RuntimeError( + "Can't stop the server on the leader as the leader was never initialized." + ) cls.initialized -= 1 if cls.initialized == 0: if cls.loggingServer: - logger.info('Stopping real-time logging server.') + logger.info("Stopping real-time logging server.") cls.loggingServer.shutdown() cls.loggingServer = None if cls.serverThread: - logger.info('Joining real-time logging server thread.') + logger.info("Joining real-time logging server thread.") cls.serverThread.join() cls.serverThread = None for k in list(os.environ.keys()): @@ -198,9 +205,9 @@ def getLogger(cls) -> logging.Logger: if cls.logger is None: with cls.lock: if cls.logger is None: - cls.logger = logging.getLogger('toil-rt') + cls.logger = logging.getLogger("toil-rt") try: - level = os.environ[cls.envPrefix + 'LEVEL'] + level = os.environ[cls.envPrefix + "LEVEL"] except KeyError: # There is no server running on the leader, so suppress most log messages # and skip the UDP stuff. @@ -209,16 +216,16 @@ def getLogger(cls) -> logging.Logger: # Adopt the logging level set on the leader. set_log_level(level, cls.logger) try: - address = os.environ[cls.envPrefix + 'ADDRESS'] + address = os.environ[cls.envPrefix + "ADDRESS"] except KeyError: pass else: # We know where to send messages to, so send them. - host, port = address.split(':') + host, port = address.split(":") cls.logger.addHandler(JSONDatagramHandler(host, int(port))) return cls.logger - def __init__(self, batchSystem: 'AbstractBatchSystem', level: str = defaultLevel): + def __init__(self, batchSystem: "AbstractBatchSystem", level: str = defaultLevel): """ Create a context manager that starts up the UDP server. @@ -237,5 +244,10 @@ def __enter__(self) -> None: RealtimeLogger._startLeader(self.__batchSystem, level=self.__level) # noinspection PyUnusedLocal - def __exit__(self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]) -> None: + def __exit__( + self, + exc_type: Optional[type[BaseException]], + exc_val: Optional[BaseException], + exc_tb: Optional[TracebackType], + ) -> None: RealtimeLogger._stopLeader() diff --git a/src/toil/resource.py b/src/toil/resource.py index d48150aba3..5156e69f72 100644 --- a/src/toil/resource.py +++ b/src/toil/resource.py @@ -20,17 +20,12 @@ import shutil import sys from collections import namedtuple +from collections.abc import Sequence from contextlib import closing from io import BytesIO from pydoc import locate from types import ModuleType -from typing import (IO, - TYPE_CHECKING, - BinaryIO, - Callable, - Optional, - Sequence, - Type) +from typing import IO, TYPE_CHECKING, BinaryIO, Callable, Optional from urllib.error import HTTPError from urllib.request import urlopen from zipfile import ZipFile @@ -47,7 +42,8 @@ logger = logging.getLogger(__name__) -class Resource(namedtuple('Resource', ('name', 'pathHash', 'url', 'contentHash'))): + +class Resource(namedtuple("Resource", ("name", "pathHash", "url", "contentHash"))): """ Represents a file or directory that will be deployed to each node before any jobs in the user script are invoked. @@ -67,9 +63,9 @@ class Resource(namedtuple('Resource', ('name', 'pathHash', 'url', 'contentHash') ZIP archive of that directory. """ - resourceEnvNamePrefix = 'JTRES_' + resourceEnvNamePrefix = "JTRES_" - rootDirPathEnvName = resourceEnvNamePrefix + 'ROOT' + rootDirPathEnvName = resourceEnvNamePrefix + "ROOT" @classmethod def create(cls, jobStore: "AbstractJobStore", leaderPath: str) -> "Resource": @@ -86,20 +82,26 @@ def create(cls, jobStore: "AbstractJobStore", leaderPath: str) -> "Resource": contentHash = hashlib.md5() # noinspection PyProtectedMember with cls._load(leaderPath) as src: - with jobStore.write_shared_file_stream(shared_file_name=pathHash, encrypted=False) as dst: + with jobStore.write_shared_file_stream( + shared_file_name=pathHash, encrypted=False + ) as dst: userScript = src.read() contentHash.update(userScript) dst.write(userScript) - return cls(name=os.path.basename(leaderPath), - pathHash=pathHash, - url=jobStore.getSharedPublicUrl(sharedFileName=pathHash), - contentHash=contentHash.hexdigest()) + return cls( + name=os.path.basename(leaderPath), + pathHash=pathHash, + url=jobStore.getSharedPublicUrl(sharedFileName=pathHash), + contentHash=contentHash.hexdigest(), + ) def refresh(self, jobStore: "AbstractJobStore") -> "Resource": - return type(self)(name=self.name, - pathHash=self.pathHash, - url=jobStore.get_shared_public_url(shared_file_name=self.pathHash), - contentHash=self.contentHash) + return type(self)( + name=self.name, + pathHash=self.pathHash, + url=jobStore.get_shared_public_url(shared_file_name=self.pathHash), + contentHash=self.contentHash, + ) @classmethod def prepareSystem(cls) -> None: @@ -165,7 +167,9 @@ def download(self, callback: Optional[Callable[[str], None]] = None) -> None: """ dirPath = self.localDirPath if not os.path.exists(dirPath): - tempDirPath = mkdtemp(dir=os.path.dirname(dirPath), prefix=self.contentHash + "-") + tempDirPath = mkdtemp( + dir=os.path.dirname(dirPath), prefix=self.contentHash + "-" + ) self._save(tempDirPath) if callback is not None: callback(tempDirPath) @@ -199,16 +203,22 @@ def localDirPath(self) -> str: return os.path.join(rootDirPath, self.contentHash) def pickle(self) -> str: - return self.__class__.__module__ + "." + self.__class__.__name__ + ':' + json.dumps(self) + return ( + self.__class__.__module__ + + "." + + self.__class__.__name__ + + ":" + + json.dumps(self) + ) @classmethod def unpickle(cls, s: str) -> "Resource": - className, _json = s.split(':', 1) - return locate(className)(*json.loads(_json)) # type: ignore + className, _json = s.split(":", 1) + return locate(className)(*json.loads(_json)) # type: ignore @classmethod def _pathHash(cls, path: str) -> str: - return hashlib.md5(path.encode('utf-8')).hexdigest() + return hashlib.md5(path.encode("utf-8")).hexdigest() @classmethod def _load(cls, path: str) -> IO[bytes]: @@ -230,11 +240,7 @@ def _save(self, dirPath: str) -> None: """ raise NotImplementedError() - @retry(errors=[ - ErrorCondition( - error=HTTPError, - error_codes=[400]) - ]) + @retry(errors=[ErrorCondition(error=HTTPError, error_codes=[400])]) def _download(self, dstFile: IO[bytes]) -> None: """ Download this resource from its URL to the given file object. @@ -254,10 +260,10 @@ class FileResource(Resource): @classmethod def _load(cls, path: str) -> BinaryIO: - return open(path, 'rb') + return open(path, "rb") def _save(self, dirPath: str) -> None: - with open(os.path.join(dirPath, self.name), mode='wb') as localFile: + with open(os.path.join(dirPath, self.name), mode="wb") as localFile: self._download(localFile) @property @@ -277,7 +283,7 @@ class DirectoryResource(Resource): @classmethod def _load(cls, path: str) -> BytesIO: bytesIO = BytesIO() - initfile = os.path.join(path, '__init__.py') + initfile = os.path.join(path, "__init__.py") if os.path.isfile(initfile): # This is a package directory. To emulate # PyZipFile.writepy's behavior, we need to keep everything @@ -286,20 +292,37 @@ def _load(cls, path: str) -> BytesIO: else: # This is a simple user script (with possibly a few helper files) rootDir = path - skipdirList = ['/tmp', '/var', '/etc', '/bin', '/sbin', '/home', '/dev', '/sys', '/usr', '/run'] + skipdirList = [ + "/tmp", + "/var", + "/etc", + "/bin", + "/sbin", + "/home", + "/dev", + "/sys", + "/usr", + "/run", + ] if path not in skipdirList: - with ZipFile(file=bytesIO, mode='w') as zipFile: + with ZipFile(file=bytesIO, mode="w") as zipFile: for dirName, _, fileList in os.walk(path): for fileName in fileList: try: fullPath = os.path.join(dirName, fileName) zipFile.write(fullPath, os.path.relpath(fullPath, rootDir)) except OSError: - logger.critical('Cannot access and read the file at path: %s' % fullPath) + logger.critical( + "Cannot access and read the file at path: %s" % fullPath + ) sys.exit(1) else: - logger.critical("Couldn't package the directory at {} for hot deployment. Would recommend to create a \ - subdirectory (ie {}/MYDIR_HERE/)".format(path, path)) + logger.critical( + "Couldn't package the directory at {} for hot deployment. Would recommend to create a \ + subdirectory (ie {}/MYDIR_HERE/)".format( + path, path + ) + ) sys.exit(1) bytesIO.seek(0) return bytesIO @@ -308,7 +331,7 @@ def _save(self, dirPath: str) -> None: bytesIO = BytesIO() self._download(bytesIO) bytesIO.seek(0) - with ZipFile(file=bytesIO, mode='r') as zipFile: + with ZipFile(file=bytesIO, mode="r") as zipFile: zipFile.extractall(path=dirPath) @property @@ -325,10 +348,10 @@ class VirtualEnvResource(DirectoryResource): @classmethod def _load(cls, path: str) -> BytesIO: - if os.path.basename(path) != 'site-packages': + if os.path.basename(path) != "site-packages": raise RuntimeError("An incorrect path was passed through.") bytesIO = BytesIO() - with ZipFile(file=bytesIO, mode='w') as zipFile: + with ZipFile(file=bytesIO, mode="w") as zipFile: for dirName, _, fileList in os.walk(path): zipFile.write(dirName) for fileName in fileList: @@ -338,7 +361,9 @@ def _load(cls, path: str) -> BytesIO: return bytesIO -class ModuleDescriptor(namedtuple('ModuleDescriptor', ('dirPath', 'name', 'fromVirtualEnv'))): +class ModuleDescriptor( + namedtuple("ModuleDescriptor", ("dirPath", "name", "fromVirtualEnv")) +): """ A path to a Python module decomposed into a namedtuple of three elements @@ -378,6 +403,7 @@ class ModuleDescriptor(namedtuple('ModuleDescriptor', ('dirPath', 'name', 'fromV Clean up >>> rmtree( dirPath ) """ + dirPath: str name: str @@ -392,7 +418,7 @@ def forModule(cls, name: str) -> "ModuleDescriptor": """ module = sys.modules[name] if module.__file__ is None: - raise Exception(f'Module {name} does not exist.') + raise Exception(f"Module {name} does not exist.") fileAbsPath = os.path.abspath(module.__file__) filePath = fileAbsPath.split(os.path.sep) filePath[-1], extension = os.path.splitext(filePath[-1]) @@ -405,12 +431,12 @@ def forModule(cls, name: str) -> "ModuleDescriptor": # Invoked as a module via python -m foo.bar logger.debug("Script was invoked as a module") nameList = [filePath.pop()] - for package in reversed(module.__package__.split('.')): + for package in reversed(module.__package__.split(".")): dirPathTail = filePath.pop() if dirPathTail != package: raise RuntimeError("Incorrect path to package.") nameList.append(dirPathTail) - name = '.'.join(reversed(nameList)) + name = ".".join(reversed(nameList)) dirPath = os.path.sep.join(filePath) else: # Invoked as a script via python foo/bar.py @@ -419,20 +445,27 @@ def forModule(cls, name: str) -> "ModuleDescriptor": cls._check_conflict(dirPath, name) else: # User module was imported. Determine the directory containing the top-level package - if filePath[-1] == '__init__': + if filePath[-1] == "__init__": # module is a subpackage filePath.pop() - for package in reversed(name.split('.')): + for package in reversed(name.split(".")): dirPathTail = filePath.pop() if dirPathTail != package: raise RuntimeError("Incorrect path to package.") dirPath = os.path.abspath(os.path.sep.join(filePath)) absPrefix = os.path.abspath(sys.prefix) inVenv = inVirtualEnv() - logger.debug("Module dir is %s, our prefix is %s, virtualenv: %s", dirPath, absPrefix, inVenv) + logger.debug( + "Module dir is %s, our prefix is %s, virtualenv: %s", + dirPath, + absPrefix, + inVenv, + ) if not os.path.isdir(dirPath): - raise Exception(f'Bad directory path {dirPath} for module {name}. Note that hot-deployment does not support .egg-link files yet, or scripts located in the root directory.') + raise Exception( + f"Bad directory path {dirPath} for module {name}. Note that hot-deployment does not support .egg-link files yet, or scripts located in the root directory." + ) fromVirtualEnv = inVenv and dirPath.startswith(absPrefix) return cls(dirPath=dirPath, name=name, fromVirtualEnv=fromVirtualEnv) @@ -446,7 +479,11 @@ def _check_conflict(cls, dirPath: str, name: str) -> None: """ old_sys_path = sys.path try: - sys.path = [d for d in old_sys_path if os.path.realpath(d) != os.path.realpath(dirPath)] + sys.path = [ + d + for d in old_sys_path + if os.path.realpath(d) != os.path.realpath(dirPath) + ] try: colliding_module = importlib.import_module(name) except ImportError: @@ -454,7 +491,9 @@ def _check_conflict(cls, dirPath: str, name: str) -> None: else: raise ResourceException( "The user module '{}' collides with module '{} from '{}'.".format( - name, colliding_module.__name__, colliding_module.__file__)) + name, colliding_module.__name__, colliding_module.__file__ + ) + ) finally: sys.path = old_sys_path @@ -463,7 +502,7 @@ def belongsToToil(self) -> bool: """ True if this module is part of the Toil distribution """ - return self.name.startswith('toil.') + return self.name.startswith("toil.") def saveAsResourceTo(self, jobStore: "AbstractJobStore") -> Resource: """ @@ -475,11 +514,11 @@ def saveAsResourceTo(self, jobStore: "AbstractJobStore") -> Resource: """ return self._getResourceClass().create(jobStore, self._resourcePath) - def _getResourceClass(self) -> Type[Resource]: + def _getResourceClass(self) -> type[Resource]: """ Return the concrete subclass of Resource that's appropriate for auto-deploying this module. """ - subcls: Type[Resource] + subcls: type[Resource] if self.fromVirtualEnv: subcls = VirtualEnvResource elif os.path.isdir(self._resourcePath): @@ -487,7 +526,9 @@ def _getResourceClass(self) -> Type[Resource]: elif os.path.isfile(self._resourcePath): subcls = FileResource elif os.path.exists(self._resourcePath): - raise AssertionError("Neither a file or a directory: '%s'" % self._resourcePath) + raise AssertionError( + "Neither a file or a directory: '%s'" % self._resourcePath + ) else: raise AssertionError("No such file or directory: '%s'" % self._resourcePath) return subcls @@ -501,27 +542,30 @@ def localize(self) -> "ModuleDescriptor": the leader, this method returns this resource, i.e. self. """ if not self._runningOnWorker(): - logger.warning('The localize() method should only be invoked on a worker.') + logger.warning("The localize() method should only be invoked on a worker.") resource = Resource.lookup(self._resourcePath) if resource is None: return self else: + def stash(tmpDirPath: str) -> None: # Save the original dirPath such that we can restore it in globalize() - with open(os.path.join(tmpDirPath, '.stash'), 'w') as f: - f.write('1' if self.fromVirtualEnv else '0') + with open(os.path.join(tmpDirPath, ".stash"), "w") as f: + f.write("1" if self.fromVirtualEnv else "0") f.write(self.dirPath) resource.download(callback=stash) - return self.__class__(dirPath=resource.localDirPath, - name=self.name, - fromVirtualEnv=self.fromVirtualEnv) + return self.__class__( + dirPath=resource.localDirPath, + name=self.name, + fromVirtualEnv=self.fromVirtualEnv, + ) def _runningOnWorker(self) -> bool: try: - mainModule = sys.modules['__main__'] + mainModule = sys.modules["__main__"] except KeyError: - logger.warning('Cannot determine main program module.') + logger.warning("Cannot determine main program module.") return False else: # If __file__ is not a valid attribute, it's because @@ -535,7 +579,12 @@ def _runningOnWorker(self) -> bool: except AttributeError: return False - workerModuleFiles = ['worker.py', 'worker.pyc', 'worker.pyo', '_toil_worker'] # setuptools entry point + workerModuleFiles = [ + "worker.py", + "worker.pyc", + "worker.pyo", + "_toil_worker", + ] # setuptools entry point return mainModuleFile in workerModuleFiles def globalize(self) -> "ModuleDescriptor": @@ -543,7 +592,7 @@ def globalize(self) -> "ModuleDescriptor": Reverse the effect of localize(). """ try: - with open(os.path.join(self.dirPath, '.stash')) as f: + with open(os.path.join(self.dirPath, ".stash")) as f: fromVirtualEnv = [False, True][int(f.read(1))] dirPath = f.read() except OSError as e: @@ -552,9 +601,9 @@ def globalize(self) -> "ModuleDescriptor": else: raise else: - return self.__class__(dirPath=dirPath, - name=self.name, - fromVirtualEnv=fromVirtualEnv) + return self.__class__( + dirPath=dirPath, name=self.name, fromVirtualEnv=fromVirtualEnv + ) @property def _resourcePath(self) -> str: @@ -564,7 +613,7 @@ def _resourcePath(self) -> str: """ if self.fromVirtualEnv: return self.dirPath - elif '.' in self.name: + elif "." in self.name: return os.path.join(self.dirPath, self._rootPackage()) else: initName = self._initModuleName(self.dirPath) @@ -572,22 +621,31 @@ def _resourcePath(self) -> str: raise ResourceException( "Toil does not support loading a user script from a package directory. You " "may want to remove %s from %s or invoke the user script as a module via " - "'PYTHONPATH=\"%s\" %s -m %s.%s'." % - tuple(concat(initName, self.dirPath, exactPython, os.path.split(self.dirPath), self.name))) + "'PYTHONPATH=\"%s\" %s -m %s.%s'." + % tuple( + concat( + initName, + self.dirPath, + exactPython, + os.path.split(self.dirPath), + self.name, + ) + ) + ) return self.dirPath @classmethod def _initModuleName(cls, dirPath: str) -> Optional[str]: - for name in ('__init__.py', '__init__.pyc', '__init__.pyo'): + for name in ("__init__.py", "__init__.pyc", "__init__.pyo"): if os.path.exists(os.path.join(dirPath, name)): return name return None def _rootPackage(self) -> str: try: - head, tail = self.name.split('.', 1) + head, tail = self.name.split(".", 1) except ValueError: - raise ValueError('%r is stand-alone module.' % self.__repr__()) + raise ValueError("%r is stand-alone module." % self.__repr__()) else: return head @@ -598,7 +656,9 @@ def toCommand(self) -> Sequence[str]: def fromCommand(cls, command: Sequence[str]) -> "ModuleDescriptor": if len(command) != 3: raise RuntimeError("Incorrect number of arguments (Expected 3).") - return cls(dirPath=command[0], name=command[1], fromVirtualEnv=strict_bool(command[2])) + return cls( + dirPath=command[0], name=command[1], fromVirtualEnv=strict_bool(command[2]) + ) def makeLoadable(self) -> "ModuleDescriptor": module = self if self.belongsToToil else self.localize() @@ -611,7 +671,9 @@ def load(self) -> Optional[ModuleType]: try: return importlib.import_module(module.name) except ImportError: - logger.error('Failed to import user module %r from sys.path (%r).', module, sys.path) + logger.error( + "Failed to import user module %r from sys.path (%r).", module, sys.path + ) raise diff --git a/src/toil/server/app.py b/src/toil/server/app.py index ec06ccdff2..77260c4b67 100644 --- a/src/toil/server/app.py +++ b/src/toil/server/app.py @@ -29,49 +29,103 @@ def parser_with_server_options() -> argparse.ArgumentParser: parser = ArgumentParser(description="Toil server mode.") - parser.add_argument("--debug", action="store_true", default=False, - help="Enable debug mode.") - parser.add_argument("--bypass_celery", action="store_true", default=False, - help="Skip sending workflows to Celery and just run them under the " - "server. For testing.") - parser.add_argument("--host", type=str, default="127.0.0.1", - help="The host interface that the Toil server binds on. (default: '127.0.0.1').") - parser.add_argument("--port", type=int, default=8080, - help="The port that the Toil server listens on. (default: 8080).") - parser.add_argument("--swagger_ui", action="store_true", default=False, - help="If True, the swagger UI will be enabled and hosted on the " - "`{api_base_path}/ui` endpoint. (default: False)") + parser.add_argument( + "--debug", action="store_true", default=False, help="Enable debug mode." + ) + parser.add_argument( + "--bypass_celery", + action="store_true", + default=False, + help="Skip sending workflows to Celery and just run them under the " + "server. For testing.", + ) + parser.add_argument( + "--host", + type=str, + default="127.0.0.1", + help="The host interface that the Toil server binds on. (default: '127.0.0.1').", + ) + parser.add_argument( + "--port", + type=int, + default=8080, + help="The port that the Toil server listens on. (default: 8080).", + ) + parser.add_argument( + "--swagger_ui", + action="store_true", + default=False, + help="If True, the swagger UI will be enabled and hosted on the " + "`{api_base_path}/ui` endpoint. (default: False)", + ) # CORS - parser.add_argument("--cors", action="store_true", default=False, - help="Enable Cross Origin Resource Sharing (CORS). This should only be turned on " - "if the server is intended to be used by a website or domain.") - parser.add_argument("--cors_origins", type=str, default="*", - help="Ignored if --cors is False. This sets the allowed origins for CORS. " - "For details about CORS and its security risks, see: " - "https://w3id.org/ga4gh/product-approval-support/cors. (default: '*')") + parser.add_argument( + "--cors", + action="store_true", + default=False, + help="Enable Cross Origin Resource Sharing (CORS). This should only be turned on " + "if the server is intended to be used by a website or domain.", + ) + parser.add_argument( + "--cors_origins", + type=str, + default="*", + help="Ignored if --cors is False. This sets the allowed origins for CORS. " + "For details about CORS and its security risks, see: " + "https://w3id.org/ga4gh/product-approval-support/cors. (default: '*')", + ) # production only - parser.add_argument("-w", "--workers", dest='workers', type=int, default=2, - help="Ignored if --debug is True. The number of worker processes launched by the " - "WSGI server. (default: 2).") - - parser.add_argument("--work_dir", type=str, default=os.path.join(os.getcwd(), "workflows"), - help="The directory where workflows should be stored. This directory should be " - "empty or only contain previous workflows. (default: './workflows').") - parser.add_argument("--state_store", type=str, default=None, - help="The local path or S3 URL where workflow state metadata should be stored. " - "(default: in --work_dir)") - parser.add_argument("--opt", "-o", type=str, action="append", default=[], - help="Specify the default parameters to be sent to the workflow engine for each " - "run. Options taking arguments must use = syntax. Accepts multiple values.\n" - "Example: '--opt=--logLevel=CRITICAL --opt=--workDir=/tmp'.") - parser.add_argument("--dest_bucket_base", type=str, default=None, - help="Direct CWL workflows to save output files to dynamically generated " - "unique paths under the given URL. Supports AWS S3.") - parser.add_argument("--wes_dialect", type=str, default="standard", choices=["standard", "agc"], - help="Restrict WES responses to a dialect compatible with clients that do not fully " - "implement the WES standard. (default: 'standard')") - - parser.add_argument("--version", action='version', version=version) + parser.add_argument( + "-w", + "--workers", + dest="workers", + type=int, + default=2, + help="Ignored if --debug is True. The number of worker processes launched by the " + "WSGI server. (default: 2).", + ) + + parser.add_argument( + "--work_dir", + type=str, + default=os.path.join(os.getcwd(), "workflows"), + help="The directory where workflows should be stored. This directory should be " + "empty or only contain previous workflows. (default: './workflows').", + ) + parser.add_argument( + "--state_store", + type=str, + default=None, + help="The local path or S3 URL where workflow state metadata should be stored. " + "(default: in --work_dir)", + ) + parser.add_argument( + "--opt", + "-o", + type=str, + action="append", + default=[], + help="Specify the default parameters to be sent to the workflow engine for each " + "run. Options taking arguments must use = syntax. Accepts multiple values.\n" + "Example: '--opt=--logLevel=CRITICAL --opt=--workDir=/tmp'.", + ) + parser.add_argument( + "--dest_bucket_base", + type=str, + default=None, + help="Direct CWL workflows to save output files to dynamically generated " + "unique paths under the given URL. Supports AWS S3.", + ) + parser.add_argument( + "--wes_dialect", + type=str, + default="standard", + choices=["standard", "agc"], + help="Restrict WES responses to a dialect compatible with clients that do not fully " + "implement the WES standard. (default: 'standard')", + ) + + parser.add_argument("--version", action="version", version=version) return parser @@ -79,34 +133,43 @@ def create_app(args: argparse.Namespace) -> "connexion.FlaskApp": """ Create a "connexion.FlaskApp" instance with Toil server configurations. """ - flask_app = connexion.FlaskApp(__name__, - specification_dir='api_spec/', - options={"swagger_ui": args.swagger_ui}) + flask_app = connexion.FlaskApp( + __name__, specification_dir="api_spec/", options={"swagger_ui": args.swagger_ui} + ) - flask_app.app.config['JSON_SORT_KEYS'] = False + flask_app.app.config["JSON_SORT_KEYS"] = False if args.cors: # enable cross origin resource sharing from flask_cors import CORS + CORS(flask_app.app, resources={r"/ga4gh/*": {"origins": args.cors_origins}}) # add workflow execution service (WES) API endpoints - backend = ToilBackend(work_dir=args.work_dir, - state_store=args.state_store, - options=args.opt, - dest_bucket_base=args.dest_bucket_base, - bypass_celery=args.bypass_celery, - wes_dialect=args.wes_dialect) - - flask_app.add_api('workflow_execution_service.swagger.yaml', - resolver=connexion.Resolver(backend.resolve_operation_id)) # noqa + backend = ToilBackend( + work_dir=args.work_dir, + state_store=args.state_store, + options=args.opt, + dest_bucket_base=args.dest_bucket_base, + bypass_celery=args.bypass_celery, + wes_dialect=args.wes_dialect, + ) + + flask_app.add_api( + "workflow_execution_service.swagger.yaml", + resolver=connexion.Resolver(backend.resolve_operation_id), + ) # noqa # add custom endpoints if isinstance(backend, ToilBackend): # We extend the WES API to allow presenting log data base_url = "/toil/wes/v1" - flask_app.app.add_url_rule(f"{base_url}/logs//stdout", view_func=backend.get_stdout) - flask_app.app.add_url_rule(f"{base_url}/logs//stderr", view_func=backend.get_stderr) + flask_app.app.add_url_rule( + f"{base_url}/logs//stdout", view_func=backend.get_stdout + ) + flask_app.app.add_url_rule( + f"{base_url}/logs//stderr", view_func=backend.get_stderr + ) # To be a well-behaved AGC engine we can implement the default status check endpoint flask_app.app.add_url_rule("/engine/v1/status", view_func=backend.get_health) # And we can provide lost humans some information on what they are looking at @@ -116,7 +179,7 @@ def create_app(args: argparse.Namespace) -> "connexion.FlaskApp": def start_server(args: argparse.Namespace) -> None: - """ Start a Toil server.""" + """Start a Toil server.""" # Explain a bit about who and where we are logger.info("Toil WES server version %s starting...", version) @@ -137,7 +200,10 @@ def start_server(args: argparse.Namespace) -> None: flask_app.run(host=host, port=port) else: # start a production WSGI server - run_app(flask_app.app, options={ - "bind": f"{host}:{port}", - "workers": args.workers, - }) + run_app( + flask_app.app, + options={ + "bind": f"{host}:{port}", + "workers": args.workers, + }, + ) diff --git a/src/toil/server/celery_app.py b/src/toil/server/celery_app.py index 2d8984fbec..9da15b2a4d 100644 --- a/src/toil/server/celery_app.py +++ b/src/toil/server/celery_app.py @@ -11,7 +11,9 @@ def create_celery_app() -> Celery: """ """ - broker = os.environ.get("TOIL_WES_BROKER_URL", "amqp://guest:guest@localhost:5672//") + broker = os.environ.get( + "TOIL_WES_BROKER_URL", "amqp://guest:guest@localhost:5672//" + ) app = Celery("toil_wes", broker=broker) # Celery configurations diff --git a/src/toil/server/cli/wes_cwl_runner.py b/src/toil/server/cli/wes_cwl_runner.py index 77b1f30c24..55943e8884 100644 --- a/src/toil/server/cli/wes_cwl_runner.py +++ b/src/toil/server/cli/wes_cwl_runner.py @@ -5,8 +5,9 @@ import sys import time from base64 import b64encode +from collections.abc import Iterable from io import BytesIO -from typing import Any, Dict, Iterable, List, Optional, Tuple, cast +from typing import Any, Optional, cast from urllib.parse import urldefrag, urljoin, urlparse import requests @@ -56,7 +57,7 @@ logger = logging.getLogger(__name__) -def generate_attachment_path_names(paths: List[str]) -> Tuple[str, List[str]]: +def generate_attachment_path_names(paths: list[str]) -> tuple[str, list[str]]: """ Take in a list of path names and return a list of names with the common path name stripped out, while preserving the input order. This guarantees that @@ -105,7 +106,8 @@ class WESClientWithWorkflowEngineParameters(WESClient): # type: ignore TODO: Propose a PR in wes-service to include workflow_engine_params. """ - def __init__(self, endpoint: str, auth: Optional[Tuple[str, str]] = None) -> None: + + def __init__(self, endpoint: str, auth: Optional[tuple[str, str]] = None) -> None: """ :param endpoint: The http(s) URL of the WES server. Must include the protocol. @@ -113,12 +115,21 @@ def __init__(self, endpoint: str, auth: Optional[Tuple[str, str]] = None) -> Non request to the WES server. """ proto, host = endpoint.split("://") - super().__init__({ - # TODO: use the auth argument in requests.post so we don't need to encode it ourselves - "auth": {"Authorization": "Basic " + b64encode(f"{auth[0]}:{auth[1]}".encode()).decode("utf-8")} if auth else {}, - "proto": proto, - "host": host - }) + super().__init__( + { + # TODO: use the auth argument in requests.post so we don't need to encode it ourselves + "auth": ( + { + "Authorization": "Basic " + + b64encode(f"{auth[0]}:{auth[1]}".encode()).decode("utf-8") + } + if auth + else {} + ), + "proto": proto, + "host": host, + } + ) def get_version(self, extension: str, workflow_file: str) -> str: """Determines the version of a .py, .wdl, or .cwl file.""" @@ -140,7 +151,7 @@ def get_version(self, extension: str, workflow_file: str) -> str: else: raise RuntimeError(f"Invalid workflow extension: {extension}.") - def parse_params(self, workflow_params_file: str) -> Dict[str, Any]: + def parse_params(self, workflow_params_file: str) -> dict[str, Any]: """ Parse the CWL input file into a dictionary to be attached to the body of the WES run request. @@ -155,9 +166,11 @@ def parse_params(self, workflow_params_file: str) -> Dict[str, Any]: workflow_params: Any workflow_params, _ = loader.resolve_ref(workflow_params_file, checklinks=False) - return cast(Dict[str, Any], workflow_params) + return cast(dict[str, Any], workflow_params) - def modify_param_paths(self, base_dir: str, workflow_params: Dict[str, Any]) -> None: + def modify_param_paths( + self, base_dir: str, workflow_params: dict[str, Any] + ) -> None: """ Modify the file paths in the input workflow parameters to be relative to base_dir. @@ -168,7 +181,7 @@ def modify_param_paths(self, base_dir: str, workflow_params: Dict[str, Any]) -> :param workflow_params: A dict containing the workflow parameters. """ - def replace(field: str, file_obj: Dict[str, str]) -> None: + def replace(field: str, file_obj: dict[str, str]) -> None: """ Given a file object with the "location" or "path" field, replace it to be relative to base_dir. @@ -192,15 +205,16 @@ def replace_paths(obj: Any) -> None: replace_paths(file.values()) elif isinstance(file, list): replace_paths(file) + replace_paths(workflow_params.values()) def build_wes_request( - self, - workflow_file: str, - workflow_params_file: Optional[str], - attachments: Optional[List[str]], - workflow_engine_parameters: Optional[List[str]] = None - ) -> Tuple[Dict[str, str], Iterable[Tuple[str, Tuple[str, BytesIO]]]]: + self, + workflow_file: str, + workflow_params_file: Optional[str], + attachments: Optional[list[str]], + workflow_engine_parameters: Optional[list[str]] = None, + ) -> tuple[dict[str, str], Iterable[tuple[str, tuple[str, BytesIO]]]]: """ Build the workflow run request to submit to WES. @@ -233,21 +247,21 @@ def build_wes_request( workflow_type = wf_url.lower().split(".")[-1] # Grab the file extension workflow_type_version = self.get_version(workflow_type, wf_url) - data: Dict[str, str] = { + data: dict[str, str] = { "workflow_url": workflow_file, "workflow_params": "", # to be set after attachments are processed "workflow_type": workflow_type, - "workflow_type_version": workflow_type_version + "workflow_type_version": workflow_type_version, } # Convert engine arguments into a JSON object if workflow_engine_parameters: params = {} for param in workflow_engine_parameters: - if '=' not in param: # flags like "--logDebug" + if "=" not in param: # flags like "--logDebug" k, v = param, None else: - k, v = param.split('=', 1) + k, v = param.split("=", 1) params[k] = v data["workflow_engine_parameters"] = json.dumps(params) @@ -278,12 +292,12 @@ def build_wes_request( return data, [("workflow_attachment", val) for val in workflow_attachments] def run_with_engine_options( - self, - workflow_file: str, - workflow_params_file: Optional[str], - attachments: Optional[List[str]], - workflow_engine_parameters: Optional[List[str]] - ) -> Dict[str, Any]: + self, + workflow_file: str, + workflow_params_file: Optional[str], + attachments: Optional[list[str]], + workflow_engine_parameters: Optional[list[str]], + ) -> dict[str, Any]: """ Composes and sends a post request that signals the WES server to run a workflow. @@ -297,10 +311,9 @@ def run_with_engine_options( :return: The body of the post result as a dictionary. """ - data, files = self.build_wes_request(workflow_file, - workflow_params_file, - attachments, - workflow_engine_parameters) + data, files = self.build_wes_request( + workflow_file, workflow_params_file, attachments, workflow_engine_parameters + ) post_result = requests.post( urljoin(f"{self.proto}://{self.host}", "/ga4gh/wes/v1/runs"), data=data, @@ -308,10 +321,10 @@ def run_with_engine_options( headers=self.auth, ) - return cast(Dict[str, Any], wes_response(post_result)) + return cast(dict[str, Any], wes_response(post_result)) -def get_deps_from_cwltool(cwl_file: str, input_file: Optional[str] = None) -> List[str]: +def get_deps_from_cwltool(cwl_file: str, input_file: Optional[str] = None) -> list[str]: """ Return a list of dependencies of the given workflow from cwltool. @@ -320,19 +333,21 @@ def get_deps_from_cwltool(cwl_file: str, input_file: Optional[str] = None) -> Li this returns the dependencies from the input file. """ - option = '--print-input-deps' if input_file else '--print-deps' + option = "--print-input-deps" if input_file else "--print-deps" - args = ['cwltool', option, '--relative-deps', 'cwd', cwl_file] + args = ["cwltool", option, "--relative-deps", "cwd", cwl_file] if input_file: args.append(input_file) - p = subprocess.run(args=args, check=True, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) + p = subprocess.run( + args=args, check=True, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL + ) result = p.stdout.decode() if not result: return [] - json_result: Dict[str, Any] = json.loads(result) + json_result: dict[str, Any] = json.loads(result) deps = [] def get_deps(obj: Any) -> None: @@ -368,10 +383,12 @@ def get_deps(obj: Any) -> None: return deps -def submit_run(client: WESClientWithWorkflowEngineParameters, - cwl_file: str, - input_file: Optional[str] = None, - engine_options: Optional[List[str]] = None) -> str: +def submit_run( + client: WESClientWithWorkflowEngineParameters, + cwl_file: str, + input_file: Optional[str] = None, + engine_options: Optional[list[str]] = None, +) -> str: """ Given a CWL file, its input files, and an optional list of engine options, submit the CWL workflow to the WES server via the WES client. @@ -391,23 +408,32 @@ def submit_run(client: WESClientWithWorkflowEngineParameters, if input_file: attachments.extend(get_deps_from_cwltool(cwl_file, input_file)) - run_result: Dict[str, Any] = client.run_with_engine_options( + run_result: dict[str, Any] = client.run_with_engine_options( cwl_file, input_file, attachments=attachments, - workflow_engine_parameters=engine_options) + workflow_engine_parameters=engine_options, + ) return str(run_result["run_id"]) def poll_run(client: WESClientWithWorkflowEngineParameters, run_id: str) -> bool: - """ Return True if the given workflow run is in a finished state.""" + """Return True if the given workflow run is in a finished state.""" status_result = client.get_run_status(run_id) state = status_result.get("state") - return state in ("COMPLETE", "CANCELING", "CANCELED", "EXECUTOR_ERROR", "SYSTEM_ERROR") + return state in ( + "COMPLETE", + "CANCELING", + "CANCELED", + "EXECUTOR_ERROR", + "SYSTEM_ERROR", + ) -def print_logs_and_exit(client: WESClientWithWorkflowEngineParameters, run_id: str) -> None: +def print_logs_and_exit( + client: WESClientWithWorkflowEngineParameters, run_id: str +) -> None: """ Fetch the workflow logs from the WES server, print the results, then exit the program with the same exit code as the workflow run. @@ -431,9 +457,11 @@ def main() -> None: parser.add_argument("cwl_file", type=str) parser.add_argument("input_file", type=str, nargs="?", default=None) # arguments used by the WES runner - parser.add_argument("--wes_endpoint", - default=os.environ.get("TOIL_WES_ENDPOINT", "http://localhost:8080"), - help="The http(s) URL of the WES server. (default: %(default)s)") + parser.add_argument( + "--wes_endpoint", + default=os.environ.get("TOIL_WES_ENDPOINT", "http://localhost:8080"), + help="The http(s) URL of the WES server. (default: %(default)s)", + ) # the rest of the arguments are passed as engine options to the WES server options, rest = parser.parse_known_args() @@ -449,7 +477,8 @@ def main() -> None: client = WESClientWithWorkflowEngineParameters( endpoint=endpoint, - auth=(wes_user, wes_password) if wes_user and wes_password else None) + auth=(wes_user, wes_password) if wes_user and wes_password else None, + ) run_id = submit_run(client, cwl_file, input_file, engine_options=rest) assert run_id @@ -462,5 +491,5 @@ def main() -> None: print_logs_and_exit(client, run_id) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/toil/server/utils.py b/src/toil/server/utils.py index d61f48ae25..9fb43f37a2 100644 --- a/src/toil/server/utils.py +++ b/src/toil/server/utils.py @@ -16,7 +16,7 @@ import os from abc import abstractmethod from datetime import datetime -from typing import Dict, Optional, Tuple +from typing import Optional from urllib.parse import urlparse import requests @@ -28,12 +28,14 @@ from toil.lib.aws import get_current_aws_region from toil.lib.aws.session import client from toil.lib.aws.utils import retry_s3 + HAVE_S3 = True except ImportError: HAVE_S3 = False logger = logging.getLogger(__name__) + def get_iso_time() -> str: """ Return the current time in ISO 8601 format. @@ -55,7 +57,9 @@ def link_file(src: str, dest: str) -> None: os.symlink(src, dest) -def download_file_from_internet(src: str, dest: str, content_type: Optional[str] = None) -> None: +def download_file_from_internet( + src: str, dest: str, content_type: Optional[str] = None +) -> None: """ Download a file from the Internet and write it to dest. """ @@ -64,14 +68,19 @@ def download_file_from_internet(src: str, dest: str, content_type: Optional[str] if not response.ok: raise RuntimeError("Request failed with a client error or a server error.") - if content_type and not response.headers.get("Content-Type", "").startswith(content_type): + if content_type and not response.headers.get("Content-Type", "").startswith( + content_type + ): val = response.headers.get("Content-Type") raise RuntimeError(f"Expected content type to be '{content_type}'. Not {val}.") with open(dest, "wb") as f: f.write(response.content) -def download_file_from_s3(src: str, dest: str, content_type: Optional[str] = None) -> None: + +def download_file_from_s3( + src: str, dest: str, content_type: Optional[str] = None +) -> None: """ Download a file from Amazon S3 and write it to dest. """ @@ -81,10 +90,11 @@ def download_file_from_s3(src: str, dest: str, content_type: Optional[str] = Non except ImportError: raise RuntimeError("Cannot access S3 as AWS modules are not available") - with open(dest, 'wb') as out_stream: + with open(dest, "wb") as out_stream: obj = get_object_for_url(urlparse(src), existing=True) obj.download_fileobj(out_stream) + def get_file_class(path: str) -> str: """ Return the type of the file as a human readable string. @@ -97,6 +107,7 @@ def get_file_class(path: str) -> str: return "Directory" return "Unknown" + @retry(errors=[OSError, BlockingIOError]) def safe_read_file(file: str) -> Optional[str]: """ @@ -153,6 +164,7 @@ def safe_write_file(file: str, s: str) -> None: with open(temp_name, "w") as file_obj: file_obj.write(s) + class MemoryStateCache: """ An in-memory place to store workflow state. @@ -164,7 +176,7 @@ def __init__(self) -> None: """ super().__init__() - self._data: Dict[Tuple[str, str], Optional[str]] = {} + self._data: dict[tuple[str, str], Optional[str]] = {} def get(self, workflow_id: str, key: str) -> Optional[str]: """ @@ -185,6 +197,7 @@ def set(self, workflow_id: str, key: str, value: Optional[str]) -> None: else: self._data[(workflow_id, key)] = value + class AbstractStateStore: """ A place for the WES server to keep its state: the set of workflows that @@ -250,6 +263,7 @@ def write_cache(self, workflow_id: str, key: str, value: Optional[str]) -> None: """ self._cache.set(workflow_id, key, value) + class MemoryStateStore(MemoryStateCache, AbstractStateStore): """ An in-memory place to store workflow state, for testing. @@ -261,6 +275,7 @@ class MemoryStateStore(MemoryStateCache, AbstractStateStore): def __init__(self): super().__init__() + class FileStateStore(AbstractStateStore): """ A place to store workflow state that uses a POSIX-compatible file system. @@ -275,7 +290,7 @@ def __init__(self, url: str) -> None: """ super().__init__() parse = urlparse(url) - if parse.scheme.lower() not in ['file', '']: + if parse.scheme.lower() not in ["file", ""]: # We want to catch if we get the wrong argument. raise RuntimeError(f"{url} doesn't look like a local path") if not os.path.exists(parse.path): @@ -309,7 +324,9 @@ def set(self, workflow_id: str, key: str, value: Optional[str]) -> None: # Set the value in the file safe_write_file(file_path, value) + if HAVE_S3: + class S3StateStore(AbstractStateStore): """ A place to store workflow state that uses an S3-compatible object store. @@ -327,7 +344,7 @@ def __init__(self, url: str) -> None: parse = urlparse(url) - if parse.scheme.lower() != 's3': + if parse.scheme.lower() != "s3": # We want to catch if we get the wrong argument. raise RuntimeError(f"{url} doesn't look like an S3 URL") @@ -335,12 +352,14 @@ def __init__(self, url: str) -> None: # urlparse keeps the leading '/', but here we want a path in the # bucket without a leading '/'. We also need to support an empty # path. - self._base_path = parse.path[1:] if parse.path.startswith('/') else parse.path - self._client = client('s3', region_name=get_current_aws_region()) + self._base_path = ( + parse.path[1:] if parse.path.startswith("/") else parse.path + ) + self._client = client("s3", region_name=get_current_aws_region()) logger.debug("Connected to S3StateStore at %s", url) - def _get_bucket_and_path(self, workflow_id: str, key: str) -> Tuple[str, str]: + def _get_bucket_and_path(self, workflow_id: str, key: str) -> tuple[str, str]: """ Get the bucket and path in the bucket at which a key value belongs. """ @@ -354,13 +373,12 @@ def get(self, workflow_id: str, key: str) -> Optional[str]: bucket, path = self._get_bucket_and_path(workflow_id, key) for attempt in retry_s3(): try: - logger.debug('Fetch %s path %s', bucket, path) + logger.debug("Fetch %s path %s", bucket, path) response = self._client.get_object(Bucket=bucket, Key=path) - return response['Body'].read().decode('utf-8') + return response["Body"].read().decode("utf-8") except self._client.exceptions.NoSuchKey: return None - def set(self, workflow_id: str, key: str, value: Optional[str]) -> None: """ Set or clear a key value on S3. @@ -369,18 +387,21 @@ def set(self, workflow_id: str, key: str, value: Optional[str]) -> None: for attempt in retry_s3(): if value is None: # Get rid of it. - logger.debug('Clear %s path %s', bucket, path) + logger.debug("Clear %s path %s", bucket, path) self._client.delete_object(Bucket=bucket, Key=path) return else: # Store it, clobbering anything there already. - logger.debug('Set %s path %s', bucket, path) - self._client.put_object(Bucket=bucket, Key=path, - Body=value.encode('utf-8')) + logger.debug("Set %s path %s", bucket, path) + self._client.put_object( + Bucket=bucket, Key=path, Body=value.encode("utf-8") + ) return + # We want to memoize state stores so we can cache on them. -state_store_cache: Dict[str, AbstractStateStore] = {} +state_store_cache: dict[str, AbstractStateStore] = {} + def connect_to_state_store(url: str) -> AbstractStateStore: """ @@ -392,25 +413,30 @@ def connect_to_state_store(url: str) -> AbstractStateStore: if url not in state_store_cache: # We need to actually make the state store parse = urlparse(url) - if parse.scheme.lower() == 's3': + if parse.scheme.lower() == "s3": # It's an S3 URL if HAVE_S3: # And we can use S3, so make the right implementation for S3. state_store_cache[url] = S3StateStore(url) else: # We can't actually use S3, so complain. - raise RuntimeError(f'Cannot connect to {url} because Toil AWS ' - f'dependencies are not available. Did you ' - f'install Toil with the [aws] extra?') - elif parse.scheme.lower() in ['file', '']: + raise RuntimeError( + f"Cannot connect to {url} because Toil AWS " + f"dependencies are not available. Did you " + f"install Toil with the [aws] extra?" + ) + elif parse.scheme.lower() in ["file", ""]: # It's a file URL or path state_store_cache[url] = FileStateStore(url) else: - raise RuntimeError(f'Cannot connect to {url} because we do not ' - f'implement its URL scheme') + raise RuntimeError( + f"Cannot connect to {url} because we do not " + f"implement its URL scheme" + ) return state_store_cache[url] + class WorkflowStateStore: """ Slice of a state store for the state of a particular workflow. @@ -463,6 +489,7 @@ def connect_to_workflow_state_store(url: str, workflow_id: str) -> WorkflowState return WorkflowStateStore(connect_to_state_store(url), workflow_id) + # When we see one of these terminal states, we stay there forever. TERMINAL_STATES = {"COMPLETE", "EXECUTOR_ERROR", "SYSTEM_ERROR", "CANCELED"} @@ -470,6 +497,7 @@ def connect_to_workflow_state_store(url: str, workflow_id: str) -> WorkflowState # workflow running task is gone and move it to CANCELED? MAX_CANCELING_SECONDS = 30 + class WorkflowStateMachine: """ Class for managing the WES workflow state machine. @@ -628,5 +656,3 @@ def get_current_state(self) -> str: state = "UNKNOWN" return state - - diff --git a/src/toil/server/wes/abstract_backend.py b/src/toil/server/wes/abstract_backend.py index 32abff6e3b..3502fc63cf 100644 --- a/src/toil/server/wes/abstract_backend.py +++ b/src/toil/server/wes/abstract_backend.py @@ -4,7 +4,7 @@ import logging import os from abc import abstractmethod -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Optional, Union from urllib.parse import urldefrag import connexion # type: ignore @@ -16,18 +16,25 @@ # Define a type for WES task log entries in responses # TODO: make this a typed dict with all the WES task log field names and their types. -TaskLog = Dict[str, Union[str, int, None]] +TaskLog = dict[str, Union[str, int, None]] class VersionNotImplementedException(Exception): """ Raised when the requested workflow version is not implemented. """ - def __init__(self, - wf_type: str, version: Optional[str] = None, supported_versions: Optional[List[str]] = None) -> None: + + def __init__( + self, + wf_type: str, + version: Optional[str] = None, + supported_versions: Optional[list[str]] = None, + ) -> None: if version: - message = ("workflow_type '{}' requires 'workflow_type_version' to be one of '{}'. " - "Got '{}' instead.".format(wf_type, str(supported_versions), version)) + message = ( + "workflow_type '{}' requires 'workflow_type_version' to be one of '{}'. " + "Got '{}' instead.".format(wf_type, str(supported_versions), version) + ) else: message = f"workflow_type '{wf_type}' is not supported." @@ -38,6 +45,7 @@ class MalformedRequestException(Exception): """ Raised when the request is malformed. """ + def __init__(self, message: str) -> None: super().__init__(message) @@ -46,6 +54,7 @@ class WorkflowNotFoundException(Exception): """ Raised when the requested run ID is not found. """ + def __init__(self) -> None: super().__init__("The requested workflow run wasn't found.") @@ -54,6 +63,7 @@ class WorkflowConflictException(Exception): """ Raised when the requested workflow is not in the expected state. """ + def __init__(self, run_id: str): super().__init__(f"Workflow {run_id} exists when it shouldn't.") @@ -62,6 +72,7 @@ class OperationForbidden(Exception): """ Raised when the request is forbidden. """ + def __init__(self, message: str) -> None: super().__init__(message) @@ -70,6 +81,7 @@ class WorkflowExecutionException(Exception): """ Raised when an internal error occurred during the execution of the workflow. """ + def __init__(self, message: str) -> None: super().__init__(message) @@ -81,8 +93,10 @@ def handle_errors(func: Callable[..., Any]) -> Callable[..., Any]: GA4GH WES spec. """ - def error(msg: Any, code: int = 500) -> Tuple[Dict[str, Any], int]: - logger.warning(f"Exception raised when calling '{func.__name__}()':", exc_info=True) + def error(msg: Any, code: int = 500) -> tuple[dict[str, Any], int]: + logger.warning( + f"Exception raised when calling '{func.__name__}()':", exc_info=True + ) return {"msg": str(msg), "status_code": code}, code @functools.wraps(func) @@ -114,7 +128,7 @@ class WESBackend: to handle user requests when they hit different endpoints. """ - def __init__(self, options: List[str]): + def __init__(self, options: list[str]): """ :param options: A list of default engine options to use when executing a workflow. Example options: @@ -135,7 +149,7 @@ def resolve_operation_id(self, operation_id: str) -> Any: return getattr(self, operation_id.split(".")[-1]) @abstractmethod - def get_service_info(self) -> Dict[str, Any]: + def get_service_info(self) -> dict[str, Any]: """ Get information about the Workflow Execution Service. @@ -144,7 +158,9 @@ def get_service_info(self) -> Dict[str, Any]: raise NotImplementedError @abstractmethod - def list_runs(self, page_size: Optional[int] = None, page_token: Optional[str] = None) -> Dict[str, Any]: + def list_runs( + self, page_size: Optional[int] = None, page_token: Optional[str] = None + ) -> dict[str, Any]: """ List the workflow runs. @@ -153,7 +169,7 @@ def list_runs(self, page_size: Optional[int] = None, page_token: Optional[str] = raise NotImplementedError @abstractmethod - def run_workflow(self) -> Dict[str, str]: + def run_workflow(self) -> dict[str, str]: """ Run a workflow. This endpoint creates a new workflow run and returns a `RunId` to monitor its progress. @@ -163,7 +179,7 @@ def run_workflow(self) -> Dict[str, str]: raise NotImplementedError @abstractmethod - def get_run_log(self, run_id: str) -> Dict[str, Any]: + def get_run_log(self, run_id: str) -> dict[str, Any]: """ Get detailed info about a workflow run. @@ -172,7 +188,7 @@ def get_run_log(self, run_id: str) -> Dict[str, Any]: raise NotImplementedError @abstractmethod - def cancel_run(self, run_id: str) -> Dict[str, str]: + def cancel_run(self, run_id: str) -> dict[str, str]: """ Cancel a running workflow. @@ -181,7 +197,7 @@ def cancel_run(self, run_id: str) -> Dict[str, str]: raise NotImplementedError @abstractmethod - def get_run_status(self, run_id: str) -> Dict[str, str]: + def get_run_status(self, run_id: str) -> dict[str, str]: """ Get quick status info about a workflow run, returning a simple result with the overall state of the workflow run. @@ -199,9 +215,17 @@ def log_for_run(run_id: Optional[str], message: str) -> None: @staticmethod def secure_path(path: str) -> str: - return os.path.join(*[str(secure_filename(p)) for p in path.split("/") if p not in ("", ".", "..")]) - - def collect_attachments(self, run_id: Optional[str], temp_dir: Optional[str]) -> Tuple[str, Dict[str, Any]]: + return os.path.join( + *[ + str(secure_filename(p)) + for p in path.split("/") + if p not in ("", ".", "..") + ] + ) + + def collect_attachments( + self, run_id: Optional[str], temp_dir: Optional[str] + ) -> tuple[str, dict[str, Any]]: """ Collect attachments from the current request by staging uploaded files to temp_dir, and return the temp_dir and parsed body of the request. @@ -212,7 +236,7 @@ def collect_attachments(self, run_id: Optional[str], temp_dir: Optional[str]) -> """ if not temp_dir: temp_dir = mkdtemp() - body: Dict[str, Any] = {} + body: dict[str, Any] = {} has_attachments = False for key, ls in connexion.request.files.lists(): try: @@ -223,12 +247,20 @@ def collect_attachments(self, run_id: Optional[str], temp_dir: Optional[str]) -> dest = os.path.join(temp_dir, self.secure_path(value.filename)) if not os.path.isdir(os.path.dirname(dest)): os.makedirs(os.path.dirname(dest)) - self.log_for_run(run_id, f"Staging attachment '{value.filename}' to '{dest}'") + self.log_for_run( + run_id, f"Staging attachment '{value.filename}' to '{dest}'" + ) value.save(dest) has_attachments = True - body[key] = f"file://{temp_dir}" # Reference to temp working dir. - - elif key in ("workflow_params", "tags", "workflow_engine_parameters"): + body[key] = ( + f"file://{temp_dir}" # Reference to temp working dir. + ) + + elif key in ( + "workflow_params", + "tags", + "workflow_engine_parameters", + ): content = value.read() body[key] = json.loads(content.decode("utf-8")) else: @@ -252,17 +284,23 @@ def collect_attachments(self, run_id: Optional[str], temp_dir: Optional[str]) -> url, ref = urldefrag(body["workflow_url"]) if ":" not in url: if not has_attachments: - raise MalformedRequestException("Relative 'workflow_url' but missing 'workflow_attachment'") + raise MalformedRequestException( + "Relative 'workflow_url' but missing 'workflow_attachment'" + ) body["workflow_url"] = self.secure_path(url) # keep this relative if ref: # append "#ref" after the url body["workflow_url"] += "#" + self.secure_path(ref) - self.log_for_run(run_id, "Using workflow_url '%s'" % body.get("workflow_url")) + self.log_for_run( + run_id, "Using workflow_url '%s'" % body.get("workflow_url") + ) else: raise MalformedRequestException("Missing 'workflow_url' in submission") if "workflow_params" in body and not isinstance(body["workflow_params"], dict): # They sent us something silly like "workflow_params": "5" - raise MalformedRequestException("Got a 'workflow_params' which does not decode to a JSON object") + raise MalformedRequestException( + "Got a 'workflow_params' which does not decode to a JSON object" + ) return temp_dir, body diff --git a/src/toil/server/wes/amazon_wes_utils.py b/src/toil/server/wes/amazon_wes_utils.py index 7ea390223a..f05c8b84d8 100644 --- a/src/toil/server/wes/amazon_wes_utils.py +++ b/src/toil/server/wes/amazon_wes_utils.py @@ -20,21 +20,15 @@ import json import logging -import sys import zipfile from os import path -from typing import IO, List, Optional, cast - -if sys.version_info >= (3, 8): - from typing import TypedDict -else: - from typing_extensions import TypedDict - +from typing import IO, Optional, TypedDict, cast from urllib.parse import ParseResult, urlparse from toil.bus import JobStatus -from toil.server.wes.abstract_backend import \ - MalformedRequestException as InvalidRequestError +from toil.server.wes.abstract_backend import ( + MalformedRequestException as InvalidRequestError, +) from toil.server.wes.abstract_backend import TaskLog logger = logging.getLogger(__name__) @@ -53,20 +47,25 @@ # The official spec we are working with here is: https://aws.github.io/amazon-genomics-cli/docs/concepts/workflows/#multi-file-workflows + class WorkflowPlan(TypedDict): """ These functions pass around dicts of a certain type, with `data` and `files` keys. """ + data: "DataDict" files: "FilesDict" + class DataDict(TypedDict, total=False): """ Under `data`, there can be: * `workflowUrl` (required if no `workflowSource`): URL to main workflow code. """ + workflowUrl: str + class FilesDict(TypedDict, total=False): """ Under `files`, there can be: @@ -75,11 +74,13 @@ class FilesDict(TypedDict, total=False): * `workflowOptions`: Open binary-mode file for a JSON of options sent along with the workflow. * `workflowDependencies`: Open binary-mode file for the zip the workflow came in, if any. """ + workflowSource: IO[bytes] - workflowInputFiles: List[IO[bytes]] + workflowInputFiles: list[IO[bytes]] workflowOptions: IO[bytes] workflowDependencies: IO[bytes] + def parse_workflow_zip_file(file: str, workflow_type: str) -> WorkflowPlan: r""" Processes a workflow zip bundle @@ -163,9 +164,9 @@ def parse_workflow_manifest_file(manifest_file: str) -> WorkflowPlan: :rtype: dict of `data` and `files` MANIFEST.json is expected to be formatted like: - + .. code-block:: json - + { "mainWorkflowURL": "relpath/to/workflow", "inputFileURLs": [ @@ -235,7 +236,9 @@ def parse_workflow_manifest_file(manifest_file: str) -> WorkflowPlan: return {"data": data, "files": files} -def workflow_manifest_url_to_path(url: ParseResult, parent_dir: Optional[str] = None) -> str: +def workflow_manifest_url_to_path( + url: ParseResult, parent_dir: Optional[str] = None +) -> str: """ Interpret a possibly-relative parsed URL, relative to the given parent directory. """ @@ -244,6 +247,7 @@ def workflow_manifest_url_to_path(url: ParseResult, parent_dir: Optional[str] = return path.join(parent_dir, relpath) return relpath + # This one is all UCSC code def task_filter(task: TaskLog, job_status: JobStatus) -> Optional[TaskLog]: """ @@ -264,6 +268,8 @@ def task_filter(task: TaskLog, job_status: JobStatus) -> Optional[TaskLog]: modified_task = dict(task) # Tack the batch ID onto the end of the name with the required separator - modified_task["name"] = "|".join([cast(str, modified_task.get("name", "")), batch_id]) + modified_task["name"] = "|".join( + [cast(str, modified_task.get("name", "")), batch_id] + ) logger.info("Transformed task %s to %s", task, modified_task) return modified_task diff --git a/src/toil/server/wes/tasks.py b/src/toil/server/wes/tasks.py index 169e897a52..872a97edc5 100644 --- a/src/toil/server/wes/tasks.py +++ b/src/toil/server/wes/tasks.py @@ -20,7 +20,7 @@ import sys import tempfile import zipfile -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Optional, Union from urllib.parse import urldefrag from celery.exceptions import SoftTimeLimitExceeded # type: ignore @@ -29,13 +29,15 @@ from toil.common import Toil from toil.jobStores.utils import generate_locator from toil.server.celery_app import celery -from toil.server.utils import (WorkflowStateMachine, - connect_to_workflow_state_store, - download_file_from_internet, - download_file_from_s3, - get_file_class, - get_iso_time, - link_file) +from toil.server.utils import ( + WorkflowStateMachine, + connect_to_workflow_state_store, + download_file_from_internet, + download_file_from_s3, + get_file_class, + get_iso_time, + link_file, +) logger = logging.getLogger(__name__) @@ -45,6 +47,7 @@ # our patience for CANCELING WES workflows to time out to CANCELED. WAIT_FOR_DEATH_TIMEOUT = 20 + class ToilWorkflowRunner: """ A class to represent a workflow runner to run the requested workflow. @@ -53,7 +56,14 @@ class ToilWorkflowRunner: that command, and collecting the outputs of the resulting workflow run. """ - def __init__(self, base_scratch_dir: str, state_store_url: str, workflow_id: str, request: Dict[str, Any], engine_options: List[str]): + def __init__( + self, + base_scratch_dir: str, + state_store_url: str, + workflow_id: str, + request: dict[str, Any], + engine_options: list[str], + ): """ Make a new ToilWorkflowRunner to actually run a workflow leader based on a WES request. @@ -85,8 +95,11 @@ def __init__(self, base_scratch_dir: str, state_store_url: str, workflow_id: str self.out_dir = os.path.join(self.scratch_dir, "outputs") # Compose the right kind of job store to use it the user doesn't specify one. - default_type = os.getenv('TOIL_WES_JOB_STORE_TYPE', 'file') - self.default_job_store = generate_locator(default_type, local_suggestion=os.path.join(self.scratch_dir, "toil_job_store")) + default_type = os.getenv("TOIL_WES_JOB_STORE_TYPE", "file") + self.default_job_store = generate_locator( + default_type, + local_suggestion=os.path.join(self.scratch_dir, "toil_job_store"), + ) self.job_store = self.default_job_store @@ -112,7 +125,9 @@ def write_workflow(self, src_url: str) -> str: logger.info("Linking workflow from filesystem.") link_file(src=src_url[7:], dest=dest) elif src_url.startswith(("http://", "https://")): - logger.info(f"Downloading workflow_url from the Internet. src: {src_url} dst: {dest}") + logger.info( + f"Downloading workflow_url from the Internet. src: {src_url} dst: {dest}" + ) download_file_from_internet(src=src_url, dest=dest, content_type="text/") elif src_url.startswith("s3://"): logger.info("Downloading workflow_url from Amazon S3.") @@ -127,7 +142,9 @@ def write_workflow(self, src_url: str) -> str: return dest - def sort_options(self, workflow_engine_parameters: Optional[Dict[str, Optional[str]]] = None) -> List[str]: + def sort_options( + self, workflow_engine_parameters: Optional[dict[str, Optional[str]]] = None + ) -> list[str]: """ Sort the command line arguments in the order that can be recognized by the workflow execution engine. @@ -188,7 +205,7 @@ def sort_options(self, workflow_engine_parameters: Optional[Dict[str, Optional[s return options - def initialize_run(self) -> List[str]: + def initialize_run(self) -> list[str]: """ Write workflow and input files and construct a list of shell commands to be executed. Return that list of shell commands that should be @@ -204,56 +221,69 @@ def initialize_run(self) -> List[str]: # Obtain main workflow file to a path (no-scheme file URL). workflow_url = self.write_workflow(src_url=self.request["workflow_url"]) - if os.path.basename(workflow_url) == "workflow.zip" and zipfile.is_zipfile(workflow_url): + if os.path.basename(workflow_url) == "workflow.zip" and zipfile.is_zipfile( + workflow_url + ): # We've been sent a zip file. We should interpret this as an Amazon Genomics CLI-style zip file. # Extract everything next to the zip and find and open relvant files. logger.info("Extracting and parsing Amazon-style workflow bundle...") - zip_info = amazon_wes_utils.parse_workflow_zip_file(workflow_url, self.wf_type) + zip_info = amazon_wes_utils.parse_workflow_zip_file( + workflow_url, self.wf_type + ) # Now parse Amazon's internal format into our own. # Find the real workflow source for the entrypoint file - if 'workflowSource' in zip_info['files']: + if "workflowSource" in zip_info["files"]: # They sent a file, which has been opened, so grab its path - workflow_url = zip_info['files']['workflowSource'].name + workflow_url = zip_info["files"]["workflowSource"].name logger.info("Workflow source file: '%s'", workflow_url) - elif 'workflowUrl' in zip_info['data']: + elif "workflowUrl" in zip_info["data"]: # They are pointing to another URL. # TODO: What does Amazon expect this to mean? Are we supposed to recurse? # For now just forward it. - workflow_url = zip_info['data']['workflowUrl'] + workflow_url = zip_info["data"]["workflowUrl"] logger.info("Workflow reference URL: '%s'", workflow_url) else: # The parser is supposed to throw so we can't get here - raise RuntimeError("Parser could not find workflow source or URL in zip") - - if 'workflowInputFiles' in zip_info['files'] and len(zip_info['files']['workflowInputFiles']) > 0: + raise RuntimeError( + "Parser could not find workflow source or URL in zip" + ) + + if ( + "workflowInputFiles" in zip_info["files"] + and len(zip_info["files"]["workflowInputFiles"]) > 0 + ): # The bundle contains a list of input files. # We interpret these as JSON, and layer them on top of each # other, and then apply workflow_params from the request. - logger.info("Workflow came with %d bundled inputs files; coalescing final parameters", - len(zip_info['files']['workflowInputFiles'])) + logger.info( + "Workflow came with %d bundled inputs files; coalescing final parameters", + len(zip_info["files"]["workflowInputFiles"]), + ) coalesced_parameters = {} - for binary_file in zip_info['files']['workflowInputFiles']: + for binary_file in zip_info["files"]["workflowInputFiles"]: try: # Read each input file as a JSON loaded_parameters = json.load(binary_file) except json.JSONDecodeError as e: - raise RuntimeError(f"Could not parse inputs JSON {os.path.basename(binary_file.name)}: {e}") + raise RuntimeError( + f"Could not parse inputs JSON {os.path.basename(binary_file.name)}: {e}" + ) # And merge them together in order coalesced_parameters.update(loaded_parameters) # Then apply and replace the parameters that came with the request coalesced_parameters.update(workflow_params) workflow_params = coalesced_parameters - if 'workflowOptions' in zip_info['files']: + if "workflowOptions" in zip_info["files"]: # The bundle contains an options JSON. We interpret these as # defaults for workflow_engine_parameters. logger.info(f"Workflow came with bundled options JSON") try: # Read as a JSON - loaded_options = json.load(zip_info['files']['workflowOptions']) + loaded_options = json.load(zip_info["files"]["workflowOptions"]) except json.JSONDecodeError as e: raise RuntimeError(f"Could not parse options JSON: {e}") # Apply and replace the engine parameters that came with the @@ -275,13 +305,9 @@ def initialize_run(self) -> List[str]: # construct and return the command to run if self.wf_type == "cwl": - command_args = ( - ["toil-cwl-runner"] + options + [workflow_url, input_json] - ) + command_args = ["toil-cwl-runner"] + options + [workflow_url, input_json] elif self.wf_type == "wdl": - command_args = ( - ["toil-wdl-runner"] + options + [workflow_url, input_json] - ) + command_args = ["toil-wdl-runner"] + options + [workflow_url, input_json] elif self.wf_type == "py": command_args = ["python", workflow_url] + options else: @@ -290,7 +316,9 @@ def initialize_run(self) -> List[str]: return command_args - def call_cmd(self, cmd: Union[List[str], str], cwd: str) -> "subprocess.Popen[bytes]": + def call_cmd( + self, cmd: Union[list[str], str], cwd: str + ) -> "subprocess.Popen[bytes]": """ Calls a command with Popen. Writes stdout, stderr, and the command to separate files. @@ -300,7 +328,9 @@ def call_cmd(self, cmd: Union[List[str], str], cwd: str) -> "subprocess.Popen[by with open(stdout_f, "w") as stdout, open(stderr_f, "w") as stderr: logger.info(f"Calling: '{' '.join(cmd)}'") - process = subprocess.Popen(cmd, stdout=stdout, stderr=stderr, close_fds=True, cwd=cwd) + process = subprocess.Popen( + cmd, stdout=stdout, stderr=stderr, close_fds=True, cwd=cwd + ) return process @@ -348,7 +378,7 @@ def run(self) -> None: self.store.set("end_time", get_iso_time()) self.store.set("exit_code", str(exit_code)) - logger.info('Toil child finished with code %s', exit_code) + logger.info("Toil child finished with code %s", exit_code) if exit_code == 0: self.state_machine.send_complete() @@ -387,7 +417,14 @@ def write_output_files(self) -> None: self.write_scratch_file("outputs.json", json.dumps(output_obj)) -def run_wes_task(base_scratch_dir: str, state_store_url: str, workflow_id: str, request: Dict[str, Any], engine_options: List[str]) -> str: + +def run_wes_task( + base_scratch_dir: str, + state_store_url: str, + workflow_id: str, + request: dict[str, Any], + engine_options: list[str], +) -> str: """ Run a requested workflow. @@ -402,8 +439,13 @@ def run_wes_task(base_scratch_dir: str, state_store_url: str, workflow_id: str, logger.info("Starting WES workflow") - runner = ToilWorkflowRunner(base_scratch_dir, state_store_url, workflow_id, - request=request, engine_options=engine_options) + runner = ToilWorkflowRunner( + base_scratch_dir, + state_store_url, + workflow_id, + request=request, + engine_options=engine_options, + ) try: runner.run() @@ -416,24 +458,27 @@ def run_wes_task(base_scratch_dir: str, state_store_url: str, workflow_id: str, runner.write_output_files() except (KeyboardInterrupt, SystemExit, SoftTimeLimitExceeded): # We canceled the workflow run - logger.info('Canceling the workflow') + logger.info("Canceling the workflow") runner.state_machine.send_canceled() except Exception: # The workflow run broke. We still count as the executor here. - logger.exception('Running Toil produced an exception.') + logger.exception("Running Toil produced an exception.") runner.state_machine.send_executor_error() raise return runner.get_state() + # Wrap the task function as a Celery task run_wes = celery.task(name="run_wes")(run_wes_task) + def cancel_run(task_id: str) -> None: """ Send a SIGTERM signal to the process that is running task_id. """ - celery.control.terminate(task_id, signal='SIGUSR1') + celery.control.terminate(task_id, signal="SIGUSR1") + class TaskRunner: """ @@ -443,13 +488,13 @@ class TaskRunner: """ @staticmethod - def run(args: Tuple[str, str, str, Dict[str, Any], List[str]], task_id: str) -> None: + def run( + args: tuple[str, str, str, dict[str, Any], list[str]], task_id: str + ) -> None: """ Run the given task args with the given ID on Celery. """ - run_wes.apply_async(args=args, - task_id=task_id, - ignore_result=True) + run_wes.apply_async(args=args, task_id=task_id, ignore_result=True) @staticmethod def cancel(task_id: str) -> None: @@ -467,8 +512,10 @@ def is_ok(task_id: str) -> bool: # Nothing to do for Celery return True + # If Celery can't be set up, we can just use this fake version instead. + class MultiprocessingTaskRunner(TaskRunner): """ Version of TaskRunner that just runs tasks with Multiprocessing. @@ -478,11 +525,13 @@ class MultiprocessingTaskRunner(TaskRunner): ToilWorkflowRunner) don't poll for it. """ - _id_to_process: Dict[str, multiprocessing.Process] = {} - _id_to_log: Dict[str, str] = {} + _id_to_process: dict[str, multiprocessing.Process] = {} + _id_to_log: dict[str, str] = {} @staticmethod - def set_up_and_run_task(output_path: str, args: Tuple[str, str, str, Dict[str, Any], List[str]]) -> None: + def set_up_and_run_task( + output_path: str, args: tuple[str, str, str, dict[str, Any], list[str]] + ) -> None: """ Set up logging for the process into the given file and then call run_wes_task with the given arguments. @@ -499,8 +548,8 @@ def set_up_and_run_task(output_path: str, args: Tuple[str, str, str, Dict[str, A # that we were told about, so the server can come get the log if we # unexpectedly die. - output_file = open(output_path, 'w') - output_file.write('Initializing task log\n') + output_file = open(output_path, "w") + output_file.write("Initializing task log\n") output_file.flush() # Take over logging. @@ -523,22 +572,24 @@ def handle_sigterm(_: Any, __: Any) -> None: signal.signal(signal.SIGTERM, handle_sigterm) try: - logger.info('Running task') + logger.info("Running task") output_file.flush() run_wes_task(*args) except Exception: - logger.exception('Exception in task!') + logger.exception("Exception in task!") raise else: # If the task does not crash, clean up the log os.unlink(output_path) finally: - logger.debug('Finishing task log') + logger.debug("Finishing task log") output_file.flush() output_file.close() @classmethod - def run(cls, args: Tuple[str, str, str, Dict[str, Any], List[str]], task_id: str) -> None: + def run( + cls, args: tuple[str, str, str, dict[str, Any], list[str]], task_id: str + ) -> None: """ Run the given task args with the given ID. """ @@ -549,9 +600,13 @@ def run(cls, args: Tuple[str, str, str, Dict[str, Any], List[str]], task_id: str # Store the log filename before the process, like is_ok() expects. cls._id_to_log[task_id] = path - logger.info("Starting task %s in a process that should log to %s", task_id, path) + logger.info( + "Starting task %s in a process that should log to %s", task_id, path + ) - cls._id_to_process[task_id] = multiprocessing.Process(target=cls.set_up_and_run_task, args=(path, args)) + cls._id_to_process[task_id] = multiprocessing.Process( + target=cls.set_up_and_run_task, args=(path, args) + ) cls._id_to_process[task_id].start() @classmethod @@ -581,9 +636,14 @@ def is_ok(cls, task_id: str) -> bool: # being canceled by cancel(), then it is OK. ACCEPTABLE_EXIT_CODES = [0, -signal.SIGTERM] - if process.exitcode is not None and process.exitcode not in ACCEPTABLE_EXIT_CODES: + if ( + process.exitcode is not None + and process.exitcode not in ACCEPTABLE_EXIT_CODES + ): # Something went wring in the task and it couldn't handle it. - logger.error("Process for running %s failed with code %s", task_id, process.exitcode) + logger.error( + "Process for running %s failed with code %s", task_id, process.exitcode + ) try: for line in open(cls._id_to_log[task_id]): # Dump the task log @@ -596,5 +656,3 @@ def is_ok(cls, task_id: str) -> bool: return False return True - - diff --git a/src/toil/server/wes/toil_backend.py b/src/toil/server/wes/toil_backend.py index 1c4cbca5dd..5332b000bb 100644 --- a/src/toil/server/wes/toil_backend.py +++ b/src/toil/server/wes/toil_backend.py @@ -17,18 +17,9 @@ import shutil import uuid from collections import Counter +from collections.abc import Generator from contextlib import contextmanager -from typing import (Any, - Callable, - Dict, - Generator, - List, - Optional, - TextIO, - Tuple, - Type, - Union, - overload) +from typing import Any, Callable, Optional, TextIO, Union, overload from flask import send_from_directory from werkzeug.utils import redirect @@ -38,22 +29,24 @@ from toil.bus import JobStatus, replay_message_bus from toil.lib.io import AtomicFileCreate from toil.lib.threading import global_mutex -from toil.server.utils import (WorkflowStateMachine, - connect_to_workflow_state_store) -from toil.server.wes.abstract_backend import (OperationForbidden, - TaskLog, - VersionNotImplementedException, - WESBackend, - WorkflowConflictException, - WorkflowExecutionException, - WorkflowNotFoundException, - handle_errors) +from toil.server.utils import WorkflowStateMachine, connect_to_workflow_state_store +from toil.server.wes.abstract_backend import ( + OperationForbidden, + TaskLog, + VersionNotImplementedException, + WESBackend, + WorkflowConflictException, + WorkflowExecutionException, + WorkflowNotFoundException, + handle_errors, +) from toil.server.wes.tasks import MultiprocessingTaskRunner, TaskRunner from toil.version import baseVersion logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) + class ToilWorkflow: def __init__(self, base_work_dir: str, state_store_url: str, run_id: str): """ @@ -115,24 +108,31 @@ def fetch_scratch(self, filename: str) -> Generator[Optional[TextIO], None, None yield None def exists(self) -> bool: - """ Return True if the workflow run exists.""" + """Return True if the workflow run exists.""" return self.get_state() != "UNKNOWN" def get_state(self) -> str: - """ Return the state of the current run.""" + """Return the state of the current run.""" return self.state_machine.get_current_state() - def check_on_run(self, task_runner: Type[TaskRunner]) -> None: + def check_on_run(self, task_runner: type[TaskRunner]) -> None: """ Check to make sure nothing has gone wrong in the task runner for this workflow. If something has, log, and fail the workflow with an error. """ - if not task_runner.is_ok(self.run_id) and self.get_state() not in ['SYSTEM_ERROR', 'EXECUTOR_ERROR', 'COMPLETE', 'CANCELED']: - logger.error('Failing run %s because the task to run its leader crashed', self.run_id) + if not task_runner.is_ok(self.run_id) and self.get_state() not in [ + "SYSTEM_ERROR", + "EXECUTOR_ERROR", + "COMPLETE", + "CANCELED", + ]: + logger.error( + "Failing run %s because the task to run its leader crashed", self.run_id + ) self.state_machine.send_system_error() def set_up_run(self) -> None: - """ Set up necessary directories for the run.""" + """Set up necessary directories for the run.""" # Go to queued state self.state_machine.send_enqueue() @@ -140,11 +140,13 @@ def set_up_run(self) -> None: os.makedirs(self.exec_dir, exist_ok=True) def clean_up(self) -> None: - """ Clean directory and files related to the run.""" + """Clean directory and files related to the run.""" shutil.rmtree(self.scratch_dir) # Don't remove state; state needs to persist forever. - def queue_run(self, task_runner: Type[TaskRunner], request: Dict[str, Any], options: List[str]) -> None: + def queue_run( + self, task_runner: type[TaskRunner], request: dict[str, Any], options: list[str] + ) -> None: """This workflow should be ready to run. Hand this to the task system.""" with open(os.path.join(self.scratch_dir, "request.json"), "w") as f: # Save the request to disk for get_run_log() @@ -152,8 +154,16 @@ def queue_run(self, task_runner: Type[TaskRunner], request: Dict[str, Any], opti try: # Run the task. Set the task ID the same as our run ID - task_runner.run(args=(self.base_scratch_dir, self.state_store_url, self.run_id, request, options), - task_id=self.run_id) + task_runner.run( + args=( + self.base_scratch_dir, + self.state_store_url, + self.run_id, + request, + options, + ), + task_id=self.run_id, + ) except Exception: # Celery or the broker might be down self.state_machine.send_system_error() @@ -185,23 +195,28 @@ def get_stdout_path(self) -> Optional[str]: Return the path to the standard output log, relative to the run's scratch_dir, or None if it doesn't exist. """ - return self._get_scratch_file_path('stdout') + return self._get_scratch_file_path("stdout") def get_stderr_path(self) -> Optional[str]: """ Return the path to the standard output log, relative to the run's scratch_dir, or None if it doesn't exist. """ - return self._get_scratch_file_path('stderr') + return self._get_scratch_file_path("stderr") def get_messages_path(self) -> Optional[str]: """ Return the path to the bus message log, relative to the run's scratch_dir, or None if it doesn't exist. """ - return self._get_scratch_file_path('bus_messages') + return self._get_scratch_file_path("bus_messages") - def get_task_logs(self, filter_function: Optional[Callable[[TaskLog, JobStatus], Optional[TaskLog]]] = None) -> List[Dict[str, Union[str, int, None]]]: + def get_task_logs( + self, + filter_function: Optional[ + Callable[[TaskLog, JobStatus], Optional[TaskLog]] + ] = None, + ) -> list[dict[str, Union[str, int, None]]]: """ Return all the task log objects for the individual tasks in the workflow. @@ -226,9 +241,12 @@ def get_task_logs(self, filter_function: Optional[Callable[[TaskLog, JobStatus], abs_path = os.path.join(self.scratch_dir, path) job_statuses = replay_message_bus(abs_path) # Compose log objects from recovered job info. - logs: List[TaskLog] = [] + logs: list[TaskLog] = [] for job_status in job_statuses.values(): - task: Optional[TaskLog] = {"name": job_status.name, "exit_code": job_status.exit_code} + task: Optional[TaskLog] = { + "name": job_status.name, + "exit_code": job_status.exit_code, + } if filter_function is not None: # Convince MyPy the task is set assert task is not None @@ -236,22 +254,26 @@ def get_task_logs(self, filter_function: Optional[Callable[[TaskLog, JobStatus], task = filter_function(task, job_status) if task is not None: logs.append(task) - logger.info('Recovered task logs: %s', logs) + logger.info("Recovered task logs: %s", logs) return logs # TODO: times, log files, AWS Batch IDs if any, names from the workflow instead of IDs, commands - - - class ToilBackend(WESBackend): """ WES backend implemented for Toil to run CWL, WDL, or Toil workflows. This class is responsible for validating and executing submitted workflows. """ - def __init__(self, work_dir: str, state_store: Optional[str], options: List[str], - dest_bucket_base: Optional[str], bypass_celery: bool = False, wes_dialect: str = "standard") -> None: + def __init__( + self, + work_dir: str, + state_store: Optional[str], + options: list[str], + dest_bucket_base: Optional[str], + bypass_celery: bool = False, + wes_dialect: str = "standard", + ) -> None: """ Make a new ToilBackend for serving WES. @@ -274,19 +296,21 @@ def __init__(self, work_dir: str, state_store: Optional[str], options: List[str] acceptable in any dialect. """ for opt in options: - if not opt.startswith('-'): + if not opt.startswith("-"): # We don't allow a value to be set across multiple arguments # that would need to remain in the same order. - raise ValueError(f'Option {opt} does not begin with -') + raise ValueError(f"Option {opt} does not begin with -") super().__init__(options) # How should we generate run IDs? We apply a prefix so that we can tell # what things in our work directory suggest that runs exist and what # things don't. - self.run_id_prefix = 'run-' + self.run_id_prefix = "run-" # Use this to run Celery tasks so we can swap it out for testing. - self.task_runner = TaskRunner if not bypass_celery else MultiprocessingTaskRunner + self.task_runner = ( + TaskRunner if not bypass_celery else MultiprocessingTaskRunner + ) logger.info("Using task runner: %s", self.task_runner) # Record if we need to limit our WES responses for a particular @@ -304,7 +328,7 @@ def __init__(self, work_dir: str, state_store: Optional[str], options: List[str] if state_store is None: # Store workflow metadata under the work_dir. - self.state_store_url = os.path.join(self.work_dir, 'state_store') + self.state_store_url = os.path.join(self.work_dir, "state_store") else: # Use the provided value self.state_store_url = state_store @@ -331,14 +355,14 @@ def __init__(self, work_dir: str, state_store: Optional[str], options: List[str] pass # Assign an ID to the work directory storage. work_dir_id = None - work_dir_id_file = os.path.join(self.work_dir, 'id.txt') + work_dir_id_file = os.path.join(self.work_dir, "id.txt") if os.path.exists(work_dir_id_file): # An ID is assigned already with open(work_dir_id_file) as f: work_dir_id = uuid.UUID(f.readline().strip()) else: # We need to try and assign an ID. - with global_mutex(self.work_dir, 'id-assignment'): + with global_mutex(self.work_dir, "id-assignment"): # We need to synchronize with other processes starting up to # make sure we agree on an ID. if os.path.exists(work_dir_id_file): @@ -350,7 +374,7 @@ def __init__(self, work_dir: str, state_store: Optional[str], options: List[str] with AtomicFileCreate(work_dir_id_file) as temp_file: # Still need to be atomic here or people not locking # will see an incomplete file. - with open(temp_file, 'w') as f: + with open(temp_file, "w") as f: f.write(str(work_dir_id)) # Now combine into one ID if boot_id is not None: @@ -359,14 +383,15 @@ def __init__(self, work_dir: str, state_store: Optional[str], options: List[str] self.server_id = str(work_dir_id) logger.info("Using server ID: %s", self.server_id) - self.supported_versions = { "py": ["3.7", "3.8", "3.9"], "cwl": ["v1.0", "v1.1", "v1.2"], - "wdl": ["draft-2", "1.0"] + "wdl": ["draft-2", "1.0"], } - def _get_run(self, run_id: str, should_exists: Optional[bool] = None) -> ToilWorkflow: + def _get_run( + self, run_id: str, should_exists: Optional[bool] = None + ) -> ToilWorkflow: """ Helper method to instantiate a ToilWorkflow object. @@ -387,24 +412,32 @@ def _get_run(self, run_id: str, should_exists: Optional[bool] = None) -> ToilWor # TODO: Implement multiple servers working together. owning_server = run.fetch_state("server_id") apparent_state = run.get_state() - if (apparent_state not in ("UNKNOWN", "COMPLETE", "EXECUTOR_ERROR", "SYSTEM_ERROR", "CANCELED") and - owning_server != self.server_id): + if ( + apparent_state + not in ("UNKNOWN", "COMPLETE", "EXECUTOR_ERROR", "SYSTEM_ERROR", "CANCELED") + and owning_server != self.server_id + ): # This workflow is in a state that suggests it is doing something # but it appears to belong to a previous incarnation of the server, # and so its Celery is probably gone. Put it into system error # state if possible. - logger.warning("Run %s in state %s appears to belong to server %s and not us, server %s. " - "Its server is probably gone. Failing the workflow!", - run_id, apparent_state, owning_server, self.server_id) + logger.warning( + "Run %s in state %s appears to belong to server %s and not us, server %s. " + "Its server is probably gone. Failing the workflow!", + run_id, + apparent_state, + owning_server, + self.server_id, + ) run.state_machine.send_system_error() # Poll to make sure the run is not broken run.check_on_run(self.task_runner) return run - def get_runs(self) -> Generator[Tuple[str, str], None, None]: - """ A generator of a list of run ids and their state.""" + def get_runs(self) -> Generator[tuple[str, str], None, None]: + """A generator of a list of run ids and their state.""" if not os.path.exists(self.work_dir): return @@ -423,25 +456,24 @@ def get_state(self, run_id: str) -> str: return self._get_run(run_id, should_exists=True).get_state() @handle_errors - def get_service_info(self) -> Dict[str, Any]: - """ Get information about the Workflow Execution Service.""" + def get_service_info(self) -> dict[str, Any]: + """Get information about the Workflow Execution Service.""" state_counts = Counter(state for _, state in self.get_runs()) engine_parameters = [] for option in self.options: - if '=' not in option: # flags like "--logDebug" + if "=" not in option: # flags like "--logDebug" k, v = option, None else: - k, v = option.split('=', 1) + k, v = option.split("=", 1) engine_parameters.append((k, v)) return { "version": baseVersion, "workflow_type_versions": { - k: { - "workflow_type_version": v - } for k, v in self.supported_versions.items() + k: {"workflow_type_version": v} + for k, v in self.supported_versions.items() }, "supported_wes_versions": ["1.0.0"], "supported_filesystem_protocols": ["file", "http", "https"], @@ -449,10 +481,7 @@ def get_service_info(self) -> Dict[str, Any]: # TODO: How can we report --destBucket here, since we pass it only # for CWL workflows? "default_workflow_engine_parameters": [ - { - "name": key, - "default_value": value - } + {"name": key, "default_value": value} for key, value in engine_parameters ], "system_state_counts": state_counts, @@ -460,22 +489,21 @@ def get_service_info(self) -> Dict[str, Any]: } @handle_errors - def list_runs(self, page_size: Optional[int] = None, page_token: Optional[str] = None) -> Dict[str, Any]: - """ List the workflow runs.""" + def list_runs( + self, page_size: Optional[int] = None, page_token: Optional[str] = None + ) -> dict[str, Any]: + """List the workflow runs.""" # TODO: implement pagination return { "workflows": [ - { - "run_id": run_id, - "state": state - } for run_id, state in self.get_runs() + {"run_id": run_id, "state": state} for run_id, state in self.get_runs() ], - "next_page_token": "" + "next_page_token": "", } @handle_errors - def run_workflow(self) -> Dict[str, str]: - """ Run a workflow.""" + def run_workflow(self) -> dict[str, str]: + """Run a workflow.""" run_id = self.run_id_prefix + uuid.uuid4().hex run = self._get_run(run_id, should_exists=False) @@ -491,7 +519,11 @@ def run_workflow(self) -> Dict[str, str]: run.clean_up() raise - logger.info("Received workflow run request %s with parameters: %s", run_id, list(request.keys())) + logger.info( + "Received workflow run request %s with parameters: %s", + run_id, + list(request.keys()), + ) wf_type = request["workflow_type"].lower().strip() version = request["workflow_type_version"] @@ -514,21 +546,25 @@ def run_workflow(self) -> Dict[str, str]: workflow_options = list(self.options) if wf_type == "cwl" and self.dest_bucket_base: # Output to a directory under out base destination bucket URL. - workflow_options.append('--destBucket=' + os.path.join(self.dest_bucket_base, run_id)) + workflow_options.append( + "--destBucket=" + os.path.join(self.dest_bucket_base, run_id) + ) # Tell it to dump its messages to a file. # TODO: automatically sync file names with accessors somehow. - workflow_options.append('--writeMessages=' + os.path.join(run.scratch_dir, 'bus_messages')) + workflow_options.append( + "--writeMessages=" + os.path.join(run.scratch_dir, "bus_messages") + ) - logger.info(f"Putting workflow {run_id} into the queue. Waiting to be picked up...") + logger.info( + f"Putting workflow {run_id} into the queue. Waiting to be picked up..." + ) run.queue_run(self.task_runner, request, options=workflow_options) - return { - "run_id": run_id - } + return {"run_id": run_id} @handle_errors - def get_run_log(self, run_id: str) -> Dict[str, Any]: - """ Get detailed info about a workflow run.""" + def get_run_log(self, run_id: str) -> dict[str, Any]: + """Get detailed info about a workflow run.""" run = self._get_run(run_id, should_exists=True) state = run.get_state() @@ -550,7 +586,7 @@ def get_run_log(self, run_id: str) -> Dict[str, Any]: # path under that hostname. So we need to use a relative URL to the # logs. stdout = f"../../../../toil/wes/v1/logs/{run_id}/stdout" - stderr ="" + stderr = "" if run.get_stderr_path() is not None: # We have a standard error link. stderr = f"../../../../toil/wes/v1/logs/{run_id}/stderr" @@ -564,7 +600,9 @@ def get_run_log(self, run_id: str) -> Dict[str, Any]: filter_function = amazon_wes_utils.task_filter else: # We can emit any standard-compliant WES tasks - logger.info("WES dialect %s does not require transforming tasks", self.wes_dialect) + logger.info( + "WES dialect %s does not require transforming tasks", self.wes_dialect + ) filter_function = None task_logs = run.get_task_logs(filter_function=filter_function) @@ -589,8 +627,8 @@ def get_run_log(self, run_id: str) -> Dict[str, Any]: } @handle_errors - def cancel_run(self, run_id: str) -> Dict[str, str]: - """ Cancel a running workflow.""" + def cancel_run(self, run_id: str) -> dict[str, str]: + """Cancel a running workflow.""" run = self._get_run(run_id, should_exists=True) # Do some preflight checks on the current state. @@ -598,31 +636,30 @@ def cancel_run(self, run_id: str) -> Dict[str, str]: state = run.get_state() if state in ("CANCELING", "CANCELED", "COMPLETE"): # We don't need to do anything. - logger.warning(f"A user is attempting to cancel a workflow in state: '{state}'.") + logger.warning( + f"A user is attempting to cancel a workflow in state: '{state}'." + ) elif state in ("EXECUTOR_ERROR", "SYSTEM_ERROR"): # Something went wrong. Let the user know. - raise OperationForbidden(f"Workflow is in state: '{state}', which cannot be cancelled.") + raise OperationForbidden( + f"Workflow is in state: '{state}', which cannot be cancelled." + ) else: # Go to canceling state if allowed run.state_machine.send_cancel() # Stop the run task if it is there. self.task_runner.cancel(run_id) - return { - "run_id": run_id - } + return {"run_id": run_id} @handle_errors - def get_run_status(self, run_id: str) -> Dict[str, str]: + def get_run_status(self, run_id: str) -> dict[str, str]: """ Get quick status info about a workflow run, returning a simple result with the overall state of the workflow run. """ - return { - "run_id": run_id, - "state": self.get_state(run_id) - } + return {"run_id": run_id, "state": self.get_state(run_id)} # Toil custom endpoints that are not part of the GA4GH WES spec @@ -665,6 +702,4 @@ def get_homepage(self) -> Response: Provide a sensible result for / other than 404. """ # For now just go to the service info endpoint - return redirect('ga4gh/wes/v1/service-info', code=302) - - + return redirect("ga4gh/wes/v1/service-info", code=302) diff --git a/src/toil/server/wsgi_app.py b/src/toil/server/wsgi_app.py index f5da2e62c8..7f91911ba6 100644 --- a/src/toil/server/wsgi_app.py +++ b/src/toil/server/wsgi_app.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Dict, Optional +from typing import Any, Optional from gunicorn.app.base import BaseApplication # type: ignore @@ -29,7 +29,8 @@ class GunicornApplication(BaseApplication): # type: ignore For more details, see: https://docs.gunicorn.org/en/latest/custom.html """ - def __init__(self, app: object, options: Optional[Dict[str, Any]] = None): + + def __init__(self, app: object, options: Optional[dict[str, Any]] = None): self.options = options or {} self.application = app super().__init__() @@ -51,7 +52,7 @@ def load(self) -> object: return self.application -def run_app(app: object, options: Optional[Dict[str, Any]] = None) -> None: +def run_app(app: object, options: Optional[dict[str, Any]] = None) -> None: """ Run a Gunicorn WSGI server. """ diff --git a/src/toil/serviceManager.py b/src/toil/serviceManager.py index af0d7e0d62..a199b54ebc 100644 --- a/src/toil/serviceManager.py +++ b/src/toil/serviceManager.py @@ -15,9 +15,10 @@ import logging import time +from collections.abc import Iterable from queue import Empty, Queue from threading import Event, Thread -from typing import Iterable, Optional, Set +from typing import Optional from toil.job import ServiceJobDescription from toil.jobStores.abstractJobStore import AbstractJobStore @@ -40,7 +41,7 @@ def __init__(self, job_store: AbstractJobStore, toil_state: ToilState) -> None: # These are all the client jobs that are waiting for their services to # start. - self.__waiting_clients: Set[str] = set() + self.__waiting_clients: set[str] = set() # This is used to terminate the thread associated with the service # manager @@ -123,7 +124,9 @@ def get_ready_client(self, maxWait: float) -> Optional[str]: client_id = self.__clients_out.get(timeout=maxWait) self.__waiting_clients.remove(client_id) if self.__service_manager_jobs < 0: - raise RuntimeError("The number of jobs scheduled by the service manager cannot be negative.") + raise RuntimeError( + "The number of jobs scheduled by the service manager cannot be negative." + ) self.__service_manager_jobs -= 1 return client_id except Empty: @@ -141,7 +144,9 @@ def get_unservable_client(self, maxWait: float) -> Optional[str]: client_id = self.__failed_clients_out.get(timeout=maxWait) self.__waiting_clients.remove(client_id) if self.__service_manager_jobs < 0: - raise RuntimeError("The number of jobs scheduled by the service manager cannot be negative.") + raise RuntimeError( + "The number of jobs scheduled by the service manager cannot be negative." + ) self.__service_manager_jobs -= 1 return client_id except Empty: @@ -157,7 +162,9 @@ def get_startable_service(self, maxWait: float) -> Optional[str]: try: service_id = self.__services_out.get(timeout=maxWait) if self.__service_manager_jobs < 0: - raise RuntimeError("The number of jobs scheduled by the service manager cannot be negative.") + raise RuntimeError( + "The number of jobs scheduled by the service manager cannot be negative." + ) self.__service_manager_jobs -= 1 return service_id except Empty: @@ -226,7 +233,7 @@ def shutdown(self) -> None: Will block until all services are started and blocked. """ - logger.debug('Waiting for service manager thread to finish ...') + logger.debug("Waiting for service manager thread to finish ...") start_time = time.time() self.__terminate.set() self.__service_starter.join() @@ -251,13 +258,17 @@ def __start_services(self) -> None: while True: with throttle(1.0): if self.__terminate.is_set(): - logger.debug('Received signal to quit starting services.') + logger.debug("Received signal to quit starting services.") break try: client_id = self.__clients_in.get_nowait() client = self.__toil_state.get_job(client_id) host_id_batches = list(client.serviceHostIDsInBatches()) - logger.debug("Service manager processing client %s with %d batches of services", client, len(host_id_batches)) + logger.debug( + "Service manager processing client %s with %d batches of services", + client, + len(host_id_batches), + ) if len(host_id_batches) > 1: # Have to fall back to the old blocking behavior to # ensure entire service "groups" are issued as a whole. @@ -288,7 +299,7 @@ def __start_services(self) -> None: pending_service_count = len(starting_services) if pending_service_count > 0 and log_limiter.throttle(False): - logger.debug('%d services are starting...', pending_service_count) + logger.debug("%d services are starting...", pending_service_count) for service_id in list(starting_services): service_job_desc = self._get_service_job(service_id) @@ -297,7 +308,9 @@ def __start_services(self) -> None: or service_job_desc.errorJobStoreID is None ): raise Exception("Must be a registered ServiceJobDescription") - if not self.__job_store.file_exists(service_job_desc.startJobStoreID): + if not self.__job_store.file_exists( + service_job_desc.startJobStoreID + ): # Service has started (or failed) logger.debug( "Service %s has removed %s and is therefore started", @@ -308,9 +321,13 @@ def __start_services(self) -> None: client_id = service_to_client[service_id] remaining_services_by_client[client_id] -= 1 if remaining_services_by_client[client_id] < 0: - raise RuntimeError("The number of remaining services cannot be negative.") + raise RuntimeError( + "The number of remaining services cannot be negative." + ) del service_to_client[service_id] - if not self.__job_store.file_exists(service_job_desc.errorJobStoreID): + if not self.__job_store.file_exists( + service_job_desc.errorJobStoreID + ): logger.error( "Service %s has immediately failed before it could be used", service_job_desc, @@ -321,13 +338,22 @@ def __start_services(self) -> None: # Find if any clients have had *all* their services started. ready_clients = set() - for client_id, remainingServices in remaining_services_by_client.items(): + for ( + client_id, + remainingServices, + ) in remaining_services_by_client.items(): if remainingServices == 0: if client_id in clients_with_failed_services: - logger.error('Job %s has had all its services try to start, but at least one failed', self.__toil_state.get_job(client_id)) + logger.error( + "Job %s has had all its services try to start, but at least one failed", + self.__toil_state.get_job(client_id), + ) self.__failed_clients_out.put(client_id) else: - logger.debug('Job %s has all its services started', self.__toil_state.get_job(client_id)) + logger.debug( + "Job %s has all its services started", + self.__toil_state.get_job(client_id), + ) self.__clients_out.put(client_id) ready_clients.add(client_id) for client_id in ready_clients: @@ -344,7 +370,9 @@ def __start_batches_blocking(self, client_id: str) -> None: # Start the service jobs in batches, waiting for each batch # to become established before starting the next batch - for service_job_list in self.__toil_state.get_job(client_id).serviceHostIDsInBatches(): + for service_job_list in self.__toil_state.get_job( + client_id + ).serviceHostIDsInBatches(): # When we get the job descriptions we store them here to go over them again. wait_on = [] for service_id in service_job_list: @@ -361,9 +389,13 @@ def __start_batches_blocking(self, client_id: str) -> None: service_job_desc.startJobStoreID, ) if not self.__job_store.file_exists(service_job_desc.startJobStoreID): - raise RuntimeError(f"Service manager attempted to start service {service_job_desc} that has already started") + raise RuntimeError( + f"Service manager attempted to start service {service_job_desc} that has already started" + ) if not self.__toil_state.job_exists(str(service_job_desc.jobStoreID)): - raise RuntimeError(f"Service manager attempted to start service {service_job_desc} that is not in the job store") + raise RuntimeError( + f"Service manager attempted to start service {service_job_desc} that is not in the job store" + ) # At this point the terminateJobStoreID and errorJobStoreID # could have been deleted, since the service can be killed at # any time! So we can't assert their presence here. @@ -382,7 +414,7 @@ def __start_batches_blocking(self, client_id: str) -> None: time.sleep(1.0) if log_limiter.throttle(False): - logger.info('Service %s is starting...', service_job_desc) + logger.info("Service %s is starting...", service_job_desc) # Check if the thread should quit if self.__terminate.is_set(): @@ -395,9 +427,14 @@ def __start_batches_blocking(self, client_id: str) -> None: ): # The service job has gone away but the service never flipped its start flag. # That's not what the worker is supposed to do when running a service at all. - logger.error('Service %s has completed and been removed without ever starting', service_job_desc) + logger.error( + "Service %s has completed and been removed without ever starting", + service_job_desc, + ) # Stop everything. - raise RuntimeError(f"Service {service_job_desc} is in an inconsistent state") + raise RuntimeError( + f"Service {service_job_desc} is in an inconsistent state" + ) # We don't bail out early here. @@ -409,6 +446,5 @@ def __start_batches_blocking(self, client_id: str) -> None: # though, so they should stop immediately when we run them. TODO: # this is a bad design! - # Add the JobDescription to the output queue of jobs whose services have been started self.__clients_out.put(client_id) diff --git a/src/toil/statsAndLogging.py b/src/toil/statsAndLogging.py index 97d03ac4db..0593f98464 100644 --- a/src/toil/statsAndLogging.py +++ b/src/toil/statsAndLogging.py @@ -20,7 +20,7 @@ from argparse import ArgumentParser, Namespace from logging.handlers import RotatingFileHandler from threading import Event, Thread -from typing import IO, TYPE_CHECKING, Any, Callable, List, Optional, Union +from typing import IO, TYPE_CHECKING, Any, Callable, Optional, Union from toil.lib.conversions import strtobool from toil.lib.expando import Expando @@ -32,7 +32,7 @@ logger = logging.getLogger(__name__) root_logger = logging.getLogger() -toil_logger = logging.getLogger('toil') +toil_logger = logging.getLogger("toil") DEFAULT_LOGLEVEL = logging.INFO __loggingFiles = [] @@ -42,21 +42,26 @@ logging.addLevelName(TRACE, "TRACE") + class StatsAndLogging: """A thread to aggregate statistics and logging.""" - def __init__(self, jobStore: 'AbstractJobStore', config: 'Config') -> None: + def __init__(self, jobStore: "AbstractJobStore", config: "Config") -> None: self._stop = Event() - self._worker = Thread(target=self.statsAndLoggingAggregator, - args=(jobStore, self._stop, config), - daemon=True) + self._worker = Thread( + target=self.statsAndLoggingAggregator, + args=(jobStore, self._stop, config), + daemon=True, + ) def start(self) -> None: """Start the stats and logging thread.""" self._worker.start() @classmethod - def formatLogStream(cls, stream: Union[IO[str], IO[bytes]], stream_name: str) -> str: + def formatLogStream( + cls, stream: Union[IO[str], IO[bytes]], stream_name: str + ) -> str: """ Given a stream of text or bytes, and the job name, job itself, or some other optional stringifyable identity info for the job, return a big @@ -69,21 +74,25 @@ def formatLogStream(cls, stream: Union[IO[str], IO[bytes]], stream_name: str) -> :param stream: The stream of text or bytes to print for the user. """ - lines = [f'{stream_name} follows:', '=========>'] + lines = [f"{stream_name} follows:", "=========>"] for line in stream: if isinstance(line, bytes): - line = line.decode('utf-8', errors='replace') - lines.append('\t' + line.rstrip('\n')) + line = line.decode("utf-8", errors="replace") + lines.append("\t" + line.rstrip("\n")) - lines.append('<=========') + lines.append("<=========") - return '\n'.join(lines) + return "\n".join(lines) @classmethod - def logWithFormatting(cls, stream_name: str, jobLogs: Union[IO[str], IO[bytes]], - method: Callable[[str], None] = logger.debug, - message: Optional[str] = None) -> None: + def logWithFormatting( + cls, + stream_name: str, + jobLogs: Union[IO[str], IO[bytes]], + method: Callable[[str], None] = logger.debug, + message: Optional[str] = None, + ) -> None: if message is not None: method(message) @@ -91,28 +100,36 @@ def logWithFormatting(cls, stream_name: str, jobLogs: Union[IO[str], IO[bytes]], method(cls.formatLogStream(jobLogs, stream_name)) @classmethod - def writeLogFiles(cls, jobNames: List[str], jobLogList: List[str], config: 'Config', failed: bool = False) -> None: - def createName(logPath: str, jobName: str, logExtension: str, failed: bool = False) -> str: - logName = jobName.replace('-', '--') - logName = logName.replace('/', '-') - logName = logName.replace(' ', '_') - logName = logName.replace("'", '') - logName = logName.replace('"', '') + def writeLogFiles( + cls, + jobNames: list[str], + jobLogList: list[str], + config: "Config", + failed: bool = False, + ) -> None: + def createName( + logPath: str, jobName: str, logExtension: str, failed: bool = False + ) -> str: + logName = jobName.replace("-", "--") + logName = logName.replace("/", "-") + logName = logName.replace(" ", "_") + logName = logName.replace("'", "") + logName = logName.replace('"', "") # Add a "failed_" prefix to logs from failed jobs. - logName = ('failed_' if failed else '') + logName + logName = ("failed_" if failed else "") + logName counter = 0 while True: - suffix = '_' + str(counter).zfill(3) + logExtension + suffix = "_" + str(counter).zfill(3) + logExtension fullName = os.path.join(logPath, logName + suffix) # The maximum file name size in the default HFS+ file system is 255 UTF-16 encoding units, so basically 255 characters if len(fullName) >= 255: - return fullName[:(255 - len(suffix))] + suffix + return fullName[: (255 - len(suffix))] + suffix if not os.path.exists(fullName): return fullName counter += 1 mainFileName = jobNames[0] - extension = '.log' + extension = ".log" writeFn: Callable[..., Any] if config.writeLogs: path = config.writeLogs @@ -120,7 +137,7 @@ def createName(logPath: str, jobName: str, logExtension: str, failed: bool = Fal elif config.writeLogsGzip: path = config.writeLogsGzip writeFn = gzip.open - extension += '.gz' + extension += ".gz" else: # we don't have anywhere to write the logs, return now return @@ -129,13 +146,13 @@ def createName(logPath: str, jobName: str, logExtension: str, failed: bool = Fal os.makedirs(path, exist_ok=True) fullName = createName(path, mainFileName, extension, failed) - with writeFn(fullName, 'wb') as f: + with writeFn(fullName, "wb") as f: for l in jobLogList: if isinstance(l, bytes): - l = l.decode('utf-8') - if not l.endswith('\n'): - l += '\n' - f.write(l.encode('utf-8')) + l = l.decode("utf-8") + if not l.endswith("\n"): + l += "\n" + f.write(l.encode("utf-8")) for alternateName in jobNames[1:]: # There are chained jobs in this output - indicate this with a symlink # of the job's name to this file @@ -144,7 +161,9 @@ def createName(logPath: str, jobName: str, logExtension: str, failed: bool = Fal os.symlink(os.path.relpath(fullName, path), name) @classmethod - def statsAndLoggingAggregator(cls, jobStore: 'AbstractJobStore', stop: Event, config: 'Config') -> None: + def statsAndLoggingAggregator( + cls, jobStore: "AbstractJobStore", stop: Event, config: "Config" + ) -> None: """ The following function is used for collating stats/reporting log messages from the workers. Works inside of a thread, collates as long as the stop flag is not True. @@ -169,9 +188,12 @@ def callback(fileHandle: Union[IO[bytes], IO[str]]) -> None: pass else: for message in logs: - logger.log(int(message.level), - 'Got message from job at time %s: %s', - time.strftime('%m-%d-%Y %H:%M:%S'), message.text) + logger.log( + int(message.level), + "Got message from job at time %s: %s", + time.strftime("%m-%d-%Y %H:%M:%S"), + message.text, + ) try: # Handle all the user-level text streams reported back (command output, etc.) @@ -202,8 +224,11 @@ def callback(fileHandle: Union[IO[bytes], IO[str]]) -> None: # we may have multiple jobs per worker jobNames = logs.names messages = logs.messages - cls.logWithFormatting(f'Log from job "{jobNames[0]}"', messages, - message='Received Toil worker log. Disable debug level logging to hide this output') + cls.logWithFormatting( + f'Log from job "{jobNames[0]}"', + messages, + message="Received Toil worker log. Disable debug level logging to hide this output", + ) cls.writeLogFiles(jobNames, messages, config=config) while True: @@ -215,8 +240,13 @@ def callback(fileHandle: Union[IO[bytes], IO[str]]) -> None: time.sleep(0.5) # Avoid cycling too fast # Finish the stats file - text = json.dumps(dict(total_time=str(time.time() - startTime), - total_clock=str(ResourceMonitor.get_total_cpu_time() - startClock)), ensure_ascii=True) + text = json.dumps( + dict( + total_time=str(time.time() - startTime), + total_clock=str(ResourceMonitor.get_total_cpu_time() - startClock), + ), + ensure_ascii=True, + ) jobStore.write_logs(text) def check(self) -> None: @@ -229,11 +259,14 @@ def check(self) -> None: def shutdown(self) -> None: """Finish up the stats/logging aggregation thread.""" - logger.debug('Waiting for stats and logging collator thread to finish ...') + logger.debug("Waiting for stats and logging collator thread to finish ...") startTime = time.time() self._stop.set() self._worker.join() - logger.debug('... finished collating stats and logs. Took %s seconds', time.time() - startTime) + logger.debug( + "... finished collating stats and logs. Took %s seconds", + time.time() - startTime, + ) # in addition to cleaning on exceptions, onError should clean if there are any failed jobs @@ -255,7 +288,7 @@ def install_log_color(set_logger: Optional[logging.Logger] = None) -> None: level_styles = dict(coloredlogs.DEFAULT_LEVEL_STYLES) level_styles["trace"] = dict(level_styles["debug"]) - + # TODO: What if these fixed colors aren't right for the terminal background? # It might be light or dark or even grey. level_styles["trace"]["color"] = 242 @@ -281,7 +314,9 @@ def install_log_color(set_logger: Optional[logging.Logger] = None) -> None: ) -def add_logging_options(parser: ArgumentParser, default_level: Optional[int] = None) -> None: +def add_logging_options( + parser: ArgumentParser, default_level: Optional[int] = None +) -> None: """ Add logging options to set the global log level. @@ -294,23 +329,51 @@ def add_logging_options(parser: ArgumentParser, default_level: Optional[int] = N group = parser.add_argument_group("Logging Options") - levels = ['Critical', 'Error', 'Warning', 'Info', 'Debug', 'Trace'] + levels = ["Critical", "Error", "Warning", "Info", "Debug", "Trace"] for level in levels: - group.add_argument(f"--log{level}", dest="logLevel", default=default_level_name, action="store_const", - const=level, help=f"Set logging level to {level}. Default: {default_level_name}.") + group.add_argument( + f"--log{level}", + dest="logLevel", + default=default_level_name, + action="store_const", + const=level, + help=f"Set logging level to {level}. Default: {default_level_name}.", + ) levels += [l.lower() for l in levels] + [l.upper() for l in levels] - group.add_argument("--logOff", dest="logLevel", default=default_level_name, - action="store_const", const="CRITICAL", help="Same as --logCritical.") + group.add_argument( + "--logOff", + dest="logLevel", + default=default_level_name, + action="store_const", + const="CRITICAL", + help="Same as --logCritical.", + ) # Maybe deprecate the above in favor of --logLevel? - group.add_argument("--logLevel", dest="logLevel", default=default_level_name, choices=levels, - help=f"Set the log level. Default: {default_level_name}. Options: {levels}.") + group.add_argument( + "--logLevel", + dest="logLevel", + default=default_level_name, + choices=levels, + help=f"Set the log level. Default: {default_level_name}. Options: {levels}.", + ) group.add_argument("--logFile", dest="logFile", help="File to log in.") - group.add_argument("--rotatingLogging", dest="logRotating", action="store_true", default=False, - help="Turn on rotating logging, which prevents log files from getting too big.") - group.add_argument("--logColors", dest="colored_logs", default=True, type=strtobool, metavar="BOOL", - help="Enable or disable colored logging. Default: %(default)s") + group.add_argument( + "--rotatingLogging", + dest="logRotating", + action="store_true", + default=False, + help="Turn on rotating logging, which prevents log files from getting too big.", + ) + group.add_argument( + "--logColors", + dest="colored_logs", + default=True, + type=strtobool, + metavar="BOOL", + help="Enable or disable colored logging. Default: %(default)s", + ) def configure_root_logger() -> None: @@ -320,8 +383,10 @@ def configure_root_logger() -> None: Should be called before any entry point tries to log anything, to ensure consistent formatting. """ - logging.basicConfig(format='[%(asctime)s] [%(threadName)-10s] [%(levelname).1s] [%(name)s] %(message)s', - datefmt='%Y-%m-%dT%H:%M:%S%z') + logging.basicConfig( + format="[%(asctime)s] [%(threadName)-10s] [%(levelname).1s] [%(name)s] %(message)s", + datefmt="%Y-%m-%dT%H:%M:%S%z", + ) root_logger.setLevel(DEFAULT_LOGLEVEL) @@ -339,12 +404,16 @@ def log_to_file(log_file: Optional[str], log_rotation: bool) -> None: def set_logging_from_options(options: Union["Config", Namespace]) -> None: configure_root_logger() - options.logLevel = options.logLevel or logging.getLevelName(root_logger.getEffectiveLevel()) + options.logLevel = options.logLevel or logging.getLevelName( + root_logger.getEffectiveLevel() + ) set_log_level(options.logLevel) if options.colored_logs: install_log_color() - logger.debug(f"Root logger is at level '{logging.getLevelName(root_logger.getEffectiveLevel())}', " - f"'toil' logger at level '{logging.getLevelName(toil_logger.getEffectiveLevel())}'.") + logger.debug( + f"Root logger is at level '{logging.getLevelName(root_logger.getEffectiveLevel())}', " + f"'toil' logger at level '{logging.getLevelName(toil_logger.getEffectiveLevel())}'." + ) # start logging to log file if specified log_to_file(options.logFile, options.logRotating) @@ -362,18 +431,24 @@ def suppress_exotic_logging(local_logger: str) -> None: This is important because some packages, particularly boto3, are not always instantiated yet in the environment when this is run, and so we create the logger and set the level preemptively. """ - never_suppress = ['toil', '__init__', '__main__', 'toil-rt', 'cwltool'] - always_suppress = ['boto3', 'boto', 'botocore'] # ensure we suppress even before instantiated + never_suppress = ["toil", "__init__", "__main__", "toil-rt", "cwltool"] + always_suppress = [ + "boto3", + "boto", + "botocore", + ] # ensure we suppress even before instantiated - top_level_loggers: List[str] = [] + top_level_loggers: list[str] = [] # Due to https://stackoverflow.com/questions/61683713 for pkg_logger in list(logging.Logger.manager.loggerDict.keys()) + always_suppress: if pkg_logger != local_logger: # many sub-loggers may exist, like "boto.a", "boto.b", "boto.c"; we only want the top_level: "boto" - top_level_logger = pkg_logger.split('.')[0] if '.' in pkg_logger else pkg_logger + top_level_logger = ( + pkg_logger.split(".")[0] if "." in pkg_logger else pkg_logger + ) if top_level_logger not in top_level_loggers + never_suppress: top_level_loggers.append(top_level_logger) logging.getLogger(top_level_logger).setLevel(logging.CRITICAL) - logger.debug(f'Suppressing the following loggers: {set(top_level_loggers)}') + logger.debug(f"Suppressing the following loggers: {set(top_level_loggers)}") diff --git a/src/toil/test/__init__.py b/src/toil/test/__init__.py index 5970be5841..1ebaf74af3 100644 --- a/src/toil/test/__init__.py +++ b/src/toil/test/__init__.py @@ -1,4 +1,5 @@ """Base testing class for Toil.""" + # Copyright (C) 2015-2021 Regents of the University of California # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -20,42 +21,28 @@ import shutil import signal import subprocess -import sys import threading import time import unittest import uuid +import zoneinfo from abc import ABCMeta, abstractmethod +from collections.abc import Generator from contextlib import contextmanager from inspect import getsource from shutil import which from tempfile import mkstemp from textwrap import dedent -from typing import (Any, - Callable, - Dict, - Generator, - List, - Literal, - Optional, - Tuple, - Type, - TypeVar, - Union, - cast) +from typing import Any, Callable, Literal, Optional, TypeVar, Union, cast from unittest.util import strclass from urllib.error import HTTPError, URLError from urllib.request import urlopen - -if sys.version_info >= (3, 9): - import zoneinfo -else: - from backports import zoneinfo - from toil import ApplianceImageNotFound, applianceSelf, toilPackageDirPath -from toil.lib.accelerators import (have_working_nvidia_docker_runtime, - have_working_nvidia_smi) +from toil.lib.accelerators import ( + have_working_nvidia_docker_runtime, + have_working_nvidia_smi, +) from toil.lib.io import mkdtemp from toil.lib.iterables import concat from toil.lib.memoize import memoize @@ -82,20 +69,24 @@ class ToilTest(unittest.TestCase): """ _tempBaseDir: Optional[str] = None - _tempDirs: List[str] = [] + _tempDirs: list[str] = [] def setup_method(self, method: Any) -> None: western = zoneinfo.ZoneInfo("America/Los_Angeles") california_time = datetime.datetime.now(tz=western) timestamp = california_time.strftime("%b %d %Y %H:%M:%S:%f %Z") - print(f"\n\n[TEST] {strclass(self.__class__)}:{self._testMethodName} ({timestamp})\n\n") + print( + f"\n\n[TEST] {strclass(self.__class__)}:{self._testMethodName} ({timestamp})\n\n" + ) @classmethod def setUpClass(cls) -> None: super().setUpClass() - tempBaseDir = os.environ.get('TOIL_TEST_TEMP', None) + tempBaseDir = os.environ.get("TOIL_TEST_TEMP", None) if tempBaseDir is not None and not os.path.isabs(tempBaseDir): - tempBaseDir = os.path.abspath(os.path.join(cls._projectRootPath(), tempBaseDir)) + tempBaseDir = os.path.abspath( + os.path.join(cls._projectRootPath(), tempBaseDir) + ) os.makedirs(tempBaseDir, exist_ok=True) cls._tempBaseDir = tempBaseDir @@ -127,7 +118,8 @@ def awsRegion(cls) -> str: the instance is located """ from toil.lib.aws import running_on_ec2 - return cls._region() if running_on_ec2() else 'us-west-2' + + return cls._region() if running_on_ec2() else "us-west-2" @classmethod def _availabilityZone(cls) -> str: @@ -151,13 +143,15 @@ def _region(cls) -> str: The region will not change over the life of the instance so the result is memoized to avoid unnecessary work. """ - region = re.match(r'^([a-z]{2}-[a-z]+-[1-9][0-9]*)([a-z])$', cls._availabilityZone()) + region = re.match( + r"^([a-z]{2}-[a-z]+-[1-9][0-9]*)([a-z])$", cls._availabilityZone() + ) assert region return region.group(1) @classmethod def _getUtilScriptPath(cls, script_name: str) -> str: - return os.path.join(toilPackageDirPath(), 'utils', script_name + '.py') + return os.path.join(toilPackageDirPath(), "utils", script_name + ".py") @classmethod def _projectRootPath(cls) -> str: @@ -169,12 +163,12 @@ def _projectRootPath(cls) -> str: mode, since it assumes the existence of a src subdirectory which, in a regular install wouldn't exist. Then again, in that mode project root has no meaning anyways. """ - assert re.search(r'__init__\.pyc?$', __file__) + assert re.search(r"__init__\.pyc?$", __file__) projectRootPath = os.path.dirname(os.path.abspath(__file__)) - packageComponents = __name__.split('.') - expectedSuffix = os.path.join('src', *packageComponents) + packageComponents = __name__.split(".") + expectedSuffix = os.path.join("src", *packageComponents) assert projectRootPath.endswith(expectedSuffix) - projectRootPath = projectRootPath[:-len(expectedSuffix)] + projectRootPath = projectRootPath[: -len(expectedSuffix)] return projectRootPath def _createTempDir(self, purpose: Optional[str] = None) -> str: @@ -187,7 +181,7 @@ def _createTempDirEx(cls, *names: Optional[str]) -> str: classname = classname[len("toil.test.") :] prefix = ["toil", "test", classname] prefix.extend([_f for _f in names if _f]) - prefix.append('') + prefix.append("") temp_dir_path = os.path.realpath( mkdtemp(dir=cls._tempBaseDir, prefix="-".join(prefix)) ) @@ -277,9 +271,9 @@ def _run(cls, command: str, *args: str, **kwargs: Any) -> Optional[str]: capture = kwargs.pop("capture", False) _input = kwargs.pop("input", None) if capture: - kwargs['stdout'] = subprocess.PIPE + kwargs["stdout"] = subprocess.PIPE if _input is not None: - kwargs['stdin'] = subprocess.PIPE + kwargs["stdin"] = subprocess.PIPE popen = subprocess.Popen(args, universal_newlines=True, **kwargs) stdout, stderr = popen.communicate(input=_input) assert stderr is None @@ -295,7 +289,8 @@ def _getScriptSource(self, callable_: Callable[..., Any]) -> str: This is a naughty but incredibly useful trick that lets you embed user scripts as nested functions and expose them to the syntax checker of your IDE. """ - return dedent('\n'.join(getsource(callable_).split('\n')[1:])) + return dedent("\n".join(getsource(callable_).split("\n")[1:])) + MT = TypeVar("MT", bound=Callable[..., Any]) @@ -306,6 +301,7 @@ def _getScriptSource(self, callable_: Callable[..., Any]) -> str: # noinspection PyUnusedLocal def _mark_test(name: str, test_item: MT) -> MT: return test_item + else: def _mark_test(name: str, test_item: MT) -> MT: @@ -319,7 +315,7 @@ def get_temp_file(suffix: str = "", rootDir: Optional[str] = None) -> str: os.close(handle) return tmp_file else: - alphanumerics = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' + alphanumerics = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" tmp_file = os.path.join( rootDir, f"tmp_{''.join([random.choice(alphanumerics) for _ in range(0, 10)])}{suffix}", @@ -328,18 +324,24 @@ def get_temp_file(suffix: str = "", rootDir: Optional[str] = None) -> str: os.chmod(tmp_file, 0o777) # Ensure everyone has access to the file. return tmp_file + def needs_env_var(var_name: str, comment: Optional[str] = None) -> Callable[[MT], MT]: """ Use as a decorator before test classes or methods to run only if the given environment variable is set. Can include a comment saying what the variable should be set to. """ + def decorator(test_item: MT) -> MT: if not os.getenv(var_name): - return unittest.skip(f"Set {var_name}{' to ' + comment if comment else ''} to include this test.")(test_item) + return unittest.skip( + f"Set {var_name}{' to ' + comment if comment else ''} to include this test." + )(test_item) return test_item + return decorator + def needs_rsync3(test_item: MT) -> MT: """ Decorate classes or methods that depend on any features from rsync version 3.0.0+. @@ -347,32 +349,34 @@ def needs_rsync3(test_item: MT) -> MT: Necessary because :meth:`utilsTest.testAWSProvisionerUtils` uses option `--protect-args` which is only available in rsync 3 """ - test_item = _mark_test('rsync', test_item) + test_item = _mark_test("rsync", test_item) try: - versionInfo = subprocess.check_output(['rsync', '--version']).decode('utf-8') + versionInfo = subprocess.check_output(["rsync", "--version"]).decode("utf-8") # output looks like: 'rsync version 2.6.9 ...' if int(versionInfo.split()[2].split(".")[0]) < 3: return unittest.skip("This test depends on rsync version 3.0.0+.")( test_item ) except subprocess.CalledProcessError: - return unittest.skip('rsync needs to be installed to run this test.')(test_item) + return unittest.skip("rsync needs to be installed to run this test.")(test_item) return test_item def needs_online(test_item: MT) -> MT: """Use as a decorator before test classes or methods to run only if we are meant to talk to the Internet.""" - test_item = _mark_test('online', test_item) - if os.getenv('TOIL_SKIP_ONLINE', '').lower() == 'true': - return unittest.skip('Skipping online test.')(test_item) + test_item = _mark_test("online", test_item) + if os.getenv("TOIL_SKIP_ONLINE", "").lower() == "true": + return unittest.skip("Skipping online test.")(test_item) return test_item + def needs_aws_s3(test_item: MT) -> MT: """Use as a decorator before test classes or methods to run only if AWS S3 is usable.""" # TODO: we just check for generic access to the AWS account - test_item = _mark_test('aws-s3', needs_online(test_item)) + test_item = _mark_test("aws-s3", needs_online(test_item)) try: from boto3 import Session + session = Session() boto3_credentials = session.get_credentials() except ImportError: @@ -380,18 +384,25 @@ def needs_aws_s3(test_item: MT) -> MT: test_item ) from toil.lib.aws import running_on_ec2 - if not (boto3_credentials or os.path.exists(os.path.expanduser('~/.aws/credentials')) or running_on_ec2()): - return unittest.skip("Configure AWS credentials to include this test.")(test_item) + + if not ( + boto3_credentials + or os.path.exists(os.path.expanduser("~/.aws/credentials")) + or running_on_ec2() + ): + return unittest.skip("Configure AWS credentials to include this test.")( + test_item + ) return test_item def needs_aws_ec2(test_item: MT) -> MT: """Use as a decorator before test classes or methods to run only if AWS EC2 is usable.""" # Assume we need S3 as well as EC2 - test_item = _mark_test('aws-ec2', needs_aws_s3(test_item)) + test_item = _mark_test("aws-ec2", needs_aws_s3(test_item)) # In addition to S3 we also need an SSH key to deploy with. # TODO: We assume that if this is set we have EC2 access. - test_item = needs_env_var('TOIL_AWS_KEYNAME', 'an AWS-stored SSH key')(test_item) + test_item = needs_env_var("TOIL_AWS_KEYNAME", "an AWS-stored SSH key")(test_item) return test_item @@ -401,16 +412,23 @@ def needs_aws_batch(test_item: MT) -> MT: is usable. """ # Assume we need S3 as well as Batch - test_item = _mark_test('aws-batch', needs_aws_s3(test_item)) + test_item = _mark_test("aws-batch", needs_aws_s3(test_item)) # Assume we have Batch if the user has set these variables. - test_item = needs_env_var('TOIL_AWS_BATCH_QUEUE', 'an AWS Batch queue name or ARN')(test_item) - test_item = needs_env_var('TOIL_AWS_BATCH_JOB_ROLE_ARN', 'an IAM role ARN that grants S3 and SDB access')(test_item) + test_item = needs_env_var("TOIL_AWS_BATCH_QUEUE", "an AWS Batch queue name or ARN")( + test_item + ) + test_item = needs_env_var( + "TOIL_AWS_BATCH_JOB_ROLE_ARN", "an IAM role ARN that grants S3 and SDB access" + )(test_item) try: from toil.lib.aws import get_current_aws_region + if get_current_aws_region() is None: # We don't know a region so we need one set. # TODO: It always won't be set if we get here. - test_item = needs_env_var('TOIL_AWS_REGION', 'an AWS region to use with AWS batch')(test_item) + test_item = needs_env_var( + "TOIL_AWS_REGION", "an AWS region to use with AWS batch" + )(test_item) except ImportError: return unittest.skip("Install Toil with the 'aws' extra to include this test.")( @@ -418,67 +436,79 @@ def needs_aws_batch(test_item: MT) -> MT: ) return test_item + def needs_google_storage(test_item: MT) -> MT: """ Use as a decorator before test classes or methods to run only if Google Cloud is installed and we ought to be able to access public Google Storage URIs. """ - test_item = _mark_test('google-storage', needs_online(test_item)) + test_item = _mark_test("google-storage", needs_online(test_item)) try: from google.cloud import storage # noqa except ImportError: - return unittest.skip("Install Toil with the 'google' extra to include this test.")(test_item) + return unittest.skip( + "Install Toil with the 'google' extra to include this test." + )(test_item) return test_item + def needs_google_project(test_item: MT) -> MT: """ Use as a decorator before test classes or methods to run only if we have a Google Cloud project set. """ - test_item = _mark_test('google-project', needs_online(test_item)) - test_item = needs_env_var('TOIL_GOOGLE_PROJECTID', "a Google project ID")(test_item) + test_item = _mark_test("google-project", needs_online(test_item)) + test_item = needs_env_var("TOIL_GOOGLE_PROJECTID", "a Google project ID")(test_item) return test_item def needs_gridengine(test_item: MT) -> MT: """Use as a decorator before test classes or methods to run only if GridEngine is installed.""" - test_item = _mark_test('gridengine', test_item) - if which('qhost'): + test_item = _mark_test("gridengine", test_item) + if which("qhost"): return test_item return unittest.skip("Install GridEngine to include this test.")(test_item) def needs_torque(test_item: MT) -> MT: """Use as a decorator before test classes or methods to run only if PBS/Torque is installed.""" - test_item = _mark_test('torque', test_item) - if which('pbsnodes'): + test_item = _mark_test("torque", test_item) + if which("pbsnodes"): return test_item return unittest.skip("Install PBS/Torque to include this test.")(test_item) + def needs_kubernetes_installed(test_item: MT) -> MT: """Use as a decorator before test classes or methods to run only if Kubernetes is installed.""" - test_item = _mark_test('kubernetes', test_item) + test_item = _mark_test("kubernetes", test_item) try: import kubernetes + str(kubernetes) # to prevent removal of this import except ImportError: - return unittest.skip("Install Toil with the 'kubernetes' extra to include this test.")(test_item) + return unittest.skip( + "Install Toil with the 'kubernetes' extra to include this test." + )(test_item) return test_item + def needs_kubernetes(test_item: MT) -> MT: """Use as a decorator before test classes or methods to run only if Kubernetes is installed and configured.""" test_item = needs_kubernetes_installed(needs_online(test_item)) try: import kubernetes + try: kubernetes.config.load_kube_config() except kubernetes.config.ConfigException: try: kubernetes.config.load_incluster_config() except kubernetes.config.ConfigException: - return unittest.skip("Configure Kubernetes (~/.kube/config, $KUBECONFIG, " - "or current pod) to include this test.")(test_item) + return unittest.skip( + "Configure Kubernetes (~/.kube/config, $KUBECONFIG, " + "or current pod) to include this test." + )(test_item) except ImportError: # We should already be skipping this test pass @@ -487,37 +517,50 @@ def needs_kubernetes(test_item: MT) -> MT: def needs_mesos(test_item: MT) -> MT: """Use as a decorator before test classes or methods to run only if Mesos is installed.""" - test_item = _mark_test('mesos', test_item) - if not (which('mesos-master') or which('mesos-agent')): - return unittest.skip("Install Mesos (and Toil with the 'mesos' extra) to include this test.")(test_item) + test_item = _mark_test("mesos", test_item) + if not (which("mesos-master") or which("mesos-agent")): + return unittest.skip( + "Install Mesos (and Toil with the 'mesos' extra) to include this test." + )(test_item) try: import psutil # noqa import pymesos # noqa except ImportError: - return unittest.skip("Install Mesos (and Toil with the 'mesos' extra) to include this test.")(test_item) + return unittest.skip( + "Install Mesos (and Toil with the 'mesos' extra) to include this test." + )(test_item) return test_item def needs_slurm(test_item: MT) -> MT: """Use as a decorator before test classes or methods to run only if Slurm is installed.""" - test_item = _mark_test('slurm', test_item) - if which('squeue'): + test_item = _mark_test("slurm", test_item) + if which("squeue"): return test_item return unittest.skip("Install Slurm to include this test.")(test_item) def needs_htcondor(test_item: MT) -> MT: """Use a decorator before test classes or methods to run only if the HTCondor is installed.""" - test_item = _mark_test('htcondor', test_item) + test_item = _mark_test("htcondor", test_item) try: import htcondor - htcondor.Collector(os.getenv('TOIL_HTCONDOR_COLLECTOR')).query(constraint='False') + + htcondor.Collector(os.getenv("TOIL_HTCONDOR_COLLECTOR")).query( + constraint="False" + ) except ImportError: - return unittest.skip("Install the HTCondor Python bindings to include this test.")(test_item) + return unittest.skip( + "Install the HTCondor Python bindings to include this test." + )(test_item) except OSError: - return unittest.skip("HTCondor must be running to include this test.")(test_item) + return unittest.skip("HTCondor must be running to include this test.")( + test_item + ) except RuntimeError: - return unittest.skip("HTCondor must be installed and configured to include this test.")(test_item) + return unittest.skip( + "HTCondor must be installed and configured to include this test." + )(test_item) else: return test_item @@ -526,8 +569,8 @@ def needs_lsf(test_item: MT) -> MT: """ Use as a decorator before test classes or methods to only run them if LSF is installed. """ - test_item = _mark_test('lsf', test_item) - if which('bsub'): + test_item = _mark_test("lsf", test_item) + if which("bsub"): return test_item else: return unittest.skip("Install LSF to include this test.")(test_item) @@ -535,8 +578,8 @@ def needs_lsf(test_item: MT) -> MT: def needs_java(test_item: MT) -> MT: """Use as a test decorator to run only if java is installed.""" - test_item = _mark_test('java', test_item) - if which('java'): + test_item = _mark_test("java", test_item) + if which("java"): return test_item else: return unittest.skip("Install java to include this test.")(test_item) @@ -547,74 +590,84 @@ def needs_docker(test_item: MT) -> MT: Use as a decorator before test classes or methods to only run them if docker is installed and docker-based tests are enabled. """ - test_item = _mark_test('docker', needs_online(test_item)) - if os.getenv('TOIL_SKIP_DOCKER', '').lower() == 'true': - return unittest.skip('Skipping docker test.')(test_item) - if which('docker'): + test_item = _mark_test("docker", needs_online(test_item)) + if os.getenv("TOIL_SKIP_DOCKER", "").lower() == "true": + return unittest.skip("Skipping docker test.")(test_item) + if which("docker"): return test_item else: return unittest.skip("Install docker to include this test.")(test_item) + def needs_singularity(test_item: MT) -> MT: """ Use as a decorator before test classes or methods to only run them if singularity is installed. """ - test_item = _mark_test('singularity', needs_online(test_item)) - if which('singularity'): + test_item = _mark_test("singularity", needs_online(test_item)) + if which("singularity"): return test_item else: return unittest.skip("Install singularity to include this test.")(test_item) + def needs_singularity_or_docker(test_item: MT) -> MT: """ Use as a decorator before test classes or methods to only run them if docker is installed and docker-based tests are enabled, or if Singularity is installed. """ - + # TODO: Is there a good way to OR decorators? - if which('singularity'): + if which("singularity"): # Singularity is here, say it's a Singularity test return needs_singularity(test_item) else: # Otherwise say it's a Docker test. return needs_docker(test_item) - + + def needs_local_cuda(test_item: MT) -> MT: """ Use as a decorator before test classes or methods to only run them if a CUDA setup legible to cwltool (i.e. providing userspace nvidia-smi) is present. """ - test_item = _mark_test('local_cuda', test_item) + test_item = _mark_test("local_cuda", test_item) if have_working_nvidia_smi(): return test_item else: - return unittest.skip("Install nvidia-smi, an nvidia proprietary driver, and a CUDA-capable nvidia GPU to include this test.")(test_item) + return unittest.skip( + "Install nvidia-smi, an nvidia proprietary driver, and a CUDA-capable nvidia GPU to include this test." + )(test_item) + def needs_docker_cuda(test_item: MT) -> MT: """ Use as a decorator before test classes or methods to only run them if a CUDA setup is available through Docker. """ - test_item = _mark_test('docker_cuda', needs_online(test_item)) + test_item = _mark_test("docker_cuda", needs_online(test_item)) if have_working_nvidia_docker_runtime(): return test_item else: - return unittest.skip("Install nvidia-container-runtime on your Docker server and configure an 'nvidia' runtime to include this test.")(test_item) + return unittest.skip( + "Install nvidia-container-runtime on your Docker server and configure an 'nvidia' runtime to include this test." + )(test_item) + def needs_encryption(test_item: MT) -> MT: """ Use as a decorator before test classes or methods to only run them if PyNaCl is installed and configured. """ - test_item = _mark_test('encryption', test_item) + test_item = _mark_test("encryption", test_item) try: # noinspection PyUnresolvedReferences import nacl # noqa except ImportError: return unittest.skip( - "Install Toil with the 'encryption' extra to include this test.")(test_item) + "Install Toil with the 'encryption' extra to include this test." + )(test_item) else: return test_item @@ -624,26 +677,31 @@ def needs_cwl(test_item: MT) -> MT: Use as a decorator before test classes or methods to only run them if CWLTool is installed and configured. """ - test_item = _mark_test('cwl', test_item) + test_item = _mark_test("cwl", test_item) try: # noinspection PyUnresolvedReferences import cwltool # noqa except ImportError: - return unittest.skip("Install Toil with the 'cwl' extra to include this test.")(test_item) + return unittest.skip("Install Toil with the 'cwl' extra to include this test.")( + test_item + ) else: return test_item + def needs_wdl(test_item: MT) -> MT: """ Use as a decorator before test classes or methods to only run them if miniwdl is installed and configured. """ - test_item = _mark_test('wdl', test_item) + test_item = _mark_test("wdl", test_item) try: # noinspection PyUnresolvedReferences import WDL # noqa except ImportError: - return unittest.skip("Install Toil with the 'wdl' extra to include this test.")(test_item) + return unittest.skip("Install Toil with the 'wdl' extra to include this test.")( + test_item + ) else: return test_item @@ -652,40 +710,48 @@ def needs_server(test_item: MT) -> MT: """ Use as a decorator before test classes or methods to only run them if Connexion is installed. """ - test_item = _mark_test('server_mode', test_item) + test_item = _mark_test("server_mode", test_item) try: # noinspection PyUnresolvedReferences import connexion + print(connexion.__file__) # keep this import from being removed. except ImportError: return unittest.skip( - "Install Toil with the 'server' extra to include this test.")(test_item) + "Install Toil with the 'server' extra to include this test." + )(test_item) else: return test_item + def needs_celery_broker(test_item: MT) -> MT: """ Use as a decorator before test classes or methods to run only if RabbitMQ is set up to take Celery jobs. """ - test_item = _mark_test('celery', needs_online(test_item)) - test_item = needs_env_var('TOIL_WES_BROKER_URL', "a URL to a RabbitMQ broker for Celery")(test_item) + test_item = _mark_test("celery", needs_online(test_item)) + test_item = needs_env_var( + "TOIL_WES_BROKER_URL", "a URL to a RabbitMQ broker for Celery" + )(test_item) return test_item + def needs_wes_server(test_item: MT) -> MT: """ Use as a decorator before test classes or methods to run only if a WES server is available to run against. """ - test_item = _mark_test('wes_server', needs_online(test_item)) + test_item = _mark_test("wes_server", needs_online(test_item)) - wes_url = os.environ.get('TOIL_WES_ENDPOINT') + wes_url = os.environ.get("TOIL_WES_ENDPOINT") if not wes_url: return unittest.skip(f"Set TOIL_WES_ENDPOINT to include this test")(test_item) try: urlopen(f"{wes_url}/ga4gh/wes/v1/service-info") except (HTTPError, URLError) as e: - return unittest.skip(f"Run a WES server on {wes_url} to include this test")(test_item) + return unittest.skip(f"Run a WES server on {wes_url} to include this test")( + test_item + ) return test_item @@ -695,10 +761,10 @@ def needs_local_appliance(test_item: MT) -> MT: Use as a decorator before test classes or methods to only run them if the Toil appliance Docker image is downloaded. """ - test_item = _mark_test('appliance', test_item) - if os.getenv('TOIL_SKIP_DOCKER', '').lower() == 'true': - return unittest.skip('Skipping docker test.')(test_item) - if not which('docker'): + test_item = _mark_test("appliance", test_item) + if os.getenv("TOIL_SKIP_DOCKER", "").lower() == "true": + return unittest.skip("Skipping docker test.")(test_item) + if not which("docker"): return unittest.skip("Install docker to include this test.")(test_item) try: @@ -734,9 +800,9 @@ def needs_fetchable_appliance(test_item: MT) -> MT: the Toil appliance Docker image is able to be downloaded from the Internet. """ - test_item = _mark_test('fetchable_appliance', needs_online(test_item)) - if os.getenv('TOIL_SKIP_DOCKER', '').lower() == 'true': - return unittest.skip('Skipping docker test.')(test_item) + test_item = _mark_test("fetchable_appliance", needs_online(test_item)) + if os.getenv("TOIL_SKIP_DOCKER", "").lower() == "true": + return unittest.skip("Skipping docker test.")(test_item) try: applianceSelf() except ApplianceImageNotFound: @@ -757,12 +823,12 @@ def integrative(test_item: MT) -> MT: We define integration tests as A) involving other, non-Toil software components that we develop and/or B) having a higher cost (time or money). """ - test_item = _mark_test('integrative', test_item) - if os.getenv('TOIL_TEST_INTEGRATIVE', '').lower() == 'true': + test_item = _mark_test("integrative", test_item) + if os.getenv("TOIL_TEST_INTEGRATIVE", "").lower() == "true": return test_item else: return unittest.skip( - 'Set TOIL_TEST_INTEGRATIVE=True to include this integration test, ' + "Set TOIL_TEST_INTEGRATIVE=True to include this integration test, " "or run `make integration_test_local` to run all integration tests." )(test_item) @@ -772,14 +838,14 @@ def slow(test_item: MT) -> MT: Use this decorator to identify tests that are slow and not critical. Skip if TOIL_TEST_QUICK is true. """ - test_item = _mark_test('slow', test_item) - if os.environ.get('TOIL_TEST_QUICK', '').lower() != 'true': + test_item = _mark_test("slow", test_item) + if os.environ.get("TOIL_TEST_QUICK", "").lower() != "true": return test_item else: return unittest.skip('Skipped because TOIL_TEST_QUICK is "True"')(test_item) -methodNamePartRegex = re.compile('^[a-zA-Z_0-9]+$') +methodNamePartRegex = re.compile("^[a-zA-Z_0-9]+$") @contextmanager @@ -791,7 +857,7 @@ def timeLimit(seconds: int) -> Generator[None, None, None]: specified amount of time. See . :param seconds: maximum allowable time, in seconds - + >>> import time >>> with timeLimit(2): ... time.sleep(1) @@ -802,9 +868,10 @@ def timeLimit(seconds: int) -> Generator[None, None, None]: ... RuntimeError: Timed out """ + # noinspection PyUnusedLocal def signal_handler(signum: int, frame: Any) -> None: - raise RuntimeError('Timed out') + raise RuntimeError("Timed out") signal.signal(signal.SIGALRM, signal_handler) signal.alarm(seconds) @@ -869,6 +936,7 @@ def make_tests(generalMethod, targetClass, **kwargs): False """ + def permuteIntoLeft(left, rParamName, right): """ Permutes values in right dictionary into each parameter: value dict pair in the left @@ -894,9 +962,11 @@ def permuteIntoLeft(left, rParamName, right): """ for prmValName, lDict in list(left.items()): for rValName, rVal in list(right.items()): - nextPrmVal = (f'__{rParamName}_{rValName.lower()}') + nextPrmVal = f"__{rParamName}_{rValName.lower()}" if methodNamePartRegex.match(nextPrmVal) is None: - raise RuntimeError("The name '%s' cannot be used in a method name" % pvName) + raise RuntimeError( + "The name '%s' cannot be used in a method name" % pvName + ) aggDict = dict(lDict) aggDict[rParamName] = rVal left[prmValName + nextPrmVal] = aggDict @@ -911,7 +981,7 @@ def fx(self, prms=prms): else: return generalMethod(self) - methodName = f'test_{generalMethod.__name__}{prmNames}' + methodName = f"test_{generalMethod.__name__}{prmNames}" setattr(targetClass, methodName, fx) @@ -924,9 +994,11 @@ def fx(self, prms=prms): left = {} prmName, vals = sortedKwargs.pop() for valName, val in list(vals.items()): - pvName = f'__{prmName}_{valName.lower()}' + pvName = f"__{prmName}_{valName.lower()}" if methodNamePartRegex.match(pvName) is None: - raise RuntimeError("The name '%s' cannot be used in a method name" % pvName) + raise RuntimeError( + "The name '%s' cannot be used in a method name" % pvName + ) left[pvName] = {prmName: val} # get cartesian product @@ -952,9 +1024,9 @@ class ApplianceTestSupport(ToilTest): @contextmanager def _applianceCluster( - self, mounts: Dict[str, str], numCores: Optional[int] = None + self, mounts: dict[str, str], numCores: Optional[int] = None ) -> Generator[ - Tuple["ApplianceTestSupport.LeaderThread", "ApplianceTestSupport.WorkerThread"], + tuple["ApplianceTestSupport.LeaderThread", "ApplianceTestSupport.WorkerThread"], None, None, ]: @@ -984,10 +1056,10 @@ def _applianceCluster( class Appliance(ExceptionalThread, metaclass=ABCMeta): @abstractmethod def _getRole(self) -> str: - return 'leader' + return "leader" @abstractmethod - def _containerCommand(self) -> List[str]: + def _containerCommand(self) -> list[str]: pass @abstractmethod @@ -1000,7 +1072,7 @@ def _entryPoint(self) -> str: def __init__( self, outer: "ApplianceTestSupport", - mounts: Dict[str, str], + mounts: dict[str, str], cleanMounts: bool = False, ) -> None: assert all( @@ -1017,15 +1089,20 @@ def __enter__(self) -> "Appliance": with self.lock: image = applianceSelf() # Omitting --rm, it's unreliable, see https://github.com/docker/docker/issues/16575 - args = list(concat('docker', 'run', - '--entrypoint=' + self._entryPoint(), - '--net=host', - '-i', - '--name=' + self.containerName, - ['--volume=%s:%s' % mount for mount in self.mounts.items()], - image, - self._containerCommand())) - logger.info('Running %r', args) + args = list( + concat( + "docker", + "run", + "--entrypoint=" + self._entryPoint(), + "--net=host", + "-i", + "--name=" + self.containerName, + ["--volume=%s:%s" % mount for mount in self.mounts.items()], + image, + self._containerCommand(), + ) + ) + logger.info("Running %r", args) self.popen = subprocess.Popen(args) self.start() self.__wait_running() @@ -1033,17 +1110,17 @@ def __enter__(self) -> "Appliance": # noinspection PyUnusedLocal def __exit__( - self, exc_type: Type[BaseException], exc_val: Exception, exc_tb: Any + self, exc_type: type[BaseException], exc_val: Exception, exc_tb: Any ) -> Literal[False]: try: try: - self.outer._run('docker', 'stop', self.containerName) + self.outer._run("docker", "stop", self.containerName) self.join() finally: if self.cleanMounts: self.__cleanMounts() finally: - self.outer._run('docker', 'rm', '-f', self.containerName) + self.outer._run("docker", "rm", "-f", self.containerName) return False # don't swallow exception def __wait_running(self) -> None: @@ -1070,7 +1147,7 @@ def __wait_running(self) -> None: except subprocess.CalledProcessError: pass else: - if 'true' == running: + if "true" == running: break time.sleep(1) @@ -1083,30 +1160,33 @@ def __cleanMounts(self) -> None: was stopped, otherwise the running container might still be writing files. """ # Delete all files within each mounted directory, but not the directory itself. - cmd = 'shopt -s dotglob && rm -rf ' + ' '.join(v + '/*' - for k, v in self.mounts.items() - if os.path.isdir(k)) - self.outer._run('docker', 'run', - '--rm', - '--entrypoint=/bin/bash', - applianceSelf(), - '-c', - cmd) + cmd = "shopt -s dotglob && rm -rf " + " ".join( + v + "/*" for k, v in self.mounts.items() if os.path.isdir(k) + ) + self.outer._run( + "docker", + "run", + "--rm", + "--entrypoint=/bin/bash", + applianceSelf(), + "-c", + cmd, + ) def tryRun(self) -> None: assert self.popen self.popen.wait() - logger.info('Exiting %s', self.__class__.__name__) + logger.info("Exiting %s", self.__class__.__name__) def runOnAppliance(self, *args: str, **kwargs: Any) -> None: # Check if thread is still alive. Note that ExceptionalThread.join raises the # exception that occurred in the thread. self.join(timeout=0) # noinspection PyProtectedMember - self.outer._run('docker', 'exec', '-i', self.containerName, *args, **kwargs) + self.outer._run("docker", "exec", "-i", self.containerName, *args, **kwargs) def writeToAppliance(self, path: str, contents: Any) -> None: - self.runOnAppliance('tee', path, input=contents) + self.runOnAppliance("tee", path, input=contents) def deployScript( self, path: str, packagePath: str, script: Union[str, Callable[..., Any]] @@ -1129,42 +1209,46 @@ def deployScript( packagePath_list = packagePath.split(".") packages, module = packagePath_list[:-1], packagePath_list[-1] for package in packages: - path += '/' + package - self.runOnAppliance('mkdir', '-p', path) - self.writeToAppliance(path + '/__init__.py', '') - self.writeToAppliance(path + '/' + module + '.py', script) + path += "/" + package + self.runOnAppliance("mkdir", "-p", path) + self.writeToAppliance(path + "/__init__.py", "") + self.writeToAppliance(path + "/" + module + ".py", script) class LeaderThread(Appliance): def _entryPoint(self) -> str: - return 'mesos-master' + return "mesos-master" def _getRole(self) -> str: - return 'leader' + return "leader" - def _containerCommand(self) -> List[str]: - return ['--registry=in_memory', - '--ip=127.0.0.1', - '--port=5050', - '--allocation_interval=500ms'] + def _containerCommand(self) -> list[str]: + return [ + "--registry=in_memory", + "--ip=127.0.0.1", + "--port=5050", + "--allocation_interval=500ms", + ] class WorkerThread(Appliance): def __init__( - self, outer: "ApplianceTestSupport", mounts: Dict[str, str], numCores: int + self, outer: "ApplianceTestSupport", mounts: dict[str, str], numCores: int ) -> None: self.numCores = numCores super().__init__(outer, mounts) def _entryPoint(self) -> str: - return 'mesos-agent' + return "mesos-agent" def _getRole(self) -> str: - return 'worker' - - def _containerCommand(self) -> List[str]: - return ['--work_dir=/var/lib/mesos', - '--ip=127.0.0.1', - '--master=127.0.0.1:5050', - '--attributes=preemptible:False', - '--resources=cpus(*):%i' % self.numCores, - '--no-hostname_lookup', - '--no-systemd_enable_support'] + return "worker" + + def _containerCommand(self) -> list[str]: + return [ + "--work_dir=/var/lib/mesos", + "--ip=127.0.0.1", + "--master=127.0.0.1:5050", + "--attributes=preemptible:False", + "--resources=cpus(*):%i" % self.numCores, + "--no-hostname_lookup", + "--no-systemd_enable_support", + ] diff --git a/src/toil/test/batchSystems/batchSystemTest.py b/src/toil/test/batchSystems/batchSystemTest.py index 1619d65d66..49796bc71a 100644 --- a/src/toil/test/batchSystems/batchSystemTest.py +++ b/src/toil/test/batchSystems/batchSystemTest.py @@ -24,36 +24,43 @@ from fractions import Fraction from unittest import skipIf -from toil.batchSystems.abstractBatchSystem import (AbstractBatchSystem, - BatchSystemSupport, - InsufficientSystemResources) +from toil.batchSystems.abstractBatchSystem import ( + AbstractBatchSystem, + BatchSystemSupport, + InsufficientSystemResources, +) + # Don't import any batch systems here that depend on extras # in order to import properly. Import them later, in tests # protected by annotations. from toil.batchSystems.mesos.test import MesosTestSupport -from toil.batchSystems.registry import (add_batch_system_factory, - get_batch_system, - get_batch_systems, - restore_batch_system_plugin_state, - save_batch_system_plugin_state) +from toil.batchSystems.registry import ( + add_batch_system_factory, + get_batch_system, + get_batch_systems, + restore_batch_system_plugin_state, + save_batch_system_plugin_state, +) from toil.batchSystems.singleMachine import SingleMachineBatchSystem from toil.common import Config, Toil from toil.job import Job, JobDescription, Requirer from toil.lib.retry import retry_flaky_test from toil.lib.threading import cpu_count -from toil.test import (ToilTest, - needs_aws_batch, - needs_aws_s3, - needs_fetchable_appliance, - needs_gridengine, - needs_htcondor, - needs_kubernetes, - needs_kubernetes_installed, - needs_lsf, - needs_mesos, - needs_slurm, - needs_torque, - slow) +from toil.test import ( + ToilTest, + needs_aws_batch, + needs_aws_s3, + needs_fetchable_appliance, + needs_gridengine, + needs_htcondor, + needs_kubernetes, + needs_kubernetes_installed, + needs_lsf, + needs_mesos, + needs_slurm, + needs_torque, + slow, +) logger = logging.getLogger(__name__) @@ -66,7 +73,10 @@ # Since we aren't always attaching the config to the jobs for these tests, we # need to use fully specified requirements. -defaultRequirements = dict(memory=int(100e6), cores=1, disk=1000, preemptible=preemptible, accelerators=[]) +defaultRequirements = dict( + memory=int(100e6), cores=1, disk=1000, preemptible=preemptible, accelerators=[] +) + class BatchSystemPluginTest(ToilTest): """ @@ -91,9 +101,10 @@ def test_batch_system_factory(): # add its arguments. return SingleMachineBatchSystem - add_batch_system_factory('testBatchSystem', test_batch_system_factory) - assert 'testBatchSystem' in get_batch_systems() - assert get_batch_system('testBatchSystem') == SingleMachineBatchSystem + add_batch_system_factory("testBatchSystem", test_batch_system_factory) + assert "testBatchSystem" in get_batch_systems() + assert get_batch_system("testBatchSystem") == SingleMachineBatchSystem + class hidden: """ @@ -127,8 +138,9 @@ def createConfig(cls): """ config = Config() from uuid import uuid4 + config.workflowID = str(uuid4()) - config.cleanWorkDir = 'always' + config.cleanWorkDir = "always" return config def _createConfig(self): @@ -164,7 +176,7 @@ def setUp(self): super().setUp() self.config = self._createConfig() self.batchSystem = self.createBatchSystem() - self.tempDir = self._createTempDir('testFiles') + self.tempDir = self._createTempDir("testFiles") def tearDown(self): self.batchSystem.shutdown() @@ -182,12 +194,20 @@ def test_available_cores(self): @retry_flaky_test(prepare=[tearDown, setUp]) def test_run_jobs(self): - jobDesc1 = self._mockJobDescription(jobName='test1', unitName=None, - jobStoreID='1', requirements=defaultRequirements) - jobDesc2 = self._mockJobDescription(jobName='test2', unitName=None, - jobStoreID='2', requirements=defaultRequirements) - job1 = self.batchSystem.issueBatchJob('sleep 1000', jobDesc1) - job2 = self.batchSystem.issueBatchJob('sleep 1000', jobDesc2) + jobDesc1 = self._mockJobDescription( + jobName="test1", + unitName=None, + jobStoreID="1", + requirements=defaultRequirements, + ) + jobDesc2 = self._mockJobDescription( + jobName="test2", + unitName=None, + jobStoreID="2", + requirements=defaultRequirements, + ) + job1 = self.batchSystem.issueBatchJob("sleep 1000", jobDesc1) + job2 = self.batchSystem.issueBatchJob("sleep 1000", jobDesc2) issuedIDs = self._waitForJobsToIssue(2) self.assertEqual(set(issuedIDs), {job1, job2}) @@ -202,7 +222,9 @@ def test_run_jobs(self): # getUpdatedBatchJob, and the sleep time is longer than the time we # should spend waiting for both to start, so if our cluster can # only run one job at a time, we will fail the test. - runningJobIDs = self._waitForJobsToStart(2, tries=self.get_max_startup_seconds()) + runningJobIDs = self._waitForJobsToStart( + 2, tries=self.get_max_startup_seconds() + ) self.assertEqual(set(runningJobIDs), {job1, job2}) # Killing the jobs instead of allowing them to complete means this test can run very @@ -216,13 +238,21 @@ def test_run_jobs(self): # then check for it having happened, but we can't guarantee that # the batch system will run against the same filesystem we are # looking at. - jobDesc3 = self._mockJobDescription(jobName='test3', unitName=None, - jobStoreID='3', requirements=defaultRequirements) + jobDesc3 = self._mockJobDescription( + jobName="test3", + unitName=None, + jobStoreID="3", + requirements=defaultRequirements, + ) job3 = self.batchSystem.issueBatchJob("mktemp -d", jobDesc3) jobUpdateInfo = self.batchSystem.getUpdatedBatchJob(maxWait=1000) - jobID, exitStatus, wallTime = jobUpdateInfo.jobID, jobUpdateInfo.exitStatus, jobUpdateInfo.wallTime - logger.info(f'Third job completed: {jobID} {exitStatus} {wallTime}') + jobID, exitStatus, wallTime = ( + jobUpdateInfo.jobID, + jobUpdateInfo.exitStatus, + jobUpdateInfo.wallTime, + ) + logger.info(f"Third job completed: {jobID} {exitStatus} {wallTime}") # Since the first two jobs were killed, the only job in the updated jobs queue should # be job 3. If the first two jobs were (incorrectly) added to the queue, this will @@ -242,46 +272,68 @@ def test_run_jobs(self): def test_set_env(self): # Start with a relatively safe script - script_shell = 'if [ "x${FOO}" == "xbar" ] ; then exit 23 ; else exit 42 ; fi' + script_shell = ( + 'if [ "x${FOO}" == "xbar" ] ; then exit 23 ; else exit 42 ; fi' + ) # Escape the semicolons - script_protected = script_shell.replace(';', r'\;') + script_protected = script_shell.replace(";", r"\;") # Turn into a string which convinces bash to take all args and paste them back together and run them - command = "bash -c \"\\${@}\" bash eval " + script_protected - jobDesc4 = self._mockJobDescription(jobName='test4', unitName=None, - jobStoreID='4', requirements=defaultRequirements) + command = 'bash -c "\\${@}" bash eval ' + script_protected + jobDesc4 = self._mockJobDescription( + jobName="test4", + unitName=None, + jobStoreID="4", + requirements=defaultRequirements, + ) job4 = self.batchSystem.issueBatchJob(command, jobDesc4) jobUpdateInfo = self.batchSystem.getUpdatedBatchJob(maxWait=1000) - jobID, exitStatus, wallTime = jobUpdateInfo.jobID, jobUpdateInfo.exitStatus, jobUpdateInfo.wallTime + jobID, exitStatus, wallTime = ( + jobUpdateInfo.jobID, + jobUpdateInfo.exitStatus, + jobUpdateInfo.wallTime, + ) self.assertEqual(exitStatus, 42) self.assertEqual(jobID, job4) # Now set the variable and ensure that it is present - self.batchSystem.setEnv('FOO', 'bar') - jobDesc5 = self._mockJobDescription(jobName='test5', unitName=None, - jobStoreID='5', requirements=defaultRequirements) + self.batchSystem.setEnv("FOO", "bar") + jobDesc5 = self._mockJobDescription( + jobName="test5", + unitName=None, + jobStoreID="5", + requirements=defaultRequirements, + ) job5 = self.batchSystem.issueBatchJob(command, jobDesc5) jobUpdateInfo = self.batchSystem.getUpdatedBatchJob(maxWait=1000) self.assertEqual(jobUpdateInfo.exitStatus, 23) self.assertEqual(jobUpdateInfo.jobID, job5) def test_set_job_env(self): - """ Test the mechanism for setting per-job environment variables to batch system jobs.""" + """Test the mechanism for setting per-job environment variables to batch system jobs.""" script = 'if [ "x${FOO}" == "xbar" ] ; then exit 23 ; else exit 42 ; fi' - command = "bash -c \"\\${@}\" bash eval " + script.replace(';', r'\;') + command = 'bash -c "\\${@}" bash eval ' + script.replace(";", r"\;") # Issue a job with a job environment variable - job_desc_6 = self._mockJobDescription(jobName='test6', unitName=None, - jobStoreID='6', requirements=defaultRequirements) - job6 = self.batchSystem.issueBatchJob(command, job_desc_6, job_environment={ - 'FOO': 'bar' - }) + job_desc_6 = self._mockJobDescription( + jobName="test6", + unitName=None, + jobStoreID="6", + requirements=defaultRequirements, + ) + job6 = self.batchSystem.issueBatchJob( + command, job_desc_6, job_environment={"FOO": "bar"} + ) job_update_info = self.batchSystem.getUpdatedBatchJob(maxWait=1000) self.assertEqual(job_update_info.exitStatus, 23) # this should succeed self.assertEqual(job_update_info.jobID, job6) # Now check that the environment variable doesn't exist for other jobs - job_desc_7 = self._mockJobDescription(jobName='test7', unitName=None, - jobStoreID='7', requirements=defaultRequirements) + job_desc_7 = self._mockJobDescription( + jobName="test7", + unitName=None, + jobStoreID="7", + requirements=defaultRequirements, + ) job7 = self.batchSystem.issueBatchJob(command, job_desc_7) job_update_info = self.batchSystem.getUpdatedBatchJob(maxWait=1000) self.assertEqual(job_update_info.exitStatus, 42) @@ -291,34 +343,67 @@ def testCheckResourceRequest(self): if isinstance(self.batchSystem, BatchSystemSupport): check_resource_request = self.batchSystem.check_resource_request # Assuming we have <2000 cores, this should be too many cores - self.assertRaises(InsufficientSystemResources, check_resource_request, - Requirer(dict(memory=1000, cores=2000, disk='1G', accelerators=[]))) - self.assertRaises(InsufficientSystemResources, check_resource_request, - Requirer(dict(memory=5, cores=2000, disk='1G', accelerators=[]))) + self.assertRaises( + InsufficientSystemResources, + check_resource_request, + Requirer(dict(memory=1000, cores=2000, disk="1G", accelerators=[])), + ) + self.assertRaises( + InsufficientSystemResources, + check_resource_request, + Requirer(dict(memory=5, cores=2000, disk="1G", accelerators=[])), + ) # This should be too much memory - self.assertRaises(InsufficientSystemResources, check_resource_request, - Requirer(dict(memory='5000G', cores=1, disk='1G', accelerators=[]))) + self.assertRaises( + InsufficientSystemResources, + check_resource_request, + Requirer(dict(memory="5000G", cores=1, disk="1G", accelerators=[])), + ) # This should be too much disk - self.assertRaises(InsufficientSystemResources, check_resource_request, - Requirer(dict(memory=5, cores=1, disk='2G', accelerators=[]))) + self.assertRaises( + InsufficientSystemResources, + check_resource_request, + Requirer(dict(memory=5, cores=1, disk="2G", accelerators=[])), + ) # This should be an accelerator we don't have. # All the batch systems need code to know they don't have these accelerators. - self.assertRaises(InsufficientSystemResources, check_resource_request, - Requirer(dict(memory=5, cores=1, disk=100, accelerators=[{'kind': 'turbo-encabulator', 'count': 1}]))) + self.assertRaises( + InsufficientSystemResources, + check_resource_request, + Requirer( + dict( + memory=5, + cores=1, + disk=100, + accelerators=[{"kind": "turbo-encabulator", "count": 1}], + ) + ), + ) # These should be missing attributes - self.assertRaises(AttributeError, check_resource_request, - Requirer(dict(memory=5, cores=1, disk=1000))) - self.assertRaises(AttributeError, check_resource_request, - Requirer(dict(cores=1, disk=1000, accelerators=[]))) - self.assertRaises(AttributeError, check_resource_request, - Requirer(dict(memory=10, disk=1000, accelerators=[]))) + self.assertRaises( + AttributeError, + check_resource_request, + Requirer(dict(memory=5, cores=1, disk=1000)), + ) + self.assertRaises( + AttributeError, + check_resource_request, + Requirer(dict(cores=1, disk=1000, accelerators=[])), + ) + self.assertRaises( + AttributeError, + check_resource_request, + Requirer(dict(memory=10, disk=1000, accelerators=[])), + ) # This should actually work - check_resource_request(Requirer(dict(memory=10, cores=1, disk=100, accelerators=[]))) + check_resource_request( + Requirer(dict(memory=10, cores=1, disk=100, accelerators=[])) + ) def testScalableBatchSystem(self): # If instance of scalable batch system @@ -345,7 +430,7 @@ def _waitForJobsToStart(self, numJobs, tries=20): # prevent an endless loop, give it a few tries for it in range(tries): running = self.batchSystem.getRunningBatchJobIDs() - logger.info(f'Running jobs now: {running}') + logger.info(f"Running jobs now: {running}") runningIDs = list(running.keys()) if len(runningIDs) == numJobs: break @@ -395,18 +480,26 @@ def testJobConcurrency(self): Tests that the batch system is allocating core resources properly for concurrent tasks. """ for coresPerJob in self.allocatedCores: - tempDir = self._createTempDir('testFiles') + tempDir = self._createTempDir("testFiles") options = self.getOptions(tempDir) - counterPath = os.path.join(tempDir, 'counter') + counterPath = os.path.join(tempDir, "counter") resetCounters(counterPath) value, maxValue = getCounters(counterPath) assert (value, maxValue) == (0, 0) root = Job() for _ in range(self.cpuCount): - root.addFollowOn(Job.wrapFn(measureConcurrency, counterPath, self.sleepTime, - cores=coresPerJob, memory='1M', disk='1Mi')) + root.addFollowOn( + Job.wrapFn( + measureConcurrency, + counterPath, + self.sleepTime, + cores=coresPerJob, + memory="1M", + disk="1Mi", + ) + ) with Toil(options) as toil: toil.start(root) _, maxValue = getCounters(counterPath) @@ -420,18 +513,24 @@ def test_omp_threads(self): # mapping of the number of cores to the OMP_NUM_THREADS value 0.1: "1", 1: "1", - 2: "2" + 2: "2", } temp_dir = self._createTempDir() options = self.getOptions(temp_dir) for cores, expected_omp_threads in test_cases.items(): - if os.environ.get('OMP_NUM_THREADS'): - expected_omp_threads = os.environ.get('OMP_NUM_THREADS') - logger.info(f"OMP_NUM_THREADS is set. Using OMP_NUM_THREADS={expected_omp_threads} instead.") + if os.environ.get("OMP_NUM_THREADS"): + expected_omp_threads = os.environ.get("OMP_NUM_THREADS") + logger.info( + f"OMP_NUM_THREADS is set. Using OMP_NUM_THREADS={expected_omp_threads} instead." + ) with Toil(options) as toil: - output = toil.start(Job.wrapFn(get_omp_threads, memory='1Mi', cores=cores, disk='1Mi')) + output = toil.start( + Job.wrapFn( + get_omp_threads, memory="1Mi", cores=cores, disk="1Mi" + ) + ) self.assertEqual(output, expected_omp_threads) class AbstractGridEngineBatchSystemTest(AbstractBatchSystemTest): @@ -444,9 +543,10 @@ def _createConfig(self): config = super()._createConfig() config.statePollingWait = 0.5 # Reduce polling wait so tests run faster # can't use _getTestJobStorePath since that method removes the directory - config.jobStore = 'file:' + self._createTempDir('jobStore') + config.jobStore = "file:" + self._createTempDir("jobStore") return config + @needs_kubernetes @needs_aws_s3 @needs_fetchable_appliance @@ -461,8 +561,11 @@ def supportsWallTime(self): def createBatchSystem(self): # We know we have Kubernetes so we can import the batch system from toil.batchSystems.kubernetes import KubernetesBatchSystem - return KubernetesBatchSystem(config=self.config, - maxCores=numCores, maxMemory=1e9, maxDisk=2001) + + return KubernetesBatchSystem( + config=self.config, maxCores=numCores, maxMemory=1e9, maxDisk=2001 + ) + @needs_kubernetes_installed class KubernetesBatchSystemBenchTest(ToilTest): @@ -486,7 +589,9 @@ def test_preemptability_constraints(self): constraints = KubernetesBatchSystem.Placement() constraints.set_preemptible(False) constraints.apply(normal_spec) - self.assertEqual(textwrap.dedent(""" + self.assertEqual( + textwrap.dedent( + """ {'node_affinity': {'preferred_during_scheduling_ignored_during_execution': None, 'required_during_scheduling_ignored_during_execution': {'node_selector_terms': [{'match_expressions': [{'key': 'eks.amazonaws.com/capacityType', 'operator': 'NotIn', @@ -497,14 +602,19 @@ def test_preemptability_constraints(self): 'match_fields': None}]}}, 'pod_affinity': None, 'pod_anti_affinity': None} - """).strip(), str(normal_spec.affinity)) + """ + ).strip(), + str(normal_spec.affinity), + ) self.assertEqual(str(normal_spec.tolerations), "None") spot_spec = V1PodSpec(containers=[]) constraints = KubernetesBatchSystem.Placement() constraints.set_preemptible(True) constraints.apply(spot_spec) - self.assertEqual(textwrap.dedent(""" + self.assertEqual( + textwrap.dedent( + """ {'node_affinity': {'preferred_during_scheduling_ignored_during_execution': [{'preference': {'match_expressions': [{'key': 'eks.amazonaws.com/capacityType', 'operator': 'In', 'values': ['SPOT']}], @@ -518,14 +628,22 @@ def test_preemptability_constraints(self): 'required_during_scheduling_ignored_during_execution': None}, 'pod_affinity': None, 'pod_anti_affinity': None} - """).strip(), str(spot_spec.affinity), ) - self.assertEqual(textwrap.dedent(""" + """ + ).strip(), + str(spot_spec.affinity), + ) + self.assertEqual( + textwrap.dedent( + """ [{'effect': None, 'key': 'cloud.google.com/gke-preemptible', 'operator': None, 'toleration_seconds': None, 'value': 'true'}] - """).strip(), str(spot_spec.tolerations)) + """ + ).strip(), + str(spot_spec.tolerations), + ) def test_label_constraints(self): """ @@ -541,11 +659,13 @@ def test_label_constraints(self): spec = V1PodSpec(containers=[]) constraints = KubernetesBatchSystem.Placement() - constraints.required_labels = [('GottaBeSetTo', ['This'])] - constraints.desired_labels = [('OutghtToBeSetTo', ['That'])] - constraints.prohibited_labels = [('CannotBe', ['ABadThing'])] + constraints.required_labels = [("GottaBeSetTo", ["This"])] + constraints.desired_labels = [("OutghtToBeSetTo", ["That"])] + constraints.prohibited_labels = [("CannotBe", ["ABadThing"])] constraints.apply(spec) - self.assertEqual(textwrap.dedent(""" + self.assertEqual( + textwrap.dedent( + """ {'node_affinity': {'preferred_during_scheduling_ignored_during_execution': [{'preference': {'match_expressions': [{'key': 'OutghtToBeSetTo', 'operator': 'In', 'values': ['That']}], @@ -560,7 +680,10 @@ def test_label_constraints(self): 'match_fields': None}]}}, 'pod_affinity': None, 'pod_anti_affinity': None} - """).strip(), str(spec.affinity),) + """ + ).strip(), + str(spec.affinity), + ) self.assertEqual(str(spec.tolerations), "None") @@ -576,13 +699,16 @@ def supportsWallTime(self): def createBatchSystem(self): from toil.batchSystems.awsBatch import AWSBatchBatchSystem - return AWSBatchBatchSystem(config=self.config, - maxCores=numCores, maxMemory=1e9, maxDisk=2001) + + return AWSBatchBatchSystem( + config=self.config, maxCores=numCores, maxMemory=1e9, maxDisk=2001 + ) def get_max_startup_seconds(self) -> int: # AWS Batch may need to scale out the compute environment. return 300 + @slow @needs_mesos class MesosBatchSystemTest(hidden.AbstractBatchSystemTest, MesosTestSupport): @@ -597,7 +723,7 @@ def createConfig(cls): private IP address """ config = super().createConfig() - config.mesos_endpoint = 'localhost:5050' + config.mesos_endpoint = "localhost:5050" return config def supportsWallTime(self): @@ -606,19 +732,25 @@ def supportsWallTime(self): def createBatchSystem(self): # We know we have Mesos so we can import the batch system from toil.batchSystems.mesos.batchSystem import MesosBatchSystem + self._startMesos(numCores) - return MesosBatchSystem(config=self.config, - maxCores=numCores, maxMemory=1e9, maxDisk=1001) + return MesosBatchSystem( + config=self.config, maxCores=numCores, maxMemory=1e9, maxDisk=1001 + ) def tearDown(self): self._stopMesos() super().tearDown() def testIgnoreNode(self): - self.batchSystem.ignoreNode('localhost') - jobDesc = self._mockJobDescription(jobName='test2', unitName=None, - jobStoreID='1', requirements=defaultRequirements) - job = self.batchSystem.issueBatchJob('sleep 1000', jobDesc) + self.batchSystem.ignoreNode("localhost") + jobDesc = self._mockJobDescription( + jobName="test2", + unitName=None, + jobStoreID="1", + requirements=defaultRequirements, + ) + job = self.batchSystem.issueBatchJob("sleep 1000", jobDesc) issuedID = self._waitForJobsToIssue(1) self.assertEqual(set(issuedID), {job}) @@ -635,7 +767,7 @@ def write_temp_file(s: str, temp_dir: str) -> str: """ fd, path = tempfile.mkstemp(dir=temp_dir) try: - encoded = s.encode('utf-8') + encoded = s.encode("utf-8") assert os.write(fd, encoded) == len(encoded) except: os.unlink(path) @@ -655,8 +787,9 @@ def supportsWallTime(self) -> bool: return True def createBatchSystem(self) -> AbstractBatchSystem: - return SingleMachineBatchSystem(config=self.config, - maxCores=numCores, maxMemory=1e9, maxDisk=2001) + return SingleMachineBatchSystem( + config=self.config, maxCores=numCores, maxMemory=1e9, maxDisk=2001 + ) def testProcessEscape(self, hide: bool = False) -> None: """ @@ -677,14 +810,18 @@ def script() -> None: from typing import Any def handle_signal(sig: Any, frame: Any) -> None: - sys.stderr.write(f'{os.getpid()} ignoring signal {sig}\n') + sys.stderr.write(f"{os.getpid()} ignoring signal {sig}\n") - if hasattr(signal, 'valid_signals'): + if hasattr(signal, "valid_signals"): # We can just ask about the signals all_signals = signal.valid_signals() else: # Fish them out by name - all_signals = [getattr(signal, n) for n in dir(signal) if n.startswith('SIG') and not n.startswith('SIG_')] + all_signals = [ + getattr(signal, n) + for n in dir(signal) + if n.startswith("SIG") and not n.startswith("SIG_") + ] for sig in all_signals: # Set up to ignore all signals we can and generally be obstinate @@ -706,7 +843,7 @@ def handle_signal(sig: Any, frame: Any) -> None: fd = os.open(sys.argv[1], os.O_RDONLY) fcntl.lockf(fd, fcntl.LOCK_SH) - sys.stderr.write(f'{os.getpid()} waiting...\n') + sys.stderr.write(f"{os.getpid()} waiting...\n") while True: # Wait around forever @@ -718,27 +855,26 @@ def handle_signal(sig: Any, frame: Any) -> None: script_path = write_temp_file(self._getScriptSource(script), temp_dir) # We will have all the job processes try and lock this file shared while they are alive. - lockable_path = write_temp_file('', temp_dir) + lockable_path = write_temp_file("", temp_dir) try: - command = f'{sys.executable} {script_path} {lockable_path}' + command = f"{sys.executable} {script_path} {lockable_path}" if hide: # Tell the children to stop the first child and hide out in the # process group it made. - command += ' hide' + command += " hide" # Start the job self.batchSystem.issueBatchJob( - command, + command, self._mockJobDescription( - jobName='fork', - jobStoreID='1', - requirements=defaultRequirements) + jobName="fork", jobStoreID="1", requirements=defaultRequirements + ), ) # Wait time.sleep(10) - lockfile = open(lockable_path, 'w') + lockfile = open(lockable_path, "w") if not hide: # In hiding mode the job will finish, and the batch system will @@ -793,13 +929,14 @@ def setUp(self) -> None: # Write initial value of counter file containing a tuple of two integers (i, n) where i # is the number of currently executing tasks and n the maximum observed value of i - self.counterPath = write_temp_file('0,0', temp_dir) + self.counterPath = write_temp_file("0,0", temp_dir) def script() -> None: import fcntl import os import sys import time + def count(delta: int) -> None: """ Adjust the first integer value in a file by the given amount. If the result @@ -809,13 +946,14 @@ def count(delta: int) -> None: try: fcntl.flock(fd, fcntl.LOCK_EX) try: - s = os.read(fd, 10).decode('utf-8') - value, maxValue = list(map(int, s.split(','))) + s = os.read(fd, 10).decode("utf-8") + value, maxValue = list(map(int, s.split(","))) value += delta - if value > maxValue: maxValue = value + if value > maxValue: + maxValue = value os.lseek(fd, 0, 0) os.ftruncate(fd, 0) - os.write(fd, f'{value},{maxValue}'.encode()) + os.write(fd, f"{value},{maxValue}".encode()) finally: fcntl.flock(fd, fcntl.LOCK_UN) finally: @@ -839,7 +977,7 @@ def tearDown(self) -> None: os.unlink(self.counterPath) def scriptCommand(self) -> str: - return ' '.join([sys.executable, self.scriptPath, self.counterPath]) + return " ".join([sys.executable, self.scriptPath, self.counterPath]) @retry_flaky_test(prepare=[tearDown, setUp]) def test(self): @@ -851,7 +989,13 @@ def test(self): minCores = F(1, 10) self.assertEqual(float(minCores), SingleMachineBatchSystem.minCores) for maxCores in {F(minCores), minCores * 10, F(1), F(numCores, 2), F(numCores)}: - for coresPerJob in {F(minCores), F(minCores * 10), F(1), F(maxCores, 2), F(maxCores)}: + for coresPerJob in { + F(minCores), + F(minCores * 10), + F(1), + F(maxCores, 2), + F(maxCores), + }: for load in (F(1, 10), F(1), F(10)): jobs = int(maxCores / coresPerJob * load) if jobs >= 1 and minCores <= coresPerJob < maxCores: @@ -861,7 +1005,8 @@ def test(self): maxCores=float(maxCores), # Ensure that memory or disk requirements don't get in the way. maxMemory=jobs * 10, - maxDisk=jobs * 10) + maxDisk=jobs * 10, + ) try: jobIds = set() for i in range(0, int(jobs)): @@ -871,48 +1016,62 @@ def test(self): memory=1, disk=1, accelerators=[], - preemptible=preemptible + preemptible=preemptible, ), jobName=str(i), - unitName='' + unitName="", ) jobIds.add(bs.issueBatchJob(self.scriptCommand(), desc)) self.assertEqual(len(jobIds), jobs) while jobIds: job = bs.getUpdatedBatchJob(maxWait=10) self.assertIsNotNone(job) - jobId, status, wallTime = job.jobID, job.exitStatus, job.wallTime + jobId, status, wallTime = ( + job.jobID, + job.exitStatus, + job.wallTime, + ) self.assertEqual(status, 0) # would raise KeyError on absence jobIds.remove(jobId) finally: bs.shutdown() - concurrentTasks, maxConcurrentTasks = getCounters(self.counterPath) + concurrentTasks, maxConcurrentTasks = getCounters( + self.counterPath + ) self.assertEqual(concurrentTasks, 0) - logger.info(f'maxCores: {maxCores}, ' - f'coresPerJob: {coresPerJob}, ' - f'load: {load}') + logger.info( + f"maxCores: {maxCores}, " + f"coresPerJob: {coresPerJob}, " + f"load: {load}" + ) # This is the key assertion: we shouldn't run too many jobs. # Because of nondeterminism we can't guarantee hitting the limit. expectedMaxConcurrentTasks = min(maxCores // coresPerJob, jobs) - self.assertLessEqual(maxConcurrentTasks, expectedMaxConcurrentTasks) + self.assertLessEqual( + maxConcurrentTasks, expectedMaxConcurrentTasks + ) resetCounters(self.counterPath) - @skipIf(SingleMachineBatchSystem.numCores < 3, 'Need at least three cores to run this test') + @skipIf( + SingleMachineBatchSystem.numCores < 3, + "Need at least three cores to run this test", + ) def testServices(self): options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.logLevel = "DEBUG" options.maxCores = 3 self.assertTrue(options.maxCores <= SingleMachineBatchSystem.numCores) Job.Runner.startToil(Job.wrapJobFn(parentJob, self.scriptCommand()), options) - with open(self.counterPath, 'r+') as f: + with open(self.counterPath, "r+") as f: s = f.read() - logger.info('Counter is %s', s) + logger.info("Counter is %s", s) self.assertEqual(getCounters(self.counterPath), (0, 3)) # Toil can use only top-level functions so we have to add them here: + def parentJob(job, cmd): job.addChildJobFn(childJob, cmd) @@ -939,13 +1098,13 @@ def __init__(self, cmd): self.cmd = cmd def start(self, fileStore): - subprocess.check_call(self.cmd + ' 1', shell=True) + subprocess.check_call(self.cmd + " 1", shell=True) def check(self): return True def stop(self, fileStore): - subprocess.check_call(self.cmd + ' -1', shell=True) + subprocess.check_call(self.cmd + " -1", shell=True) @slow @@ -957,14 +1116,17 @@ class GridEngineBatchSystemTest(hidden.AbstractGridEngineBatchSystemTest): def createBatchSystem(self) -> AbstractBatchSystem: from toil.batchSystems.gridengine import GridEngineBatchSystem - return GridEngineBatchSystem(config=self.config, maxCores=numCores, maxMemory=1000e9, - maxDisk=1e9) + + return GridEngineBatchSystem( + config=self.config, maxCores=numCores, maxMemory=1000e9, maxDisk=1e9 + ) def tearDown(self): super().tearDown() # Cleanup GridEngine output log file from qsub from glob import glob - for f in glob('toil_job*.o*'): + + for f in glob("toil_job*.o*"): os.unlink(f) @@ -977,14 +1139,17 @@ class SlurmBatchSystemTest(hidden.AbstractGridEngineBatchSystemTest): def createBatchSystem(self) -> AbstractBatchSystem: from toil.batchSystems.slurm import SlurmBatchSystem - return SlurmBatchSystem(config=self.config, maxCores=numCores, maxMemory=1000e9, - maxDisk=1e9) + + return SlurmBatchSystem( + config=self.config, maxCores=numCores, maxMemory=1000e9, maxDisk=1e9 + ) def tearDown(self): super().tearDown() # Cleanup 'slurm-%j.out' produced by sbatch from glob import glob - for f in glob('slurm-*.out'): + + for f in glob("slurm-*.out"): os.unlink(f) @@ -994,10 +1159,13 @@ class LSFBatchSystemTest(hidden.AbstractGridEngineBatchSystemTest): """ Tests against the LSF batch system """ + def createBatchSystem(self) -> AbstractBatchSystem: from toil.batchSystems.lsf import LSFBatchSystem - return LSFBatchSystem(config=self.config, maxCores=numCores, - maxMemory=1000e9, maxDisk=1e9) + + return LSFBatchSystem( + config=self.config, maxCores=numCores, maxMemory=1000e9, maxDisk=1e9 + ) @slow @@ -1010,19 +1178,22 @@ class TorqueBatchSystemTest(hidden.AbstractGridEngineBatchSystemTest): def _createDummyConfig(self): config = super()._createDummyConfig() # can't use _getTestJobStorePath since that method removes the directory - config.jobStore = self._createTempDir('jobStore') + config.jobStore = self._createTempDir("jobStore") return config def createBatchSystem(self) -> AbstractBatchSystem: from toil.batchSystems.torque import TorqueBatchSystem - return TorqueBatchSystem(config=self.config, maxCores=numCores, maxMemory=1000e9, - maxDisk=1e9) + + return TorqueBatchSystem( + config=self.config, maxCores=numCores, maxMemory=1000e9, maxDisk=1e9 + ) def tearDown(self): super().tearDown() # Cleanup 'toil_job-%j.out' produced by sbatch from glob import glob - for f in glob('toil_job_*.[oe]*'): + + for f in glob("toil_job_*.[oe]*"): os.unlink(f) @@ -1035,8 +1206,10 @@ class HTCondorBatchSystemTest(hidden.AbstractGridEngineBatchSystemTest): def createBatchSystem(self) -> AbstractBatchSystem: from toil.batchSystems.htcondor import HTCondorBatchSystem - return HTCondorBatchSystem(config=self.config, maxCores=numCores, maxMemory=1000e9, - maxDisk=1e9) + + return HTCondorBatchSystem( + config=self.config, maxCores=numCores, maxMemory=1000e9, maxDisk=1e9 + ) def tearDown(self): super().tearDown() @@ -1051,46 +1224,71 @@ def getBatchSystemName(self): return "single_machine" @slow - @retry_flaky_test(prepare=[hidden.AbstractBatchSystemJobTest.tearDown, hidden.AbstractBatchSystemJobTest.setUp]) + @retry_flaky_test( + prepare=[ + hidden.AbstractBatchSystemJobTest.tearDown, + hidden.AbstractBatchSystemJobTest.setUp, + ] + ) def testConcurrencyWithDisk(self): """ Tests that the batch system is allocating disk resources properly """ - tempDir = self._createTempDir('testFiles') + tempDir = self._createTempDir("testFiles") options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.workDir = tempDir from toil import physicalDisk + availableDisk = physicalDisk(options.workDir) - logger.info('Testing disk concurrency limits with %s disk space', availableDisk) + logger.info("Testing disk concurrency limits with %s disk space", availableDisk) # More disk might become available by the time Toil starts, so we limit it here options.maxDisk = availableDisk options.batchSystem = self.batchSystemName - counterPath = os.path.join(tempDir, 'counter') + counterPath = os.path.join(tempDir, "counter") resetCounters(counterPath) value, maxValue = getCounters(counterPath) assert (value, maxValue) == (0, 0) half_disk = availableDisk // 2 more_than_half_disk = half_disk + 500 - logger.info('Dividing into parts of %s and %s', half_disk, more_than_half_disk) + logger.info("Dividing into parts of %s and %s", half_disk, more_than_half_disk) root = Job() # Physically, we're asking for 50% of disk and 50% of disk + 500bytes in the two jobs. The # batchsystem should not allow the 2 child jobs to run concurrently. - root.addChild(Job.wrapFn(measureConcurrency, counterPath, self.sleepTime, cores=1, - memory='1M', disk=half_disk)) - root.addChild(Job.wrapFn(measureConcurrency, counterPath, self.sleepTime, cores=1, - memory='1M', disk=more_than_half_disk)) + root.addChild( + Job.wrapFn( + measureConcurrency, + counterPath, + self.sleepTime, + cores=1, + memory="1M", + disk=half_disk, + ) + ) + root.addChild( + Job.wrapFn( + measureConcurrency, + counterPath, + self.sleepTime, + cores=1, + memory="1M", + disk=more_than_half_disk, + ) + ) Job.Runner.startToil(root, options) _, maxValue = getCounters(counterPath) - logger.info('After run: %s disk space', physicalDisk(options.workDir)) + logger.info("After run: %s disk space", physicalDisk(options.workDir)) self.assertEqual(maxValue, 1) - @skipIf(SingleMachineBatchSystem.numCores < 4, 'Need at least four cores to run this test') + @skipIf( + SingleMachineBatchSystem.numCores < 4, + "Need at least four cores to run this test", + ) @slow def testNestedResourcesDoNotBlock(self): """ @@ -1098,39 +1296,80 @@ def testNestedResourcesDoNotBlock(self): Test that unavailability of cpus for one job that is scheduled does not block another job that can run. """ - tempDir = self._createTempDir('testFiles') + tempDir = self._createTempDir("testFiles") options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.workDir = tempDir options.maxCores = 4 from toil import physicalMemory + availableMemory = physicalMemory() options.batchSystem = self.batchSystemName - outFile = os.path.join(tempDir, 'counter') - open(outFile, 'w').close() + outFile = os.path.join(tempDir, "counter") + open(outFile, "w").close() root = Job() - blocker = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=30, writeVal='b', - cores=2, memory='1M', disk='1M') - firstJob = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5, writeVal='fJ', - cores=1, memory='1M', disk='1M') - secondJob = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=10, - writeVal='sJ', cores=1, memory='1M', disk='1M') + blocker = Job.wrapFn( + _resourceBlockTestAuxFn, + outFile=outFile, + sleepTime=30, + writeVal="b", + cores=2, + memory="1M", + disk="1M", + ) + firstJob = Job.wrapFn( + _resourceBlockTestAuxFn, + outFile=outFile, + sleepTime=5, + writeVal="fJ", + cores=1, + memory="1M", + disk="1M", + ) + secondJob = Job.wrapFn( + _resourceBlockTestAuxFn, + outFile=outFile, + sleepTime=10, + writeVal="sJ", + cores=1, + memory="1M", + disk="1M", + ) # Should block off 50% of memory while waiting for it's 3 cores - firstJobChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=0, - writeVal='fJC', cores=3, memory=int(availableMemory // 2), disk='1M') + firstJobChild = Job.wrapFn( + _resourceBlockTestAuxFn, + outFile=outFile, + sleepTime=0, + writeVal="fJC", + cores=3, + memory=int(availableMemory // 2), + disk="1M", + ) # These two shouldn't be able to run before B because there should be only # (50% of memory - 1M) available (firstJobChild should be blocking 50%) - secondJobChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5, - writeVal='sJC', cores=2, memory=int(availableMemory // 1.5), - disk='1M') - secondJobGrandChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5, - writeVal='sJGC', cores=2, memory=int(availableMemory // 1.5), - disk='1M') + secondJobChild = Job.wrapFn( + _resourceBlockTestAuxFn, + outFile=outFile, + sleepTime=5, + writeVal="sJC", + cores=2, + memory=int(availableMemory // 1.5), + disk="1M", + ) + secondJobGrandChild = Job.wrapFn( + _resourceBlockTestAuxFn, + outFile=outFile, + sleepTime=5, + writeVal="sJGC", + cores=2, + memory=int(availableMemory // 1.5), + disk="1M", + ) root.addChild(blocker) root.addChild(firstJob) @@ -1160,9 +1399,11 @@ def testNestedResourcesDoNotBlock(self): outString = oFH.read() # The ordering of b, fJ and sJ is non-deterministic since they are scheduled at the same # time. We look for all possible permutations. - possibleStarts = tuple(''.join(x) for x in itertools.permutations(['b', 'fJ', 'sJ'])) + possibleStarts = tuple( + "".join(x) for x in itertools.permutations(["b", "fJ", "sJ"]) + ) assert outString.startswith(possibleStarts) - assert outString.endswith('sJCsJGCfJC') + assert outString.endswith("sJCsJGCfJC") def _resourceBlockTestAuxFn(outFile, sleepTime, writeVal): @@ -1172,7 +1413,7 @@ def _resourceBlockTestAuxFn(outFile, sleepTime, writeVal): :param int sleepTime: Time to sleep for :param str writeVal: Character to write """ - with open(outFile, 'a') as oFH: + with open(outFile, "a") as oFH: fcntl.flock(oFH, fcntl.LOCK_EX) oFH.write(writeVal) time.sleep(sleepTime) @@ -1184,9 +1425,10 @@ class MesosBatchSystemJobTest(hidden.AbstractBatchSystemJobTest, MesosTestSuppor """ Tests Toil workflow against the Mesos batch system """ + def getOptions(self, tempDir): options = super().getOptions(tempDir) - options.mesos_endpoint = 'localhost:5050' + options.mesos_endpoint = "localhost:5050" return options def getBatchSystemName(self): @@ -1227,12 +1469,13 @@ def count(delta, file_path): fcntl.flock(fd, fcntl.LOCK_EX) try: s = os.read(fd, 10) - value, maxValue = (int(i) for i in s.decode('utf-8').split(',')) + value, maxValue = (int(i) for i in s.decode("utf-8").split(",")) value += delta - if value > maxValue: maxValue = value + if value > maxValue: + maxValue = value os.lseek(fd, 0, 0) os.ftruncate(fd, 0) - os.write(fd, f'{value},{maxValue}'.encode()) + os.write(fd, f"{value},{maxValue}".encode()) finally: fcntl.flock(fd, fcntl.LOCK_UN) finally: @@ -1241,8 +1484,8 @@ def count(delta, file_path): def getCounters(path): - with open(path, 'r+') as f: - concurrentTasks, maxConcurrentTasks = (int(i) for i in f.read().split(',')) + with open(path, "r+") as f: + concurrentTasks, maxConcurrentTasks = (int(i) for i in f.read().split(",")) return concurrentTasks, maxConcurrentTasks @@ -1253,4 +1496,4 @@ def resetCounters(path): def get_omp_threads() -> str: - return os.environ['OMP_NUM_THREADS'] + return os.environ["OMP_NUM_THREADS"] diff --git a/src/toil/test/batchSystems/batch_system_plugin_test.py b/src/toil/test/batchSystems/batch_system_plugin_test.py index a5533bb7cb..a57ed97eb4 100644 --- a/src/toil/test/batchSystems/batch_system_plugin_test.py +++ b/src/toil/test/batchSystems/batch_system_plugin_test.py @@ -12,10 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging -from typing import Optional, Dict, List, Type +from typing import Optional + from configargparse import ArgParser, ArgumentParser -from toil.batchSystems.abstractBatchSystem import (AbstractBatchSystem, UpdatedBatchJobInfo) +from toil.batchSystems.abstractBatchSystem import ( + AbstractBatchSystem, + UpdatedBatchJobInfo, +) from toil.batchSystems.cleanup_support import BatchSystemCleanupSupport from toil.batchSystems.options import OptionSetter from toil.batchSystems.registry import add_batch_system_factory @@ -31,16 +35,21 @@ class FakeBatchSystem(BatchSystemCleanupSupport): def supportsAutoDeployment(cls) -> bool: pass - def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int: + def issueBatchJob( + self, + command: str, + job_desc: JobDescription, + job_environment: Optional[dict[str, str]] = None, + ) -> int: pass - def killBatchJobs(self, jobIDs: List[int]) -> None: + def killBatchJobs(self, jobIDs: list[int]) -> None: pass - def getIssuedBatchJobIDs(self) -> List[int]: + def getIssuedBatchJobIDs(self) -> list[int]: pass - def getRunningBatchJobIDs(self) -> Dict[int, float]: + def getRunningBatchJobIDs(self) -> dict[int, float]: pass def getUpdatedBatchJob(self, maxWait: int) -> Optional[UpdatedBatchJobInfo]: @@ -57,13 +66,15 @@ def add_options(cls, parser: ArgumentParser) -> None: def setOptions(cls, setOption: OptionSetter) -> None: setOption("fake_argument") + class BatchSystemPluginTest(ToilTest): def test_batchsystem_plugin_installable(self): """ Test that installing a batch system plugin works. :return: """ - def fake_batch_system_factory() -> Type[AbstractBatchSystem]: + + def fake_batch_system_factory() -> type[AbstractBatchSystem]: return FakeBatchSystem add_batch_system_factory("fake", fake_batch_system_factory) diff --git a/src/toil/test/batchSystems/test_gridengine.py b/src/toil/test/batchSystems/test_gridengine.py index 816865fe3d..47560482ac 100644 --- a/src/toil/test/batchSystems/test_gridengine.py +++ b/src/toil/test/batchSystems/test_gridengine.py @@ -31,8 +31,9 @@ def __fake_config(self): """ config = Config() from uuid import uuid4 + config.workflowID = str(uuid4()) - config.cleanWorkDir = 'always' + config.cleanWorkDir = "always" return config def with_retries(self, operation, *args, **kwargs): @@ -45,13 +46,18 @@ def with_retries(self, operation, *args, **kwargs): def call_qstat_or_qacct(args, **_): # example outputs taken from https://2021.help.altair.com/2021.1/AltairGridEngine/8.7.0/UsersGuideGE.pdf qacct_info = {} - job_id_info = {1: {"failed": True, "exit_code": 0, "completed": True}, 2: {"failed": True, "exit_code": 2, "completed": True}, - 3: {"failed": False, "exit_code": 0, "completed": True}, 4: {"failed": False, "exit_code": 10, "completed": True}, - 5: {"failed": False, "exit_code": 0, "completed": False}} + job_id_info = { + 1: {"failed": True, "exit_code": 0, "completed": True}, + 2: {"failed": True, "exit_code": 2, "completed": True}, + 3: {"failed": False, "exit_code": 0, "completed": True}, + 4: {"failed": False, "exit_code": 10, "completed": True}, + 5: {"failed": False, "exit_code": 0, "completed": False}, + } for job_id, status_info in job_id_info.items(): failed = 1 if status_info["failed"] else 0 exit_status = status_info["exit_code"] - qacct_info[job_id] = textwrap.dedent(f"""\ + qacct_info[job_id] = textwrap.dedent( + f"""\ ============================================================== qname all.q hostname kailua @@ -78,7 +84,8 @@ def call_qstat_or_qacct(args, **_): ru_ixrss 0 ru_ismrss 0 ru_idrss 0 - """) + """ + ) if args[0] == "qstat": # This is guess for what qstat will return given a job. I'm unable to find an example for qstat. # This also assumes the second argument args[1] is -j, as that is what we try to use @@ -113,27 +120,36 @@ class GridEngineTest(ToilTest): def setUp(self): self.monkeypatch = pytest.MonkeyPatch() - self.worker = toil.batchSystems.gridengine.GridEngineBatchSystem.GridEngineThread( - newJobsQueue=Queue(), - updatedJobsQueue=Queue(), - killQueue=Queue(), - killedJobsQueue=Queue(), - boss=FakeBatchSystem()) + self.worker = ( + toil.batchSystems.gridengine.GridEngineBatchSystem.GridEngineThread( + newJobsQueue=Queue(), + updatedJobsQueue=Queue(), + killQueue=Queue(), + killedJobsQueue=Queue(), + boss=FakeBatchSystem(), + ) + ) ### ### Tests for coalesce_job_exit_codes for gridengine. ### def test_coalesce_job_exit_codes_one_exists(self): - self.monkeypatch.setattr(toil.batchSystems.gridengine, "call_command", call_qstat_or_qacct) - job_ids = ['1'] # FAILED + self.monkeypatch.setattr( + toil.batchSystems.gridengine, "call_command", call_qstat_or_qacct + ) + job_ids = ["1"] # FAILED expected_result = [1] result = self.worker.coalesce_job_exit_codes(job_ids) assert result == expected_result, f"{result} != {expected_result}" def test_coalesce_job_exit_codes_one_still_running(self): - self.monkeypatch.setattr(toil.batchSystems.gridengine, "call_command", call_qstat_or_qacct) - job_ids = ['5'] # Still running. We currently raise an exception when this happens + self.monkeypatch.setattr( + toil.batchSystems.gridengine, "call_command", call_qstat_or_qacct + ) + job_ids = [ + "5" + ] # Still running. We currently raise an exception when this happens try: self.worker.coalesce_job_exit_codes(job_ids) except ExceededRetryAttempts: @@ -142,18 +158,16 @@ def test_coalesce_job_exit_codes_one_still_running(self): raise RuntimeError("Test did not raise an exception!") def test_coalesce_job_exit_codes_many_all_exist(self): - self.monkeypatch.setattr(toil.batchSystems.gridengine, "call_command", call_qstat_or_qacct) - job_ids = ['1', # FAILED, - '2', # FAILED (with exit code that we ignore), - '3', # SUCCEEDED, - '4'] # EXIT CODE 10 + self.monkeypatch.setattr( + toil.batchSystems.gridengine, "call_command", call_qstat_or_qacct + ) + job_ids = [ + "1", # FAILED, + "2", # FAILED (with exit code that we ignore), + "3", # SUCCEEDED, + "4", + ] # EXIT CODE 10 # RUNNING and PENDING jobs should return None - expected_result = [ - 1, - 1, - 0, - 10 - ] + expected_result = [1, 1, 0, 10] result = self.worker.coalesce_job_exit_codes(job_ids) assert result == expected_result, f"{result} != {expected_result}" - diff --git a/src/toil/test/batchSystems/test_lsf_helper.py b/src/toil/test/batchSystems/test_lsf_helper.py index d383249788..36d55d513c 100644 --- a/src/toil/test/batchSystems/test_lsf_helper.py +++ b/src/toil/test/batchSystems/test_lsf_helper.py @@ -1,4 +1,5 @@ """lsfHelper.py shouldn't need a batch system and so the unit tests here should aim to run on any system.""" + from toil.batchSystems.lsfHelper import parse_mem_and_cmd_from_output from toil.test import ToilTest @@ -6,69 +7,77 @@ class LSFHelperTest(ToilTest): def test_parse_mem_and_cmd_from_output(self): # https://github.com/DataBiosphere/toil/pull/3475 - output = ('\nJob <2924748>, Job Name , User , Project , S' - '\n tatus , Queue , Job Priority <50>, Com' - '\n mand <_toil_worker CactusBarRecursion file:/hps/nobackup/p' - '\n roduction/ensembl/thiagogenez/pairwises/arabidopsis/run/jo' - '\n bstore/3 kind-CactusBarRecursion/instance-iu6wo56x --conte' - '\n xt gAShortenedh32xqlE51Yi4=>, Share group charged , Esub ' - '\nThu Mar 18 02:06:32: Submitted from host , CWD , S' - '\n pecified CWD , Output File , Error File , Requested Resources 4000)]' - '\n rusage[mem=4000:duration=480, numcpus=1:duration=480]>;' - '\nThu Mar 18 02:06:33: Started on , Execution Home , Execution CWD ;' - '\nThu Mar 18 17:07:47: Resource usage collected.' - '\n The CPU time used is 53936 seconds.' - '\n MEM: 344 Mbytes; SWAP: 1.3 Gbytes; NTHREAD: 5' - '\n PGID: 433168; PIDs: 433168 433177 433179 444026 ' - '\n' - '\n RUNLIMIT ' - '\n 10085.0 min' - '\n' - '\n CORELIMIT MEMLIMIT' - '\n 0 M 3.9 G ' - '\n' - '\n MEMORY USAGE:' - '\n MAX MEM: 2.5 Gbytes; AVG MEM: 343 Mbytes' - '\n' - '\n SCHEDULING PARAMETERS:' - '\n r15s r1m r15m ut pg io ls it tmp swp mem' - '\n loadSched - - - - 10.0 - - - 500M - 1000M ' - '\n loadStop - - - - - - - - - - - ' - '\n' - '\n availcpus ' - '\n loadSched 1.0 ' - '\n loadStop - ' - '\n' - '\n RESOURCE REQUIREMENT DETAILS:' - '\n Combined: select[((mem>4000)) && (type == local)] order[r15s:pg] rusage[mem=40' - '\n 00.00:duration=8h:decay=0,numcpus=1.00:duration=8h:decay=0' - '\n ] span[hosts=1]' - '\n Effective: select[(((mem>4000))) && (type == local)] order[r15s:pg] rusage[mem' - '\n =4000.00:duration=8h:decay=0,numcpus=1.00:duration=8h:deca' - '\n y=0] span[hosts=1] ' - '\n' - '\n') + output = ( + "\nJob <2924748>, Job Name , User , Project , S" + "\n tatus , Queue , Job Priority <50>, Com" + "\n mand <_toil_worker CactusBarRecursion file:/hps/nobackup/p" + "\n roduction/ensembl/thiagogenez/pairwises/arabidopsis/run/jo" + "\n bstore/3 kind-CactusBarRecursion/instance-iu6wo56x --conte" + "\n xt gAShortenedh32xqlE51Yi4=>, Share group charged , Esub " + "\nThu Mar 18 02:06:32: Submitted from host , CWD , S" + "\n pecified CWD , Output File , Error File , Requested Resources 4000)]" + "\n rusage[mem=4000:duration=480, numcpus=1:duration=480]>;" + "\nThu Mar 18 02:06:33: Started on , Execution Home , Execution CWD ;" + "\nThu Mar 18 17:07:47: Resource usage collected." + "\n The CPU time used is 53936 seconds." + "\n MEM: 344 Mbytes; SWAP: 1.3 Gbytes; NTHREAD: 5" + "\n PGID: 433168; PIDs: 433168 433177 433179 444026 " + "\n" + "\n RUNLIMIT " + "\n 10085.0 min" + "\n" + "\n CORELIMIT MEMLIMIT" + "\n 0 M 3.9 G " + "\n" + "\n MEMORY USAGE:" + "\n MAX MEM: 2.5 Gbytes; AVG MEM: 343 Mbytes" + "\n" + "\n SCHEDULING PARAMETERS:" + "\n r15s r1m r15m ut pg io ls it tmp swp mem" + "\n loadSched - - - - 10.0 - - - 500M - 1000M " + "\n loadStop - - - - - - - - - - - " + "\n" + "\n availcpus " + "\n loadSched 1.0 " + "\n loadStop - " + "\n" + "\n RESOURCE REQUIREMENT DETAILS:" + "\n Combined: select[((mem>4000)) && (type == local)] order[r15s:pg] rusage[mem=40" + "\n 00.00:duration=8h:decay=0,numcpus=1.00:duration=8h:decay=0" + "\n ] span[hosts=1]" + "\n Effective: select[(((mem>4000))) && (type == local)] order[r15s:pg] rusage[mem" + "\n =4000.00:duration=8h:decay=0,numcpus=1.00:duration=8h:deca" + "\n y=0] span[hosts=1] " + "\n" + "\n" + ) max_mem, command = parse_mem_and_cmd_from_output(output=output) assert len(max_mem.groups()) == 1 - expected_mem = '2.5 Gbytes' - assert max_mem.group(1) == expected_mem, f'Actual: {max_mem.group(1)}, Expected: "{expected_mem}"' + expected_mem = "2.5 Gbytes" + assert ( + max_mem.group(1) == expected_mem + ), f'Actual: {max_mem.group(1)}, Expected: "{expected_mem}"' assert len(command.groups()) == 1 - expected_command = ('_toil_worker CactusBarRecursion file:/hps/nobackup/production/ensembl/thiagogenez/' - 'pairwises/arabidopsis/run/jobstore/3 kind-CactusBarRecursion/instance-iu6wo56x ' - '--context gAShortenedh32xqlE51Yi4=') - assert command.group(1) == expected_command, f'Actual: {command.group(1)}, Expected: "{expected_command}"' + expected_command = ( + "_toil_worker CactusBarRecursion file:/hps/nobackup/production/ensembl/thiagogenez/" + "pairwises/arabidopsis/run/jobstore/3 kind-CactusBarRecursion/instance-iu6wo56x " + "--context gAShortenedh32xqlE51Yi4=" + ) + assert ( + command.group(1) == expected_command + ), f'Actual: {command.group(1)}, Expected: "{expected_command}"' print(command) - max_mem, command = parse_mem_and_cmd_from_output(output='') + max_mem, command = parse_mem_and_cmd_from_output(output="") assert max_mem == None assert command == None diff --git a/src/toil/test/batchSystems/test_slurm.py b/src/toil/test/batchSystems/test_slurm.py index 483e8fb030..b2818359e9 100644 --- a/src/toil/test/batchSystems/test_slurm.py +++ b/src/toil/test/batchSystems/test_slurm.py @@ -4,7 +4,10 @@ import pytest import toil.batchSystems.slurm -from toil.batchSystems.abstractBatchSystem import BatchJobExitReason, EXIT_STATUS_UNAVAILABLE_VALUE +from toil.batchSystems.abstractBatchSystem import ( + EXIT_STATUS_UNAVAILABLE_VALUE, + BatchJobExitReason, +) from toil.common import Config from toil.lib.misc import CalledProcessErrorStderr from toil.test import ToilTest @@ -12,6 +15,7 @@ # TODO: Come up with a better way to mock the commands then monkey-patching the # command-calling functions. + def call_sacct(args, **_) -> str: """ The arguments passed to `call_command` when executing `sacct` are: @@ -37,7 +41,7 @@ def call_sacct(args, **_) -> str: 789868: "789868|PENDING|0:0\n", 789869: "789869|COMPLETED|0:0\n789869.batch|COMPLETED|0:0\n789869.extern|COMPLETED|0:0\n", } - job_ids = [int(job_id) for job_id in args[3].split(',')] + job_ids = [int(job_id) for job_id in args[3].split(",")] stdout = "" # Glue the fake outputs for the request job-ids together in a single string for job_id in job_ids: @@ -53,7 +57,8 @@ def call_scontrol(args, **_) -> str: job_id = int(args[3]) if len(args) > 3 else None # Fake output per fake job-id. scontrol_info = { - 787204: textwrap.dedent("""\ + 787204: textwrap.dedent( + """\ JobId=787204 JobName=toil_job_6_CWLJob UserId=rapthor-mloose(54386) GroupId=rapthor-mloose(54038) MCS_label=N/A Priority=11067 Nice=0 Account=rapthor QOS=normal @@ -81,8 +86,10 @@ def call_scontrol(args, **_) -> str: StdOut=/home/rapthor-mloose/code/toil/cwl-v1.2/tmp/toil_19512746-a9f4-4b99-b9ff-48ca5c1b661c.6.787204.out.log Power= NtasksPerTRES:0 - """), - 789724: textwrap.dedent("""\ + """ + ), + 789724: textwrap.dedent( + """\ JobId=789724 JobName=run_prefactor-cwltool.sh UserId=rapthor-mloose(54386) GroupId=rapthor-mloose(54038) MCS_label=N/A Priority=7905 Nice=0 Account=rapthor QOS=normal @@ -110,8 +117,10 @@ def call_scontrol(args, **_) -> str: StdOut=/project/rapthor/Share/prefactor/L721962/slurm-789724.out Power= NtasksPerTRES:0 - """), - 789728: textwrap.dedent("""\ + """ + ), + 789728: textwrap.dedent( + """\ JobId=789728 JobName=sleep.sh UserId=rapthor-mloose(54386) GroupId=rapthor-mloose(54038) MCS_label=N/A Priority=8005 Nice=0 Account=rapthor QOS=normal @@ -138,26 +147,31 @@ def call_scontrol(args, **_) -> str: StdOut=/home/rapthor-mloose/tmp/slurm-789728.out Power= NtasksPerTRES:0 - """), + """ + ), } if job_id is not None: try: stdout = scontrol_info[job_id] except KeyError: - raise CalledProcessErrorStderr(1, "slurm_load_jobs error: Invalid job id specified") + raise CalledProcessErrorStderr( + 1, "slurm_load_jobs error: Invalid job id specified" + ) else: # Glue the fake outputs for the request job-ids together in a single string stdout = "" for value in scontrol_info.values(): - stdout += value + '\n' + stdout += value + "\n" return stdout + def call_sacct_raises(*_): """ Fake that the `sacct` command fails by raising a `CalledProcessErrorStderr` """ - raise CalledProcessErrorStderr(1, "sacct: error: Problem talking to the database: " - "Connection timed out") + raise CalledProcessErrorStderr( + 1, "sacct: error: Problem talking to the database: " "Connection timed out" + ) class FakeBatchSystem: @@ -169,7 +183,7 @@ def __init__(self): self.config = self.__fake_config() def getWaitDuration(self): - return 10; + return 10 def __fake_config(self): """ @@ -181,8 +195,9 @@ def __fake_config(self): """ config = Config() from uuid import uuid4 + config.workflowID = str(uuid4()) - config.cleanWorkDir = 'always' + config.cleanWorkDir = "always" return config @@ -198,7 +213,8 @@ def setUp(self): updatedJobsQueue=Queue(), killQueue=Queue(), killedJobsQueue=Queue(), - boss=FakeBatchSystem()) + boss=FakeBatchSystem(), + ) #### #### tests for _getJobDetailsFromSacct() @@ -218,15 +234,25 @@ def test_getJobDetailsFromSacct_one_not_exists(self): def test_getJobDetailsFromSacct_many_all_exist(self): self.monkeypatch.setattr(toil.batchSystems.slurm, "call_command", call_sacct) - expected_result = {754725: ("TIMEOUT", 0), 789456: ("FAILED", 1), 789724: ("RUNNING", 0), - 789868: ("PENDING", 0), 789869: ("COMPLETED", 0)} + expected_result = { + 754725: ("TIMEOUT", 0), + 789456: ("FAILED", 1), + 789724: ("RUNNING", 0), + 789868: ("PENDING", 0), + 789869: ("COMPLETED", 0), + } result = self.worker._getJobDetailsFromSacct(list(expected_result)) assert result == expected_result, f"{result} != {expected_result}" def test_getJobDetailsFromSacct_many_some_exist(self): self.monkeypatch.setattr(toil.batchSystems.slurm, "call_command", call_sacct) - expected_result = {609663: ("FAILED", 130), 767925: ("FAILED", 2), 1234: (None, None), - 1235: (None, None), 765096: ("FAILED", 137)} + expected_result = { + 609663: ("FAILED", 130), + 767925: ("FAILED", 2), + 1234: (None, None), + 1235: (None, None), + 765096: ("FAILED", 137), + } result = self.worker._getJobDetailsFromSacct(list(expected_result)) assert result == expected_result, f"{result} != {expected_result}" @@ -262,13 +288,21 @@ def test_getJobDetailsFromScontrol_one_not_exists(self): def test_getJobDetailsFromScontrol_many_all_exist(self): self.monkeypatch.setattr(toil.batchSystems.slurm, "call_command", call_scontrol) - expected_result = {787204: ("COMPLETED", 0), 789724: ("RUNNING", 0), 789728: ("PENDING", 0)} + expected_result = { + 787204: ("COMPLETED", 0), + 789724: ("RUNNING", 0), + 789728: ("PENDING", 0), + } result = self.worker._getJobDetailsFromScontrol(list(expected_result)) assert result == expected_result, f"{result} != {expected_result}" def test_getJobDetailsFromScontrol_many_some_exist(self): self.monkeypatch.setattr(toil.batchSystems.slurm, "call_command", call_scontrol) - expected_result = {787204: ("COMPLETED", 0), 789724: ("RUNNING", 0), 1234: (None, None)} + expected_result = { + 787204: ("COMPLETED", 0), + 789724: ("RUNNING", 0), + 1234: (None, None), + } result = self.worker._getJobDetailsFromScontrol(list(expected_result)) assert result == expected_result, f"{result} != {expected_result}" @@ -284,14 +318,14 @@ def test_getJobDetailsFromScontrol_many_none_exist(self): def test_getJobExitCode_job_exists(self): self.monkeypatch.setattr(toil.batchSystems.slurm, "call_command", call_sacct) - job_id = '785023' # FAILED + job_id = "785023" # FAILED expected_result = (127, BatchJobExitReason.FAILED) result = self.worker.getJobExitCode(job_id) assert result == expected_result, f"{result} != {expected_result}" def test_getJobExitCode_job_not_exists(self): self.monkeypatch.setattr(toil.batchSystems.slurm, "call_command", call_sacct) - job_id = '1234' # Non-existent + job_id = "1234" # Non-existent expected_result = None result = self.worker.getJobExitCode(job_id) assert result == expected_result, f"{result} != {expected_result}" @@ -301,10 +335,12 @@ def test_getJobExitCode_sacct_raises_job_exists(self): This test forces the use of `scontrol` to get job information, by letting `sacct` raise an exception. """ - self.monkeypatch.setattr(self.worker, "_getJobDetailsFromSacct", call_sacct_raises) + self.monkeypatch.setattr( + self.worker, "_getJobDetailsFromSacct", call_sacct_raises + ) self.monkeypatch.setattr(toil.batchSystems.slurm, "call_command", call_scontrol) - job_id = '787204' # COMPLETED - expected_result = (0, BatchJobExitReason.FINISHED) + job_id = "787204" # COMPLETED + expected_result = (0, BatchJobExitReason.FINISHED) result = self.worker.getJobExitCode(job_id) assert result == expected_result, f"{result} != {expected_result}" @@ -313,9 +349,11 @@ def test_getJobExitCode_sacct_raises_job_not_exists(self): This test forces the use of `scontrol` to get job information, by letting `sacct` raise an exception. Next, `scontrol` should also raise because it doesn't know the job. """ - self.monkeypatch.setattr(self.worker, "_getJobDetailsFromSacct", call_sacct_raises) + self.monkeypatch.setattr( + self.worker, "_getJobDetailsFromSacct", call_sacct_raises + ) self.monkeypatch.setattr(toil.batchSystems.slurm, "call_command", call_scontrol) - job_id = '1234' # Non-existent + job_id = "1234" # Non-existent try: _ = self.worker.getJobExitCode(job_id) except CalledProcessErrorStderr: @@ -329,50 +367,54 @@ def test_getJobExitCode_sacct_raises_job_not_exists(self): def test_coalesce_job_exit_codes_one_exists(self): self.monkeypatch.setattr(toil.batchSystems.slurm, "call_command", call_sacct) - job_ids = ['785023'] # FAILED - expected_result = [(127, BatchJobExitReason.FAILED)] + job_ids = ["785023"] # FAILED + expected_result = [(127, BatchJobExitReason.FAILED)] result = self.worker.coalesce_job_exit_codes(job_ids) assert result == expected_result, f"{result} != {expected_result}" def test_coalesce_job_exit_codes_one_not_exists(self): self.monkeypatch.setattr(toil.batchSystems.slurm, "call_command", call_sacct) - job_ids = ['1234'] # Non-existent + job_ids = ["1234"] # Non-existent expected_result = [None] result = self.worker.coalesce_job_exit_codes(job_ids) assert result == expected_result, f"{result} != {expected_result}" def test_coalesce_job_exit_codes_many_all_exist(self): self.monkeypatch.setattr(toil.batchSystems.slurm, "call_command", call_sacct) - job_ids = ['754725', # TIMEOUT, - '789456', # FAILED, - '789724', # RUNNING, - '789868', # PENDING, - '789869'] # COMPLETED + job_ids = [ + "754725", # TIMEOUT, + "789456", # FAILED, + "789724", # RUNNING, + "789868", # PENDING, + "789869", + ] # COMPLETED # RUNNING and PENDING jobs should return None expected_result = [ (EXIT_STATUS_UNAVAILABLE_VALUE, BatchJobExitReason.KILLED), (1, BatchJobExitReason.FAILED), None, None, - (0, BatchJobExitReason.FINISHED) + (0, BatchJobExitReason.FINISHED), ] result = self.worker.coalesce_job_exit_codes(job_ids) assert result == expected_result, f"{result} != {expected_result}" def test_coalesce_job_exit_codes_some_exists(self): self.monkeypatch.setattr(toil.batchSystems.slurm, "call_command", call_sacct) - job_ids = ['609663', # FAILED (SIGINT) - '767925', # FAILED, - '789724', # RUNNING, - '999999', # Non-existent, - '789869'] # COMPLETED + job_ids = [ + "609663", # FAILED (SIGINT) + "767925", # FAILED, + "789724", # RUNNING, + "999999", # Non-existent, + "789869", + ] # COMPLETED # RUNNING job should return None expected_result = [ (130, BatchJobExitReason.FAILED), (2, BatchJobExitReason.FAILED), None, None, - (0, BatchJobExitReason.FINISHED) + (0, BatchJobExitReason.FINISHED), ] result = self.worker.coalesce_job_exit_codes(job_ids) assert result == expected_result, f"{result} != {expected_result}" @@ -382,9 +424,11 @@ def test_coalesce_job_exit_codes_sacct_raises_job_exists(self): This test forces the use of `scontrol` to get job information, by letting `sacct` raise an exception. """ - self.monkeypatch.setattr(self.worker, "_getJobDetailsFromSacct", call_sacct_raises) + self.monkeypatch.setattr( + self.worker, "_getJobDetailsFromSacct", call_sacct_raises + ) self.monkeypatch.setattr(toil.batchSystems.slurm, "call_command", call_scontrol) - job_ids = ['787204'] # COMPLETED + job_ids = ["787204"] # COMPLETED expected_result = [(0, BatchJobExitReason.FINISHED)] result = self.worker.coalesce_job_exit_codes(job_ids) assert result == expected_result, f"{result} != {expected_result}" @@ -394,9 +438,11 @@ def test_coalesce_job_exit_codes_sacct_raises_job_not_exists(self): This test forces the use of `scontrol` to get job information, by letting `sacct` raise an exception. Next, `scontrol` should also raise because it doesn't know the job. """ - self.monkeypatch.setattr(self.worker, "_getJobDetailsFromSacct", call_sacct_raises) + self.monkeypatch.setattr( + self.worker, "_getJobDetailsFromSacct", call_sacct_raises + ) self.monkeypatch.setattr(toil.batchSystems.slurm, "call_command", call_scontrol) - job_ids = ['1234'] # Non-existent + job_ids = ["1234"] # Non-existent try: _ = self.worker.coalesce_job_exit_codes(job_ids) except CalledProcessErrorStderr: diff --git a/src/toil/test/cactus/test_cactus_integration.py b/src/toil/test/cactus/test_cactus_integration.py index c1d72a0c91..513c159556 100644 --- a/src/toil/test/cactus/test_cactus_integration.py +++ b/src/toil/test/cactus/test_cactus_integration.py @@ -30,29 +30,27 @@ def test_cactus_integration(self): ) self.leader = self.cluster.getLeader() - CACTUS_COMMIT_SHA = os.environ["CACTUS_COMMIT_SHA"] or "f5adf4013326322ae58ef1eccb8409b71d761583" # default cactus commit + CACTUS_COMMIT_SHA = ( + os.environ["CACTUS_COMMIT_SHA"] + or "f5adf4013326322ae58ef1eccb8409b71d761583" + ) # default cactus commit # command to install and run cactus on the cluster - cactus_command = ("python -m virtualenv --system-site-packages venv && " - ". venv/bin/activate && " - "git clone https://github.com/ComparativeGenomicsToolkit/cactus.git --recursive && " - "cd cactus && " - "git fetch origin && " - f"git checkout {CACTUS_COMMIT_SHA} && " - "git submodule update --init --recursive && " - "pip install --upgrade 'setuptools' pip && " - "pip install --upgrade . && " - "pip install --upgrade numpy psutil && " - "time cactus --batchSystem kubernetes --retryCount=3 " - f"--consCores 2 --binariesMode singularity --clean always {self.jobStore} " - "examples/evolverMammals.txt examples/evolverMammals.hal --root mr --defaultDisk 8G --logDebug") - - # run cactus - self.sshUtil( - [ - "bash", - "-c", - cactus_command - ] + cactus_command = ( + "python -m virtualenv --system-site-packages venv && " + ". venv/bin/activate && " + "git clone https://github.com/ComparativeGenomicsToolkit/cactus.git --recursive && " + "cd cactus && " + "git fetch origin && " + f"git checkout {CACTUS_COMMIT_SHA} && " + "git submodule update --init --recursive && " + "pip install --upgrade 'setuptools' pip && " + "pip install --upgrade . && " + "pip install --upgrade numpy psutil && " + "time cactus --batchSystem kubernetes --retryCount=3 " + f"--consCores 2 --binariesMode singularity --clean always {self.jobStore} " + "examples/evolverMammals.txt examples/evolverMammals.hal --root mr --defaultDisk 8G --logDebug" ) + # run cactus + self.sshUtil(["bash", "-c", cactus_command]) diff --git a/src/toil/test/cwl/cwlTest.py b/src/toil/test/cwl/cwlTest.py index c116cb16eb..1ff5c1c59b 100644 --- a/src/toil/test/cwl/cwlTest.py +++ b/src/toil/test/cwl/cwlTest.py @@ -23,16 +23,10 @@ import unittest import uuid import zipfile - from functools import partial from io import StringIO from pathlib import Path -from typing import (TYPE_CHECKING, - Callable, - Dict, - List, - Optional, - cast) +from typing import TYPE_CHECKING, Callable, Optional, cast from unittest.mock import Mock, call from urllib.request import urlretrieve @@ -46,32 +40,32 @@ from schema_salad.exceptions import ValidationException -from toil.cwl.utils import (DirectoryStructure, - download_structure, - visit_cwl_class_and_reduce, - visit_top_cwl_class) +from toil.cwl.utils import ( + DirectoryStructure, + download_structure, + visit_cwl_class_and_reduce, + visit_top_cwl_class, +) from toil.fileStores import FileID from toil.fileStores.abstractFileStore import AbstractFileStore from toil.lib.threading import cpu_count -from toil.provisioners import cluster_factory -from toil.test import (ToilTest, - needs_aws_ec2, - needs_aws_s3, - needs_cwl, - needs_docker, - needs_docker_cuda, - needs_env_var, - needs_fetchable_appliance, - needs_gridengine, - needs_kubernetes, - needs_local_cuda, - needs_lsf, - needs_mesos, - needs_online, - needs_slurm, - needs_torque, - needs_wes_server, - slow) +from toil.test import ( + ToilTest, + needs_aws_s3, + needs_cwl, + needs_docker, + needs_docker_cuda, + needs_gridengine, + needs_kubernetes, + needs_local_cuda, + needs_lsf, + needs_mesos, + needs_online, + needs_slurm, + needs_torque, + needs_wes_server, + slow, +) log = logging.getLogger(__name__) CONFORMANCE_TEST_TIMEOUT = 10000 @@ -86,7 +80,7 @@ def run_conformance_tests( selected_tests: Optional[str] = None, selected_tags: Optional[str] = None, skipped_tests: Optional[str] = None, - extra_args: Optional[List[str]] = None, + extra_args: Optional[list[str]] = None, must_support_all_features: bool = False, junit_file: Optional[str] = None, ) -> None: @@ -147,7 +141,7 @@ def run_conformance_tests( "--relax-path-checks", # Defaults to 20s but we can't start hundreds of nodejs processes that fast on our CI potatoes "--eval-timeout=600", - f"--caching={caching}" + f"--caching={caching}", ] if extra_args: @@ -181,14 +175,16 @@ def run_conformance_tests( cmd.extend(["--"] + args_passed_directly_to_runner) log.info("Running: '%s'", "' '".join(cmd)) - output_lines: List[str] = [] + output_lines: list[str] = [] try: - child = subprocess.Popen(cmd, cwd=workDir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + child = subprocess.Popen( + cmd, cwd=workDir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT + ) if child.stdout is not None: for line_bytes in child.stdout: # Pass through all the logs - line_text = line_bytes.decode('utf-8', errors='replace').rstrip() + line_text = line_bytes.decode("utf-8", errors="replace").rstrip() output_lines.append(line_text) log.info(line_text) @@ -197,7 +193,7 @@ def run_conformance_tests( log.info("CWL tests finished with exit code %s", child.returncode) if child.returncode != 0: # Act like check_output and raise an error. - raise subprocess.CalledProcessError(child.returncode, ' '.join(cmd)) + raise subprocess.CalledProcessError(child.returncode, " ".join(cmd)) finally: if job_store_override: # Clean up the job store we used for all the tests, if it is still there. @@ -218,13 +214,16 @@ def run_conformance_tests( only_unsupported = True break if (not only_unsupported) or must_support_all_features: - log.error("CWL tests gave unacceptable output:\n%s", '\n'.join(output_lines)) + log.error( + "CWL tests gave unacceptable output:\n%s", "\n".join(output_lines) + ) raise e log.info("Unsuccessful return code is OK") TesterFuncType = Callable[[str, str, "CWLObjectType"], None] + @needs_cwl class CWLWorkflowTest(ToilTest): """ @@ -253,6 +252,7 @@ def test_cwl_cmdline_input(self) -> None: Test that running a CWL workflow with inputs specified on the command line passes. """ from toil.cwl import cwltoil + cwlfile = "src/toil/test/cwl/conditional_wf.cwl" args = [cwlfile, "--message", "str", "--sleep", "2"] st = StringIO() @@ -264,7 +264,7 @@ def _tester( cwlfile: str, jobfile: str, expect: "CWLObjectType", - main_args: List[str] = [], + main_args: list[str] = [], out_name: str = "output", output_here: bool = False, ) -> None: @@ -274,13 +274,7 @@ def _tester( main_args = main_args[:] if not output_here: # Don't just dump output in the working directory. - main_args.extend( - [ - "--logDebug", - "--outdir", - self.outDir - ] - ) + main_args.extend(["--logDebug", "--outdir", self.outDir]) main_args.extend( [ os.path.join(self.rootDir, cwlfile), @@ -295,7 +289,12 @@ def _tester( self.assertEqual(out, expect) for k, v in expect.items(): - if isinstance(v, dict) and "class" in v and v["class"] == "File" and "path" in v: + if ( + isinstance(v, dict) + and "class" in v + and v["class"] == "File" + and "path" in v + ): # This is a top-level output file. # None of our output files should be executable. self.assertTrue(os.path.exists(v["path"])) @@ -445,7 +444,7 @@ def test_glob_dir_bypass_file_store(self) -> None: "src/toil/test/cwl/empty.json", self._expected_glob_dir_output(os.getcwd()), main_args=["--bypass-file-store"], - output_here=True + output_here=True, ) finally: # Clean up anything we made in the current directory. @@ -456,7 +455,7 @@ def test_glob_dir_bypass_file_store(self) -> None: @needs_slurm def test_slurm_node_memory(self) -> None: - from toil.cwl import cwltoil + pass # Run the workflow. This will either finish quickly and tell us the # memory we got, or take a long time because it requested a whole @@ -479,7 +478,9 @@ def test_slurm_node_memory(self) -> None: ] try: log.debug("Start test workflow") - child = subprocess.Popen(["toil-cwl-runner"] + main_args, stdout=subprocess.PIPE) + child = subprocess.Popen( + ["toil-cwl-runner"] + main_args, stdout=subprocess.PIPE + ) output, _ = child.communicate(timeout=60) except subprocess.TimeoutExpired: # The job didn't finish quickly; presumably waiting for a full node. @@ -513,7 +514,10 @@ def test_download_https(self) -> None: self.download("download_https.json", self._tester) def test_download_https_reference(self) -> None: - self.download("download_https.json", partial(self._tester, main_args=["--reference-inputs"])) + self.download( + "download_https.json", + partial(self._tester, main_args=["--reference-inputs"]), + ) def test_download_file(self) -> None: self.download("download_file.json", self._tester) @@ -524,7 +528,10 @@ def test_download_directory_s3(self) -> None: @needs_aws_s3 def test_download_directory_s3_reference(self) -> None: - self.download_directory("download_directory_s3.json", partial(self._tester, main_args=["--reference-inputs"])) + self.download_directory( + "download_directory_s3.json", + partial(self._tester, main_args=["--reference-inputs"]), + ) def test_download_directory_file(self) -> None: self.download_directory("download_directory_file.json", self._tester) @@ -569,7 +576,10 @@ def test_default_args(self) -> None: "src/toil/test/cwl/seqtk_seq.cwl", "src/toil/test/cwl/seqtk_seq_job.json", self._expected_seqtk_output(self.outDir), - main_args=["--default-container", "quay.io/biocontainers/seqtk:1.4--he4a0461_1"], + main_args=[ + "--default-container", + "quay.io/biocontainers/seqtk:1.4--he4a0461_1", + ], out_name="output1", ) @@ -633,7 +643,11 @@ def path_with_bogus_rev() -> str: # Force a failure by trying to use an incorrect version of `rev` from the PATH os.environ["PATH"] = path_with_bogus_rev() try: - subprocess.check_output(["toil-cwl-runner"] + cmd, env=os.environ.copy(), stderr=subprocess.STDOUT) + subprocess.check_output( + ["toil-cwl-runner"] + cmd, + env=os.environ.copy(), + stderr=subprocess.STDOUT, + ) self.fail("Expected problem job with incorrect PATH did not fail") except subprocess.CalledProcessError: pass @@ -643,13 +657,17 @@ def path_with_bogus_rev() -> str: cwltoil.main(cmd) # Should fail because previous job completed successfully try: - subprocess.check_output(["toil-cwl-runner"] + cmd, env=os.environ.copy(), stderr=subprocess.STDOUT) + subprocess.check_output( + ["toil-cwl-runner"] + cmd, + env=os.environ.copy(), + stderr=subprocess.STDOUT, + ) self.fail("Restart with missing directory did not fail") except subprocess.CalledProcessError: pass @needs_aws_s3 - def test_streamable(self, extra_args: Optional[List[str]] = None) -> None: + def test_streamable(self, extra_args: Optional[list[str]] = None) -> None: """ Test that a file with 'streamable'=True is a named pipe. This is a CWL1.2 feature. @@ -733,7 +751,6 @@ def test_preemptible_expression(self) -> None: except ValidationException as e: # Make sure we chastise the user appropriately. assert "expressions are not allowed" in str(e) - @staticmethod def _expected_seqtk_output(outDir: str) -> "CWLObjectType": @@ -790,7 +807,7 @@ def _expected_download_output(outDir: str) -> "CWLObjectType": "size": 0, "class": "File", "checksum": "sha1$da39a3ee5e6b4b0d3255bfef95601890afd80709", - "path": path + "path": path, } } @@ -817,9 +834,9 @@ def _expected_glob_dir_output(out_dir: str) -> "CWLObjectType": "checksum": "sha1$da39a3ee5e6b4b0d3255bfef95601890afd80709", "size": 0, "nameroot": "test", - "nameext": ".txt" + "nameext": ".txt", } - ] + ], } } @@ -852,7 +869,7 @@ def _expected_colon_output(outDir: str) -> "CWLObjectType": "size": 1111, "nameroot": "whale", "nameext": ".txt", - "path": f"{path}/whale.txt" + "path": f"{path}/whale.txt", } ], } @@ -920,7 +937,7 @@ def test_run_conformance( caching: bool = False, selected_tests: Optional[str] = None, skipped_tests: Optional[str] = None, - extra_args: Optional[List[str]] = None, + extra_args: Optional[list[str]] = None, ) -> None: run_conformance_tests( workDir=self.workDir, @@ -1048,7 +1065,7 @@ def test_run_conformance( caching: bool = False, batchSystem: Optional[str] = None, skipped_tests: Optional[str] = None, - extra_args: Optional[List[str]] = None, + extra_args: Optional[list[str]] = None, ) -> None: run_conformance_tests( workDir=self.cwlSpec, @@ -1122,7 +1139,7 @@ def test_run_conformance( batchSystem: Optional[str] = None, selected_tests: Optional[str] = None, skipped_tests: Optional[str] = None, - extra_args: Optional[List[str]] = None, + extra_args: Optional[list[str]] = None, must_support_all_features: bool = False, junit_file: Optional[str] = None, ) -> None: @@ -1146,9 +1163,7 @@ def test_run_conformance( def test_run_conformance_with_caching(self) -> None: self.test_run_conformance( caching=True, - junit_file = os.path.join( - self.rootDir, "caching-conformance-1.2.junit.xml" - ) + junit_file=os.path.join(self.rootDir, "caching-conformance-1.2.junit.xml"), ) @slow @@ -1160,10 +1175,11 @@ def test_run_conformance_with_in_place_update(self) -> None: features. """ self.test_run_conformance( - extra_args=["--bypass-file-store"], must_support_all_features=True, - junit_file = os.path.join( + extra_args=["--bypass-file-store"], + must_support_all_features=True, + junit_file=os.path.join( self.rootDir, "in-place-update-conformance-1.2.junit.xml" - ) + ), ) @slow @@ -1338,6 +1354,7 @@ def test_log_dir_echo_stderr(tmp_path: Path) -> None: output = open(result).read() assert output == "hello\n" + # TODO: It's not clear how this test tests filename conflict resolution; it # seems like it runs a python script to copy some files and makes sure the # workflow doesn't fail. @@ -1363,6 +1380,7 @@ def test_filename_conflict_resolution(tmp_path: Path) -> None: assert b"Finished toil run successfully" in stderr assert p.returncode == 0 + @needs_cwl @pytest.mark.cwl_small_log_dir def test_filename_conflict_resolution_3_or_more(tmp_path: Path) -> None: @@ -1372,15 +1390,16 @@ def test_filename_conflict_resolution_3_or_more(tmp_path: Path) -> None: f"--outdir={out_dir}", "--clean=always", ] - cwl = os.path.join( - os.path.dirname(__file__), "scatter_duplicate_outputs.cwl" - ) + cwl = os.path.join(os.path.dirname(__file__), "scatter_duplicate_outputs.cwl") cmd = [toil] + options + [cwl] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() assert b"Finished toil run successfully" in stderr assert p.returncode == 0 - assert len(os.listdir(out_dir)) == 9, "All 9 files made by the scatter should be in the directory" + assert ( + len(os.listdir(out_dir)) == 9 + ), "All 9 files made by the scatter should be in the directory" + @needs_cwl @needs_docker @@ -1404,6 +1423,7 @@ def test_filename_conflict_detection(tmp_path: Path) -> None: assert b"File staging conflict" in stderr assert p.returncode != 0 + @needs_cwl @needs_docker @pytest.mark.cwl_small_log_dir @@ -1445,7 +1465,10 @@ def test_pick_value_with_one_null_value(caplog: pytest.LogCaptureFixture) -> Non with caplog.at_level(logging.WARNING, logger="toil.cwl.cwltoil"): cwltoil.main(args) for line in caplog.messages: - assert "You had a conditional step that did not run, but you did not use pickValue to handle the skipped input." not in line + assert ( + "You had a conditional step that did not run, but you did not use pickValue to handle the skipped input." + not in line + ) @needs_cwl @@ -1593,7 +1616,7 @@ def op_down(thing: "CWLObjectType") -> int: up_count = 0 up_child_count = 0 - def op_up(thing: "CWLObjectType", down_value: int, child_results: List[str]) -> str: + def op_up(thing: "CWLObjectType", down_value: int, child_results: list[str]) -> str: """ Check the down return value and the up return values, and count what we visit going up and what child relationships we have. @@ -1647,9 +1670,9 @@ def test_download_structure(tmp_path: Path) -> None: # These will be populated. # TODO: This cache seems unused. Remove it? # This maps filesystem path to CWL URI - index: Dict[str, str] = {} + index: dict[str, str] = {} # This maps CWL URI to filesystem path - existing: Dict[str, str] = {} + existing: dict[str, str] = {} # Do the download download_structure(file_store, index, existing, structure, to_dir) @@ -1703,15 +1726,19 @@ def test_download_structure(tmp_path: Path) -> None: any_order=True, ) + @needs_cwl @pytest.mark.timeout(300) def test_import_on_workers() -> None: - args = ["src/toil/test/cwl/download.cwl", - "src/toil/test/cwl/download_file.json", - "--runImportsOnWorkers", - "--importWorkersDisk=10MiB", - "--realTimeLogging=True", - "--logLevel=INFO", "--logColors=False"] + args = [ + "src/toil/test/cwl/download.cwl", + "src/toil/test/cwl/download_file.json", + "--runImportsOnWorkers", + "--importWorkersDisk=10MiB", + "--realTimeLogging=True", + "--logLevel=INFO", + "--logColors=False", + ] from toil.cwl import cwltoil detector = ImportWorkersMessageHandler() @@ -1727,9 +1754,12 @@ def test_import_on_workers() -> None: # StreamHandler is generic, _typeshed doesn't exist at runtime, do a bit of typing trickery, see https://github.com/python/typeshed/issues/5680 if TYPE_CHECKING: from _typeshed import SupportsWrite + _stream_handler = logging.StreamHandler[SupportsWrite[str]] else: _stream_handler = logging.StreamHandler + + class ImportWorkersMessageHandler(_stream_handler): """ Detect the import workers log message and set a flag. @@ -1741,5 +1771,7 @@ def __init__(self) -> None: super().__init__(sys.stderr) def emit(self, record: logging.LogRecord) -> None: - if (record.msg % record.args).startswith("Issued job 'CWLImportJob' CWLImportJob"): + if (record.msg % record.args).startswith( + "Issued job 'CWLImportJob' CWLImportJob" + ): self.detected = True diff --git a/src/toil/test/cwl/mock_mpi/fake_mpi_run.py b/src/toil/test/cwl/mock_mpi/fake_mpi_run.py index a44caccd1e..73c5ff5409 100755 --- a/src/toil/test/cwl/mock_mpi/fake_mpi_run.py +++ b/src/toil/test/cwl/mock_mpi/fake_mpi_run.py @@ -2,16 +2,19 @@ import argparse import subprocess import sys -from typing import List from configargparse import ArgumentParser def make_parser(): p = ArgumentParser() - p.add_argument("progargs", nargs=argparse.REMAINDER, help="The program and its arguments") + p.add_argument( + "progargs", nargs=argparse.REMAINDER, help="The program and its arguments" + ) p.add_argument("--num", type=int, help="number of times to run the application") - p.add_argument("--no-fail", help="add this flag to actually work", action="store_true") + p.add_argument( + "--no-fail", help="add this flag to actually work", action="store_true" + ) return p @@ -22,10 +25,12 @@ def __init__(self): else: self.indata = sys.stdin.read().encode(sys.stdin.encoding) - def run_once(self, args: List[str]): - subprocess.run(args, input=self.indata, stdout=sys.stdout, stderr=sys.stderr).check_returncode() + def run_once(self, args: list[str]): + subprocess.run( + args, input=self.indata, stdout=sys.stdout, stderr=sys.stderr + ).check_returncode() - def run_many(self, n: int, args: List[str]): + def run_many(self, n: int, args: list[str]): for i in range(n): self.run_once(args) diff --git a/src/toil/test/docs/scripts/example_alwaysfail.py b/src/toil/test/docs/scripts/example_alwaysfail.py index fc3826997c..92b479523f 100644 --- a/src/toil/test/docs/scripts/example_alwaysfail.py +++ b/src/toil/test/docs/scripts/example_alwaysfail.py @@ -30,8 +30,8 @@ def main(): def explode(job): - sys.stderr.write('Something somewhere has gone terribly wrong\n') - raise RuntimeError('Boom!') + sys.stderr.write("Something somewhere has gone terribly wrong\n") + raise RuntimeError("Boom!") if __name__ == "__main__": diff --git a/src/toil/test/docs/scripts/example_cachingbenchmark.py b/src/toil/test/docs/scripts/example_cachingbenchmark.py index 463a3f46be..50d7bc7c62 100755 --- a/src/toil/test/docs/scripts/example_cachingbenchmark.py +++ b/src/toil/test/docs/scripts/example_cachingbenchmark.py @@ -32,10 +32,13 @@ def main(): - parser = ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser = ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) - parser.add_argument('--minSleep', type=int, default=1, - help="Minimum seconds to sleep") + parser.add_argument( + "--minSleep", type=int, default=1, help="Minimum seconds to sleep" + ) Job.Runner.addToilOptions(parser) @@ -53,8 +56,10 @@ def main(): def root(job, options): # Make a file with job.fileStore.writeGlobalFileStream() as (stream, file_id): - stream.write(b"This is a test of the Toil file caching system. " - b"Had this been an actual file, its contents would have been more interesting.") + stream.write( + b"This is a test of the Toil file caching system. " + b"Had this been an actual file, its contents would have been more interesting." + ) child_rvs = [] for i in range(100): @@ -65,14 +70,16 @@ def root(job, options): return job.addFollowOnJobFn(report, child_rvs).rv() -def poll(job, options, file_id, number, cores=0.1, disk='200M', memory='512M'): +def poll(job, options, file_id, number, cores=0.1, disk="200M", memory="512M"): # Wait a random amount of time before grabbing the file for others to cache it time.sleep(random.randint(options.minSleep, options.minSleep + 10)) # Read the file. Don't accept a symlink because then we might just have the # filestore's copy, even if caching is not happening. - local_file = job.fileStore.readGlobalFile(file_id, cache=True, mutable=False, symlink=False) + local_file = job.fileStore.readGlobalFile( + file_id, cache=True, mutable=False, symlink=False + ) # Wait a random amount of after before grabbing the file for others to use it time.sleep(random.randint(options.minSleep, options.minSleep + 10)) @@ -83,7 +90,9 @@ def poll(job, options, file_id, number, cores=0.1, disk='200M', memory='512M'): # Check what machine we are hostname = socket.gethostname() - RealtimeLogger.info(f'Job {number} on host {hostname} sees file at device {stats.st_dev} inode {stats.st_ino}') + RealtimeLogger.info( + f"Job {number} on host {hostname} sees file at device {stats.st_dev} inode {stats.st_ino}" + ) # Return a tuple representing our view of the file. # Drop hostname since hostnames are unique per pod. @@ -96,12 +105,12 @@ def report(job, views): for v in views: counts[v] += 1 - report = [f'{len(counts)} distinct views, most frequent:'] + report = [f"{len(counts)} distinct views, most frequent:"] for view, count in counts.most_common(10): - report.append(f'{view}: {count}') + report.append(f"{view}: {count}") - return '\n'.join(report) + return "\n".join(report) if __name__ == "__main__": diff --git a/src/toil/test/docs/scripts/tutorial_docker.py b/src/toil/test/docs/scripts/tutorial_docker.py index c002881fde..da75081492 100644 --- a/src/toil/test/docs/scripts/tutorial_docker.py +++ b/src/toil/test/docs/scripts/tutorial_docker.py @@ -5,10 +5,9 @@ from toil.lib.docker import apiDockerCall from toil.lib.io import mkdtemp -align = Job.wrapJobFn(apiDockerCall, - image='ubuntu', - working_dir=os.getcwd(), - parameters=['ls', '-lha']) +align = Job.wrapJobFn( + apiDockerCall, image="ubuntu", working_dir=os.getcwd(), parameters=["ls", "-lha"] +) if __name__ == "__main__": jobstore: str = mkdtemp("tutorial_docker") diff --git a/src/toil/test/docs/scripts/tutorial_dynamic.py b/src/toil/test/docs/scripts/tutorial_dynamic.py index 76f7e82d49..bd1c4d2412 100644 --- a/src/toil/test/docs/scripts/tutorial_dynamic.py +++ b/src/toil/test/docs/scripts/tutorial_dynamic.py @@ -7,8 +7,8 @@ def binaryStringFn(job, depth, message=""): if depth > 0: - job.addChildJobFn(binaryStringFn, depth-1, message + "0") - job.addChildJobFn(binaryStringFn, depth-1, message + "1") + job.addChildJobFn(binaryStringFn, depth - 1, message + "0") + job.addChildJobFn(binaryStringFn, depth - 1, message + "1") else: job.log(f"Binary string: {message}") diff --git a/src/toil/test/docs/scripts/tutorial_managing2.py b/src/toil/test/docs/scripts/tutorial_managing2.py index f1eb748dc1..db2e8faeef 100644 --- a/src/toil/test/docs/scripts/tutorial_managing2.py +++ b/src/toil/test/docs/scripts/tutorial_managing2.py @@ -6,14 +6,16 @@ def globalFileStoreJobFn(job): - job.log("The following example exercises all the methods provided " - "by the toil.fileStores.abstractFileStore.AbstractFileStore class") + job.log( + "The following example exercises all the methods provided " + "by the toil.fileStores.abstractFileStore.AbstractFileStore class" + ) # Create a local temporary file. scratchFile = job.fileStore.getLocalTempFile() # Write something in the scratch file. - with open(scratchFile, 'w') as fH: + with open(scratchFile, "w") as fH: fH.write("What a tangled web we weave") # Write a copy of the file into the file-store; fileID is the key that can be used to retrieve the file. diff --git a/src/toil/test/docs/scripts/tutorial_promises2.py b/src/toil/test/docs/scripts/tutorial_promises2.py index 99bd28341d..1e69925a9f 100644 --- a/src/toil/test/docs/scripts/tutorial_promises2.py +++ b/src/toil/test/docs/scripts/tutorial_promises2.py @@ -7,8 +7,10 @@ def binaryStrings(job, depth, message=""): if depth > 0: - s = [job.addChildJobFn(binaryStrings, depth - 1, message + "0").rv(), - job.addChildJobFn(binaryStrings, depth - 1, message + "1").rv()] + s = [ + job.addChildJobFn(binaryStrings, depth - 1, message + "0").rv(), + job.addChildJobFn(binaryStrings, depth - 1, message + "1").rv(), + ] return job.addFollowOnFn(merge, s).rv() return [message] diff --git a/src/toil/test/docs/scripts/tutorial_requirements.py b/src/toil/test/docs/scripts/tutorial_requirements.py index 8cfaa5ab28..6aeb0ef633 100644 --- a/src/toil/test/docs/scripts/tutorial_requirements.py +++ b/src/toil/test/docs/scripts/tutorial_requirements.py @@ -6,12 +6,20 @@ def parentJob(job): - downloadJob = Job.wrapJobFn(stageFn, "file://" + os.path.realpath(__file__), cores=0.1, memory='32M', disk='1M') + downloadJob = Job.wrapJobFn( + stageFn, + "file://" + os.path.realpath(__file__), + cores=0.1, + memory="32M", + disk="1M", + ) job.addChild(downloadJob) - analysis = Job.wrapJobFn(analysisJob, - fileStoreID=downloadJob.rv(0), - disk=PromisedRequirement(downloadJob.rv(1))) + analysis = Job.wrapJobFn( + analysisJob, + fileStoreID=downloadJob.rv(0), + disk=PromisedRequirement(downloadJob.rv(1)), + ) job.addFollowOn(analysis) diff --git a/src/toil/test/docs/scripts/tutorial_staging.py b/src/toil/test/docs/scripts/tutorial_staging.py index 57a1431ed9..17c278228a 100644 --- a/src/toil/test/docs/scripts/tutorial_staging.py +++ b/src/toil/test/docs/scripts/tutorial_staging.py @@ -11,9 +11,12 @@ def __init__(self, id): self.inputFileID = id def run(self, fileStore): - with fileStore.readGlobalFileStream(self.inputFileID, encoding='utf-8') as fi: - with fileStore.writeGlobalFileStream(encoding='utf-8') as (fo, outputFileID): - fo.write(fi.read() + 'World!') + with fileStore.readGlobalFileStream(self.inputFileID, encoding="utf-8") as fi: + with fileStore.writeGlobalFileStream(encoding="utf-8") as ( + fo, + outputFileID, + ): + fo.write(fi.read() + "World!") return outputFileID @@ -26,10 +29,17 @@ def run(self, fileStore): with Toil(options) as toil: if not toil.options.restart: - ioFileDirectory = os.path.join(os.path.dirname(os.path.abspath(__file__)), "stagingExampleFiles") - inputFileID = toil.importFile("file://" + os.path.abspath(os.path.join(ioFileDirectory, "in.txt"))) + ioFileDirectory = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "stagingExampleFiles" + ) + inputFileID = toil.importFile( + "file://" + os.path.abspath(os.path.join(ioFileDirectory, "in.txt")) + ) outputFileID = toil.start(HelloWorld(inputFileID)) else: outputFileID = toil.restart() - toil.exportFile(outputFileID, "file://" + os.path.abspath(os.path.join(ioFileDirectory, "out.txt"))) + toil.exportFile( + outputFileID, + "file://" + os.path.abspath(os.path.join(ioFileDirectory, "out.txt")), + ) diff --git a/src/toil/test/docs/scripts/tutorial_stats.py b/src/toil/test/docs/scripts/tutorial_stats.py index 76702dad77..a8dab81a1d 100644 --- a/src/toil/test/docs/scripts/tutorial_stats.py +++ b/src/toil/test/docs/scripts/tutorial_stats.py @@ -1,12 +1,10 @@ -import os +import math +import time +from multiprocessing import Process from toil.common import Toil from toil.job import Job -import math -import time - -from multiprocessing import Process def think(seconds): start = time.time() @@ -14,6 +12,7 @@ def think(seconds): # Use CPU math.sqrt(123456) + class TimeWaster(Job): def __init__(self, time_to_think, time_to_waste, space_to_waste, *args, **kwargs): self.time_to_think = time_to_think @@ -24,10 +23,10 @@ def __init__(self, time_to_think, time_to_waste, space_to_waste, *args, **kwargs def run(self, fileStore): # Waste some space file_path = fileStore.getLocalTempFile() - with open(file_path, 'w') as stream: + with open(file_path, "w") as stream: for i in range(self.space_to_waste): stream.write("X") - + # Do some "useful" compute processes = [] for core_number in range(max(1, self.cores)): @@ -45,11 +44,10 @@ def run(self, fileStore): def main(): options = Job.Runner.getDefaultArgumentParser().parse_args() - job1 = TimeWaster(0, 0, 0, displayName='doNothing') - job2 = TimeWaster(10, 0, 4096, displayName='efficientJob') - job3 = TimeWaster(10, 0, 1024, cores=4, displayName='multithreadedJob') - job4 = TimeWaster(1, 9, 65536, displayName='inefficientJob') - + job1 = TimeWaster(0, 0, 0, displayName="doNothing") + job2 = TimeWaster(10, 0, 4096, displayName="efficientJob") + job3 = TimeWaster(10, 0, 1024, cores=4, displayName="multithreadedJob") + job4 = TimeWaster(1, 9, 65536, displayName="inefficientJob") job1.addChild(job2) job1.addChild(job3) @@ -62,5 +60,5 @@ def main(): toil.restart() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/toil/test/docs/scriptsTest.py b/src/toil/test/docs/scriptsTest.py index 99deed467c..411f8bee5f 100644 --- a/src/toil/test/docs/scriptsTest.py +++ b/src/toil/test/docs/scriptsTest.py @@ -4,19 +4,19 @@ import subprocess import sys import unittest -import pytest -from typing import List +import pytest -pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) # noqa +pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) # noqa sys.path.insert(0, pkg_root) # noqa -from toil.test import ToilTest, needs_cwl, needs_docker +from toil.test import ToilTest, needs_docker from toil.version import python class ToilDocumentationTest(ToilTest): """Tests for scripts in the toil tutorials.""" + @classmethod def setUpClass(cls): super(ToilTest, cls).setUpClass() @@ -25,7 +25,7 @@ def setUpClass(cls): def tearDown(self) -> None: super(ToilTest, self).tearDown() - jobstores = ['/mnt/ephemeral/workspace/toil-pull-requests/toilWorkflowRun'] + jobstores = ["/mnt/ephemeral/workspace/toil-pull-requests/toilWorkflowRun"] for jobstore in jobstores: if os.path.exists(jobstore): shutil.rmtree(jobstore) @@ -33,19 +33,24 @@ def tearDown(self) -> None: unittest.TestCase.tearDown(self) """Just check the exit code""" - def checkExitCode(self, script, extra_args: List[str] = []): + + def checkExitCode(self, script, extra_args: list[str] = []): program = os.path.join(self.directory, "scripts", script) - process = subprocess.Popen([python, program, "file:my-jobstore", "--clean=always"] + extra_args, - stdout=subprocess.PIPE, stderr=subprocess.PIPE) + process = subprocess.Popen( + [python, program, "file:my-jobstore", "--clean=always"] + extra_args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) stdout, stderr = process.communicate() if isinstance(stdout, bytes): - stdout = stdout.decode('utf-8') - stderr = stderr.decode('utf-8') + stdout = stdout.decode("utf-8") + stderr = stderr.decode("utf-8") if not process.returncode == 0: raise RuntimeError(stderr) - return stdout + ' ' + stderr + return stdout + " " + stderr """Check the exit code and the output""" + def checkExpectedOut(self, script, expectedOutput): outerr = self.checkExitCode(script) @@ -54,6 +59,7 @@ def checkExpectedOut(self, script, expectedOutput): self.assertGreater(index, -1, f"Expected:\n{expectedOutput}\nOutput:\n{outerr}") """Check the exit code and look for a pattern""" + def checkExpectedPattern(self, script, expectedPattern): outerr = self.checkExitCode(script) @@ -77,16 +83,24 @@ def testEncapsulation2(self): self.checkExitCode("tutorial_encapsulation2.py") def testHelloworld(self): - self.checkExpectedOut("tutorial_helloworld.py", "Hello, world!, here's a message: You did it!\n") + self.checkExpectedOut( + "tutorial_helloworld.py", "Hello, world!, here's a message: You did it!\n" + ) def testInvokeworkflow(self): - self.checkExpectedOut("tutorial_invokeworkflow.py", "Hello, world!, here's a message: Woot\n") + self.checkExpectedOut( + "tutorial_invokeworkflow.py", "Hello, world!, here's a message: Woot\n" + ) def testInvokeworkflow2(self): - self.checkExpectedOut("tutorial_invokeworkflow2.py", "Hello, world!, I have a message: Woot!\n") + self.checkExpectedOut( + "tutorial_invokeworkflow2.py", "Hello, world!, I have a message: Woot!\n" + ) def testJobFunctions(self): - self.checkExpectedOut("tutorial_jobfunctions.py", "Hello world, I have a message: Woot!\n") + self.checkExpectedOut( + "tutorial_jobfunctions.py", "Hello world, I have a message: Woot!\n" + ) def testManaging(self): self.checkExitCode("tutorial_managing.py") @@ -95,39 +109,51 @@ def testManaging2(self): self.checkExitCode("tutorial_managing2.py") def testMultiplejobs(self): - self.checkExpectedPattern("tutorial_multiplejobs.py", - "Hello world, I have a message: first.*Hello world, I have a message: " - "second or third.*Hello world, I have a message: second or third.*Hello world," - " I have a message: last") + self.checkExpectedPattern( + "tutorial_multiplejobs.py", + "Hello world, I have a message: first.*Hello world, I have a message: " + "second or third.*Hello world, I have a message: second or third.*Hello world," + " I have a message: last", + ) def testMultiplejobs2(self): - self.checkExpectedPattern("tutorial_multiplejobs2.py", - "Hello world, I have a message: first.*Hello world, I have a message: " - "second or third.*Hello world, I have a message: second or third.*Hello world," - " I have a message: last") + self.checkExpectedPattern( + "tutorial_multiplejobs2.py", + "Hello world, I have a message: first.*Hello world, I have a message: " + "second or third.*Hello world, I have a message: second or third.*Hello world," + " I have a message: last", + ) def testMultiplejobs3(self): - self.checkExpectedPattern("tutorial_multiplejobs3.py", - "Hello world, I have a message: first.*Hello world, I have a message: " - "second or third.*Hello world, I have a message: second or third.*Hello world," - " I have a message: last") + self.checkExpectedPattern( + "tutorial_multiplejobs3.py", + "Hello world, I have a message: first.*Hello world, I have a message: " + "second or third.*Hello world, I have a message: second or third.*Hello world," + " I have a message: last", + ) @pytest.mark.timeout(1200) def testPromises2(self): - self.checkExpectedOut("tutorial_promises2.py", - "['00000', '00001', '00010', '00011', '00100', '00101', '00110', '00111'," - " '01000', '01001', '01010', '01011', '01100', '01101', '01110', '01111'," - " '10000', '10001', '10010', '10011', '10100', '10101', '10110', '10111'," - " '11000', '11001', '11010', '11011', '11100', '11101', '11110', '11111']") + self.checkExpectedOut( + "tutorial_promises2.py", + "['00000', '00001', '00010', '00011', '00100', '00101', '00110', '00111'," + " '01000', '01001', '01010', '01011', '01100', '01101', '01110', '01111'," + " '10000', '10001', '10010', '10011', '10100', '10101', '10110', '10111'," + " '11000', '11001', '11010', '11011', '11100', '11101', '11110', '11111']", + ) def testQuickstart(self): - self.checkExpectedOut("tutorial_quickstart.py", "Hello, world!, here's a message: Woot\n") + self.checkExpectedOut( + "tutorial_quickstart.py", "Hello, world!, here's a message: Woot\n" + ) def testRequirements(self): self.checkExitCode("tutorial_requirements.py") def testArguments(self): - self.checkExpectedOut("tutorial_arguments.py", "Hello, world!, here's a message: Woot") + self.checkExpectedOut( + "tutorial_arguments.py", "Hello, world!, here's a message: Woot" + ) @needs_docker def testDocker(self): diff --git a/src/toil/test/jobStores/jobStoreTest.py b/src/toil/test/jobStores/jobStoreTest.py index 33453f92d7..1c88c071ab 100644 --- a/src/toil/test/jobStores/jobStoreTest.py +++ b/src/toil/test/jobStores/jobStoreTest.py @@ -28,7 +28,7 @@ from queue import Queue from tempfile import mkstemp from threading import Thread -from typing import Any, Tuple +from typing import Any from urllib.request import Request, urlopen import pytest @@ -37,20 +37,21 @@ from toil.common import Config, Toil from toil.fileStores import FileID from toil.job import Job, JobDescription, TemporaryID -from toil.jobStores.abstractJobStore import (NoSuchFileException, - NoSuchJobException) +from toil.jobStores.abstractJobStore import NoSuchFileException, NoSuchJobException from toil.jobStores.fileJobStore import FileJobStore from toil.lib.io import mkdtemp from toil.lib.memoize import memoize from toil.lib.retry import retry from toil.statsAndLogging import StatsAndLogging -from toil.test import (ToilTest, - make_tests, - needs_aws_s3, - needs_encryption, - needs_google_project, - needs_google_storage, - slow) +from toil.test import ( + ToilTest, + make_tests, + needs_aws_s3, + needs_encryption, + needs_google_project, + needs_google_storage, + slow, +) # noinspection PyPackageRequirements # (installed by `make prepare`) @@ -82,11 +83,11 @@ class Test(ToilTest, metaclass=ABCMeta): def setUpClass(cls): super().setUpClass() logging.basicConfig(level=logging.DEBUG) - logging.getLogger('boto').setLevel(logging.CRITICAL) - logging.getLogger('boto').setLevel(logging.WARNING) - logging.getLogger('boto3.resources').setLevel(logging.WARNING) - logging.getLogger('botocore.auth').setLevel(logging.WARNING) - logging.getLogger('botocore.hooks').setLevel(logging.WARNING) + logging.getLogger("boto").setLevel(logging.CRITICAL) + logging.getLogger("boto").setLevel(logging.WARNING) + logging.getLogger("boto3.resources").setLevel(logging.WARNING) + logging.getLogger("botocore.auth").setLevel(logging.WARNING) + logging.getLogger("botocore.hooks").setLevel(logging.WARNING) # The use of @memoize ensures that we only have one instance of per class even with the # generative import/export tests attempts to instantiate more. This in turn enables us to @@ -113,7 +114,7 @@ def _createJobStore(self): def setUp(self): super().setUp() - self.namePrefix = 'jobstore-test-' + str(uuid.uuid4()) + self.namePrefix = "jobstore-test-" + str(uuid.uuid4()) self.config = self._createConfig() # Jobstores to be used in testing. @@ -127,10 +128,16 @@ def setUp(self): self.jobstore_resumed_noconfig.resume() # Requirements for jobs to be created. - self.arbitraryRequirements = {'memory': 1, 'disk': 2, 'cores': 1, 'preemptible': False} + self.arbitraryRequirements = { + "memory": 1, + "disk": 2, + "cores": 1, + "preemptible": False, + } # Function to make an arbitrary new job - self.arbitraryJob = lambda: JobDescription(jobName='arbitrary', - requirements=self.arbitraryRequirements) + self.arbitraryJob = lambda: JobDescription( + jobName="arbitrary", requirements=self.arbitraryRequirements + ) self.parentJobReqs = dict(memory=12, cores=34, disk=35, preemptible=True) self.childJobReqs1 = dict(memory=23, cores=45, disk=46, preemptible=True) @@ -143,8 +150,12 @@ def tearDown(self): def testInitialState(self): """Ensure proper handling of nonexistent files.""" - self.assertFalse(self.jobstore_initialized.job_exists('nonexistentFile')) - self.assertRaises(NoSuchJobException, self.jobstore_initialized.load_job, 'nonexistentFile') + self.assertFalse(self.jobstore_initialized.job_exists("nonexistentFile")) + self.assertRaises( + NoSuchJobException, + self.jobstore_initialized.load_job, + "nonexistentFile", + ) def testJobCreation(self): """ @@ -157,8 +168,9 @@ def testJobCreation(self): jobstore = self.jobstore_initialized # Create a job and verify its existence/properties - job = JobDescription(requirements=self.parentJobReqs, - jobName='test1', unitName='onParent') + job = JobDescription( + requirements=self.parentJobReqs, jobName="test1", unitName="onParent" + ) self.assertTrue(isinstance(job.jobStoreID, TemporaryID)) jobstore.assign_job_id(job) self.assertFalse(isinstance(job.jobStoreID, TemporaryID)) @@ -167,12 +179,12 @@ def testJobCreation(self): self.assertEqual(created, job) self.assertTrue(jobstore.job_exists(job.jobStoreID)) - self.assertEqual(job.memory, self.parentJobReqs['memory']) - self.assertEqual(job.cores, self.parentJobReqs['cores']) - self.assertEqual(job.disk, self.parentJobReqs['disk']) - self.assertEqual(job.preemptible, self.parentJobReqs['preemptible']) - self.assertEqual(job.jobName, 'test1') - self.assertEqual(job.unitName, 'onParent') + self.assertEqual(job.memory, self.parentJobReqs["memory"]) + self.assertEqual(job.cores, self.parentJobReqs["cores"]) + self.assertEqual(job.disk, self.parentJobReqs["disk"]) + self.assertEqual(job.preemptible, self.parentJobReqs["preemptible"]) + self.assertEqual(job.jobName, "test1") + self.assertEqual(job.unitName, "onParent") def testConfigEquality(self): """ @@ -191,8 +203,9 @@ def testJobLoadEquality(self): """Tests that a job created via one JobStore instance can be loaded from another.""" # Create a job on the first jobstore. - jobDesc1 = JobDescription(requirements=self.parentJobReqs, - jobName='test1', unitName='onJS1') + jobDesc1 = JobDescription( + requirements=self.parentJobReqs, jobName="test1", unitName="onJS1" + ) self.jobstore_initialized.assign_job_id(jobDesc1) self.jobstore_initialized.create_job(jobDesc1) @@ -203,11 +216,13 @@ def testJobLoadEquality(self): def testChildLoadingEquality(self): """Test that loading a child job operates as expected.""" - job = JobDescription(requirements=self.parentJobReqs, - jobName='test1', unitName='onParent') + job = JobDescription( + requirements=self.parentJobReqs, jobName="test1", unitName="onParent" + ) - childJob = JobDescription(requirements=self.childJobReqs1, - jobName='test2', unitName='onChild1') + childJob = JobDescription( + requirements=self.childJobReqs1, jobName="test2", unitName="onChild1" + ) self.jobstore_initialized.assign_job_id(job) self.jobstore_initialized.assign_job_id(childJob) self.jobstore_initialized.create_job(job) @@ -215,7 +230,10 @@ def testChildLoadingEquality(self): job.addChild(childJob.jobStoreID) self.jobstore_initialized.update_job(job) - self.assertEqual(self.jobstore_initialized.load_job(list(job.allSuccessors())[0])._body, childJob._body) + self.assertEqual( + self.jobstore_initialized.load_job(list(job.allSuccessors())[0])._body, + childJob._body, + ) def testPersistantFilesToDelete(self): """ @@ -230,28 +248,35 @@ def testPersistantFilesToDelete(self): """ # Create a job. - job = JobDescription(requirements=self.parentJobReqs, - jobName='test1', unitName='onJS1') + job = JobDescription( + requirements=self.parentJobReqs, jobName="test1", unitName="onJS1" + ) self.jobstore_initialized.assign_job_id(job) self.jobstore_initialized.create_job(job) - job.filesToDelete = ['1', '2'] + job.filesToDelete = ["1", "2"] self.jobstore_initialized.update_job(job) - self.assertEqual(self.jobstore_initialized.load_job(job.jobStoreID).filesToDelete, ['1', '2']) + self.assertEqual( + self.jobstore_initialized.load_job(job.jobStoreID).filesToDelete, + ["1", "2"], + ) def testUpdateBehavior(self): """Tests the proper behavior during updating jobs.""" jobstore1 = self.jobstore_initialized jobstore2 = self.jobstore_resumed_noconfig - job1 = JobDescription(requirements=self.parentJobReqs, - jobName='test1', unitName='onParent') + job1 = JobDescription( + requirements=self.parentJobReqs, jobName="test1", unitName="onParent" + ) - childJob1 = JobDescription(requirements=self.childJobReqs1, - jobName='test2', unitName='onChild1') + childJob1 = JobDescription( + requirements=self.childJobReqs1, jobName="test2", unitName="onChild1" + ) - childJob2 = JobDescription(requirements=self.childJobReqs2, - jobName='test3', unitName='onChild2') + childJob2 = JobDescription( + requirements=self.childJobReqs2, jobName="test3", unitName="onChild2" + ) jobstore1.assign_job_id(job1) jobstore1.create_job(job1) @@ -270,7 +295,9 @@ def testUpdateBehavior(self): # Check equivalence between jobstore1 and jobstore2. # While job1 and job2 share a jobStoreID, job1 has not been "refreshed" to show the newly added child jobs. - self.assertNotEqual(sorted(job2.allSuccessors()), sorted(job1.allSuccessors())) + self.assertNotEqual( + sorted(job2.allSuccessors()), sorted(job1.allSuccessors()) + ) # Reload parent job on jobstore, "refreshing" the job. job1 = jobstore1.load_job(job1.jobStoreID) @@ -287,18 +314,21 @@ def testJobDeletions(self): """Tests the consequences of deleting jobs.""" # A local jobstore object for testing. jobstore = self.jobstore_initialized - job = JobDescription(requirements=self.parentJobReqs, - jobName='test1', unitName='onJob') + job = JobDescription( + requirements=self.parentJobReqs, jobName="test1", unitName="onJob" + ) # Create job jobstore.assign_job_id(job) jobstore.create_job(job) # Create child Jobs - child1 = JobDescription(requirements=self.childJobReqs1, - jobName='test2', unitName='onChild1') + child1 = JobDescription( + requirements=self.childJobReqs1, jobName="test2", unitName="onChild1" + ) - child2 = JobDescription(requirements=self.childJobReqs2, - jobName='test3', unitName='onChild2') + child2 = JobDescription( + requirements=self.childJobReqs2, jobName="test3", unitName="onChild2" + ) # Add children to parent. jobstore.assign_job_id(child1) @@ -320,7 +350,10 @@ def testJobDeletions(self): # jobs that show up are a subset of all existing jobs. If we had deleted jobs before # this we would have to worry about ghost jobs appearing and this assertion would not # be valid - self.assertTrue({j.jobStoreID for j in (childJobs + [job])} >= {j.jobStoreID for j in jobstore.jobs()}) + self.assertTrue( + {j.jobStoreID for j in (childJobs + [job])} + >= {j.jobStoreID for j in jobstore.jobs()} + ) # Test job deletions # First delete parent, this should have no effect on the children @@ -333,12 +366,14 @@ def testJobDeletions(self): self.assertTrue(jobstore.job_exists(childJob.jobStoreID)) jobstore.delete_job(childJob.jobStoreID) self.assertFalse(jobstore.job_exists(childJob.jobStoreID)) - self.assertRaises(NoSuchJobException, jobstore.load_job, childJob.jobStoreID) + self.assertRaises( + NoSuchJobException, jobstore.load_job, childJob.jobStoreID + ) try: - with jobstore.read_shared_file_stream('missing') as _: + with jobstore.read_shared_file_stream("missing") as _: pass - self.fail('Expecting NoSuchFileException') + self.fail("Expecting NoSuchFileException") except NoSuchFileException: pass @@ -347,36 +382,40 @@ def testSharedFiles(self): jobstore1 = self.jobstore_initialized jobstore2 = self.jobstore_resumed_noconfig - bar = b'bar' + bar = b"bar" - with jobstore1.write_shared_file_stream('foo') as f: + with jobstore1.write_shared_file_stream("foo") as f: f.write(bar) # ... read that file on worker, ... - with jobstore2.read_shared_file_stream('foo') as f: + with jobstore2.read_shared_file_stream("foo") as f: self.assertEqual(bar, f.read()) # ... and read it again on jobstore1. - with jobstore1.read_shared_file_stream('foo') as f: + with jobstore1.read_shared_file_stream("foo") as f: self.assertEqual(bar, f.read()) - with jobstore1.write_shared_file_stream('nonEncrypted', encrypted=False) as f: + with jobstore1.write_shared_file_stream( + "nonEncrypted", encrypted=False + ) as f: f.write(bar) - self.assertUrl(jobstore1.get_shared_public_url('nonEncrypted')) - self.assertRaises(NoSuchFileException, jobstore1.get_shared_public_url, 'missing') + self.assertUrl(jobstore1.get_shared_public_url("nonEncrypted")) + self.assertRaises( + NoSuchFileException, jobstore1.get_shared_public_url, "missing" + ) def testReadWriteSharedFilesTextMode(self): """Checks if text mode is compatible for shared file streams.""" jobstore1 = self.jobstore_initialized jobstore2 = self.jobstore_resumed_noconfig - bar = 'bar' + bar = "bar" - with jobstore1.write_shared_file_stream('foo', encoding='utf-8') as f: + with jobstore1.write_shared_file_stream("foo", encoding="utf-8") as f: f.write(bar) - with jobstore2.read_shared_file_stream('foo', encoding='utf-8') as f: + with jobstore2.read_shared_file_stream("foo", encoding="utf-8") as f: self.assertEqual(bar, f.read()) - with jobstore1.read_shared_file_stream('foo', encoding='utf-8') as f: + with jobstore1.read_shared_file_stream("foo", encoding="utf-8") as f: self.assertEqual(bar, f.read()) def testReadWriteFileStreamTextMode(self): @@ -386,19 +425,22 @@ def testReadWriteFileStreamTextMode(self): jobstore.assign_job_id(job) jobstore.create_job(job) - foo = 'foo' - bar = 'bar' + foo = "foo" + bar = "bar" - with jobstore.write_file_stream(job.jobStoreID, encoding='utf-8') as (f, fileID): + with jobstore.write_file_stream(job.jobStoreID, encoding="utf-8") as ( + f, + fileID, + ): f.write(foo) - with jobstore.read_file_stream(fileID, encoding='utf-8') as f: + with jobstore.read_file_stream(fileID, encoding="utf-8") as f: self.assertEqual(foo, f.read()) - with jobstore.update_file_stream(fileID, encoding='utf-8') as f: + with jobstore.update_file_stream(fileID, encoding="utf-8") as f: f.write(bar) - with jobstore.read_file_stream(fileID, encoding='utf-8') as f: + with jobstore.read_file_stream(fileID, encoding="utf-8") as f: self.assertEqual(bar, f.read()) def testPerJobFiles(self): @@ -407,19 +449,22 @@ def testPerJobFiles(self): jobstore2 = self.jobstore_resumed_noconfig # Create jobNodeOnJS1 - jobOnJobStore1 = JobDescription(requirements=self.parentJobReqs, - jobName='test1', unitName='onJobStore1') + jobOnJobStore1 = JobDescription( + requirements=self.parentJobReqs, jobName="test1", unitName="onJobStore1" + ) # First recreate job jobstore1.assign_job_id(jobOnJobStore1) jobstore1.create_job(jobOnJobStore1) - fileOne = jobstore2.get_empty_file_store_id(jobOnJobStore1.jobStoreID, cleanup=True) + fileOne = jobstore2.get_empty_file_store_id( + jobOnJobStore1.jobStoreID, cleanup=True + ) # Check file exists self.assertTrue(jobstore2.file_exists(fileOne)) self.assertTrue(jobstore1.file_exists(fileOne)) - one = b'one' - two = b'two' - three = b'three' + one = b"one" + two = b"two" + three = b"three" # ... write to the file on jobstore2, ... with jobstore2.update_file_stream(fileOne) as f: f.write(one) @@ -431,20 +476,22 @@ def testPerJobFiles(self): fh, path = mkstemp() try: os.close(fh) - tmpPath = path + '.read-only' + tmpPath = path + ".read-only" jobstore1.read_file(fileOne, tmpPath) try: shutil.copyfile(tmpPath, path) finally: os.unlink(tmpPath) - with open(path, 'rb+') as f: + with open(path, "rb+") as f: self.assertEqual(f.read(), one) # Write a different string to the local file ... f.seek(0) f.truncate(0) f.write(two) # ... and create a second file from the local file. - fileTwo = jobstore1.write_file(path, jobOnJobStore1.jobStoreID, cleanup=True) + fileTwo = jobstore1.write_file( + path, jobOnJobStore1.jobStoreID, cleanup=True + ) with jobstore2.read_file_stream(fileTwo) as f: self.assertEqual(f.read(), two) # Now update the first file from the local file ... @@ -454,7 +501,9 @@ def testPerJobFiles(self): finally: os.unlink(path) # Create a third file to test the last remaining method. - with jobstore2.write_file_stream(jobOnJobStore1.jobStoreID, cleanup=True) as (f, fileThree): + with jobstore2.write_file_stream( + jobOnJobStore1.jobStoreID, cleanup=True + ) as (f, fileThree): f.write(three) with jobstore1.read_file_stream(fileThree) as f: self.assertEqual(f.read(), three) @@ -465,11 +514,11 @@ def testPerJobFiles(self): # for store in jobstore2, jobstore1: self.assertFalse(store.file_exists(fileOne)) - self.assertRaises(NoSuchFileException, store.read_file, fileOne, '') + self.assertRaises(NoSuchFileException, store.read_file, fileOne, "") try: with store.read_file_stream(fileOne) as _: pass - self.fail('Expecting NoSuchFileException') + self.fail("Expecting NoSuchFileException") except NoSuchFileException: pass @@ -478,16 +527,17 @@ def testStatsAndLogging(self): jobstore1 = self.jobstore_initialized jobstore2 = self.jobstore_resumed_noconfig - jobOnJobStore1 = JobDescription(requirements=self.parentJobReqs, - jobName='test1', unitName='onJobStore1') + jobOnJobStore1 = JobDescription( + requirements=self.parentJobReqs, jobName="test1", unitName="onJobStore1" + ) jobstore1.assign_job_id(jobOnJobStore1) jobstore1.create_job(jobOnJobStore1) # Test stats and logging stats = None - one = b'one' - two = b'two' + one = b"one" + two = b"two" # Allows stats to be read/written to/from in read/writeStatsAndLogging. def callback(f2): @@ -504,7 +554,9 @@ def callback(f2): jobstore2.write_logs(one) self.assertEqual(1, jobstore1.read_logs(callback)) self.assertEqual({one}, stats) - self.assertEqual(0, jobstore1.read_logs(callback)) # read_logs purges saved stats etc + self.assertEqual( + 0, jobstore1.read_logs(callback) + ) # read_logs purges saved stats etc jobstore2.write_logs(one) jobstore2.write_logs(two) @@ -528,19 +580,21 @@ def callback(f2): def testWriteLogFiles(self): """Test writing log files.""" - jobNames = ['testStatsAndLogging_writeLogFiles'] - jobLogList = ['string', b'bytes', '', b'newline\n'] + jobNames = ["testStatsAndLogging_writeLogFiles"] + jobLogList = ["string", b"bytes", "", b"newline\n"] config = self._createConfig() - setattr(config, 'writeLogs', self._createTempDir()) - setattr(config, 'writeLogsGzip', None) + setattr(config, "writeLogs", self._createTempDir()) + setattr(config, "writeLogsGzip", None) StatsAndLogging.writeLogFiles(jobNames, jobLogList, config) - jobLogFile = os.path.join(config.writeLogs, jobNames[0] + '_000.log') + jobLogFile = os.path.join(config.writeLogs, jobNames[0] + "_000.log") # The log directory should get exactly one file, names after this # easy job name with no replacements needed. - self.assertEqual(os.listdir(config.writeLogs), [os.path.basename(jobLogFile)]) + self.assertEqual( + os.listdir(config.writeLogs), [os.path.basename(jobLogFile)] + ) self.assertTrue(os.path.isfile(jobLogFile)) with open(jobLogFile) as f: - self.assertEqual(f.read(), 'string\nbytes\n\nnewline\n') + self.assertEqual(f.read(), "string\nbytes\n\nnewline\n") def testBatchCreate(self): """Test creation of many jobs.""" @@ -549,8 +603,11 @@ def testBatchCreate(self): jobs = [] with jobstore.batch(): for i in range(100): - overlargeJob = JobDescription(requirements=jobRequirements, - jobName='test-overlarge', unitName='onJobStore') + overlargeJob = JobDescription( + requirements=jobRequirements, + jobName="test-overlarge", + unitName="onJobStore", + ) jobstore.assign_job_id(overlargeJob) jobstore.create_job(overlargeJob) jobs.append(overlargeJob) @@ -618,16 +675,16 @@ def _externalStore(self): try: store = self.externalStoreCache[self] except KeyError: - logger.debug('Creating new external store for %s', self) + logger.debug("Creating new external store for %s", self) store = self.externalStoreCache[self] = self._createExternalStore() else: - logger.debug('Reusing external store for %s', self) + logger.debug("Reusing external store for %s", self) return store @classmethod def cleanUpExternalStores(cls): for test, store in cls.externalStoreCache.items(): - logger.debug('Cleaning up external store for %s.', test) + logger.debug("Cleaning up external store for %s.", test) test._cleanUpExternalStore(store) mpTestPartSize = 5 << 20 @@ -637,9 +694,11 @@ def makeImportExportTests(cls): testClasses = [FileJobStoreTest, AWSJobStoreTest, GoogleJobStoreTest] - activeTestClassesByName = {testCls.__name__: testCls - for testCls in testClasses - if not getattr(testCls, '__unittest_skip__', False)} + activeTestClassesByName = { + testCls.__name__: testCls + for testCls in testClasses + if not getattr(testCls, "__unittest_skip__", False) + } def testImportExportFile(self, otherCls, size, moveExports): """ @@ -659,7 +718,7 @@ def testImportExportFile(self, otherCls, size, moveExports): # The string in otherCls() is arbitrary as long as it returns a class that has access # to ._externalStore() and ._prepareTestFile() - other = otherCls('testSharedFiles') + other = otherCls("testSharedFiles") store = other._externalStore() srcUrl, srcMd5 = other._prepareTestFile(store, size) @@ -674,9 +733,11 @@ def testImportExportFile(self, otherCls, size, moveExports): self.jobstore_initialized.export_file(jobStoreFileID, dstUrl) self.assertEqual(fileMD5, other._hashTestFile(dstUrl)) - if otherCls.__name__ == 'FileJobStoreTest': + if otherCls.__name__ == "FileJobStoreTest": if isinstance(self.jobstore_initialized, FileJobStore): - jobStorePath = self.jobstore_initialized._get_file_path_from_id(jobStoreFileID) + jobStorePath = self.jobstore_initialized._get_file_path_from_id( + jobStoreFileID + ) jobStoreHasLink = os.path.islink(jobStorePath) if self.jobstore_initialized.moveExports: # Ensure the export performed a move / link @@ -690,14 +751,20 @@ def testImportExportFile(self, otherCls, size, moveExports): os.remove(srcUrl[7:]) os.remove(dstUrl[7:]) - make_tests(testImportExportFile, cls, otherCls=activeTestClassesByName, - size=dict(zero=0, - one=1, - oneMiB=2 ** 20, - partSizeMinusOne=cls.mpTestPartSize - 1, - partSize=cls.mpTestPartSize, - partSizePlusOne=cls.mpTestPartSize + 1), - moveExports={'deactivated': None, 'activated': True}) + make_tests( + testImportExportFile, + cls, + otherCls=activeTestClassesByName, + size=dict( + zero=0, + one=1, + oneMiB=2**20, + partSizeMinusOne=cls.mpTestPartSize - 1, + partSize=cls.mpTestPartSize, + partSizePlusOne=cls.mpTestPartSize + 1, + ), + moveExports={"deactivated": None, "activated": True}, + ) def testImportSharedFile(self, otherCls): """ @@ -708,33 +775,36 @@ def testImportSharedFile(self, otherCls): """ # Prepare test file in other job store self.jobstore_initialized.part_size = cls.mpTestPartSize - other = otherCls('testSharedFiles') + other = otherCls("testSharedFiles") store = other._externalStore() srcUrl, srcMd5 = other._prepareTestFile(store, 42) # Import into job store under test - self.assertIsNone(self.jobstore_initialized.import_file(srcUrl, shared_file_name='foo')) - with self.jobstore_initialized.read_shared_file_stream('foo') as f: + self.assertIsNone( + self.jobstore_initialized.import_file( + srcUrl, shared_file_name="foo" + ) + ) + with self.jobstore_initialized.read_shared_file_stream("foo") as f: fileMD5 = hashlib.md5(f.read()).hexdigest() self.assertEqual(fileMD5, srcMd5) - if otherCls.__name__ == 'FileJobStoreTest': # Remove local Files + if otherCls.__name__ == "FileJobStoreTest": # Remove local Files os.remove(srcUrl[7:]) - make_tests(testImportSharedFile, - cls, - otherCls=activeTestClassesByName) + make_tests(testImportSharedFile, cls, otherCls=activeTestClassesByName) def testImportHttpFile(self): - '''Test importing a file over HTTP.''' - http = socketserver.TCPServer(('', 0), StubHttpRequestHandler) + """Test importing a file over HTTP.""" + http = socketserver.TCPServer(("", 0), StubHttpRequestHandler) try: httpThread = threading.Thread(target=http.serve_forever) httpThread.start() try: assignedPort = http.server_address[1] - url = 'http://localhost:%d' % assignedPort + url = "http://localhost:%d" % assignedPort with self.jobstore_initialized.read_file_stream( - self.jobstore_initialized.import_file(url)) as readable: + self.jobstore_initialized.import_file(url) + ) as readable: f1 = readable.read() f2 = StubHttpRequestHandler.fileContents if isinstance(f1, bytes) and not isinstance(f2, bytes): @@ -749,20 +819,25 @@ def testImportHttpFile(self): http.server_close() def testImportFtpFile(self): - '''Test importing a file over FTP''' - ftpfile = {'name': 'foo', 'content': 'foo bar baz qux'} + """Test importing a file over FTP""" + ftpfile = {"name": "foo", "content": "foo bar baz qux"} ftp = FTPStubServer(0) ftp.run() try: ftp.add_file(**ftpfile) assignedPort = ftp.server.server_address[1] - url = 'ftp://user1:passwd@localhost:%d/%s' % (assignedPort, ftpfile['name']) - with self.jobstore_initialized.read_file_stream(self.jobstore_initialized.import_file(url)) as readable: + url = "ftp://user1:passwd@localhost:%d/%s" % ( + assignedPort, + ftpfile["name"], + ) + with self.jobstore_initialized.read_file_stream( + self.jobstore_initialized.import_file(url) + ) as readable: imported_content = readable.read() # python 2/3 string/bytestring compat if isinstance(imported_content, bytes): - imported_content = imported_content.decode('utf-8') - self.assertEqual(imported_content, ftpfile['content']) + imported_content = imported_content.decode("utf-8") + self.assertEqual(imported_content, ftpfile["content"]) finally: ftp.stop() @@ -778,12 +853,19 @@ def testFileDeletion(self): job = self.arbitraryJob() self.jobstore_initialized.assign_job_id(job) self.jobstore_initialized.create_job(job) - fileIDs = [self.jobstore_initialized.get_empty_file_store_id(job.jobStoreID, cleanup=True) for _ in - range(0, numFiles)] + fileIDs = [ + self.jobstore_initialized.get_empty_file_store_id( + job.jobStoreID, cleanup=True + ) + for _ in range(0, numFiles) + ] self.jobstore_initialized.delete_job(job.jobStoreID) for fileID in fileIDs: # NB: the fooStream() methods return context managers - self.assertRaises(NoSuchFileException, self.jobstore_initialized.read_file_stream(fileID).__enter__) + self.assertRaises( + NoSuchFileException, + self.jobstore_initialized.read_file_stream(fileID).__enter__, + ) @slow def testMultipartUploads(self): @@ -818,9 +900,10 @@ def checksumThreadFn(): checksumThread.start() try: # Should not block. On Linux, /dev/random blocks when it's running low on entropy - with open('/dev/urandom', 'rb') as readable: - with self.jobstore_initialized.write_file_stream(job.jobStoreID, cleanup=True) as ( - writable, fileId): + with open("/dev/urandom", "rb") as readable: + with self.jobstore_initialized.write_file_stream( + job.jobStoreID, cleanup=True + ) as (writable, fileId): for i in range(int(partSize * partsPerFile / bufSize)): buf = readable.read(bufSize) checksumQueue.put(buf) @@ -845,13 +928,15 @@ def checksumThreadFn(): checksum = hashlib.md5() fh, path = mkstemp() try: - with os.fdopen(fh, 'wb+') as writable: - with open('/dev/urandom', 'rb') as readable: + with os.fdopen(fh, "wb+") as writable: + with open("/dev/urandom", "rb") as readable: for i in range(int(partSize * partsPerFile / bufSize)): buf = readable.read(bufSize) writable.write(buf) checksum.update(buf) - fileId = self.jobstore_initialized.write_file(path, job.jobStoreID, cleanup=True) + fileId = self.jobstore_initialized.write_file( + path, job.jobStoreID, cleanup=True + ) finally: os.unlink(path) before = checksum.hexdigest() @@ -869,14 +954,18 @@ def checksumThreadFn(): self.jobstore_initialized.delete_job(job.jobStoreID) def testZeroLengthFiles(self): - '''Test reading and writing of empty files.''' + """Test reading and writing of empty files.""" job = self.arbitraryJob() self.jobstore_initialized.assign_job_id(job) self.jobstore_initialized.create_job(job) - nullFile = self.jobstore_initialized.write_file('/dev/null', job.jobStoreID, cleanup=True) + nullFile = self.jobstore_initialized.write_file( + "/dev/null", job.jobStoreID, cleanup=True + ) with self.jobstore_initialized.read_file_stream(nullFile) as f: assert not f.read() - with self.jobstore_initialized.write_file_stream(job.jobStoreID, cleanup=True) as (f, nullStream): + with self.jobstore_initialized.write_file_stream( + job.jobStoreID, cleanup=True + ) as (f, nullStream): pass with self.jobstore_initialized.read_file_stream(nullStream) as f: assert not f.read() @@ -884,12 +973,12 @@ def testZeroLengthFiles(self): @slow def testLargeFile(self): - '''Test the reading and writing of large files.''' + """Test the reading and writing of large files.""" # Write a large file. dirPath = self._createTempDir() - filePath = os.path.join(dirPath, 'large') + filePath = os.path.join(dirPath, "large") hashIn = hashlib.md5() - with open(filePath, 'wb') as f: + with open(filePath, "wb") as f: for i in range(0, 10): buf = os.urandom(self._partSize()) f.write(buf) @@ -899,7 +988,9 @@ def testLargeFile(self): job = self.arbitraryJob() self.jobstore_initialized.assign_job_id(job) self.jobstore_initialized.create_job(job) - jobStoreFileID = self.jobstore_initialized.write_file(filePath, job.jobStoreID, cleanup=True) + jobStoreFileID = self.jobstore_initialized.write_file( + filePath, job.jobStoreID, cleanup=True + ) # Remove the local file. os.unlink(filePath) @@ -909,7 +1000,7 @@ def testLargeFile(self): # Reread the file to confirm success. hashOut = hashlib.md5() - with open(filePath, 'rb') as f: + with open(filePath, "rb") as f: while True: buf = f.read(self._partSize()) if not buf: @@ -927,8 +1018,8 @@ def fetch_url(self, url: str) -> None: def assertUrl(self, url): - prefix, path = url.split(':', 1) - if prefix == 'file': + prefix, path = url.split(":", 1) + if prefix == "file": self.assertTrue(os.path.exists(path)) else: try: @@ -966,8 +1057,7 @@ def testCleanCache(self): self.assertEqual(len(list(jobstore.jobs())), 101) # See how long it takes to clean with cache - jobCache = {job.jobStoreID: job - for job in jobstore.jobs()} + jobCache = {job.jobStoreID: job for job in jobstore.jobs()} cacheStart = time.time() jobstore.clean(jobCache) cacheEnd = time.time() @@ -982,13 +1072,15 @@ def testCleanCache(self): # NB: the 'thread' method seems to be needed here to actually # ensure the timeout is raised, probably because the only # "live" thread doesn't hold the GIL. - @pytest.mark.timeout(45, method='thread') + @pytest.mark.timeout(45, method="thread") def testPartialReadFromStream(self): """Test whether readFileStream will deadlock on a partial read.""" job = self.arbitraryJob() self.jobstore_initialized.assign_job_id(job) self.jobstore_initialized.create_job(job) - with self.jobstore_initialized.write_file_stream(job.jobStoreID, cleanup=True) as (f, fileID): + with self.jobstore_initialized.write_file_stream( + job.jobStoreID, cleanup=True + ) as (f, fileID): # Write enough data to make sure the writer thread # will get blocked on the write. Technically anything # greater than the pipe buffer size plus the libc @@ -996,7 +1088,7 @@ def testPartialReadFromStream(self): # but this gives us a lot of extra room just to be sure. # python 3 requires self.fileContents to be a bytestring - a = b'a' + a = b"a" f.write(a * 300000) with self.jobstore_initialized.read_file_stream(fileID) as f: self.assertEqual(f.read(1), a) @@ -1069,9 +1161,9 @@ def tearDown(self): def _createConfig(self): config = super()._createConfig() - sseKeyFile = os.path.join(self.sseKeyDir, 'keyFile') - with open(sseKeyFile, 'w') as f: - f.write('01234567890123456789012345678901') + sseKeyFile = os.path.join(self.sseKeyDir, "keyFile") + with open(sseKeyFile, "w") as f: + f.write("01234567890123456789012345678901") config.sseKey = sseKeyFile # config.attrib['sse_key'] = sseKeyFile return config @@ -1081,9 +1173,11 @@ def testEncrypted(self): Create an encrypted file. Read it in encrypted mode then try with encryption off to ensure that it fails. """ - phrase = b'This file is encrypted.' - fileName = 'foo' - with self.jobstore_initialized.write_shared_file_stream(fileName, encrypted=True) as f: + phrase = b"This file is encrypted." + fileName = "foo" + with self.jobstore_initialized.write_shared_file_stream( + fileName, encrypted=True + ) as f: f.write(phrase) with self.jobstore_initialized.read_shared_file_stream(fileName) as f: self.assertEqual(phrase, f.read()) @@ -1094,7 +1188,9 @@ def testEncrypted(self): with self.jobstore_initialized.read_shared_file_stream(fileName) as f: self.assertEqual(phrase, f.read()) except AssertionError as e: - self.assertEqual("Content is encrypted but no key was provided.", e.args[0]) + self.assertEqual( + "Content is encrypted but no key was provided.", e.args[0] + ) else: self.fail("Read encryption content with encryption off.") @@ -1110,21 +1206,21 @@ def _corruptJobStore(self): shutil.rmtree(self.jobstore_initialized.jobStoreDir) def _prepareTestFile(self, dirPath, size=None): - fileName = 'testfile_%s' % uuid.uuid4() + fileName = "testfile_%s" % uuid.uuid4() localFilePath = dirPath + fileName - url = 'file://%s' % localFilePath + url = "file://%s" % localFilePath if size is None: return url else: content = os.urandom(size) - with open(localFilePath, 'wb') as writable: + with open(localFilePath, "wb") as writable: writable.write(content) return url, hashlib.md5(content).hexdigest() def _hashTestFile(self, url): localFilePath = FileJobStore._extract_path_from_url(urlparse.urlparse(url)) - with open(localFilePath, 'rb') as f: + with open(localFilePath, "rb") as f: return hashlib.md5(f.read()).hexdigest() def _createExternalStore(self): @@ -1141,7 +1237,9 @@ def testPreserveFileName(self): job = self.arbitraryJob() self.jobstore_initialized.assign_job_id(job) self.jobstore_initialized.create_job(job) - fileID = self.jobstore_initialized.write_file(path, job.jobStoreID, cleanup=True) + fileID = self.jobstore_initialized.write_file( + path, job.jobStoreID, cleanup=True + ) self.assertTrue(fileID.endswith(os.path.basename(path))) finally: os.unlink(path) @@ -1152,12 +1250,19 @@ def test_jobstore_init_preserves_symlink_path(self): original_filestore = None try: original_filestore = self._createExternalStore() - dir_symlinked_to_original_filestore = f'{original_filestore}-am-i-real' + dir_symlinked_to_original_filestore = f"{original_filestore}-am-i-real" os.symlink(original_filestore, dir_symlinked_to_original_filestore) - filejobstore_using_symlink = FileJobStore(dir_symlinked_to_original_filestore, fanOut=2) - self.assertEqual(dir_symlinked_to_original_filestore, filejobstore_using_symlink.jobStoreDir) + filejobstore_using_symlink = FileJobStore( + dir_symlinked_to_original_filestore, fanOut=2 + ) + self.assertEqual( + dir_symlinked_to_original_filestore, + filejobstore_using_symlink.jobStoreDir, + ) finally: - if dir_symlinked_to_original_filestore and os.path.exists(dir_symlinked_to_original_filestore): + if dir_symlinked_to_original_filestore and os.path.exists( + dir_symlinked_to_original_filestore + ): os.unlink(dir_symlinked_to_original_filestore) if original_filestore and os.path.exists(original_filestore): shutil.rmtree(original_filestore) @@ -1171,21 +1276,23 @@ def test_jobstore_does_not_leak_symlinks(self): try: # Grab a temp directory to make files in. Make sure it's on the # same device as everything else. - temp_dir = os.path.abspath(self.namePrefix + '-import') + temp_dir = os.path.abspath(self.namePrefix + "-import") os.mkdir(temp_dir) - to_import = os.path.join(temp_dir, 'import-me') - with open(to_import, 'w') as f: - f.write('test') + to_import = os.path.join(temp_dir, "import-me") + with open(to_import, "w") as f: + f.write("test") # And a temp directory next to the job store to download to - download_dir = os.path.abspath(self.namePrefix + '-dl') + download_dir = os.path.abspath(self.namePrefix + "-dl") os.mkdir(download_dir) # Import it as a symlink - file_id = self.jobstore_initialized.import_file('file://' + to_import, symlink=True) + file_id = self.jobstore_initialized.import_file( + "file://" + to_import, symlink=True + ) # Take it out as a hard link or copy - download_to = os.path.join(download_dir, 'downloaded') + download_to = os.path.join(download_dir, "downloaded") self.jobstore_initialized.read_file(file_id, download_to) # Make sure it isn't a symlink @@ -1210,7 +1317,9 @@ def test_file_link_imports(self): for link_imports in [True, False]: self.jobstore_initialized.linkImports = link_imports # Import into job store under test - jobStoreFileID = self.jobstore_initialized.import_file(srcUrl, symlink=symlink) + jobStoreFileID = self.jobstore_initialized.import_file( + srcUrl, symlink=symlink + ) self.assertTrue(isinstance(jobStoreFileID, FileID)) with self.jobstore_initialized.read_file_stream(jobStoreFileID) as f: # gets abs path @@ -1230,14 +1339,16 @@ def test_symlink_read_control(self): """ Test that files are read by symlink when expected """ - + for should_link in (False, True): # Configure a jobstore to symlink out reads or not, as appropriate config = self._createConfig() config.symlink_job_store_reads = should_link - store = FileJobStore(self.namePrefix + ("-link" if should_link else "-nolink")) + store = FileJobStore( + self.namePrefix + ("-link" if should_link else "-nolink") + ) store.initialize(config) - + # Put something in the job store src_url, _ = self._prepareTestFile(self._externalStore(), 1) file_id = store.import_file(src_url, symlink=False) @@ -1245,23 +1356,23 @@ def test_symlink_read_control(self): # Read it out, accepting a symlink dest_dir = self._createTempDir() dest_path = os.path.join(dest_dir, "file.dat") - store.read_file(file_id, dest_path, symlink = True) + store.read_file(file_id, dest_path, symlink=True) # Make sure we get a symlink exactly when configured to assert os.path.exists(dest_path) assert os.path.islink(dest_path) == should_link - @needs_google_project @needs_google_storage @pytest.mark.xfail class GoogleJobStoreTest(AbstractJobStoreTest.Test): - projectID = os.getenv('TOIL_GOOGLE_PROJECTID') + projectID = os.getenv("TOIL_GOOGLE_PROJECTID") headers = {"x-goog-project-id": projectID} def _createJobStore(self): from toil.jobStores.googleJobStore import GoogleJobStore + return GoogleJobStore(GoogleJobStoreTest.projectID + ":" + self.namePrefix) def _corruptJobStore(self): @@ -1271,24 +1382,31 @@ def _corruptJobStore(self): def _prepareTestFile(self, bucket, size=None): from toil.jobStores.googleJobStore import GoogleJobStore - fileName = 'testfile_%s' % uuid.uuid4() - url = f'gs://{bucket.name}/{fileName}' + + fileName = "testfile_%s" % uuid.uuid4() + url = f"gs://{bucket.name}/{fileName}" if size is None: return url - with open('/dev/urandom', 'rb') as readable: + with open("/dev/urandom", "rb") as readable: contents = str(readable.read(size)) - GoogleJobStore._write_to_url(BytesIO(bytes(contents, 'utf-8')), urlparse.urlparse(url)) + GoogleJobStore._write_to_url( + BytesIO(bytes(contents, "utf-8")), urlparse.urlparse(url) + ) return url, hashlib.md5(contents.encode()).hexdigest() def _hashTestFile(self, url): from toil.jobStores.googleJobStore import GoogleJobStore - contents = GoogleJobStore._get_blob_from_url(urlparse.urlparse(url)).download_as_string() + + contents = GoogleJobStore._get_blob_from_url( + urlparse.urlparse(url) + ).download_as_string() return hashlib.md5(contents).hexdigest() @google_retry def _createExternalStore(self): from google.cloud import storage - bucketName = ("import-export-test-" + str(uuid.uuid4())) + + bucketName = "import-export-test-" + str(uuid.uuid4()) storageClient = storage.Client() return storageClient.create_bucket(bucketName) @@ -1307,11 +1425,13 @@ def _cleanUpExternalStore(self, bucket): class AWSJobStoreTest(AbstractJobStoreTest.Test): def _createJobStore(self): from toil.jobStores.aws.jobStore import AWSJobStore + partSize = self._partSize() - return AWSJobStore(self.awsRegion() + ':' + self.namePrefix, partSize=partSize) + return AWSJobStore(self.awsRegion() + ":" + self.namePrefix, partSize=partSize) def _corruptJobStore(self): from toil.jobStores.aws.jobStore import AWSJobStore + assert isinstance(self.jobstore_initialized, AWSJobStore) # type hinting self.jobstore_initialized.destroy() @@ -1327,47 +1447,63 @@ def testSDBDomainsDeletedOnFailedJobstoreBucketCreation(self): from toil.lib.aws.session import establish_boto3_session from toil.lib.aws.utils import retry_s3 - externalAWSLocation = 'us-west-1' - for testRegion in 'us-east-1', 'us-west-2': + externalAWSLocation = "us-west-1" + for testRegion in "us-east-1", "us-west-2": # We run this test twice, once with the default s3 server us-east-1 as the test region # and once with another server (us-west-2). The external server is always us-west-1. # This incidentally tests that the BucketLocationConflictException is thrown when using # both the default, and a non-default server. testJobStoreUUID = str(uuid.uuid4()) # Create the bucket at the external region - bucketName = 'domain-test-' + testJobStoreUUID + '--files' - client = establish_boto3_session().client('s3', region_name=externalAWSLocation) - resource = establish_boto3_session().resource('s3', region_name=externalAWSLocation) + bucketName = "domain-test-" + testJobStoreUUID + "--files" + client = establish_boto3_session().client( + "s3", region_name=externalAWSLocation + ) + resource = establish_boto3_session().resource( + "s3", region_name=externalAWSLocation + ) for attempt in retry_s3(delays=(2, 5, 10, 30, 60), timeout=600): with attempt: # Create the bucket at the home region - client.create_bucket(Bucket=bucketName, - CreateBucketConfiguration={'LocationConstraint': externalAWSLocation}) - - owner_tag = os.environ.get('TOIL_OWNER_TAG') + client.create_bucket( + Bucket=bucketName, + CreateBucketConfiguration={ + "LocationConstraint": externalAWSLocation + }, + ) + + owner_tag = os.environ.get("TOIL_OWNER_TAG") if owner_tag: for attempt in retry_s3(delays=(1, 1, 2, 4, 8, 16), timeout=33): with attempt: bucket_tagging = resource.BucketTagging(bucketName) - bucket_tagging.put(Tagging={'TagSet': [{'Key': 'Owner', 'Value': owner_tag}]}) - - options = Job.Runner.getDefaultOptions('aws:' + testRegion + ':domain-test-' + testJobStoreUUID) - options.logLevel = 'DEBUG' + bucket_tagging.put( + Tagging={"TagSet": [{"Key": "Owner", "Value": owner_tag}]} + ) + + options = Job.Runner.getDefaultOptions( + "aws:" + testRegion + ":domain-test-" + testJobStoreUUID + ) + options.logLevel = "DEBUG" try: with Toil(options) as toil: pass except BucketLocationConflictException: # Catch the expected BucketLocationConflictException and ensure that the bound # domains don't exist in SDB. - sdb = establish_boto3_session().client(region_name=self.awsRegion(), service_name="sdb") + sdb = establish_boto3_session().client( + region_name=self.awsRegion(), service_name="sdb" + ) next_token = None allDomainNames = [] while True: if next_token is None: domains = sdb.list_domains(MaxNumberOfDomains=100) else: - domains = sdb.list_domains(MaxNumberOfDomains=100, NextToken=next_token) + domains = sdb.list_domains( + MaxNumberOfDomains=100, NextToken=next_token + ) allDomainNames.extend(domains["DomainNames"]) next_token = domains.get("NextToken") if next_token is None: @@ -1382,7 +1518,10 @@ def testSDBDomainsDeletedOnFailedJobstoreBucketCreation(self): client.delete_bucket(Bucket=bucketName) except ClientError as e: # The actual HTTP code of the error is in status. - if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404: + if ( + e.response.get("ResponseMetadata", {}).get("HTTPStatusCode") + == 404 + ): # The bucket doesn't exist; maybe a failed delete actually succeeded. pass else: @@ -1391,25 +1530,29 @@ def testSDBDomainsDeletedOnFailedJobstoreBucketCreation(self): @slow def testInlinedFiles(self): from toil.jobStores.aws.jobStore import AWSJobStore + jobstore = self.jobstore_initialized for encrypted in (True, False): n = AWSJobStore.FileInfo.maxInlinedSize() sizes = (1, n // 2, n - 1, n, n + 1, 2 * n) for size in chain(sizes, islice(reversed(sizes), 1)): s = os.urandom(size) - with jobstore.write_shared_file_stream('foo') as f: + with jobstore.write_shared_file_stream("foo") as f: f.write(s) - with jobstore.read_shared_file_stream('foo') as f: + with jobstore.read_shared_file_stream("foo") as f: self.assertEqual(s, f.read()) def testOverlargeJob(self): jobstore = self.jobstore_initialized jobRequirements = dict(memory=12, cores=34, disk=35, preemptible=True) - overlargeJob = JobDescription(requirements=jobRequirements, - jobName='test-overlarge', unitName='onJobStore') + overlargeJob = JobDescription( + requirements=jobRequirements, + jobName="test-overlarge", + unitName="onJobStore", + ) # Make the pickled size of the job larger than 256K - with open("/dev/urandom", 'rb') as random: + with open("/dev/urandom", "rb") as random: overlargeJob.jobName = str(random.read(512 * 1024)) jobstore.assign_job_id(overlargeJob) jobstore.create_job(overlargeJob) @@ -1421,33 +1564,39 @@ def testOverlargeJob(self): jobstore.delete_job(overlargeJob.jobStoreID) def testMultiThreadImportFile(self) -> None: - """ Tests that importFile is thread-safe.""" + """Tests that importFile is thread-safe.""" from concurrent.futures.thread import ThreadPoolExecutor from toil.lib.threading import cpu_count - threads: Tuple[int, ...] = (2, cpu_count()) if cpu_count() > 2 else (2, ) + threads: tuple[int, ...] = (2, cpu_count()) if cpu_count() > 2 else (2,) num_of_files: int = 5 size: int = 1 << 16 + 1 # The string in otherCls() is arbitrary as long as it returns a class that has access # to ._externalStore() and ._prepareTestFile() - other: AbstractJobStoreTest.Test = AWSJobStoreTest('testSharedFiles') + other: AbstractJobStoreTest.Test = AWSJobStoreTest("testSharedFiles") store: Any = other._externalStore() # prepare test files to import - logger.debug(f'Preparing {num_of_files} test files for testMultiThreadImportFile().') + logger.debug( + f"Preparing {num_of_files} test files for testMultiThreadImportFile()." + ) test_files = [other._prepareTestFile(store, size) for _ in range(num_of_files)] for thread_count in threads: - with self.subTest(f'Testing threaded importFile with "{thread_count}" threads.'): + with self.subTest( + f'Testing threaded importFile with "{thread_count}" threads.' + ): results = [] with ThreadPoolExecutor(max_workers=thread_count) as executor: for url, expected_md5 in test_files: # run jobStore.importFile() asynchronously - future = executor.submit(self.jobstore_initialized.import_file, url) + future = executor.submit( + self.jobstore_initialized.import_file, url + ) results.append((future, expected_md5)) self.assertEqual(len(results), num_of_files) @@ -1457,33 +1606,39 @@ def testMultiThreadImportFile(self) -> None: self.assertIsInstance(file_id, FileID) with self.jobstore_initialized.read_file_stream(file_id) as f: - self.assertEqual(hashlib.md5(f.read()).hexdigest(), expected_md5) + self.assertEqual( + hashlib.md5(f.read()).hexdigest(), expected_md5 + ) def _prepareTestFile(self, bucket, size=None): from toil.lib.aws.utils import retry_s3 - file_name = 'testfile_%s' % uuid.uuid4() - url = f's3://{bucket.name}/{file_name}' + file_name = "testfile_%s" % uuid.uuid4() + url = f"s3://{bucket.name}/{file_name}" if size is None: return url - with open('/dev/urandom', 'rb') as readable: + with open("/dev/urandom", "rb") as readable: for attempt in retry_s3(): with attempt: bucket.put_object(Key=file_name, Body=str(readable.read(size))) - return url, hashlib.md5(bucket.Object(file_name).get().get('Body').read()).hexdigest() + return ( + url, + hashlib.md5(bucket.Object(file_name).get().get("Body").read()).hexdigest(), + ) def _hashTestFile(self, url: str) -> str: from toil.jobStores.aws.jobStore import AWSJobStore from toil.lib.aws.utils import get_object_for_url + str(AWSJobStore) # to prevent removal of that import key = get_object_for_url(urlparse.urlparse(url), existing=True) - contents = key.get().get('Body').read() + contents = key.get().get("Body").read() return hashlib.md5(contents).hexdigest() def _createExternalStore(self): """A S3.Bucket instance is returned""" from toil.jobStores.aws.jobStore import establish_boto3_session - from toil.lib.aws.utils import retry_s3, create_s3_bucket + from toil.lib.aws.utils import create_s3_bucket, retry_s3 resource = establish_boto3_session().resource( "s3", region_name=self.awsRegion() @@ -1514,6 +1669,7 @@ def _largeLogEntrySize(self): def _batchDeletionSize(self): from toil.jobStores.aws.jobStore import AWSJobStore + return AWSJobStore.itemsPerBatchDelete @@ -1521,15 +1677,10 @@ def _batchDeletionSize(self): class InvalidAWSJobStoreTest(ToilTest): def testInvalidJobStoreName(self): from toil.jobStores.aws.jobStore import AWSJobStore - self.assertRaises(ValueError, - AWSJobStore, - 'us-west-2:a--b') - self.assertRaises(ValueError, - AWSJobStore, - 'us-west-2:' + ('a' * 100)) - self.assertRaises(ValueError, - AWSJobStore, - 'us-west-2:a_b') + + self.assertRaises(ValueError, AWSJobStore, "us-west-2:a--b") + self.assertRaises(ValueError, AWSJobStore, "us-west-2:" + ("a" * 100)) + self.assertRaises(ValueError, AWSJobStore, "us-west-2:a_b") @needs_aws_s3 @@ -1540,14 +1691,14 @@ class EncryptedAWSJobStoreTest(AWSJobStoreTest, AbstractEncryptedJobStoreTest.Te class StubHttpRequestHandler(http.server.SimpleHTTPRequestHandler): - fileContents = 'A good programmer looks both ways before crossing a one-way street' + fileContents = "A good programmer looks both ways before crossing a one-way street" def do_GET(self): self.send_response(200) self.send_header("Content-type", "text/plain") self.send_header("Content-length", len(self.fileContents)) self.end_headers() - self.fileContents = self.fileContents.encode('utf-8') + self.fileContents = self.fileContents.encode("utf-8") self.wfile.write(self.fileContents) diff --git a/src/toil/test/lib/aws/test_iam.py b/src/toil/test/lib/aws/test_iam.py index 43b77ab6d0..f2a75e69ed 100644 --- a/src/toil/test/lib/aws/test_iam.py +++ b/src/toil/test/lib/aws/test_iam.py @@ -13,9 +13,9 @@ # limitations under the License. import json import logging -import boto3 - from uuid import uuid4 + +import boto3 from moto import mock_aws from toil.lib.aws import iam @@ -29,23 +29,45 @@ class IAMTest(ToilTest): """Check that given permissions and associated functions perform correctly""" def test_permissions_iam(self): - granted_perms = {'*': {'Action': ['ec2:*', 'iam:*', 's3:*', 'sdb:*'], 'NotAction': []}} - assert iam.policy_permissions_allow(granted_perms, iam.CLUSTER_LAUNCHING_PERMISSIONS) is True - granted_perms = {'*': {'Action': [], 'NotAction': ['s3:*']}} - assert iam.policy_permissions_allow(granted_perms, iam.CLUSTER_LAUNCHING_PERMISSIONS) is True + granted_perms = { + "*": {"Action": ["ec2:*", "iam:*", "s3:*", "sdb:*"], "NotAction": []} + } + assert ( + iam.policy_permissions_allow( + granted_perms, iam.CLUSTER_LAUNCHING_PERMISSIONS + ) + is True + ) + granted_perms = {"*": {"Action": [], "NotAction": ["s3:*"]}} + assert ( + iam.policy_permissions_allow( + granted_perms, iam.CLUSTER_LAUNCHING_PERMISSIONS + ) + is True + ) def test_negative_permissions_iam(self): - granted_perms = {'*': {'Action': ['ec2:*', 's3:*', 'sdb:*'], 'NotAction': []}} - assert iam.policy_permissions_allow(granted_perms, iam.CLUSTER_LAUNCHING_PERMISSIONS) is False - granted_perms = {'*': {'Action': [], 'NotAction': ['iam:*', 'ec2:*']}} - assert iam.policy_permissions_allow(granted_perms, iam.CLUSTER_LAUNCHING_PERMISSIONS) is False + granted_perms = {"*": {"Action": ["ec2:*", "s3:*", "sdb:*"], "NotAction": []}} + assert ( + iam.policy_permissions_allow( + granted_perms, iam.CLUSTER_LAUNCHING_PERMISSIONS + ) + is False + ) + granted_perms = {"*": {"Action": [], "NotAction": ["iam:*", "ec2:*"]}} + assert ( + iam.policy_permissions_allow( + granted_perms, iam.CLUSTER_LAUNCHING_PERMISSIONS + ) + is False + ) def test_wildcard_handling(self): - assert iam.permission_matches_any("iam:CreateRole", ['iam:Create**']) is True - assert iam.permission_matches_any("iam:GetUser", ['iam:???????']) is True - assert iam.permission_matches_any("iam:ListRoleTags", ['iam:*?*Tags']) is True + assert iam.permission_matches_any("iam:CreateRole", ["iam:Create**"]) is True + assert iam.permission_matches_any("iam:GetUser", ["iam:???????"]) is True + assert iam.permission_matches_any("iam:ListRoleTags", ["iam:*?*Tags"]) is True assert iam.permission_matches_any("iam:*", ["*"]) is True - assert iam.permission_matches_any("ec2:*", ['iam:*']) is False + assert iam.permission_matches_any("ec2:*", ["iam:*"]) is False @mock_aws def test_get_policy_permissions(self): @@ -56,68 +78,77 @@ def test_get_policy_permissions(self): mock_iam.create_user(UserName=user_name) group_name = "default_group" - mock_iam.create_group( - GroupName=group_name - ) + mock_iam.create_group(GroupName=group_name) - mock_iam.add_user_to_group( - GroupName=group_name, - UserName=user_name - ) + mock_iam.add_user_to_group(GroupName=group_name, UserName=user_name) policy_response = mock_iam.create_policy( PolicyName="test_iam_createrole", - PolicyDocument=json.dumps({ - "Version": "2012-10-17", # represents version language - "Statement": [ - {"Effect": "Allow", "Action": "iam:CreateRole", "Resource": "*"} - ] - }) + PolicyDocument=json.dumps( + { + "Version": "2012-10-17", # represents version language + "Statement": [ + {"Effect": "Allow", "Action": "iam:CreateRole", "Resource": "*"} + ], + } + ), ) # attached user policy mock_iam.attach_user_policy( - UserName=user_name, - PolicyArn=policy_response["Policy"]["Arn"] + UserName=user_name, PolicyArn=policy_response["Policy"]["Arn"] ) # inline user policy mock_iam.put_user_policy( UserName=user_name, PolicyName="test_iam_createinstanceprofile", - PolicyDocument=json.dumps({ - "Version": "2012-10-17", # represents version language - "Statement": [ - {"Effect": "Allow", "Action": "iam:CreateInstanceProfile", "Resource": "*"} - ] - }) + PolicyDocument=json.dumps( + { + "Version": "2012-10-17", # represents version language + "Statement": [ + { + "Effect": "Allow", + "Action": "iam:CreateInstanceProfile", + "Resource": "*", + } + ], + } + ), ) # group policies policy_response = mock_iam.create_policy( PolicyName="test_iam_taginstanceprofile", - PolicyDocument=json.dumps({ - "Version": "2012-10-17", # represents version language - "Statement": [ - {"Effect": "Allow", "Action": "iam:TagInstanceProfile", "Resource": "*"} - ] - }) + PolicyDocument=json.dumps( + { + "Version": "2012-10-17", # represents version language + "Statement": [ + { + "Effect": "Allow", + "Action": "iam:TagInstanceProfile", + "Resource": "*", + } + ], + } + ), ) # attached group policy mock_iam.attach_group_policy( - GroupName=group_name, - PolicyArn=policy_response["Policy"]["Arn"] + GroupName=group_name, PolicyArn=policy_response["Policy"]["Arn"] ) # inline group policy mock_iam.put_group_policy( GroupName=group_name, PolicyName="test_iam_deleterole", - PolicyDocument=json.dumps({ - "Version": "2012-10-17", # represents version language - "Statement": [ - {"Effect": "Allow", "Action": "iam:DeleteRole", "Resource": "*"} - ] - }) + PolicyDocument=json.dumps( + { + "Version": "2012-10-17", # represents version language + "Statement": [ + {"Effect": "Allow", "Action": "iam:DeleteRole", "Resource": "*"} + ], + } + ), ) actions_collection = iam.get_policy_permissions("us-west-2") @@ -125,30 +156,42 @@ def test_get_policy_permissions(self): actions_set = set(actions_collection["*"]["Action"]) notactions_set = set(actions_collection["*"]["NotAction"]) - expected_actions = {"iam:CreateRole", "iam:CreateInstanceProfile", "iam:TagInstanceProfile", "iam:DeleteRole"} + expected_actions = { + "iam:CreateRole", + "iam:CreateInstanceProfile", + "iam:TagInstanceProfile", + "iam:DeleteRole", + } assert actions_set == expected_actions assert notactions_set == set() def test_create_delete_iam_role(self): - region = 'us-west-2' + region = "us-west-2" role_name = f'test{str(uuid4()).replace("-", "")}' - with self.subTest('Create role w/policies.'): - ec2_role_policy_document = json.dumps({ - "Version": "2012-10-17", - "Statement": [{ - "Effect": "Allow", - "Principal": {"Service": ["ec2.amazonaws.com"]}, - "Action": ["sts:AssumeRole"]} - ]}) - policy = dict(s3_deny=dict(Version="2012-10-17", Statement=[dict(Effect="Deny", Resource="*", Action="s3:*")])) + with self.subTest("Create role w/policies."): + ec2_role_policy_document = json.dumps( + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": {"Service": ["ec2.amazonaws.com"]}, + "Action": ["sts:AssumeRole"], + } + ], + } + ) + policy = dict( + s3_deny=dict( + Version="2012-10-17", + Statement=[dict(Effect="Deny", Resource="*", Action="s3:*")], + ) + ) iam.create_iam_role( role_name=role_name, assume_role_policy_document=ec2_role_policy_document, policies=policy, - region=region - ) - with self.subTest('Delete role w/policies.'): - iam.delete_iam_role( - role_name=role_name, - region=region + region=region, ) + with self.subTest("Delete role w/policies."): + iam.delete_iam_role(role_name=role_name, region=region) diff --git a/src/toil/test/lib/aws/test_s3.py b/src/toil/test/lib/aws/test_s3.py index 7ba5c9e77b..55da3c6e85 100644 --- a/src/toil/test/lib/aws/test_s3.py +++ b/src/toil/test/lib/aws/test_s3.py @@ -58,16 +58,22 @@ def test_create_bucket(self) -> None: self.assertEqual(get_bucket_region(bucket_name), "us-east-1") # Make sure all the bucket location getting strategies work on a bucket we created - self.assertEqual(get_bucket_region(bucket_name, only_strategies = {1}), "us-east-1") - self.assertEqual(get_bucket_region(bucket_name, only_strategies = {2}), "us-east-1") - self.assertEqual(get_bucket_region(bucket_name, only_strategies = {3}), "us-east-1") + self.assertEqual( + get_bucket_region(bucket_name, only_strategies={1}), "us-east-1" + ) + self.assertEqual( + get_bucket_region(bucket_name, only_strategies={2}), "us-east-1" + ) + self.assertEqual( + get_bucket_region(bucket_name, only_strategies={3}), "us-east-1" + ) def test_get_bucket_location_public_bucket(self) -> None: """ Test getting buket location for a bucket we don't own. """ - bucket_name = 'spacenet-dataset' + bucket_name = "spacenet-dataset" # This bucket happens to live in us-east-1 self.assertEqual(get_bucket_region(bucket_name), "us-east-1") diff --git a/src/toil/test/lib/aws/test_utils.py b/src/toil/test/lib/aws/test_utils.py index d70dce17e0..931ab63570 100644 --- a/src/toil/test/lib/aws/test_utils.py +++ b/src/toil/test/lib/aws/test_utils.py @@ -21,23 +21,25 @@ logger = logging.getLogger(__name__) logging.basicConfig(level=logging.DEBUG) + class TagGenerationTest(ToilTest): """ Test for tag generation from environment variables """ + def test_build_tag(self): environment = dict() environment["TOIL_OWNER_TAG"] = "😀" environment["TOIL_AWS_TAGS"] = None tag_dict = build_tag_dict_from_env(environment) - assert(tag_dict == {'Owner': '😀'}) + assert tag_dict == {"Owner": "😀"} def test_empty_aws_tags(self): environment = dict() environment["TOIL_OWNER_TAG"] = None environment["TOIL_AWS_TAGS"] = "{}" tag_dict = build_tag_dict_from_env(environment) - assert (tag_dict == dict()) + assert tag_dict == dict() def test_incorrect_json_object(self): with pytest.raises(SystemExit): @@ -58,7 +60,4 @@ def test_build_tag_with_tags(self): environment["TOIL_OWNER_TAG"] = "😀" environment["TOIL_AWS_TAGS"] = '{"1": "2", " ":")"}' tag_dict = build_tag_dict_from_env(environment) - assert(tag_dict == {'Owner': '😀', '1': '2', ' ': ')'}) - - - + assert tag_dict == {"Owner": "😀", "1": "2", " ": ")"} diff --git a/src/toil/test/lib/dockerTest.py b/src/toil/test/lib/dockerTest.py index a9bdee4fa9..23d92739c3 100644 --- a/src/toil/test/lib/dockerTest.py +++ b/src/toil/test/lib/dockerTest.py @@ -22,12 +22,14 @@ from toil.common import Toil from toil.exceptions import FailedJobsException from toil.job import Job -from toil.lib.docker import (FORGO, - RM, - STOP, - apiDockerCall, - containerIsRunning, - dockerKill) +from toil.lib.docker import ( + FORGO, + RM, + STOP, + apiDockerCall, + containerIsRunning, + dockerKill, +) from toil.test import ToilTest, needs_docker, slow logger = logging.getLogger(__name__) @@ -47,15 +49,12 @@ class DockerTest(ToilTest): removed during tear down. Otherwise, left-over files will not be removed. """ + def setUp(self): - self.tempDir = self._createTempDir(purpose='tempDir') - self.dockerTestLogLevel = 'INFO' - - def testDockerClean(self, - caching=False, - detached=True, - rm=True, - deferParam=None): + self.tempDir = self._createTempDir(purpose="tempDir") + self.dockerTestLogLevel = "INFO" + + def testDockerClean(self, caching=False, detached=True, rm=True, deferParam=None): """ Run the test container that creates a file in the work dir, and sleeps for 5 minutes. @@ -73,29 +72,25 @@ def testDockerClean(self, # detached X R E X # Neither X R E X - data_dir = os.path.join(self.tempDir, 'data') - working_dir = os.path.join(self.tempDir, 'working') - test_file = os.path.join(working_dir, 'test.txt') + data_dir = os.path.join(self.tempDir, "data") + working_dir = os.path.join(self.tempDir, "working") + test_file = os.path.join(working_dir, "test.txt") os.makedirs(data_dir, exist_ok=True) os.makedirs(working_dir, exist_ok=True) - options = Job.Runner.getDefaultOptions(os.path.join(self.tempDir, - 'jobstore')) + options = Job.Runner.getDefaultOptions(os.path.join(self.tempDir, "jobstore")) options.logLevel = self.dockerTestLogLevel options.workDir = working_dir - options.clean = 'always' + options.clean = "always" options.retryCount = 0 # we're expecting the job to fail so don't retry! options.caching = caching # No base64 logic since it might create a name starting with a `-`. container_name = uuid.uuid4().hex - A = Job.wrapJobFn(_testDockerCleanFn, - working_dir, - detached, - rm, - deferParam, - container_name) + A = Job.wrapJobFn( + _testDockerCleanFn, working_dir, detached, rm, deferParam, container_name + ) try: with Toil(options) as toil: toil.start(A) @@ -110,18 +105,21 @@ def testDockerClean(self, if (rm and (deferParam != FORGO)) or deferParam == RM or deferParam is None: # These containers should not exist - assert containerIsRunning(container_name) is None, \ - 'Container was not removed.' + assert ( + containerIsRunning(container_name) is None + ), "Container was not removed." elif deferParam == STOP: # These containers should exist but be non-running - assert containerIsRunning(container_name) == False, \ - 'Container was not stopped.' + assert ( + containerIsRunning(container_name) == False + ), "Container was not stopped." else: # These containers will be running - assert containerIsRunning(container_name) == True, \ - 'Container was not running.' + assert ( + containerIsRunning(container_name) == True + ), "Container was not running." finally: # Clean up try: @@ -131,121 +129,97 @@ def testDockerClean(self, pass def testDockerClean_CRx_FORGO(self): - self.testDockerClean(caching=False, detached=False, rm=True, - deferParam=FORGO) + self.testDockerClean(caching=False, detached=False, rm=True, deferParam=FORGO) def testDockerClean_CRx_STOP(self): - self.testDockerClean(caching=False, detached=False, rm=True, - deferParam=STOP) + self.testDockerClean(caching=False, detached=False, rm=True, deferParam=STOP) def testDockerClean_CRx_RM(self): - self.testDockerClean(caching=False, detached=False, rm=True, - deferParam=RM) + self.testDockerClean(caching=False, detached=False, rm=True, deferParam=RM) @slow def testDockerClean_CRx_None(self): - self.testDockerClean(caching=False, detached=False, rm=True, - deferParam=None) + self.testDockerClean(caching=False, detached=False, rm=True, deferParam=None) @slow def testDockerClean_CxD_FORGO(self): - self.testDockerClean(caching=False, detached=True, rm=False, - deferParam=FORGO) + self.testDockerClean(caching=False, detached=True, rm=False, deferParam=FORGO) @slow def testDockerClean_CxD_STOP(self): - self.testDockerClean(caching=False, detached=True, rm=False, - deferParam=STOP) + self.testDockerClean(caching=False, detached=True, rm=False, deferParam=STOP) @slow def testDockerClean_CxD_RM(self): - self.testDockerClean(caching=False, detached=True, rm=False, - deferParam=RM) + self.testDockerClean(caching=False, detached=True, rm=False, deferParam=RM) @slow def testDockerClean_CxD_None(self): - self.testDockerClean(caching=False, detached=True, rm=False, - deferParam=None) + self.testDockerClean(caching=False, detached=True, rm=False, deferParam=None) @slow def testDockerClean_Cxx_FORGO(self): - self.testDockerClean(caching=False, detached=False, rm=False, - deferParam=FORGO) + self.testDockerClean(caching=False, detached=False, rm=False, deferParam=FORGO) @slow def testDockerClean_Cxx_STOP(self): - self.testDockerClean(caching=False, detached=False, rm=False, - deferParam=STOP) + self.testDockerClean(caching=False, detached=False, rm=False, deferParam=STOP) @slow def testDockerClean_Cxx_RM(self): - self.testDockerClean(caching=False, detached=False, rm=False, - deferParam=RM) + self.testDockerClean(caching=False, detached=False, rm=False, deferParam=RM) @slow def testDockerClean_Cxx_None(self): - self.testDockerClean(caching=False, detached=False, rm=False, - deferParam=None) + self.testDockerClean(caching=False, detached=False, rm=False, deferParam=None) @slow def testDockerClean_xRx_FORGO(self): - self.testDockerClean(caching=True, detached=False, rm=True, - deferParam=FORGO) + self.testDockerClean(caching=True, detached=False, rm=True, deferParam=FORGO) @slow def testDockerClean_xRx_STOP(self): - self.testDockerClean(caching=True, detached=False, rm=True, - deferParam=STOP) + self.testDockerClean(caching=True, detached=False, rm=True, deferParam=STOP) @slow def testDockerClean_xRx_RM(self): - self.testDockerClean(caching=True, detached=False, rm=True, - deferParam=RM) + self.testDockerClean(caching=True, detached=False, rm=True, deferParam=RM) @slow def testDockerClean_xRx_None(self): - self.testDockerClean(caching=True, detached=False, rm=True, - deferParam=None) + self.testDockerClean(caching=True, detached=False, rm=True, deferParam=None) @slow def testDockerClean_xxD_FORGO(self): - self.testDockerClean(caching=True, detached=True, rm=False, - deferParam=FORGO) + self.testDockerClean(caching=True, detached=True, rm=False, deferParam=FORGO) @slow def testDockerClean_xxD_STOP(self): - self.testDockerClean(caching=True, detached=True, rm=False, - deferParam=STOP) + self.testDockerClean(caching=True, detached=True, rm=False, deferParam=STOP) @slow def testDockerClean_xxD_RM(self): - self.testDockerClean(caching=True, detached=True, rm=False, - deferParam=RM) + self.testDockerClean(caching=True, detached=True, rm=False, deferParam=RM) @slow def testDockerClean_xxD_None(self): - self.testDockerClean(caching=True, detached=True, rm=False, - deferParam=None) + self.testDockerClean(caching=True, detached=True, rm=False, deferParam=None) @slow def testDockerClean_xxx_FORGO(self): - self.testDockerClean(caching=True, detached=False, rm=False, - deferParam=FORGO) + self.testDockerClean(caching=True, detached=False, rm=False, deferParam=FORGO) @slow def testDockerClean_xxx_STOP(self): - self.testDockerClean(caching=True, detached=False, rm=False, - deferParam=STOP) + self.testDockerClean(caching=True, detached=False, rm=False, deferParam=STOP) @slow def testDockerClean_xxx_RM(self): - self.testDockerClean(caching=True, detached=False, rm=False, - deferParam=RM) + self.testDockerClean(caching=True, detached=False, rm=False, deferParam=RM) @slow def testDockerClean_xxx_None(self): - self.testDockerClean(caching=True, detached=False, rm=False, - deferParam=None) + self.testDockerClean(caching=True, detached=False, rm=False, deferParam=None) def testDockerPipeChain(self, caching=False): r""" @@ -255,16 +229,16 @@ def testDockerPipeChain(self, caching=False): ex: ``parameters=[ ['printf', 'x\n y\n'], ['wc', '-l'] ]`` should execute: ``printf 'x\n y\n' | wc -l`` """ - options = Job.Runner.getDefaultOptions(os.path.join(self.tempDir, 'jobstore')) + options = Job.Runner.getDefaultOptions(os.path.join(self.tempDir, "jobstore")) options.logLevel = self.dockerTestLogLevel options.workDir = self.tempDir - options.clean = 'always' + options.clean = "always" options.caching = caching A = Job.wrapJobFn(_testDockerPipeChainFn) rv = Job.Runner.startToil(A, options) - logger.info('Container pipeline result: %s', repr(rv)) - rv = rv.decode('utf-8') - assert rv.strip() == '2' + logger.info("Container pipeline result: %s", repr(rv)) + rv = rv.decode("utf-8") + assert rv.strip() == "2" def testDockerPipeChainErrorDetection(self, caching=False): """ @@ -273,10 +247,10 @@ def testDockerPipeChainErrorDetection(self, caching=False): silently missed. This tests to make sure that the piping API for dockerCall() throws an exception if non-last commands in the chain fail. """ - options = Job.Runner.getDefaultOptions(os.path.join(self.tempDir, 'jobstore')) + options = Job.Runner.getDefaultOptions(os.path.join(self.tempDir, "jobstore")) options.logLevel = self.dockerTestLogLevel options.workDir = self.tempDir - options.clean = 'always' + options.clean = "always" options.caching = caching A = Job.wrapJobFn(_testDockerPipeChainErrorFn) rv = Job.Runner.startToil(A, options) @@ -291,19 +265,21 @@ def testNonCachingDockerChainErrorDetection(self): def testDockerLogs(self, stream=False, demux=False): """Test for the different log outputs when deatch=False.""" - working_dir = os.path.join(self.tempDir, 'working') - script_file = os.path.join(working_dir, 'script.sh') + working_dir = os.path.join(self.tempDir, "working") + script_file = os.path.join(working_dir, "script.sh") os.makedirs(working_dir, exist_ok=True) - options = Job.Runner.getDefaultOptions(os.path.join(self.tempDir, 'jobstore')) + options = Job.Runner.getDefaultOptions(os.path.join(self.tempDir, "jobstore")) options.logLevel = self.dockerTestLogLevel options.workDir = working_dir - options.clean = 'always' - A = Job.wrapJobFn(_testDockerLogsFn, - working_dir=working_dir, - script_file=script_file, - stream=stream, - demux=demux) + options.clean = "always" + A = Job.wrapJobFn( + _testDockerLogsFn, + working_dir=working_dir, + script_file=script_file, + stream=stream, + demux=demux, + ) try: rv = Job.Runner.startToil(A, options) @@ -324,12 +300,9 @@ def testDockerLogs_Demux_Stream(self): self.testDockerLogs(stream=True, demux=True) -def _testDockerCleanFn(job, - working_dir, - detached=None, - rm=None, - deferParam=None, - containerName=None): +def _testDockerCleanFn( + job, working_dir, detached=None, rm=None, deferParam=None, containerName=None +): """ Test function for test docker_clean. Runs a container with given flags and then dies leaving behind a zombie container. @@ -340,11 +313,12 @@ def _testDockerCleanFn(job, :param int deferParam: See `deferParam=` in :func:`dockerCall` :param str containerName: See `container_name=` in :func:`dockerCall` """ + def killSelf(): - test_file = os.path.join(working_dir, 'test.txt') + test_file = os.path.join(working_dir, "test.txt") # Kill the worker once we are sure the docker container is started while not os.path.exists(test_file): - logger.debug('Waiting on the file created by spooky_container.') + logger.debug("Waiting on the file created by spooky_container.") time.sleep(1) # By the time we reach here, we are sure the container is running. time.sleep(1) @@ -354,48 +328,48 @@ def killSelf(): # Make it a daemon thread so that thread failure doesn't hang tests. t.daemon = True t.start() - apiDockerCall(job, - image='quay.io/ucsc_cgl/spooky_test', - working_dir=working_dir, - deferParam=deferParam, - containerName=containerName, - detach=detached, - remove=rm, - privileged=True) + apiDockerCall( + job, + image="quay.io/ucsc_cgl/spooky_test", + working_dir=working_dir, + deferParam=deferParam, + containerName=containerName, + detach=detached, + remove=rm, + privileged=True, + ) def _testDockerPipeChainFn(job): """Return the result of a simple pipe chain. Should be 2.""" - parameters = [['printf', 'x\n y\n'], ['wc', '-l']] - return apiDockerCall(job, - image='quay.io/ucsc_cgl/ubuntu:20.04', - parameters=parameters, - privileged=True) + parameters = [["printf", "x\n y\n"], ["wc", "-l"]] + return apiDockerCall( + job, + image="quay.io/ucsc_cgl/ubuntu:20.04", + parameters=parameters, + privileged=True, + ) def _testDockerPipeChainErrorFn(job): """Return True if the command exit 1 | wc -l raises a ContainerError.""" - parameters = [['exit', '1'], ['wc', '-l']] + parameters = [["exit", "1"], ["wc", "-l"]] try: - apiDockerCall(job, - image='quay.io/ucsc_cgl/spooky_test', - parameters=parameters) + apiDockerCall(job, image="quay.io/ucsc_cgl/spooky_test", parameters=parameters) except ContainerError: return True return False -def _testDockerLogsFn(job, - working_dir, - script_file, - stream=False, - demux=False): +def _testDockerLogsFn(job, working_dir, script_file, stream=False, demux=False): """Return True if the test succeeds. Otherwise Exception is raised.""" # we write a script file because the redirection operator, '>&2', is wrapped # in quotes when passed as parameters. import textwrap - bash_script = textwrap.dedent(''' + + bash_script = textwrap.dedent( + """ #!/bin/bash echo hello stdout ; echo hello stderr >&2 ; @@ -403,27 +377,30 @@ def _testDockerLogsFn(job, echo hello stderr >&2 ; echo hello stdout ; echo hello stdout ; - ''') + """ + ) - with open(script_file, 'w') as file: + with open(script_file, "w") as file: file.write(bash_script) - out = apiDockerCall(job, - image='quay.io/ucsc_cgl/ubuntu:20.04', - working_dir=working_dir, - parameters=[script_file], - volumes={working_dir: {'bind': working_dir, 'mode': 'rw'}}, - entrypoint="/bin/bash", - stdout=True, - stderr=True, - stream=stream, - demux=demux) + out = apiDockerCall( + job, + image="quay.io/ucsc_cgl/ubuntu:20.04", + working_dir=working_dir, + parameters=[script_file], + volumes={working_dir: {"bind": working_dir, "mode": "rw"}}, + entrypoint="/bin/bash", + stdout=True, + stderr=True, + stream=stream, + demux=demux, + ) # we check the output length because order is not guaranteed. if stream: if demux: # a generator with tuples of (stdout, stderr) - assert hasattr(out, '__iter__') + assert hasattr(out, "__iter__") for _ in range(6): stdout, stderr = next(out) if stdout: @@ -435,7 +412,7 @@ def _testDockerLogsFn(job, assert False else: # a generator with bytes - assert hasattr(out, '__iter__') + assert hasattr(out, "__iter__") for _ in range(6): assert len(next(out)) == 13 else: diff --git a/src/toil/test/lib/test_conversions.py b/src/toil/test/lib/test_conversions.py index 1bd59a6a47..963bf037cf 100644 --- a/src/toil/test/lib/test_conversions.py +++ b/src/toil/test/lib/test_conversions.py @@ -13,9 +13,7 @@ # limitations under the License. import logging -from toil.lib.conversions import (convert_units, - hms_duration_to_seconds, - human2bytes) +from toil.lib.conversions import convert_units, hms_duration_to_seconds, human2bytes from toil.test import ToilTest logger = logging.getLogger(__name__) @@ -79,135 +77,135 @@ def test_convert(self): "11234234 KB": "0.0112 TB", "11234234 MB": "11.2342 TB", "11234234 GB": "11234.2340 TB", - "11234234 TB": "11234234.0000 TB" + "11234234 TB": "11234234.0000 TB", } results = {} for i in (0, 0.1, 0.5, 0.9, 1, 7, 7.42423, 10, 100, 1000, 11234234): - for src_unit in ['B', 'KB', 'MB', 'GB', 'TB']: - for dst_unit in ['B', 'KB', 'MB', 'GB', 'TB']: + for src_unit in ["B", "KB", "MB", "GB", "TB"]: + for dst_unit in ["B", "KB", "MB", "GB", "TB"]: converted = convert_units(i, src_unit, dst_unit) - results[f'{i} {src_unit}'] = f'{converted:.4f} {dst_unit}' + results[f"{i} {src_unit}"] = f"{converted:.4f} {dst_unit}" self.assertEqual(results, expected_conversions) def test_human2bytes(self): expected_results = { - '0 b': 0, - '0 Ki': 0, - '0 Mi': 0, - '0 Gi': 0, - '0 Ti': 0, - '0 K': 0, - '0 M': 0, - '0 G': 0, - '0 T': 0, - '0.1 b': 0, - '0.1 Ki': 102, - '0.1 Mi': 104857, - '0.1 Gi': 107374182, - '0.1 Ti': 109951162777, - '0.1 K': 100, - '0.1 M': 100000, - '0.1 G': 100000000, - '0.1 T': 100000000000, - '0.5 b': 0, - '0.5 Ki': 512, - '0.5 Mi': 524288, - '0.5 Gi': 536870912, - '0.5 Ti': 549755813888, - '0.5 K': 500, - '0.5 M': 500000, - '0.5 G': 500000000, - '0.5 T': 500000000000, - '0.9 b': 0, - '0.9 Ki': 921, - '0.9 Mi': 943718, - '0.9 Gi': 966367641, - '0.9 Ti': 989560464998, - '0.9 K': 900, - '0.9 M': 900000, - '0.9 G': 900000000, - '0.9 T': 900000000000, - '1 b': 1, - '1 Ki': 1024, - '1 Mi': 1048576, - '1 Gi': 1073741824, - '1 Ti': 1099511627776, - '1 K': 1000, - '1 M': 1000000, - '1 G': 1000000000, - '1 T': 1000000000000, - '7 b': 7, - '7 Ki': 7168, - '7 Mi': 7340032, - '7 Gi': 7516192768, - '7 Ti': 7696581394432, - '7 K': 7000, - '7 M': 7000000, - '7 G': 7000000000, - '7 T': 7000000000000, - '7.42423 b': 7, - '7.42423 Ki': 7602, - '7.42423 Mi': 7784869, - '7.42423 Gi': 7971706261, - '7.42423 Ti': 8163027212283, - '7.42423 K': 7424, - '7.42423 M': 7424230, - '7.42423 G': 7424230000, - '7.42423 T': 7424230000000, - '10 b': 10, - '10 Ki': 10240, - '10 Mi': 10485760, - '10 Gi': 10737418240, - '10 Ti': 10995116277760, - '10 K': 10000, - '10 M': 10000000, - '10 G': 10000000000, - '10 T': 10000000000000, - '100 b': 100, - '100 Ki': 102400, - '100 Mi': 104857600, - '100 Gi': 107374182400, - '100 Ti': 109951162777600, - '100 K': 100000, - '100 M': 100000000, - '100 G': 100000000000, - '100 T': 100000000000000, - '1000 b': 1000, - '1000 Ki': 1024000, - '1000 Mi': 1048576000, - '1000 Gi': 1073741824000, - '1000 Ti': 1099511627776000, - '1000 K': 1000000, - '1000 M': 1000000000, - '1000 G': 1000000000000, - '1000 T': 1000000000000000, - '11234234 b': 11234234, - '11234234 Ki': 11503855616, - '11234234 Mi': 11779948150784, - '11234234 Gi': 12062666906402816, - '11234234 Ti': 12352170912156483584, - '11234234 K': 11234234000, - '11234234 M': 11234234000000, - '11234234 G': 11234234000000000, - '11234234 T': 11234234000000000000 + "0 b": 0, + "0 Ki": 0, + "0 Mi": 0, + "0 Gi": 0, + "0 Ti": 0, + "0 K": 0, + "0 M": 0, + "0 G": 0, + "0 T": 0, + "0.1 b": 0, + "0.1 Ki": 102, + "0.1 Mi": 104857, + "0.1 Gi": 107374182, + "0.1 Ti": 109951162777, + "0.1 K": 100, + "0.1 M": 100000, + "0.1 G": 100000000, + "0.1 T": 100000000000, + "0.5 b": 0, + "0.5 Ki": 512, + "0.5 Mi": 524288, + "0.5 Gi": 536870912, + "0.5 Ti": 549755813888, + "0.5 K": 500, + "0.5 M": 500000, + "0.5 G": 500000000, + "0.5 T": 500000000000, + "0.9 b": 0, + "0.9 Ki": 921, + "0.9 Mi": 943718, + "0.9 Gi": 966367641, + "0.9 Ti": 989560464998, + "0.9 K": 900, + "0.9 M": 900000, + "0.9 G": 900000000, + "0.9 T": 900000000000, + "1 b": 1, + "1 Ki": 1024, + "1 Mi": 1048576, + "1 Gi": 1073741824, + "1 Ti": 1099511627776, + "1 K": 1000, + "1 M": 1000000, + "1 G": 1000000000, + "1 T": 1000000000000, + "7 b": 7, + "7 Ki": 7168, + "7 Mi": 7340032, + "7 Gi": 7516192768, + "7 Ti": 7696581394432, + "7 K": 7000, + "7 M": 7000000, + "7 G": 7000000000, + "7 T": 7000000000000, + "7.42423 b": 7, + "7.42423 Ki": 7602, + "7.42423 Mi": 7784869, + "7.42423 Gi": 7971706261, + "7.42423 Ti": 8163027212283, + "7.42423 K": 7424, + "7.42423 M": 7424230, + "7.42423 G": 7424230000, + "7.42423 T": 7424230000000, + "10 b": 10, + "10 Ki": 10240, + "10 Mi": 10485760, + "10 Gi": 10737418240, + "10 Ti": 10995116277760, + "10 K": 10000, + "10 M": 10000000, + "10 G": 10000000000, + "10 T": 10000000000000, + "100 b": 100, + "100 Ki": 102400, + "100 Mi": 104857600, + "100 Gi": 107374182400, + "100 Ti": 109951162777600, + "100 K": 100000, + "100 M": 100000000, + "100 G": 100000000000, + "100 T": 100000000000000, + "1000 b": 1000, + "1000 Ki": 1024000, + "1000 Mi": 1048576000, + "1000 Gi": 1073741824000, + "1000 Ti": 1099511627776000, + "1000 K": 1000000, + "1000 M": 1000000000, + "1000 G": 1000000000000, + "1000 T": 1000000000000000, + "11234234 b": 11234234, + "11234234 Ki": 11503855616, + "11234234 Mi": 11779948150784, + "11234234 Gi": 12062666906402816, + "11234234 Ti": 12352170912156483584, + "11234234 K": 11234234000, + "11234234 M": 11234234000000, + "11234234 G": 11234234000000000, + "11234234 T": 11234234000000000000, } results = {} for i in (0, 0.1, 0.5, 0.9, 1, 7, 7.42423, 10, 100, 1000, 11234234): - for src_unit in ['b', 'Ki', 'Mi', 'Gi', 'Ti', 'K', 'M', 'G', 'T']: - results[f'{i} {src_unit}'] = human2bytes(f'{i} {src_unit}') + for src_unit in ["b", "Ki", "Mi", "Gi", "Ti", "K", "M", "G", "T"]: + results[f"{i} {src_unit}"] = human2bytes(f"{i} {src_unit}") self.assertEqual(results, expected_results) def test_hms_duration_to_seconds(self): expected_results = { - '0:0:0' : 0.0, - '00:00:00' : 0.0, - '1:1:1' : 3661.0, - '20:14:33' : 72873.0, - '72:80:112' : 264112.0, + "0:0:0": 0.0, + "00:00:00": 0.0, + "1:1:1": 3661.0, + "20:14:33": 72873.0, + "72:80:112": 264112.0, } results = {} for key in expected_results.keys(): - results[key] = hms_duration_to_seconds(f'{key}') - + results[key] = hms_duration_to_seconds(f"{key}") + self.assertEqual(results, expected_results) diff --git a/src/toil/test/lib/test_ec2.py b/src/toil/test/lib/test_ec2.py index 67f1ae7129..16efac4dee 100644 --- a/src/toil/test/lib/test_ec2.py +++ b/src/toil/test/lib/test_ec2.py @@ -16,15 +16,18 @@ import pytest -from toil.lib.aws.ami import (aws_marketplace_flatcar_ami_search, - feed_flatcar_ami_release, - flatcar_release_feed_amis, - get_flatcar_ami) +from toil.lib.aws.ami import ( + aws_marketplace_flatcar_ami_search, + feed_flatcar_ami_release, + flatcar_release_feed_amis, + get_flatcar_ami, +) from toil.test import ToilTest, needs_aws_ec2, needs_online logger = logging.getLogger(__name__) logging.basicConfig(level=logging.DEBUG) + @needs_online class FlatcarFeedTest(ToilTest): """Test accessing the Flatcar AMI release feed, independent of the AWS API""" @@ -35,62 +38,68 @@ class FlatcarFeedTest(ToilTest): def test_parse_archive_feed(self): """Make sure we can get a Flatcar release from the Internet Archive.""" - amis = list(flatcar_release_feed_amis('us-west-2', 'amd64', 'archive')) + amis = list(flatcar_release_feed_amis("us-west-2", "amd64", "archive")) for ami in amis: - self.assertEqual(len(ami), len('ami-02b46c73fed689d1c')) - self.assertTrue(ami.startswith('ami-')) - + self.assertEqual(len(ami), len("ami-02b46c73fed689d1c")) + self.assertTrue(ami.startswith("ami-")) + def test_parse_beta_feed(self): """Make sure we can get a Flatcar release from the beta channel.""" - amis = list(flatcar_release_feed_amis('us-west-2', 'amd64', 'beta')) + amis = list(flatcar_release_feed_amis("us-west-2", "amd64", "beta")) for ami in amis: - self.assertEqual(len(ami), len('ami-02b46c73fed689d1c')) - self.assertTrue(ami.startswith('ami-')) - + self.assertEqual(len(ami), len("ami-02b46c73fed689d1c")) + self.assertTrue(ami.startswith("ami-")) + def test_parse_stable_feed(self): """Make sure we can get a Flatcar release from the stable channel.""" - amis = list(flatcar_release_feed_amis('us-west-2', 'amd64', 'stable')) + amis = list(flatcar_release_feed_amis("us-west-2", "amd64", "stable")) for ami in amis: - self.assertEqual(len(ami), len('ami-02b46c73fed689d1c')) - self.assertTrue(ami.startswith('ami-')) - + self.assertEqual(len(ami), len("ami-02b46c73fed689d1c")) + self.assertTrue(ami.startswith("ami-")) + + @needs_aws_ec2 class AMITest(ToilTest): @classmethod def setUpClass(cls): from toil.lib.aws.session import establish_boto3_session - session = establish_boto3_session(region_name='us-west-2') - cls.ec2_client = session.client('ec2') + + session = establish_boto3_session(region_name="us-west-2") + cls.ec2_client = session.client("ec2") def test_fetch_flatcar(self): - with self.subTest('Test flatcar AMI from user is prioritized.'): - os.environ['TOIL_AWS_AMI'] = 'overridden' + with self.subTest("Test flatcar AMI from user is prioritized."): + os.environ["TOIL_AWS_AMI"] = "overridden" ami = get_flatcar_ami(self.ec2_client) - self.assertEqual(ami, 'overridden') - del os.environ['TOIL_AWS_AMI'] + self.assertEqual(ami, "overridden") + del os.environ["TOIL_AWS_AMI"] - with self.subTest('Test flatcar AMI returns an AMI-looking AMI.'): + with self.subTest("Test flatcar AMI returns an AMI-looking AMI."): ami = get_flatcar_ami(self.ec2_client) - self.assertEqual(len(ami), len('ami-02b46c73fed689d1c')) - self.assertTrue(ami.startswith('ami-')) + self.assertEqual(len(ami), len("ami-02b46c73fed689d1c")) + self.assertTrue(ami.startswith("ami-")) - with self.subTest('Test feed_flatcar_ami_release() returns an AMI-looking AMI.'): - ami = feed_flatcar_ami_release(self.ec2_client, source='archive') - self.assertTrue(ami is None or len(ami) == len('ami-02b46c73fed689d1c')) - self.assertTrue(ami is None or ami.startswith('ami-')) + with self.subTest( + "Test feed_flatcar_ami_release() returns an AMI-looking AMI." + ): + ami = feed_flatcar_ami_release(self.ec2_client, source="archive") + self.assertTrue(ami is None or len(ami) == len("ami-02b46c73fed689d1c")) + self.assertTrue(ami is None or ami.startswith("ami-")) - with self.subTest('Test aws_marketplace_flatcar_ami_search() returns an AMI-looking AMI.'): + with self.subTest( + "Test aws_marketplace_flatcar_ami_search() returns an AMI-looking AMI." + ): ami = aws_marketplace_flatcar_ami_search(self.ec2_client) - self.assertEqual(len(ami), len('ami-02b46c73fed689d1c')) - self.assertTrue(ami.startswith('ami-')) + self.assertEqual(len(ami), len("ami-02b46c73fed689d1c")) + self.assertTrue(ami.startswith("ami-")) # TODO: This will fail until https://github.com/flatcar/Flatcar/issues/962 is fixed @pytest.mark.xfail def test_fetch_arm_flatcar(self): """Test flatcar AMI finder architecture parameter.""" amis = set() - for arch in ['amd64', 'arm64']: + for arch in ["amd64", "arm64"]: ami = get_flatcar_ami(self.ec2_client, architecture=arch) - self.assertTrue(ami.startswith('ami-')) + self.assertTrue(ami.startswith("ami-")) amis.add(ami) self.assertTrue(len(amis) == 2) diff --git a/src/toil/test/lib/test_misc.py b/src/toil/test/lib/test_misc.py index 3382219389..87db174282 100644 --- a/src/toil/test/lib/test_misc.py +++ b/src/toil/test/lib/test_misc.py @@ -32,6 +32,7 @@ def test_get_user_name(self): apparent_user_name = get_user_name() self.assertEqual(apparent_user_name, real_user_name) + class UserNameUnvailableTest(ToilTest): """ Make sure we can get something for a user name when user names are not @@ -42,9 +43,12 @@ def setUp(self): super().setUp() # Monkey patch getpass.getuser to fail self.original_getuser = getpass.getuser + def fake_getuser(): - raise KeyError('Fake key error') + raise KeyError("Fake key error") + getpass.getuser = fake_getuser + def tearDown(self): # Fix the module we hacked up getpass.getuser = self.original_getuser @@ -54,7 +58,8 @@ def test_get_user_name(self): apparent_user_name = get_user_name() # Make sure we got something self.assertTrue(isinstance(apparent_user_name, str)) - self.assertNotEqual(apparent_user_name, '') + self.assertNotEqual(apparent_user_name, "") + class UserNameVeryBrokenTest(ToilTest): """ @@ -66,9 +71,12 @@ def setUp(self): super().setUp() # Monkey patch getpass.getuser to fail self.original_getuser = getpass.getuser + def fake_getuser(): - raise RuntimeError('Fake error that we did not anticipate') + raise RuntimeError("Fake error that we did not anticipate") + getpass.getuser = fake_getuser + def tearDown(self): # Fix the module we hacked up getpass.getuser = self.original_getuser @@ -78,5 +86,4 @@ def test_get_user_name(self): apparent_user_name = get_user_name() # Make sure we got something self.assertTrue(isinstance(apparent_user_name, str)) - self.assertNotEqual(apparent_user_name, '') - + self.assertNotEqual(apparent_user_name, "") diff --git a/src/toil/test/mesos/MesosDataStructuresTest.py b/src/toil/test/mesos/MesosDataStructuresTest.py index fef22c1453..cf70d6a87c 100644 --- a/src/toil/test/mesos/MesosDataStructuresTest.py +++ b/src/toil/test/mesos/MesosDataStructuresTest.py @@ -21,15 +21,19 @@ class DataStructuresTest(ToilTest): def _getJob(self, cores=1, memory=1000, disk=5000, preemptible=True): from toil.batchSystems.mesos import MesosShape, ToilJob - resources = MesosShape(wallTime=0, cores=cores, memory=memory, disk=disk, preemptible=preemptible) + resources = MesosShape( + wallTime=0, cores=cores, memory=memory, disk=disk, preemptible=preemptible + ) - job = ToilJob(jobID=str(uuid.uuid4()), - name=str(uuid.uuid4()), - resources=resources, - command="do nothing", - userScript=None, - environment=None, - workerCleanupInfo=None) + job = ToilJob( + jobID=str(uuid.uuid4()), + name=str(uuid.uuid4()), + resources=resources, + command="do nothing", + userScript=None, + environment=None, + workerCleanupInfo=None, + ) return job def testJobQueue(self, testJobs=1000): @@ -39,15 +43,24 @@ def testJobQueue(self, testJobs=1000): non-preemptible jobs groups first, with priority given to large jobs. """ from toil.batchSystems.mesos import JobQueue + jobQueue = JobQueue() for jobNum in range(0, testJobs): - testJob = self._getJob(cores=random.choice(list(range(10))), preemptible=random.choice([True, False])) + testJob = self._getJob( + cores=random.choice(list(range(10))), + preemptible=random.choice([True, False]), + ) jobQueue.insertJob(testJob, testJob.resources) sortedTypes = jobQueue.sortedTypes self.assertGreaterEqual(20, len(sortedTypes)) - self.assertTrue(all(sortedTypes[i] <= sortedTypes[i + 1] for i in range(len(sortedTypes) - 1))) + self.assertTrue( + all( + sortedTypes[i] <= sortedTypes[i + 1] + for i in range(len(sortedTypes) - 1) + ) + ) preemptible = sortedTypes.pop(0).preemptible for jtype in sortedTypes: diff --git a/src/toil/test/mesos/helloWorld.py b/src/toil/test/mesos/helloWorld.py index 6afed019bc..5859493c43 100644 --- a/src/toil/test/mesos/helloWorld.py +++ b/src/toil/test/mesos/helloWorld.py @@ -23,14 +23,15 @@ childMessage = "The child job is now running!" parentMessage = "The parent job is now running!" + def hello_world(job): job.fileStore.log_to_leader(parentMessage) - with open('foo_bam.txt', 'w') as handle: - handle.write('\nThis is a triumph...\n') + with open("foo_bam.txt", "w") as handle: + handle.write("\nThis is a triumph...\n") # Assign FileStoreID to a given file - foo_bam = job.fileStore.writeGlobalFile('foo_bam.txt') + foo_bam = job.fileStore.writeGlobalFile("foo_bam.txt") # Spawn child job.addChildJobFn(hello_world_child, foo_bam, memory=100, cores=0.5, disk="3G") @@ -43,13 +44,13 @@ def hello_world_child(job, hw): # NOTE: path and the udpated file are stored to /tmp # If we want to SAVE our changes to this tmp file, we must write it out. with open(path) as r: - with open('bar_bam.txt', 'w') as handle: + with open("bar_bam.txt", "w") as handle: for line in r.readlines(): handle.write(line) # Assign FileStoreID to a given file # can also use: job.updateGlobalFile() given the FileStoreID instantiation. - job.fileStore.writeGlobalFile('bar_bam.txt') + job.fileStore.writeGlobalFile("bar_bam.txt") def main(): @@ -67,5 +68,5 @@ def main(): toil.start(i) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/toil/test/mesos/stress.py b/src/toil/test/mesos/stress.py index a0e23d5f36..a02eb8fa82 100644 --- a/src/toil/test/mesos/stress.py +++ b/src/toil/test/mesos/stress.py @@ -17,58 +17,63 @@ from toil.job import Job -def touchFile( fileStore ): +def touchFile(fileStore): with fileStore.writeGlobalFileStream() as (f, id): - f.write( "This is a triumph" ) + f.write("This is a triumph") + class LongTestJob(Job): def __init__(self, numJobs): - Job.__init__(self, memory=100000, cores=0.01) + Job.__init__(self, memory=100000, cores=0.01) self.numJobs = numJobs def run(self, fileStore): - for i in range(0,self.numJobs): + for i in range(0, self.numJobs): self.addChild(HelloWorldJob(i)) self.addFollowOn(LongTestFollowOn()) + class LongTestFollowOn(Job): def __init__(self): - Job.__init__(self, memory=1000000, cores=0.01) + Job.__init__(self, memory=1000000, cores=0.01) def run(self, fileStore): - touchFile( fileStore ) + touchFile(fileStore) -class HelloWorldJob(Job): - def __init__(self,i): - Job.__init__(self, memory=100000, cores=0.01) - self.i=i +class HelloWorldJob(Job): + def __init__(self, i): + Job.__init__(self, memory=100000, cores=0.01) + self.i = i def run(self, fileStore): - touchFile( fileStore ) + touchFile(fileStore) self.addFollowOn(HelloWorldFollowOn(self.i)) + class HelloWorldFollowOn(Job): - def __init__(self,i): - Job.__init__(self, memory=200000, cores=0.01) + def __init__(self, i): + Job.__init__(self, memory=200000, cores=0.01) self.i = i def run(self, fileStore): - touchFile( fileStore) + touchFile(fileStore) + def main(numJobs): # Boilerplate -- startToil requires options parser = ArgumentParser() Job.Runner.addToilOptions(parser) - options = parser.parse_args( args=['./toilTest'] ) - options.batchSystem="mesos" - options.mesos_endpoint="localhost:5050" + options = parser.parse_args(args=["./toilTest"]) + options.batchSystem = "mesos" + options.mesos_endpoint = "localhost:5050" # Launch first toil Job - i = LongTestJob( numJobs ) - Job.Runner.startToil(i, options ) + i = LongTestJob(numJobs) + Job.Runner.startToil(i, options) + -if __name__=="__main__": +if __name__ == "__main__": main(numJobs=5) diff --git a/src/toil/test/options/options.py b/src/toil/test/options/options.py index c3c307490c..a5fbd5a761 100644 --- a/src/toil/test/options/options.py +++ b/src/toil/test/options/options.py @@ -1,6 +1,6 @@ from configargparse import ArgParser -from toil.common import addOptions, Toil +from toil.common import Toil, addOptions from toil.test import ToilTest @@ -8,6 +8,7 @@ class OptionsTest(ToilTest): """ Class to test functionality of all Toil options """ + def test_default_caching_slurm(self): """ Test to ensure that caching will be set to false when running on Slurm @@ -30,7 +31,11 @@ def test_caching_option_priority(self): addOptions(parser, jobstore_as_flag=True, wdl=False, cwl=False) # the kubernetes batchsystem (and I think all batchsystems including singlemachine) return False # for default_caching - test_args = ["--jobstore=example-jobstore", "--batchSystem=kubernetes", "--caching=True"] + test_args = [ + "--jobstore=example-jobstore", + "--batchSystem=kubernetes", + "--caching=True", + ] options = parser.parse_args(test_args) with Toil(options) as toil: caching_value = toil.config.caching diff --git a/src/toil/test/provisioners/aws/awsProvisionerTest.py b/src/toil/test/provisioners/aws/awsProvisionerTest.py index 791c7cfeb1..cae25e4dd6 100644 --- a/src/toil/test/provisioners/aws/awsProvisionerTest.py +++ b/src/toil/test/provisioners/aws/awsProvisionerTest.py @@ -19,10 +19,9 @@ from abc import abstractmethod from inspect import getsource from textwrap import dedent -from typing import TYPE_CHECKING, List, Optional +from typing import TYPE_CHECKING, Optional from uuid import uuid4 -import botocore.exceptions import pytest from toil.provisioners import cluster_factory @@ -62,11 +61,13 @@ class AWSProvisionerBenchTest(ToilTest): # Needs to talk to EC2 for image discovery @needs_aws_ec2 def test_AMI_finding(self): - for zone in ['us-west-2a', 'eu-central-1a', 'sa-east-1b']: - provisioner = AWSProvisioner('fakename', 'mesos', zone, 10000, None, None, enable_fuse=False) + for zone in ["us-west-2a", "eu-central-1a", "sa-east-1b"]: + provisioner = AWSProvisioner( + "fakename", "mesos", zone, 10000, None, None, enable_fuse=False + ) ami = provisioner._discoverAMI() # Make sure we got an AMI and it looks plausible - assert(ami.startswith('ami-')) + assert ami.startswith("ami-") @needs_aws_ec2 def test_read_write_global_files(self): @@ -74,8 +75,16 @@ def test_read_write_global_files(self): Make sure the `_write_file_to_cloud()` and `_read_file_from_cloud()` functions of the AWS provisioner work as intended. """ - provisioner = AWSProvisioner(f'aws-provisioner-test-{uuid4()}', 'mesos', 'us-west-2a', 50, None, None, enable_fuse=False) - key = 'config/test.txt' + provisioner = AWSProvisioner( + f"aws-provisioner-test-{uuid4()}", + "mesos", + "us-west-2a", + 50, + None, + None, + enable_fuse=False, + ) + key = "config/test.txt" contents = b"Hello, this is a test." try: @@ -96,23 +105,23 @@ class AbstractAWSAutoscaleTest(AbstractClusterTest): def __init__(self, methodName): super().__init__(methodName=methodName) self.instanceTypes = ["m5a.large"] - self.clusterName = 'aws-provisioner-test-' + str(uuid4()) - self.numWorkers = ['2'] + self.clusterName = "aws-provisioner-test-" + str(uuid4()) + self.numWorkers = ["2"] self.numSamples = 2 self.spotBid = 0.15 # We can't dump our user script right in /tmp or /home, because hot # deploy refuses to zip up those whole directories. So we make sure to # have a subdirectory to upload the script to. - self.scriptDir = '/tmp/t' + self.scriptDir = "/tmp/t" # Where should we put our virtualenv? - self.venvDir = '/tmp/venv' + self.venvDir = "/tmp/venv" # Where should we put our data to work on? # Must exist in the Toil container; the leader will try to rsync to it # (for the SSE key) and not create it. - self.dataDir = '/tmp' + self.dataDir = "/tmp" # What filename should we use for our script (without path)? # Can be changed by derived tests. - self.scriptName = 'test_script.py' + self.scriptName = "test_script.py" def script(self): """ @@ -127,14 +136,25 @@ def data(self, filename): return os.path.join(self.dataDir, filename) def rsyncUtil(self, src, dest): - subprocess.check_call(['toil', 'rsync-cluster', '--insecure', '-p=aws', '-z', self.zone, self.clusterName] + [src, dest]) + subprocess.check_call( + [ + "toil", + "rsync-cluster", + "--insecure", + "-p=aws", + "-z", + self.zone, + self.clusterName, + ] + + [src, dest] + ) def getRootVolID(self) -> str: - instances: List["InstanceTypeDef"] = self.cluster._get_nodes_in_cluster_boto3() + instances: list["InstanceTypeDef"] = self.cluster._get_nodes_in_cluster_boto3() instances.sort(key=lambda x: x.get("LaunchTime")) leader: "InstanceTypeDef" = instances[0] # assume leader was launched first - bdm: Optional[List["InstanceBlockDeviceMappingTypeDef"]] = leader.get( + bdm: Optional[list["InstanceBlockDeviceMappingTypeDef"]] = leader.get( "BlockDeviceMappings" ) assert bdm is not None @@ -142,7 +162,9 @@ def getRootVolID(self) -> str: for device in bdm: if device["DeviceName"] == "/dev/xvda": root_block_device = device["Ebs"] - assert root_block_device is not None # There should be a device named "/dev/xvda" + assert ( + root_block_device is not None + ) # There should be a device named "/dev/xvda" assert root_block_device.get("VolumeId") is not None return root_block_device["VolumeId"] @@ -155,18 +177,19 @@ def putScript(self, content: str): """ Helper method for _getScript to inject a script file at the configured script path, from text. """ - cluster = cluster_factory(provisioner='aws', zone=self.zone, clusterName=self.clusterName) + cluster = cluster_factory( + provisioner="aws", zone=self.zone, clusterName=self.clusterName + ) leader = cluster.getLeader() - self.sshUtil(['mkdir', '-p', self.scriptDir]) + self.sshUtil(["mkdir", "-p", self.scriptDir]) - with tempfile.NamedTemporaryFile(mode='w') as t: + with tempfile.NamedTemporaryFile(mode="w") as t: # use appliance ssh method instead of sshutil so we can specify input param t.write(content) # This works to make writes visible on non-Windows t.flush() - leader.injectFile(t.name, self.script(), 'toil_leader') - + leader.injectFile(t.name, self.script(), "toil_leader") @abstractmethod def _runScript(self, toilOptions): @@ -185,31 +208,41 @@ def _test(self, preemptibleJobs=False): self.launchCluster() # get the leader so we know the IP address - we don't need to wait since create cluster # already insures the leader is running - self.cluster = cluster_factory(provisioner='aws', zone=self.zone, clusterName=self.clusterName) + self.cluster = cluster_factory( + provisioner="aws", zone=self.zone, clusterName=self.clusterName + ) self.leader = self.cluster.getLeader() - self.sshUtil(['mkdir', '-p', self.scriptDir]) - self.sshUtil(['mkdir', '-p', self.dataDir]) + self.sshUtil(["mkdir", "-p", self.scriptDir]) + self.sshUtil(["mkdir", "-p", self.dataDir]) assert len(self.cluster._getRoleNames()) == 1 # --never-download prevents silent upgrades to pip, wheel and setuptools - venv_command = ['virtualenv', '--system-site-packages', '--python', exactPython, '--never-download', self.venvDir] + venv_command = [ + "virtualenv", + "--system-site-packages", + "--python", + exactPython, + "--never-download", + self.venvDir, + ] self.sshUtil(venv_command) - log.info('Set up script...') + log.info("Set up script...") self._getScript() - toilOptions = [self.jobStore, - '--workDir=/var/lib/toil', - '--clean=always', - '--retryCount=2', - '--logDebug', - '--logFile=' + os.path.join(self.scriptDir, 'sort.log') - ] + toilOptions = [ + self.jobStore, + "--workDir=/var/lib/toil", + "--clean=always", + "--retryCount=2", + "--logDebug", + "--logFile=" + os.path.join(self.scriptDir, "sort.log"), + ] if preemptibleJobs: - toilOptions.extend(['--defaultPreemptible']) + toilOptions.extend(["--defaultPreemptible"]) - log.info('Run script...') + log.info("Run script...") self._runScript(toilOptions) assert len(self.cluster._getRoleNames()) == 1 @@ -218,7 +251,7 @@ def _test(self, preemptibleJobs=False): self.cluster.destroyCluster() boto3_ec2: "EC2Client" = self.aws.client(region=self.region, service_name="ec2") volume_filter: "FilterTypeDef" = {"Name": "volume-id", "Values": [volumeID]} - volumes: Optional[List["VolumeTypeDef"]] = None + volumes: Optional[list["VolumeTypeDef"]] = None for attempt in range(6): # https://github.com/BD2KGenomics/toil/issues/1567 # retry this for up to 1 minute until the volume disappears @@ -228,7 +261,7 @@ def _test(self, preemptibleJobs=False): break time.sleep(10) if volumes is None or len(volumes) > 0: - self.fail('Volume with ID %s was not cleaned up properly' % volumeID) + self.fail("Volume with ID %s was not cleaned up properly" % volumeID) assert len(self.cluster._getRoleNames()) == 0 @@ -239,34 +272,49 @@ def _test(self, preemptibleJobs=False): class AWSAutoscaleTest(AbstractAWSAutoscaleTest): def __init__(self, name): super().__init__(name) - self.clusterName = 'provisioner-test-' + str(uuid4()) + self.clusterName = "provisioner-test-" + str(uuid4()) self.requestedLeaderStorage = 80 - self.scriptName = 'sort.py' + self.scriptName = "sort.py" def setUp(self): super().setUp() - self.jobStore = f'aws:{self.awsRegion()}:autoscale-{uuid4()}' + self.jobStore = f"aws:{self.awsRegion()}:autoscale-{uuid4()}" def _getScript(self): fileToSort = os.path.join(os.getcwd(), str(uuid4())) - with open(fileToSort, 'w') as f: + with open(fileToSort, "w") as f: # Fixme: making this file larger causes the test to hang - f.write('01234567890123456789012345678901') - self.rsyncUtil(os.path.join(self._projectRootPath(), 'src/toil/test/sort/sort.py'), ':' + self.script()) - self.rsyncUtil(fileToSort, ':' + self.data('sortFile')) + f.write("01234567890123456789012345678901") + self.rsyncUtil( + os.path.join(self._projectRootPath(), "src/toil/test/sort/sort.py"), + ":" + self.script(), + ) + self.rsyncUtil(fileToSort, ":" + self.data("sortFile")) os.unlink(fileToSort) def _runScript(self, toilOptions): - toilOptions.extend(['--provisioner=aws', '--batchSystem=mesos', - '--nodeTypes=' + ",".join(self.instanceTypes), - '--maxNodes=' + ",".join(self.numWorkers)]) - runCommand = [self.python(), self.script(), '--fileToSort=' + self.data('sortFile'), '--sseKey=' + self.data('sortFile')] + toilOptions.extend( + [ + "--provisioner=aws", + "--batchSystem=mesos", + "--nodeTypes=" + ",".join(self.instanceTypes), + "--maxNodes=" + ",".join(self.numWorkers), + ] + ) + runCommand = [ + self.python(), + self.script(), + "--fileToSort=" + self.data("sortFile"), + "--sseKey=" + self.data("sortFile"), + ] runCommand.extend(toilOptions) self.sshUtil(runCommand) def launchCluster(self): # add arguments to test that we can specify leader storage - self.createClusterUtil(args=['--leaderStorage', str(self.requestedLeaderStorage)]) + self.createClusterUtil( + args=["--leaderStorage", str(self.requestedLeaderStorage)] + ) def getRootVolID(self) -> str: """ @@ -289,21 +337,21 @@ def getRootVolID(self) -> str: @needs_aws_ec2 def testAutoScale(self): self.instanceTypes = ["m5a.large"] - self.numWorkers = ['2'] + self.numWorkers = ["2"] self._test() @integrative @needs_aws_ec2 def testSpotAutoScale(self): self.instanceTypes = ["m5a.large:%f" % self.spotBid] - self.numWorkers = ['2'] + self.numWorkers = ["2"] self._test(preemptibleJobs=True) @integrative @needs_aws_ec2 def testSpotAutoScaleBalancingTypes(self): self.instanceTypes = ["m5.large/m5a.large:%f" % self.spotBid] - self.numWorkers = ['2'] + self.numWorkers = ["2"] self._test(preemptibleJobs=True) @@ -312,23 +360,35 @@ def testSpotAutoScaleBalancingTypes(self): @pytest.mark.timeout(2400) class AWSStaticAutoscaleTest(AWSAutoscaleTest): """Runs the tests on a statically provisioned cluster with autoscaling enabled.""" + def __init__(self, name): super().__init__(name) self.requestedNodeStorage = 20 def launchCluster(self): from toil.lib.ec2 import wait_instances_running - self.createClusterUtil(args=['--leaderStorage', str(self.requestedLeaderStorage), - '--nodeTypes', ",".join(self.instanceTypes), - '-w', ",".join(self.numWorkers), - '--nodeStorage', str(self.requestedLeaderStorage)]) - self.cluster = cluster_factory(provisioner='aws', zone=self.zone, clusterName=self.clusterName) + self.createClusterUtil( + args=[ + "--leaderStorage", + str(self.requestedLeaderStorage), + "--nodeTypes", + ",".join(self.instanceTypes), + "-w", + ",".join(self.numWorkers), + "--nodeStorage", + str(self.requestedLeaderStorage), + ] + ) + + self.cluster = cluster_factory( + provisioner="aws", zone=self.zone, clusterName=self.clusterName + ) # We need to wait a little bit here because the workers might not be # visible to EC2 read requests immediately after the create returns, # which is the last thing that starting the cluster does. time.sleep(10) - nodes: List["InstanceTypeDef"] = self.cluster._get_nodes_in_cluster_boto3() + nodes: list["InstanceTypeDef"] = self.cluster._get_nodes_in_cluster_boto3() nodes.sort(key=lambda x: x.get("LaunchTime")) # assuming that leader is first workers = nodes[1:] @@ -341,7 +401,7 @@ def launchCluster(self): worker: "InstanceTypeDef" = next(wait_instances_running(boto3_ec2, [worker])) - bdm: Optional[List["InstanceBlockDeviceMappingTypeDef"]] = worker.get( + bdm: Optional[list["InstanceBlockDeviceMappingTypeDef"]] = worker.get( "BlockDeviceMappings" ) assert bdm is not None @@ -350,7 +410,9 @@ def launchCluster(self): if device["DeviceName"] == "/dev/xvda": root_block_device = device["Ebs"] assert root_block_device is not None - assert root_block_device.get("VolumeId") is not None # TypedDicts cannot have runtime type checks + assert ( + root_block_device.get("VolumeId") is not None + ) # TypedDicts cannot have runtime type checks volume_filter: "FilterTypeDef" = { "Name": "volume-id", @@ -365,10 +427,19 @@ def launchCluster(self): def _runScript(self, toilOptions): # Autoscale even though we have static nodes - toilOptions.extend(['--provisioner=aws', '--batchSystem=mesos', - '--nodeTypes=' + ",".join(self.instanceTypes), - '--maxNodes=' + ",".join(self.numWorkers)]) - runCommand = [self.python(), self.script(), '--fileToSort=' + self.data('sortFile')] + toilOptions.extend( + [ + "--provisioner=aws", + "--batchSystem=mesos", + "--nodeTypes=" + ",".join(self.instanceTypes), + "--maxNodes=" + ",".join(self.numWorkers), + ] + ) + runCommand = [ + self.python(), + self.script(), + "--fileToSort=" + self.data("sortFile"), + ] runCommand.extend(toilOptions) self.sshUtil(runCommand) @@ -377,23 +448,39 @@ def _runScript(self, toilOptions): @pytest.mark.timeout(1200) class AWSManagedAutoscaleTest(AWSAutoscaleTest): """Runs the tests on a self-scaling Kubernetes cluster.""" + def __init__(self, name): super().__init__(name) self.requestedNodeStorage = 20 def launchCluster(self): - self.createClusterUtil(args=['--leaderStorage', str(self.requestedLeaderStorage), - '--nodeTypes', ",".join(self.instanceTypes), - '--workers', ",".join([f'0-{c}' for c in self.numWorkers]), - '--nodeStorage', str(self.requestedLeaderStorage), - '--clusterType', 'kubernetes']) + self.createClusterUtil( + args=[ + "--leaderStorage", + str(self.requestedLeaderStorage), + "--nodeTypes", + ",".join(self.instanceTypes), + "--workers", + ",".join([f"0-{c}" for c in self.numWorkers]), + "--nodeStorage", + str(self.requestedLeaderStorage), + "--clusterType", + "kubernetes", + ] + ) - self.cluster = cluster_factory(provisioner='aws', zone=self.zone, clusterName=self.clusterName) + self.cluster = cluster_factory( + provisioner="aws", zone=self.zone, clusterName=self.clusterName + ) def _runScript(self, toilOptions): # Don't use the provisioner, and use Kubernetes instead of Mesos - toilOptions.extend(['--batchSystem=kubernetes']) - runCommand = [self.python(), self.script(), '--fileToSort=' + self.data('sortFile')] + toilOptions.extend(["--batchSystem=kubernetes"]) + runCommand = [ + self.python(), + self.script(), + "--fileToSort=" + self.data("sortFile"), + ] runCommand.extend(toilOptions) self.sshUtil(runCommand) @@ -404,37 +491,51 @@ def _runScript(self, toilOptions): class AWSAutoscaleTestMultipleNodeTypes(AbstractAWSAutoscaleTest): def __init__(self, name): super().__init__(name) - self.clusterName = 'provisioner-test-' + str(uuid4()) + self.clusterName = "provisioner-test-" + str(uuid4()) def setUp(self): super().setUp() - self.jobStore = f'aws:{self.awsRegion()}:autoscale-{uuid4()}' + self.jobStore = f"aws:{self.awsRegion()}:autoscale-{uuid4()}" def _getScript(self): - sseKeyFile = os.path.join(os.getcwd(), 'keyFile') - with open(sseKeyFile, 'w') as f: - f.write('01234567890123456789012345678901') - self.rsyncUtil(os.path.join(self._projectRootPath(), 'src/toil/test/sort/sort.py'), ':' + self.script()) - self.rsyncUtil(sseKeyFile, ':' + self.data('keyFile')) + sseKeyFile = os.path.join(os.getcwd(), "keyFile") + with open(sseKeyFile, "w") as f: + f.write("01234567890123456789012345678901") + self.rsyncUtil( + os.path.join(self._projectRootPath(), "src/toil/test/sort/sort.py"), + ":" + self.script(), + ) + self.rsyncUtil(sseKeyFile, ":" + self.data("keyFile")) os.unlink(sseKeyFile) def _runScript(self, toilOptions): # Set memory requirements so that sort jobs can be run # on small instances, but merge jobs must be run on large # instances - toilOptions.extend(['--provisioner=aws', '--batchSystem=mesos', - '--nodeTypes=' + ",".join(self.instanceTypes), - '--maxNodes=' + ",".join(self.numWorkers)]) - runCommand = [self.python(), self.script(), '--fileToSort=/home/s3am/bin/asadmin', '--sortMemory=0.6G', '--mergeMemory=3.0G'] + toilOptions.extend( + [ + "--provisioner=aws", + "--batchSystem=mesos", + "--nodeTypes=" + ",".join(self.instanceTypes), + "--maxNodes=" + ",".join(self.numWorkers), + ] + ) + runCommand = [ + self.python(), + self.script(), + "--fileToSort=/home/s3am/bin/asadmin", + "--sortMemory=0.6G", + "--mergeMemory=3.0G", + ] runCommand.extend(toilOptions) - runCommand.append('--sseKey=' + self.data('keyFile')) + runCommand.append("--sseKey=" + self.data("keyFile")) self.sshUtil(runCommand) @integrative @needs_aws_ec2 def testAutoScale(self): self.instanceTypes = ["t2.small", "m5a.large"] - self.numWorkers = ['2', '1'] + self.numWorkers = ["2", "1"] self._test() @@ -443,16 +544,17 @@ def testAutoScale(self): @pytest.mark.timeout(1200) class AWSRestartTest(AbstractAWSAutoscaleTest): """This test insures autoscaling works on a restarted Toil run.""" + def __init__(self, name): super().__init__(name) - self.clusterName = 'restart-test-' + str(uuid4()) - self.scriptName = 'restartScript.py' + self.clusterName = "restart-test-" + str(uuid4()) + self.scriptName = "restartScript.py" def setUp(self): super().setUp() - self.instanceTypes = ['t2.small'] - self.numWorkers = ['1'] - self.jobStore = f'aws:{self.awsRegion()}:restart-{uuid4()}' + self.instanceTypes = ["t2.small"] + self.numWorkers = ["1"] + self.jobStore = f"aws:{self.awsRegion()}:restart-{uuid4()}" def _getScript(self): def restartScript(): @@ -463,38 +565,56 @@ def restartScript(): from toil.job import Job def f0(job): - if 'FAIL' in os.environ: - raise RuntimeError('failed on purpose') + if "FAIL" in os.environ: + raise RuntimeError("failed on purpose") - if __name__ == '__main__': + if __name__ == "__main__": parser = ArgumentParser() Job.Runner.addToilOptions(parser) options = parser.parse_args() - rootJob = Job.wrapJobFn(f0, cores=0.5, memory='50 M', disk='50 M') + rootJob = Job.wrapJobFn(f0, cores=0.5, memory="50 M", disk="50 M") Job.Runner.startToil(rootJob, options) - script = dedent('\n'.join(getsource(restartScript).split('\n')[1:])) + script = dedent("\n".join(getsource(restartScript).split("\n")[1:])) self.putScript(script) def _runScript(self, toilOptions): # Use the provisioner in the workflow - toilOptions.extend(['--provisioner=aws', '--batchSystem=mesos', - '--nodeTypes=' + ",".join(self.instanceTypes), - '--maxNodes=' + ",".join(self.numWorkers)]) + toilOptions.extend( + [ + "--provisioner=aws", + "--batchSystem=mesos", + "--nodeTypes=" + ",".join(self.instanceTypes), + "--maxNodes=" + ",".join(self.numWorkers), + ] + ) # clean = onSuccess - disallowedOptions = ['--clean=always', '--retryCount=2'] - newOptions = [option for option in toilOptions if option not in disallowedOptions] + disallowedOptions = ["--clean=always", "--retryCount=2"] + newOptions = [ + option for option in toilOptions if option not in disallowedOptions + ] try: # include a default memory - on restart the minimum memory requirement is the default, usually 2 GB - command = [self.python(), self.script(), '--setEnv', 'FAIL=true', '--defaultMemory=50000000'] + command = [ + self.python(), + self.script(), + "--setEnv", + "FAIL=true", + "--defaultMemory=50000000", + ] command.extend(newOptions) self.sshUtil(command) except subprocess.CalledProcessError: pass else: - self.fail('Command succeeded when we expected failure') + self.fail("Command succeeded when we expected failure") with timeLimit(600): - command = [self.python(), self.script(), '--restart', '--defaultMemory=50000000'] + command = [ + self.python(), + self.script(), + "--restart", + "--defaultMemory=50000000", + ] command.extend(toilOptions) self.sshUtil(command) @@ -508,14 +628,17 @@ def testAutoScaledCluster(self): class PreemptibleDeficitCompensationTest(AbstractAWSAutoscaleTest): def __init__(self, name): super().__init__(name) - self.clusterName = 'deficit-test-' + str(uuid4()) - self.scriptName = 'userScript.py' + self.clusterName = "deficit-test-" + str(uuid4()) + self.scriptName = "userScript.py" def setUp(self): super().setUp() - self.instanceTypes = ['m5a.large:0.01', "m5a.large"] # instance needs to be available on the spot market - self.numWorkers = ['1', '1'] - self.jobStore = f'aws:{self.awsRegion()}:deficit-{uuid4()}' + self.instanceTypes = [ + "m5a.large:0.01", + "m5a.large", + ] # instance needs to be available on the spot market + self.numWorkers = ["1", "1"] + self.jobStore = f"aws:{self.awsRegion()}:deficit-{uuid4()}" def test(self): self._test(preemptibleJobs=True) @@ -532,10 +655,10 @@ def userScript(): # we will observe a deficit of preemptible nodes that the non-preemptible scaler will # compensate for by spinning up non-preemptible nodes instead. # - def job(job, disk='10M', cores=1, memory='10M', preemptible=True): + def job(job, disk="10M", cores=1, memory="10M", preemptible=True): pass - if __name__ == '__main__': + if __name__ == "__main__": options = Job.Runner.getDefaultArgumentParser().parse_args() with Toil(options) as toil: if toil.config.restart: @@ -543,14 +666,19 @@ def job(job, disk='10M', cores=1, memory='10M', preemptible=True): else: toil.start(Job.wrapJobFn(job)) - script = dedent('\n'.join(getsource(userScript).split('\n')[1:])) + script = dedent("\n".join(getsource(userScript).split("\n")[1:])) self.putScript(script) def _runScript(self, toilOptions): - toilOptions.extend(['--provisioner=aws', '--batchSystem=mesos', - '--nodeTypes=' + ",".join(self.instanceTypes), - '--maxNodes=' + ",".join(self.numWorkers)]) - toilOptions.extend(['--preemptibleCompensation=1.0']) + toilOptions.extend( + [ + "--provisioner=aws", + "--batchSystem=mesos", + "--nodeTypes=" + ",".join(self.instanceTypes), + "--maxNodes=" + ",".join(self.numWorkers), + ] + ) + toilOptions.extend(["--preemptibleCompensation=1.0"]) command = [self.python(), self.script()] command.extend(toilOptions) self.sshUtil(command) diff --git a/src/toil/test/provisioners/clusterScalerTest.py b/src/toil/test/provisioners/clusterScalerTest.py index 7556a0aae5..dc7cb4e7eb 100644 --- a/src/toil/test/provisioners/clusterScalerTest.py +++ b/src/toil/test/provisioners/clusterScalerTest.py @@ -21,62 +21,55 @@ from collections import defaultdict from queue import Empty, Queue from threading import Event, Thread -from typing import List, Optional, Set, Tuple +from typing import Optional from unittest.mock import MagicMock -from toil.batchSystems.abstractBatchSystem import (AbstractBatchSystem, - AbstractScalableBatchSystem, - NodeInfo) +from toil.batchSystems.abstractBatchSystem import ( + AbstractBatchSystem, + AbstractScalableBatchSystem, + NodeInfo, +) from toil.common import Config -from toil.options.common import defaultTargetTime from toil.job import JobDescription from toil.lib.conversions import human2bytes as h2b +from toil.options.common import defaultTargetTime from toil.provisioners.abstractProvisioner import AbstractProvisioner, Shape -from toil.provisioners.clusterScaler import (BinPackedFit, - ClusterScaler, - NodeReservation, - ScalerThread) +from toil.provisioners.clusterScaler import ( + BinPackedFit, + ClusterScaler, + NodeReservation, + ScalerThread, +) from toil.provisioners.node import Node from toil.test import ToilTest, slow logger = logging.getLogger(__name__) # simplified c4.8xlarge (preemptible) -c4_8xlarge_preemptible = Shape(wallTime=3600, - memory=h2b('60G'), - cores=36, - disk=h2b('100G'), - preemptible=True) +c4_8xlarge_preemptible = Shape( + wallTime=3600, memory=h2b("60G"), cores=36, disk=h2b("100G"), preemptible=True +) # simplified c4.8xlarge (non-preemptible) -c4_8xlarge = Shape(wallTime=3600, - memory=h2b('60G'), - cores=36, - disk=h2b('100G'), - preemptible=False) +c4_8xlarge = Shape( + wallTime=3600, memory=h2b("60G"), cores=36, disk=h2b("100G"), preemptible=False +) # simplified r3.8xlarge (non-preemptible) -r3_8xlarge = Shape(wallTime=3600, - memory=h2b('260G'), - cores=32, - disk=h2b('600G'), - preemptible=False) +r3_8xlarge = Shape( + wallTime=3600, memory=h2b("260G"), cores=32, disk=h2b("600G"), preemptible=False +) # simplified r5.2xlarge (non-preemptible) -r5_2xlarge = Shape(wallTime=3600, - memory=h2b('64Gi'), - cores=8, - disk=h2b('50G'), - preemptible=False) +r5_2xlarge = Shape( + wallTime=3600, memory=h2b("64Gi"), cores=8, disk=h2b("50G"), preemptible=False +) # simplified r5.4xlarge (non-preemptible) -r5_4xlarge = Shape(wallTime=3600, - memory=h2b('128Gi'), - cores=16, - disk=h2b('50G'), - preemptible=False) +r5_4xlarge = Shape( + wallTime=3600, memory=h2b("128Gi"), cores=16, disk=h2b("50G"), preemptible=False +) # simplified t2.micro (non-preemptible) -t2_micro = Shape(wallTime=3600, - memory=h2b('1G'), - cores=1, - disk=h2b('8G'), - preemptible=False) +t2_micro = Shape( + wallTime=3600, memory=h2b("1G"), cores=1, disk=h2b("8G"), preemptible=False +) + class BinPackingTest(ToilTest): def setUp(self): @@ -85,56 +78,104 @@ def setUp(self): def testPackingOneShape(self): """Pack one shape and check that the resulting reservations look sane.""" - self.bpf.nodeReservations[c4_8xlarge_preemptible] = [NodeReservation(c4_8xlarge_preemptible)] - self.bpf.addJobShape(Shape(wallTime=1000, - cores=2, - memory=h2b('1G'), - disk=h2b('2G'), - preemptible=True)) + self.bpf.nodeReservations[c4_8xlarge_preemptible] = [ + NodeReservation(c4_8xlarge_preemptible) + ] + self.bpf.addJobShape( + Shape( + wallTime=1000, + cores=2, + memory=h2b("1G"), + disk=h2b("2G"), + preemptible=True, + ) + ) self.assertEqual(self.bpf.nodeReservations[r3_8xlarge], []) - self.assertEqual([x.shapes() for x in self.bpf.nodeReservations[c4_8xlarge_preemptible]], - [[Shape(wallTime=1000, - memory=h2b('59G'), - cores=34, - disk=h2b('98G'), - preemptible=True), - Shape(wallTime=2600, - memory=h2b('60G'), - cores=36, - disk=h2b('100G'), - preemptible=True)]]) + self.assertEqual( + [x.shapes() for x in self.bpf.nodeReservations[c4_8xlarge_preemptible]], + [ + [ + Shape( + wallTime=1000, + memory=h2b("59G"), + cores=34, + disk=h2b("98G"), + preemptible=True, + ), + Shape( + wallTime=2600, + memory=h2b("60G"), + cores=36, + disk=h2b("100G"), + preemptible=True, + ), + ] + ], + ) def testSorting(self): """ Test that sorting is correct: preemptible, then memory, then cores, then disk, then wallTime. """ - shapeList = [c4_8xlarge_preemptible, r3_8xlarge, c4_8xlarge, c4_8xlarge, - t2_micro, t2_micro, c4_8xlarge, r3_8xlarge, r3_8xlarge, t2_micro] + shapeList = [ + c4_8xlarge_preemptible, + r3_8xlarge, + c4_8xlarge, + c4_8xlarge, + t2_micro, + t2_micro, + c4_8xlarge, + r3_8xlarge, + r3_8xlarge, + t2_micro, + ] shapeList.sort() - assert shapeList == [c4_8xlarge_preemptible, - t2_micro, t2_micro, t2_micro, - c4_8xlarge, c4_8xlarge, c4_8xlarge, - r3_8xlarge, r3_8xlarge, r3_8xlarge] + assert shapeList == [ + c4_8xlarge_preemptible, + t2_micro, + t2_micro, + t2_micro, + c4_8xlarge, + c4_8xlarge, + c4_8xlarge, + r3_8xlarge, + r3_8xlarge, + r3_8xlarge, + ] def testAddingInitialNode(self): """Pack one shape when no nodes are available and confirm that we fit one node properly.""" - self.bpf.addJobShape(Shape(wallTime=1000, - cores=2, - memory=h2b('1G'), - disk=h2b('2G'), - preemptible=True)) - self.assertEqual([x.shapes() for x in self.bpf.nodeReservations[c4_8xlarge_preemptible]], - [[Shape(wallTime=1000, - memory=h2b('59G'), - cores=34, - disk=h2b('98G'), - preemptible=True), - Shape(wallTime=2600, - memory=h2b('60G'), - cores=36, - disk=h2b('100G'), - preemptible=True)]]) + self.bpf.addJobShape( + Shape( + wallTime=1000, + cores=2, + memory=h2b("1G"), + disk=h2b("2G"), + preemptible=True, + ) + ) + self.assertEqual( + [x.shapes() for x in self.bpf.nodeReservations[c4_8xlarge_preemptible]], + [ + [ + Shape( + wallTime=1000, + memory=h2b("59G"), + cores=34, + disk=h2b("98G"), + preemptible=True, + ), + Shape( + wallTime=2600, + memory=h2b("60G"), + cores=36, + disk=h2b("100G"), + preemptible=True, + ), + ] + ], + ) def testLowTargetTime(self): """ @@ -150,11 +191,13 @@ def testLowTargetTime(self): Each job is parametrized to take 300 seconds, so (the minimum of) 1 of them should fit into each node's 0 second window, so we expect 1000 nodes. """ - allocation = self.run1000JobsOnMicros(jobCores=1, - jobMem=h2b('1G'), - jobDisk=h2b('1G'), - jobTime=300, - globalTargetTime=0) + allocation = self.run1000JobsOnMicros( + jobCores=1, + jobMem=h2b("1G"), + jobDisk=h2b("1G"), + jobTime=300, + globalTargetTime=0, + ) self.assertEqual(allocation, {t2_micro: 1000}) def testHighTargetTime(self): @@ -170,11 +213,13 @@ def testHighTargetTime(self): Each job is parametrized to take 300 seconds, so 12 of them should fit into each node's 3600 second window. 1000/12 = 83.33, so we expect 84 nodes. """ - allocation = self.run1000JobsOnMicros(jobCores=1, - jobMem=h2b('1G'), - jobDisk=h2b('1G'), - jobTime=300, - globalTargetTime=3600) + allocation = self.run1000JobsOnMicros( + jobCores=1, + jobMem=h2b("1G"), + jobDisk=h2b("1G"), + jobTime=300, + globalTargetTime=3600, + ) self.assertEqual(allocation, {t2_micro: 84}) def testZeroResourceJobs(self): @@ -188,11 +233,9 @@ def testZeroResourceJobs(self): Since all jobs should pack cpu/disk/mem-wise on a t2.micro, we expect only one t2.micro to be provisioned. If we raise this, as in testLowTargetTime, it will launch 1000 t2.micros. """ - allocation = self.run1000JobsOnMicros(jobCores=0, - jobMem=0, - jobDisk=0, - jobTime=300, - globalTargetTime=0) + allocation = self.run1000JobsOnMicros( + jobCores=0, jobMem=0, jobDisk=0, jobTime=300, globalTargetTime=0 + ) self.assertEqual(allocation, {t2_micro: 1}) def testLongRunningJobs(self): @@ -206,11 +249,13 @@ def testLongRunningJobs(self): Despite setting globalTargetTime=3600, this should launch 1000 t2.micros because each job's estimated runtime (30000 seconds) extends well beyond 3600 seconds. """ - allocation = self.run1000JobsOnMicros(jobCores=1, - jobMem=h2b('1G'), - jobDisk=h2b('1G'), - jobTime=30000, - globalTargetTime=3600) + allocation = self.run1000JobsOnMicros( + jobCores=1, + jobMem=h2b("1G"), + jobDisk=h2b("1G"), + jobTime=30000, + globalTargetTime=3600, + ) self.assertEqual(allocation, {t2_micro: 1000}) def run1000JobsOnMicros(self, jobCores, jobMem, jobDisk, jobTime, globalTargetTime): @@ -221,11 +266,15 @@ def run1000JobsOnMicros(self, jobCores, jobMem, jobDisk, jobTime, globalTargetTi bpf = BinPackedFit(node_shapes_for_testing, targetTime=globalTargetTime) for _ in range(1000): - bpf.addJobShape(Shape(wallTime=jobTime, - memory=jobMem, - cores=jobCores, - disk=jobDisk, - preemptible=False)) + bpf.addJobShape( + Shape( + wallTime=jobTime, + memory=jobMem, + cores=jobCores, + disk=jobDisk, + preemptible=False, + ) + ) return bpf.getRequiredNodes() def testPathologicalCase(self): @@ -238,20 +287,30 @@ def testPathologicalCase(self): the future. """ # Add one job that partially fills an r3.8xlarge for 1000 hours - self.bpf.addJobShape(Shape(wallTime=3600000, - memory=h2b('10G'), - cores=0, - disk=h2b('10G'), - preemptible=False)) + self.bpf.addJobShape( + Shape( + wallTime=3600000, + memory=h2b("10G"), + cores=0, + disk=h2b("10G"), + preemptible=False, + ) + ) for _ in range(500): # Add 500 CPU-hours worth of jobs that fill an r3.8xlarge - self.bpf.addJobShape(Shape(wallTime=3600, - memory=h2b('26G'), - cores=32, - disk=h2b('60G'), - preemptible=False)) + self.bpf.addJobShape( + Shape( + wallTime=3600, + memory=h2b("26G"), + cores=32, + disk=h2b("60G"), + preemptible=False, + ) + ) # Hopefully we didn't assign just one node to cover all those jobs. - self.assertNotEqual(self.bpf.getRequiredNodes(), {r3_8xlarge: 1, c4_8xlarge_preemptible: 0}) + self.assertNotEqual( + self.bpf.getRequiredNodes(), {r3_8xlarge: 1, c4_8xlarge_preemptible: 0} + ) def testJobTooLargeForAllNodes(self): """ @@ -259,14 +318,17 @@ def testJobTooLargeForAllNodes(self): warning, but definitely not crash. """ # Takes more RAM than an r3.8xlarge - largerThanR3 = Shape(wallTime=3600, - memory=h2b('360G'), - cores=32, - disk=h2b('600G'), - preemptible=False) + largerThanR3 = Shape( + wallTime=3600, + memory=h2b("360G"), + cores=32, + disk=h2b("600G"), + preemptible=False, + ) self.bpf.addJobShape(largerThanR3) # If we got here we didn't crash. + class ClusterScalerTest(ToilTest): def setUp(self): super().setUp() @@ -279,7 +341,9 @@ def setUp(self): # It is also a full mock provisioner, so configure it to be that as well self.provisioner = self.leader # Pretend that Shapes are actually strings we can use for instance type names. - self.provisioner.setAutoscaledNodeTypes([({t}, None) for t in self.config.nodeTypes]) + self.provisioner.setAutoscaledNodeTypes( + [({t}, None) for t in self.config.nodeTypes] + ) def testRounding(self): """ @@ -299,8 +363,8 @@ def testRounding(self): self.assertEqual(scaler._round(123456789101112.13), 123456789101112) # Decimals other than X.5 round to the side they are closer to - self.assertEqual(scaler._round(1E-10), 0) - self.assertEqual(scaler._round(0.5 + 1E-15), 1) + self.assertEqual(scaler._round(1e-10), 0) + self.assertEqual(scaler._round(0.5 + 1e-15), 1) self.assertEqual(scaler._round(-0.9), -1) self.assertEqual(scaler._round(-0.4), 0) @@ -322,17 +386,30 @@ def testMaxNodes(self): self.config.betaInertia = 0.0 self.config.maxNodes = [2, 3] scaler = ClusterScaler(self.provisioner, self.leader, self.config) - jobShapes = [Shape(wallTime=3600, - cores=2, - memory=h2b('1G'), - disk=h2b('2G'), - preemptible=True)] * 1000 - jobShapes.extend([Shape(wallTime=3600, - cores=2, - memory=h2b('1G'), - disk=h2b('2G'), - preemptible=False)] * 1000) - estimatedNodeCounts, could_not_fit = scaler.getEstimatedNodeCounts(jobShapes, defaultdict(int)) + jobShapes = [ + Shape( + wallTime=3600, + cores=2, + memory=h2b("1G"), + disk=h2b("2G"), + preemptible=True, + ) + ] * 1000 + jobShapes.extend( + [ + Shape( + wallTime=3600, + cores=2, + memory=h2b("1G"), + disk=h2b("2G"), + preemptible=False, + ) + ] + * 1000 + ) + estimatedNodeCounts, could_not_fit = scaler.getEstimatedNodeCounts( + jobShapes, defaultdict(int) + ) self.assertEqual(estimatedNodeCounts[r3_8xlarge], 2) self.assertEqual(estimatedNodeCounts[c4_8xlarge_preemptible], 3) self.assertEqual(len(could_not_fit), 0) @@ -345,7 +422,9 @@ def testMinNodes(self): self.config.minNodes = [2, 3] scaler = ClusterScaler(self.provisioner, self.leader, self.config) jobShapes = [] - estimatedNodeCounts, could_not_fit = scaler.getEstimatedNodeCounts(jobShapes, defaultdict(int)) + estimatedNodeCounts, could_not_fit = scaler.getEstimatedNodeCounts( + jobShapes, defaultdict(int) + ) self.assertEqual(estimatedNodeCounts[r3_8xlarge], 2) self.assertEqual(estimatedNodeCounts[c4_8xlarge_preemptible], 3) self.assertEqual(len(could_not_fit), 0) @@ -367,7 +446,9 @@ def testPreemptibleDeficitResponse(self): # the same type. That is the only situation where # preemptibleCompensation applies. self.config.nodeTypes = [c4_8xlarge_preemptible, c4_8xlarge] - self.provisioner.setAutoscaledNodeTypes([({t}, None) for t in self.config.nodeTypes]) + self.provisioner.setAutoscaledNodeTypes( + [({t}, None) for t in self.config.nodeTypes] + ) scaler = ClusterScaler(self.provisioner, self.leader, self.config) # Simulate a situation where a previous run caused a @@ -375,16 +456,24 @@ def testPreemptibleDeficitResponse(self): scaler.preemptibleNodeDeficit[c4_8xlarge] = 5 # Add a bunch of preemptible jobs (so the bin-packing # estimate for the non-preemptible node should still be 0) - jobShapes = [Shape(wallTime=3600, - cores=2, - memory=h2b('1G'), - disk=h2b('2G'), - preemptible=True)] * 1000 - estimatedNodeCounts, could_not_fit = scaler.getEstimatedNodeCounts(jobShapes, defaultdict(int)) + jobShapes = [ + Shape( + wallTime=3600, + cores=2, + memory=h2b("1G"), + disk=h2b("2G"), + preemptible=True, + ) + ] * 1000 + estimatedNodeCounts, could_not_fit = scaler.getEstimatedNodeCounts( + jobShapes, defaultdict(int) + ) # We don't care about the estimated size of the preemptible # nodes. All we want to know is if we responded to the deficit # properly: 0.5 * 5 (preemptibleCompensation * the deficit) = 3 (rounded up). - self.assertEqual(estimatedNodeCounts[self.provisioner.node_shapes_for_testing[1]], 3) + self.assertEqual( + estimatedNodeCounts[self.provisioner.node_shapes_for_testing[1]], 3 + ) self.assertEqual(len(could_not_fit), 0) def testPreemptibleDeficitIsSet(self): @@ -404,7 +493,9 @@ def testPreemptibleDeficitIsSet(self): # the same type. That is the only situation where # preemptibleCompensation applies. self.config.nodeTypes = [c4_8xlarge_preemptible, c4_8xlarge] - self.provisioner.setAutoscaledNodeTypes([({t}, None) for t in self.config.nodeTypes]) + self.provisioner.setAutoscaledNodeTypes( + [({t}, None) for t in self.config.nodeTypes] + ) scaler = ClusterScaler(self.provisioner, self.leader, self.config) estimatedNodeCounts = {c4_8xlarge_preemptible: 5, c4_8xlarge: 0} scaler.updateClusterSize(estimatedNodeCounts) @@ -427,18 +518,30 @@ def testNoLaunchingIfDeltaAlreadyMet(self): scaler = ClusterScaler(self.provisioner, self.leader, self.config) # Pretend there is one ignored worker in the cluster self.provisioner.getProvisionedWorkers = MagicMock( - return_value=[Node('127.0.0.1', '127.0.0.1', 'testNode', - datetime.datetime.now().isoformat(), - nodeType=c4_8xlarge, preemptible=True)]) - scaler.ignoredNodes.add('127.0.0.1') + return_value=[ + Node( + "127.0.0.1", + "127.0.0.1", + "testNode", + datetime.datetime.now().isoformat(), + nodeType=c4_8xlarge, + preemptible=True, + ) + ] + ) + scaler.ignoredNodes.add("127.0.0.1") # Exercise the updateClusterSize logic self.provisioner.addNodes = MagicMock() scaler.updateClusterSize({c4_8xlarge: 1}) - self.assertFalse(self.provisioner.addNodes.called, - "addNodes was called when no new nodes were needed") - self.assertEqual(len(scaler.ignoredNodes), 0, - "The scaler didn't unignore an ignored node when " - "scaling up") + self.assertFalse( + self.provisioner.addNodes.called, + "addNodes was called when no new nodes were needed", + ) + self.assertEqual( + len(scaler.ignoredNodes), + 0, + "The scaler didn't unignore an ignored node when " "scaling up", + ) def testBetaInertia(self): # This is really high, but makes things easy to calculate. @@ -466,25 +569,29 @@ def test_overhead_accounting_large(self): # If the job needs 100% of the memory of the instance type, it won't # fit and will need a bigger node. - self._check_job_estimate([(c4_8xlarge, 0), (r3_8xlarge, 1)], memory=h2b('60G')) + self._check_job_estimate([(c4_8xlarge, 0), (r3_8xlarge, 1)], memory=h2b("60G")) # If the job needs 98% of the memory of the instance type, it won't # fit and will need a bigger node. - self._check_job_estimate([(c4_8xlarge, 0), (r3_8xlarge, 1)], memory=int(h2b('60G') * 0.98)) + self._check_job_estimate( + [(c4_8xlarge, 0), (r3_8xlarge, 1)], memory=int(h2b("60G") * 0.98) + ) # If the job needs 90% of the memory of the instance type, it will fit. - self._check_job_estimate([(c4_8xlarge, 1), (r3_8xlarge, 0)], memory=int(h2b('60G') * 0.90)) + self._check_job_estimate( + [(c4_8xlarge, 1), (r3_8xlarge, 0)], memory=int(h2b("60G") * 0.90) + ) # If the job needs 100% of the disk of the instance type, it won't # fit and will need a bigger node. - self._check_job_estimate([(c4_8xlarge, 0), (r3_8xlarge, 1)], disk=h2b('100G')) + self._check_job_estimate([(c4_8xlarge, 0), (r3_8xlarge, 1)], disk=h2b("100G")) # If the job needs all but 7G of the disk of the instance type, it won't # fit and will need a bigger node. - self._check_job_estimate([(c4_8xlarge, 0), (r3_8xlarge, 1)], disk=h2b('93G')) + self._check_job_estimate([(c4_8xlarge, 0), (r3_8xlarge, 1)], disk=h2b("93G")) # If the job leaves 10% and 10G of the disk free, it fits - self._check_job_estimate([(c4_8xlarge, 1), (r3_8xlarge, 0)], disk=h2b('90G')) + self._check_job_estimate([(c4_8xlarge, 1), (r3_8xlarge, 0)], disk=h2b("90G")) def test_overhead_accounting_small(self): """ @@ -499,11 +606,13 @@ def test_overhead_accounting_small(self): # If the job needs 100% of the memory of the instance type, it won't # fit and will need a bigger node. - self._check_job_estimate([(t2_micro, 0), (r3_8xlarge, 1)], memory=h2b('1G')) + self._check_job_estimate([(t2_micro, 0), (r3_8xlarge, 1)], memory=h2b("1G")) # If the job needs all but 100M of the memory of the instance type, it # won't fit and will need a bigger node. - self._check_job_estimate([(t2_micro, 0), (r3_8xlarge, 1)], memory=h2b('1G') - h2b('100M')) + self._check_job_estimate( + [(t2_micro, 0), (r3_8xlarge, 1)], memory=h2b("1G") - h2b("100M") + ) # If the job needs no more than 90% of the memory on the node *and* # leaves at least 384M free for overhead, we can rely on it fitting on a 1G @@ -512,12 +621,14 @@ def test_overhead_accounting_small(self): Shape( wallTime=3600, cores=1, - memory=h2b('1G') - h2b('384M'), - disk=h2b('2G'), - preemptible=True + memory=h2b("1G") - h2b("384M"), + disk=h2b("2G"), + preemptible=True, ) ] - self._check_job_estimate([(t2_micro, 1), (r3_8xlarge, 0)], memory=h2b('1G') - h2b('384M')) + self._check_job_estimate( + [(t2_micro, 1), (r3_8xlarge, 0)], memory=h2b("1G") - h2b("384M") + ) def test_overhead_accounting_observed(self): """ @@ -536,9 +647,13 @@ def test_overhead_accounting_observed(self): # not clear if Mesos is thinking in actual GB or GiB here. # A 62.5Gi job is sent to the larger node - self._check_job_estimate([(r5_2xlarge, 0), (r5_4xlarge, 1)], memory=h2b('62.5 Gi')) + self._check_job_estimate( + [(r5_2xlarge, 0), (r5_4xlarge, 1)], memory=h2b("62.5 Gi") + ) - def _check_job_estimate(self, nodes: List[Tuple[Shape, int]], cores=1, memory=1, disk=1) -> None: + def _check_job_estimate( + self, nodes: list[tuple[Shape, int]], cores=1, memory=1, disk=1 + ) -> None: """ Make sure that a job with the given requirements, when run on the given nodes, produces the given numbers of them. @@ -553,23 +668,20 @@ def _check_job_estimate(self, nodes: List[Tuple[Shape, int]], cores=1, memory=1, jobs = [ Shape( - wallTime=3600, - cores=cores, - memory=memory, - disk=disk, - preemptible=True + wallTime=3600, cores=cores, memory=memory, disk=disk, preemptible=True ) ] - logger.debug('Try and fit jobs: %s', jobs) + logger.debug("Try and fit jobs: %s", jobs) counts, could_not_fit = scaler.getEstimatedNodeCounts(jobs, defaultdict(int)) for node, count in nodes: seen_count = counts.get(node, 0) if seen_count != count: - logger.error('Saw %s/%s instances of node %s', seen_count, count, node) + logger.error("Saw %s/%s instances of node %s", seen_count, count, node) self.assertEqual(seen_count, count) self.assertEqual(len(could_not_fit), 0) + class ScalerThreadTest(ToilTest): def _testClusterScaling(self, config, numJobs, numPreemptibleJobs, jobShape): """ @@ -587,49 +699,77 @@ def _testClusterScaling(self, config, numJobs, numPreemptibleJobs, jobShape): clusterScaler.start() try: # Add 100 jobs to complete - list(map(lambda x: mock.addJob(jobShape=jobShape), - list(range(numJobs)))) - list(map(lambda x: mock.addJob(jobShape=jobShape, preemptible=True), - list(range(numPreemptibleJobs)))) + list(map(lambda x: mock.addJob(jobShape=jobShape), list(range(numJobs)))) + list( + map( + lambda x: mock.addJob(jobShape=jobShape, preemptible=True), + list(range(numPreemptibleJobs)), + ) + ) # Add some completed jobs for preemptible in (True, False): - if preemptible and numPreemptibleJobs > 0 or not preemptible and numJobs > 0: + if ( + preemptible + and numPreemptibleJobs > 0 + or not preemptible + and numJobs > 0 + ): # Add 1000 random jobs for _ in range(1000): x = mock.getNodeShape(nodeType=jobShape) - iJ = JobDescription(requirements=dict( - memory=random.randrange(1, x.memory), - cores=random.randrange(1, x.cores), - disk=random.randrange(1, x.disk), - preemptible=preemptible), - jobName='testClusterScaling', unitName='') - clusterScaler.addCompletedJob(iJ, random.choice(list(range(1, x.wallTime)))) + iJ = JobDescription( + requirements=dict( + memory=random.randrange(1, x.memory), + cores=random.randrange(1, x.cores), + disk=random.randrange(1, x.disk), + preemptible=preemptible, + ), + jobName="testClusterScaling", + unitName="", + ) + clusterScaler.addCompletedJob( + iJ, random.choice(list(range(1, x.wallTime))) + ) startTime = time.time() # Wait while the cluster processes the jobs - while (mock.getNumberOfJobsIssued(preemptible=False) > 0 - or mock.getNumberOfJobsIssued(preemptible=True) > 0 - or mock.getNumberOfNodes() > 0 or mock.getNumberOfNodes(preemptible=True) > 0): - logger.debug("Running, non-preemptible queue size: %s, non-preemptible workers: %s, " - "preemptible queue size: %s, preemptible workers: %s" % - (mock.getNumberOfJobsIssued(preemptible=False), - mock.getNumberOfNodes(preemptible=False), - mock.getNumberOfJobsIssued(preemptible=True), - mock.getNumberOfNodes(preemptible=True))) + while ( + mock.getNumberOfJobsIssued(preemptible=False) > 0 + or mock.getNumberOfJobsIssued(preemptible=True) > 0 + or mock.getNumberOfNodes() > 0 + or mock.getNumberOfNodes(preemptible=True) > 0 + ): + logger.debug( + "Running, non-preemptible queue size: %s, non-preemptible workers: %s, " + "preemptible queue size: %s, preemptible workers: %s" + % ( + mock.getNumberOfJobsIssued(preemptible=False), + mock.getNumberOfNodes(preemptible=False), + mock.getNumberOfJobsIssued(preemptible=True), + mock.getNumberOfNodes(preemptible=True), + ) + ) clusterScaler.check() time.sleep(0.5) - logger.debug("We waited %s for cluster to finish" % (time.time() - startTime)) + logger.debug( + "We waited %s for cluster to finish" % (time.time() - startTime) + ) finally: clusterScaler.shutdown() mock.shutDown() # Print some info about the autoscaling - logger.debug("Total-jobs: %s: Max-workers: %s, " - "Total-worker-time: %s, Worker-time-per-job: %s" % - (mock.totalJobs, sum(mock.maxWorkers.values()), - mock.totalWorkerTime, - mock.totalWorkerTime // mock.totalJobs if mock.totalJobs > 0 else 0.0)) + logger.debug( + "Total-jobs: %s: Max-workers: %s, " + "Total-worker-time: %s, Worker-time-per-job: %s" + % ( + mock.totalJobs, + sum(mock.maxWorkers.values()), + mock.totalWorkerTime, + mock.totalWorkerTime // mock.totalJobs if mock.totalJobs > 0 else 0.0, + ) + ) @slow def testClusterScaling(self): @@ -640,15 +780,15 @@ def testClusterScaling(self): config = Config() # Make defaults dummy values - config.defaultMemory = h2b('1Gi') + config.defaultMemory = h2b("1Gi") config.defaultCores = 1 - config.defaultDisk = h2b('1Gi') + config.defaultDisk = h2b("1Gi") # No preemptible nodes/jobs config.maxPreemptibleNodes = [] # No preemptible nodes # Non-preemptible parameters - config.nodeTypes = [Shape(20, h2b('10Gi'), 10, h2b('100Gi'), False)] + config.nodeTypes = [Shape(20, h2b("10Gi"), 10, h2b("100Gi"), False)] config.minNodes = [0] config.maxNodes = [10] @@ -657,27 +797,31 @@ def testClusterScaling(self): config.betaInertia = 0.1 config.scaleInterval = 3 - self._testClusterScaling(config, numJobs=100, numPreemptibleJobs=0, - jobShape=Shape(20, h2b('7Gi'), 10, h2b('80Gi'), False)) + self._testClusterScaling( + config, + numJobs=100, + numPreemptibleJobs=0, + jobShape=Shape(20, h2b("7Gi"), 10, h2b("80Gi"), False), + ) @slow def testClusterScalingMultipleNodeTypes(self): - small_node = Shape(20, h2b('5Gi'), 10, h2b('20Gi'), False) - small_job = Shape(20, h2b('3Gi'), 10, h2b('4Gi'), False) - medium_node = Shape(20, h2b('10Gi'), 10, h2b('20Gi'), False) - medium_job = Shape(20, h2b('7Gi'), 10, h2b('4Gi'), False) - large_node = Shape(20, h2b('20Gi'), 10, h2b('20Gi'), False) - large_job = Shape(20, h2b('16Gi'), 10, h2b('4Gi'), False) + small_node = Shape(20, h2b("5Gi"), 10, h2b("20Gi"), False) + small_job = Shape(20, h2b("3Gi"), 10, h2b("4Gi"), False) + medium_node = Shape(20, h2b("10Gi"), 10, h2b("20Gi"), False) + medium_job = Shape(20, h2b("7Gi"), 10, h2b("4Gi"), False) + large_node = Shape(20, h2b("20Gi"), 10, h2b("20Gi"), False) + large_job = Shape(20, h2b("16Gi"), 10, h2b("4Gi"), False) numJobs = 100 config = Config() # Make defaults dummy values - config.defaultMemory = h2b('1Gi') + config.defaultMemory = h2b("1Gi") config.defaultCores = 1 - config.defaultDisk = h2b('1Gi') + config.defaultDisk = h2b("1Gi") # No preemptible nodes/jobs config.preemptibleNodeTypes = [] @@ -707,12 +851,18 @@ def testClusterScalingMultipleNodeTypes(self): # Add medium completed jobs for i in range(1000): - iJ = JobDescription(requirements=dict( - memory=random.choice(range(small_job.memory, medium_job.memory)), - cores=medium_job.cores, - disk=large_job.disk, - preemptible=False), - jobName='testClusterScaling', unitName='') + iJ = JobDescription( + requirements=dict( + memory=random.choice( + range(small_job.memory, medium_job.memory) + ), + cores=medium_job.cores, + disk=large_job.disk, + preemptible=False, + ), + jobName="testClusterScaling", + unitName="", + ) clusterScaler.addCompletedJob(iJ, random.choice(range(1, 10))) while mock.getNumberOfJobsIssued() > 0 or mock.getNumberOfNodes() > 0: @@ -739,15 +889,15 @@ def testClusterScalingWithPreemptibleJobs(self): """ config = Config() - node_shape = Shape(20, h2b('10Gi'), 10, h2b('20Gi'), False) - preemptible_node_shape = Shape(20, h2b('10Gi'), 10, h2b('20Gi'), True) - job_shape = Shape(20, h2b('7Gi'), 10, h2b('2Gi'), False) - preemptible_job_shape = Shape(20, h2b('7Gi'), 10, h2b('2Gi'), True) + node_shape = Shape(20, h2b("10Gi"), 10, h2b("20Gi"), False) + preemptible_node_shape = Shape(20, h2b("10Gi"), 10, h2b("20Gi"), True) + job_shape = Shape(20, h2b("7Gi"), 10, h2b("2Gi"), False) + preemptible_job_shape = Shape(20, h2b("7Gi"), 10, h2b("2Gi"), True) # Make defaults dummy values - config.defaultMemory = h2b('1Gi') + config.defaultMemory = h2b("1Gi") config.defaultCores = 1 - config.defaultDisk = h2b('1Gi') + config.defaultDisk = h2b("1Gi") # non-preemptible node parameters config.nodeTypes = [node_shape, preemptible_node_shape] @@ -759,13 +909,16 @@ def testClusterScalingWithPreemptibleJobs(self): config.betaInertia = 0.9 config.scaleInterval = 3 - self._testClusterScaling(config, numJobs=100, numPreemptibleJobs=100, jobShape=job_shape) + self._testClusterScaling( + config, numJobs=100, numPreemptibleJobs=100, jobShape=job_shape + ) class MockBatchSystemAndProvisioner(AbstractScalableBatchSystem, AbstractProvisioner): """Mimics a leader, job batcher, provisioner and scalable batch system.""" + def __init__(self, config, secondsPerJob): - super().__init__(clusterName='clusterName', clusterType='mesos') + super().__init__(clusterName="clusterName", clusterType="mesos") # To mimic parallel preemptible and non-preemptible queues # for jobs we create two parallel instances of the following class self.config = config @@ -797,8 +950,8 @@ def shutDown(self): # Stub out all AbstractBatchSystem methods since they are never called for name, value in AbstractBatchSystem.__dict__.items(): - if getattr(value, '__isabstractmethod__', False): - exec('def %s(): pass' % name) + if getattr(value, "__isabstractmethod__", False): + exec("def %s(): pass" % name) # Without this, the class would end up with .name and .value attributes del name, value @@ -813,7 +966,7 @@ def unignoreNode(self, nodeAddress): pass def supportedClusterTypes(self): - return {'mesos'} + return {"mesos"} def createClusterSettings(self): pass @@ -822,7 +975,9 @@ def readClusterSettings(self): pass # AbstractProvisioner methods - def setAutoscaledNodeTypes(self, node_types: List[Tuple[Set[Shape], Optional[float]]]): + def setAutoscaledNodeTypes( + self, node_types: list[tuple[set[Shape], Optional[float]]] + ): self.node_shapes_for_testing = sorted(it for t in node_types for it in t[0]) super().setAutoscaledNodeTypes(node_types) @@ -856,18 +1011,25 @@ def addJob(self, jobShape, preemptible=False): """ self.totalJobs += 1 jobID = uuid.uuid4() - self.jobBatchSystemIDToIssuedJob[jobID] = JobDescription(requirements={"memory": jobShape.memory, - "cores": jobShape.cores, - "disk": jobShape.disk, - "preemptible": preemptible}, - jobName=f'job{self.totalJobs}') + self.jobBatchSystemIDToIssuedJob[jobID] = JobDescription( + requirements={ + "memory": jobShape.memory, + "cores": jobShape.cores, + "disk": jobShape.disk, + "preemptible": preemptible, + }, + jobName=f"job{self.totalJobs}", + ) self.jobQueue.put(jobID) # JobBatcher functionality def getNumberOfJobsIssued(self, preemptible=None): if preemptible is not None: - jobList = [job for job in list(self.jobQueue.queue) if - self.jobBatchSystemIDToIssuedJob[job].preemptible == preemptible] + jobList = [ + job + for job in list(self.jobQueue.queue) + if self.jobBatchSystemIDToIssuedJob[job].preemptible == preemptible + ] return len(jobList) else: return self.jobQueue.qsize() @@ -883,13 +1045,19 @@ def getNodes(self, preemptible: Optional[bool] = False, timeout: int = 600): for node in self.nodesToWorker: if node.preemptible == preemptible: worker = self.nodesToWorker[node] - nodes[node.privateIP] = NodeInfo(coresTotal=0, coresUsed=0, requestedCores=1, - memoryTotal=0, memoryUsed=0, requestedMemory=1, - workers=1 if worker.busyEvent.is_set() else 0) + nodes[node.privateIP] = NodeInfo( + coresTotal=0, + coresUsed=0, + requestedCores=1, + memoryTotal=0, + memoryUsed=0, + requestedMemory=1, + workers=1 if worker.busyEvent.is_set() else 0, + ) return nodes # AbstractProvisioner functionality - def addNodes(self, nodeTypes: Set[str], numNodes, preemptible) -> int: + def addNodes(self, nodeTypes: set[str], numNodes, preemptible) -> int: nodeType = next(iter(nodeTypes)) self._addNodes(numNodes=numNodes, nodeType=nodeType, preemptible=preemptible) return self.getNumberOfNodes(nodeType=nodeType, preemptible=preemptible) @@ -902,8 +1070,17 @@ def getNodeShape(self, nodeType, preemptible=False): def getWorkersInCluster(self, nodeShape): return self.workers[nodeShape] - def launchCluster(self, leaderNodeType, keyName, userTags=None, - vpcSubnet=None, leaderStorage=50, nodeStorage=50, botoPath=None, **kwargs): + def launchCluster( + self, + leaderNodeType, + keyName, + userTags=None, + vpcSubnet=None, + leaderStorage=50, + nodeStorage=50, + botoPath=None, + **kwargs, + ): pass def destroyCluster(self) -> None: @@ -912,7 +1089,6 @@ def destroyCluster(self) -> None: def getLeader(self): pass - def _leaderFn(self): while self.running: updatedJobID = None @@ -955,14 +1131,28 @@ def stop(self): return time.time() - self.startTime for _ in range(numNodes): - node = Node('127.0.0.1', uuid.uuid4(), 'testNode', datetime.datetime.now().isoformat()+'Z', nodeType=nodeType, - preemptible=preemptible) - self.nodesToWorker[node] = Worker(self.jobQueue, self.updatedJobsQueue, self.secondsPerJob) + node = Node( + "127.0.0.1", + uuid.uuid4(), + "testNode", + datetime.datetime.now().isoformat() + "Z", + nodeType=nodeType, + preemptible=preemptible, + ) + self.nodesToWorker[node] = Worker( + self.jobQueue, self.updatedJobsQueue, self.secondsPerJob + ) self.workers[nodeShape].append(self.nodesToWorker[node]) - self.maxWorkers[nodeShape] = max(self.maxWorkers[nodeShape], len(self.workers[nodeShape])) + self.maxWorkers[nodeShape] = max( + self.maxWorkers[nodeShape], len(self.workers[nodeShape]) + ) def _removeNodes(self, nodes): - logger.debug("Removing nodes. %s workers and %s to terminate.", len(self.nodesToWorker), len(nodes)) + logger.debug( + "Removing nodes. %s workers and %s to terminate.", + len(self.nodesToWorker), + len(nodes), + ) for node in nodes: try: nodeShape = self.getNodeShape(node.nodeType, node.preemptible) diff --git a/src/toil/test/provisioners/clusterTest.py b/src/toil/test/provisioners/clusterTest.py index 022c28820f..d6428312fb 100644 --- a/src/toil/test/provisioners/clusterTest.py +++ b/src/toil/test/provisioners/clusterTest.py @@ -15,16 +15,21 @@ import os import subprocess import time - +from typing import Optional from uuid import uuid4 -from typing import Optional, List -from toil.lib.retry import retry -from toil.test import ToilTest, needs_aws_ec2, needs_fetchable_appliance, slow, needs_env_var from toil.lib.aws import zone_to_region from toil.lib.aws.session import AWSConnectionManager +from toil.lib.retry import retry from toil.provisioners import cluster_factory from toil.provisioners.aws import get_best_aws_zone +from toil.test import ( + ToilTest, + needs_aws_ec2, + needs_env_var, + needs_fetchable_appliance, + slow, +) log = logging.getLogger(__name__) @@ -34,31 +39,33 @@ class AbstractClusterTest(ToilTest): def __init__(self, methodName: str) -> None: super().__init__(methodName=methodName) - self.keyName = os.getenv('TOIL_AWS_KEYNAME').strip() or 'id_rsa' - self.clusterName = f'aws-provisioner-test-{uuid4()}' - self.leaderNodeType = 't2.medium' - self.clusterType = 'mesos' + self.keyName = os.getenv("TOIL_AWS_KEYNAME").strip() or "id_rsa" + self.clusterName = f"aws-provisioner-test-{uuid4()}" + self.leaderNodeType = "t2.medium" + self.clusterType = "mesos" self.zone = get_best_aws_zone() - assert self.zone is not None, "Could not determine AWS availability zone to test in; is TOIL_AWS_ZONE set?" + assert ( + self.zone is not None + ), "Could not determine AWS availability zone to test in; is TOIL_AWS_ZONE set?" self.region = zone_to_region(self.zone) # Get connection to AWS self.aws = AWSConnectionManager() # Where should we put our virtualenv? - self.venvDir = '/tmp/venv' + self.venvDir = "/tmp/venv" def python(self) -> str: """ Return the full path to the venv Python on the leader. """ - return os.path.join(self.venvDir, 'bin/python') + return os.path.join(self.venvDir, "bin/python") def pip(self) -> str: """ Return the full path to the venv pip on the leader. """ - return os.path.join(self.venvDir, 'bin/pip') + return os.path.join(self.venvDir, "bin/pip") def destroyCluster(self) -> None: """ @@ -66,7 +73,9 @@ def destroyCluster(self) -> None: Succeeds if the cluster does not currently exist. """ - subprocess.check_call(['toil', 'destroy-cluster', '-p=aws', '-z', self.zone, self.clusterName]) + subprocess.check_call( + ["toil", "destroy-cluster", "-p=aws", "-z", self.zone, self.clusterName] + ) def setUp(self) -> None: """ @@ -82,71 +91,83 @@ def tearDown(self) -> None: # Note that teardown will run even if the test crashes. super().tearDown() self.destroyCluster() - subprocess.check_call(['toil', 'clean', self.jobStore]) + subprocess.check_call(["toil", "clean", self.jobStore]) - def sshUtil(self, command: List[str]) -> None: + def sshUtil(self, command: list[str]) -> None: """ Run the given command on the cluster. Raise subprocess.CalledProcessError if it fails. """ - cmd = ['toil', 'ssh-cluster', '--insecure', '-p=aws', '-z', self.zone, self.clusterName] + command + cmd = [ + "toil", + "ssh-cluster", + "--insecure", + "-p=aws", + "-z", + self.zone, + self.clusterName, + ] + command log.info("Running %s.", str(cmd)) p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE) # Put in non-blocking mode. See https://stackoverflow.com/a/59291466 os.set_blocking(p.stdout.fileno(), False) os.set_blocking(p.stderr.fileno(), False) - out_buffer = b'' - err_buffer = b'' + out_buffer = b"" + err_buffer = b"" loops_since_line = 0 running = True while running: # While the process is running, see if it stopped - running = (p.poll() is None) + running = p.poll() is None # Also collect its output out_data = p.stdout.read() if out_data: out_buffer += out_data - while out_buffer.find(b'\n') != -1: + while out_buffer.find(b"\n") != -1: # And log every full line - cut = out_buffer.find(b'\n') - log.info('STDOUT: %s', out_buffer[0:cut].decode('utf-8', errors='ignore')) + cut = out_buffer.find(b"\n") + log.info( + "STDOUT: %s", out_buffer[0:cut].decode("utf-8", errors="ignore") + ) loops_since_line = 0 - out_buffer = out_buffer[cut+1:] + out_buffer = out_buffer[cut + 1 :] # Same for the error err_data = p.stderr.read() if err_data: err_buffer += err_data - while err_buffer.find(b'\n') != -1: - cut = err_buffer.find(b'\n') - log.info('STDERR: %s', err_buffer[0:cut].decode('utf-8', errors='ignore')) + while err_buffer.find(b"\n") != -1: + cut = err_buffer.find(b"\n") + log.info( + "STDERR: %s", err_buffer[0:cut].decode("utf-8", errors="ignore") + ) loops_since_line = 0 - err_buffer = err_buffer[cut+1:] + err_buffer = err_buffer[cut + 1 :] loops_since_line += 1 if loops_since_line > 60: - log.debug('...waiting...') + log.debug("...waiting...") loops_since_line = 0 time.sleep(1) # At the end, log the last lines if out_buffer: - log.info('STDOUT: %s', out_buffer.decode('utf-8', errors='ignore')) + log.info("STDOUT: %s", out_buffer.decode("utf-8", errors="ignore")) if err_buffer: - log.info('STDERR: %s', err_buffer.decode('utf-8', errors='ignore')) + log.info("STDERR: %s", err_buffer.decode("utf-8", errors="ignore")) if p.returncode != 0: # It failed log.error("Failed to run %s.", str(cmd)) - raise subprocess.CalledProcessError(p.returncode, ' '.join(cmd)) + raise subprocess.CalledProcessError(p.returncode, " ".join(cmd)) @retry(errors=[subprocess.CalledProcessError], intervals=[1, 1]) def rsync_util(self, from_file: str, to_file: str) -> None: @@ -155,18 +176,38 @@ def rsync_util(self, from_file: str, to_file: str) -> None: The cluster-side path should have a ':' in front of it. """ - cmd = ['toil', 'rsync-cluster', '--insecure', '-p=aws', '-z', self.zone, self.clusterName, from_file, to_file] + cmd = [ + "toil", + "rsync-cluster", + "--insecure", + "-p=aws", + "-z", + self.zone, + self.clusterName, + from_file, + to_file, + ] log.info("Running %s.", str(cmd)) subprocess.check_call(cmd) @retry(errors=[subprocess.CalledProcessError], intervals=[1, 1]) - def createClusterUtil(self, args: Optional[List[str]]=None) -> None: + def createClusterUtil(self, args: Optional[list[str]] = None) -> None: args = [] if args is None else args - command = ['toil', 'launch-cluster', '-p=aws', '-z', self.zone, f'--keyPairName={self.keyName}', - f'--leaderNodeType={self.leaderNodeType}', f'--clusterType={self.clusterType}', '--logDebug', self.clusterName] + args - - log.debug('Launching cluster: %s', command) + command = [ + "toil", + "launch-cluster", + "-p=aws", + "-z", + self.zone, + f"--keyPairName={self.keyName}", + f"--leaderNodeType={self.leaderNodeType}", + f"--clusterType={self.clusterType}", + "--logDebug", + self.clusterName, + ] + args + + log.debug("Launching cluster: %s", command) # Try creating the cluster subprocess.check_call(command) @@ -181,9 +222,10 @@ def launchCluster(self) -> None: @slow class CWLOnARMTest(AbstractClusterTest): """Run the CWL 1.2 conformance tests on ARM specifically.""" + def __init__(self, methodName: str) -> None: super().__init__(methodName=methodName) - self.clusterName = f'cwl-test-{uuid4()}' + self.clusterName = f"cwl-test-{uuid4()}" self.leaderNodeType = "t4g.2xlarge" self.clusterType = "kubernetes" # We need to be running in a directory which Flatcar and the Toil Appliance both have @@ -247,8 +289,5 @@ def test_cwl_on_arm(self) -> None: # Bring it back to be an artifact. self.rsync_util( f":{self.cwl_test_dir}/toil/conformance-1.2.junit.xml", - os.path.join( - self._projectRootPath(), - "arm-conformance-1.2.junit.xml" - ) + os.path.join(self._projectRootPath(), "arm-conformance-1.2.junit.xml"), ) diff --git a/src/toil/test/provisioners/gceProvisionerTest.py b/src/toil/test/provisioners/gceProvisionerTest.py index d3bfb5d764..a2852e0561 100644 --- a/src/toil/test/provisioners/gceProvisionerTest.py +++ b/src/toil/test/provisioners/gceProvisionerTest.py @@ -19,13 +19,15 @@ import pytest -from toil.test import (ToilTest, - integrative, - needs_fetchable_appliance, - needs_google_project, - needs_google_storage, - slow, - timeLimit) +from toil.test import ( + ToilTest, + integrative, + needs_fetchable_appliance, + needs_google_project, + needs_google_storage, + slow, + timeLimit, +) from toil.version import exactPython log = logging.getLogger(__name__) @@ -37,50 +39,62 @@ @needs_fetchable_appliance @slow class AbstractGCEAutoscaleTest(ToilTest): - projectID = os.getenv('TOIL_GOOGLE_PROJECTID') + projectID = os.getenv("TOIL_GOOGLE_PROJECTID") def sshUtil(self, command): - baseCommand = ['toil', 'ssh-cluster', '--insecure', '-p=gce', self.clusterName] + baseCommand = ["toil", "ssh-cluster", "--insecure", "-p=gce", self.clusterName] callCommand = baseCommand + command subprocess.check_call(callCommand) def rsyncUtil(self, src, dest): - baseCommand = ['toil', 'rsync-cluster', '--insecure', '-p=gce', self.clusterName] + baseCommand = [ + "toil", + "rsync-cluster", + "--insecure", + "-p=gce", + self.clusterName, + ] callCommand = baseCommand + [src, dest] subprocess.check_call(callCommand) def destroyClusterUtil(self): - callCommand = ['toil', 'destroy-cluster', '-p=gce', self.clusterName] + callCommand = ["toil", "destroy-cluster", "-p=gce", self.clusterName] subprocess.check_call(callCommand) def createClusterUtil(self, args=None): if args is None: args = [] - callCommand = ['toil', 'launch-cluster', self.clusterName, '-p=gce', '--keyPairName=%s' % self.keyName, - '--leaderNodeType=%s' % self.leaderInstanceType, '--zone=%s' % self.googleZone] + callCommand = [ + "toil", + "launch-cluster", + self.clusterName, + "-p=gce", + "--keyPairName=%s" % self.keyName, + "--leaderNodeType=%s" % self.leaderInstanceType, + "--zone=%s" % self.googleZone, + ] if self.botoDir is not None: - callCommand += ['--boto=%s' % self.botoDir] + callCommand += ["--boto=%s" % self.botoDir] callCommand = callCommand + args if args else callCommand - log.info("createClusterUtil: %s" % ''.join(callCommand)) + log.info("createClusterUtil: %s" % "".join(callCommand)) subprocess.check_call(callCommand) def cleanJobStoreUtil(self): - callCommand = ['toil', 'clean', self.jobStore] + callCommand = ["toil", "clean", self.jobStore] subprocess.check_call(callCommand) def __init__(self, methodName): super().__init__(methodName=methodName) # TODO: add TOIL_GOOGLE_KEYNAME to needs_google_project or ssh with SA account - self.keyName = os.getenv('TOIL_GOOGLE_KEYNAME') + self.keyName = os.getenv("TOIL_GOOGLE_KEYNAME") # TODO: remove this when switching to google jobstore - self.botoDir = os.getenv('TOIL_BOTO_DIR') + self.botoDir = os.getenv("TOIL_BOTO_DIR") # TODO: get this from SA account or add an environment variable - self.googleZone = 'us-west1-a' + self.googleZone = "us-west1-a" - - self.leaderInstanceType = 'n1-standard-1' + self.leaderInstanceType = "n1-standard-1" self.instanceTypes = ["n1-standard-2"] - self.numWorkers = ['2'] + self.numWorkers = ["2"] self.numSamples = 2 self.spotBid = 0.15 @@ -92,7 +106,7 @@ def tearDown(self): self.destroyClusterUtil() self.cleanJobStoreUtil() - #def getMatchingRoles(self, clusterName): + # def getMatchingRoles(self, clusterName): # ctx = AWSProvisioner._buildContext(clusterName) # roles = list(ctx.local_roles()) # return roles @@ -107,7 +121,6 @@ def _getScript(self): """ raise NotImplementedError() - @abstractmethod def _runScript(self, toilOptions): """ @@ -127,82 +140,103 @@ def _test(self, preemptibleJobs=False): self.launchCluster() # TODO: What is the point of this test? - #assert len(self.getMatchingRoles(self.clusterName)) == 1 + # assert len(self.getMatchingRoles(self.clusterName)) == 1 # TODO: Add a check of leader and node storage size if set. # --never-download prevents silent upgrades to pip, wheel and setuptools - venv_command = ['virtualenv', '--system-site-packages', '--never-download', - '--python', exactPython, '/home/venv'] + venv_command = [ + "virtualenv", + "--system-site-packages", + "--never-download", + "--python", + exactPython, + "/home/venv", + ] self.sshUtil(venv_command) self._getScript() - toilOptions = [self.jobStore, - '--batchSystem=mesos', - '--workDir=/var/lib/toil', - '--clean=always', - '--retryCount=2', - '--clusterStats=/home/', - '--logDebug', - '--logFile=/home/sort.log', - '--provisioner=gce'] - - toilOptions.extend(['--nodeTypes=' + ",".join(self.instanceTypes), - '--maxNodes=%s' % ",".join(self.numWorkers)]) + toilOptions = [ + self.jobStore, + "--batchSystem=mesos", + "--workDir=/var/lib/toil", + "--clean=always", + "--retryCount=2", + "--clusterStats=/home/", + "--logDebug", + "--logFile=/home/sort.log", + "--provisioner=gce", + ] + + toilOptions.extend( + [ + "--nodeTypes=" + ",".join(self.instanceTypes), + "--maxNodes=%s" % ",".join(self.numWorkers), + ] + ) if preemptibleJobs: - toilOptions.extend(['--defaultPreemptible']) + toilOptions.extend(["--defaultPreemptible"]) self._runScript(toilOptions) - #TODO: Does this just check if it is still running? - #assert len(self.getMatchingRoles(self.clusterName)) == 1 + # TODO: Does this just check if it is still running? + # assert len(self.getMatchingRoles(self.clusterName)) == 1 - checkStatsCommand = ['/home/venv/bin/python', '-c', - 'import json; import os; ' - 'json.load(open("/home/" + [f for f in os.listdir("/home/") ' - 'if f.endswith(".json")].pop()))' - ] + checkStatsCommand = [ + "/home/venv/bin/python", + "-c", + "import json; import os; " + 'json.load(open("/home/" + [f for f in os.listdir("/home/") ' + 'if f.endswith(".json")].pop()))', + ] self.sshUtil(checkStatsCommand) - # TODO: Add a check to make sure everything is cleaned up. - @pytest.mark.timeout(1600) class GCEAutoscaleTest(AbstractGCEAutoscaleTest): def __init__(self, name): super().__init__(name) - self.clusterName = 'provisioner-test-' + str(uuid4()) + self.clusterName = "provisioner-test-" + str(uuid4()) self.requestedLeaderStorage = 80 def setUp(self): super().setUp() - self.jobStore = f'google:{self.projectID}:autoscale-{uuid4()}' + self.jobStore = f"google:{self.projectID}:autoscale-{uuid4()}" def _getScript(self): # TODO: Isn't this the key file? fileToSort = os.path.join(os.getcwd(), str(uuid4())) - with open(fileToSort, 'w') as f: + with open(fileToSort, "w") as f: # Fixme: making this file larger causes the test to hang - f.write('01234567890123456789012345678901') - self.rsyncUtil(os.path.join(self._projectRootPath(), 'src/toil/test/sort/sort.py'), ':/home/sort.py') - self.rsyncUtil(fileToSort, ':/home/sortFile') + f.write("01234567890123456789012345678901") + self.rsyncUtil( + os.path.join(self._projectRootPath(), "src/toil/test/sort/sort.py"), + ":/home/sort.py", + ) + self.rsyncUtil(fileToSort, ":/home/sortFile") os.unlink(fileToSort) def _runScript(self, toilOptions): - runCommand = ['/home/venv/bin/python', '/home/sort.py', '--fileToSort=/home/sortFile'] + runCommand = [ + "/home/venv/bin/python", + "/home/sort.py", + "--fileToSort=/home/sortFile", + ] #'--sseKey=/home/sortFile'] runCommand.extend(toilOptions) - log.info("_runScript: %s" % ''.join(runCommand)) + log.info("_runScript: %s" % "".join(runCommand)) self.sshUtil(runCommand) def launchCluster(self): # add arguments to test that we can specify leader storage - self.createClusterUtil(args=['--leaderStorage', str(self.requestedLeaderStorage)]) + self.createClusterUtil( + args=["--leaderStorage", str(self.requestedLeaderStorage)] + ) # TODO: aren't these checks inherited? @integrative @@ -210,7 +244,7 @@ def launchCluster(self): @needs_google_storage def testAutoScale(self): self.instanceTypes = ["n1-standard-2"] - self.numWorkers = ['2'] + self.numWorkers = ["2"] self._test() @integrative @@ -220,7 +254,7 @@ def testSpotAutoScale(self): self.instanceTypes = ["n1-standard-2:%f" % self.spotBid] # Some spot workers have a stopped state after being started, strangely. # This could be the natural preemption process, but it seems too rapid. - self.numWorkers = ['3'] # Try 3 to account for a stopped node. + self.numWorkers = ["3"] # Try 3 to account for a stopped node. self._test(preemptibleJobs=True) @@ -229,35 +263,49 @@ class GCEStaticAutoscaleTest(GCEAutoscaleTest): """ Runs the tests on a statically provisioned cluster with autoscaling enabled. """ + def __init__(self, name): super().__init__(name) self.requestedNodeStorage = 20 def launchCluster(self): - self.createClusterUtil(args=['--leaderStorage', str(self.requestedLeaderStorage), - '--nodeTypes', ",".join(self.instanceTypes), '-w', ",".join(self.numWorkers), - '--nodeStorage', str(self.requestedLeaderStorage)]) + self.createClusterUtil( + args=[ + "--leaderStorage", + str(self.requestedLeaderStorage), + "--nodeTypes", + ",".join(self.instanceTypes), + "-w", + ",".join(self.numWorkers), + "--nodeStorage", + str(self.requestedLeaderStorage), + ] + ) # TODO: check the number of workers and their storage - #nodes = AWSProvisioner._getNodesInCluster(ctx, self.clusterName, both=True) - #nodes.sort(key=lambda x: x.launch_time) + # nodes = AWSProvisioner._getNodesInCluster(ctx, self.clusterName, both=True) + # nodes.sort(key=lambda x: x.launch_time) # assuming that leader is first - #workers = nodes[1:] + # workers = nodes[1:] # test that two worker nodes were created - #self.assertEqual(2, len(workers)) + # self.assertEqual(2, len(workers)) # test that workers have expected storage size # just use the first worker - #worker = workers[0] - #worker = next(wait_instances_running(ctx.ec2, [worker])) - #rootBlockDevice = worker.block_device_mapping["/dev/xvda"] - #self.assertTrue(isinstance(rootBlockDevice, BlockDeviceType)) - #rootVolume = ctx.ec2.get_all_volumes(volume_ids=[rootBlockDevice.volume_id])[0] - #self.assertGreaterEqual(rootVolume.size, self.requestedNodeStorage) + # worker = workers[0] + # worker = next(wait_instances_running(ctx.ec2, [worker])) + # rootBlockDevice = worker.block_device_mapping["/dev/xvda"] + # self.assertTrue(isinstance(rootBlockDevice, BlockDeviceType)) + # rootVolume = ctx.ec2.get_all_volumes(volume_ids=[rootBlockDevice.volume_id])[0] + # self.assertGreaterEqual(rootVolume.size, self.requestedNodeStorage) def _runScript(self, toilOptions): - runCommand = ['/home/venv/bin/python', '/home/sort.py', '--fileToSort=/home/sortFile'] + runCommand = [ + "/home/venv/bin/python", + "/home/sort.py", + "--fileToSort=/home/sortFile", + ] runCommand.extend(toilOptions) - log.info("_runScript: %s" % ''.join(runCommand)) + log.info("_runScript: %s" % "".join(runCommand)) self.sshUtil(runCommand) @@ -266,28 +314,37 @@ class GCEAutoscaleTestMultipleNodeTypes(AbstractGCEAutoscaleTest): def __init__(self, name): super().__init__(name) - self.clusterName = 'provisioner-test-' + str(uuid4()) + self.clusterName = "provisioner-test-" + str(uuid4()) def setUp(self): super().setUp() - self.jobStore = f'google:{self.projectID}:multinode-{uuid4()}' + self.jobStore = f"google:{self.projectID}:multinode-{uuid4()}" def _getScript(self): - sseKeyFile = os.path.join(os.getcwd(), 'keyFile') - with open(sseKeyFile, 'w') as f: - f.write('01234567890123456789012345678901') - self.rsyncUtil(os.path.join(self._projectRootPath(), 'src/toil/test/sort/sort.py'), ':/home/sort.py') - self.rsyncUtil(sseKeyFile, ':/home/keyFile') + sseKeyFile = os.path.join(os.getcwd(), "keyFile") + with open(sseKeyFile, "w") as f: + f.write("01234567890123456789012345678901") + self.rsyncUtil( + os.path.join(self._projectRootPath(), "src/toil/test/sort/sort.py"), + ":/home/sort.py", + ) + self.rsyncUtil(sseKeyFile, ":/home/keyFile") os.unlink(sseKeyFile) def _runScript(self, toilOptions): - #Set memory requirements so that sort jobs can be run + # Set memory requirements so that sort jobs can be run # on small instances, but merge jobs must be run on large # instances - runCommand = ['/home/venv/bin/python', '/home/sort.py', '--fileToSort=/home/s3am/bin/asadmin', '--sortMemory=0.6G', '--mergeMemory=3.0G'] + runCommand = [ + "/home/venv/bin/python", + "/home/sort.py", + "--fileToSort=/home/s3am/bin/asadmin", + "--sortMemory=0.6G", + "--mergeMemory=3.0G", + ] runCommand.extend(toilOptions) - #runCommand.append('--sseKey=/home/keyFile') - log.info("_runScript: %s" % ''.join(runCommand)) + # runCommand.append('--sseKey=/home/keyFile') + log.info("_runScript: %s" % "".join(runCommand)) self.sshUtil(runCommand) @integrative @@ -295,9 +352,10 @@ def _runScript(self, toilOptions): @needs_google_storage def testAutoScale(self): self.instanceTypes = ["n1-standard-2", "n1-standard-4"] - self.numWorkers = ['2','1'] + self.numWorkers = ["2", "1"] self._test() + @pytest.mark.timeout(1800) class GCERestartTest(AbstractGCEAutoscaleTest): """ @@ -306,37 +364,53 @@ class GCERestartTest(AbstractGCEAutoscaleTest): def __init__(self, name): super().__init__(name) - self.clusterName = 'restart-test-' + str(uuid4()) + self.clusterName = "restart-test-" + str(uuid4()) def setUp(self): super().setUp() - self.instanceTypes = ['n1-standard-1'] - self.numWorkers = ['1'] + self.instanceTypes = ["n1-standard-1"] + self.numWorkers = ["1"] self.scriptName = "/home/restartScript.py" # TODO: replace this with a google job store - zone = 'us-west-2' - self.jobStore = f'google:{self.projectID}:restart-{uuid4()}' + zone = "us-west-2" + self.jobStore = f"google:{self.projectID}:restart-{uuid4()}" def _getScript(self): - self.rsyncUtil(os.path.join(self._projectRootPath(), 'src/toil/test/provisioners/restartScript.py'), - ':'+self.scriptName) - + self.rsyncUtil( + os.path.join( + self._projectRootPath(), "src/toil/test/provisioners/restartScript.py" + ), + ":" + self.scriptName, + ) def _runScript(self, toilOptions): # clean = onSuccess - disallowedOptions = ['--clean=always', '--retryCount=2'] - newOptions = [option for option in toilOptions if option not in disallowedOptions] + disallowedOptions = ["--clean=always", "--retryCount=2"] + newOptions = [ + option for option in toilOptions if option not in disallowedOptions + ] try: # include a default memory - on restart the minimum memory requirement is the default, usually 2 GB - command = ['/home/venv/bin/python', self.scriptName, '-e', 'FAIL=true', '--defaultMemory=50000000'] + command = [ + "/home/venv/bin/python", + self.scriptName, + "-e", + "FAIL=true", + "--defaultMemory=50000000", + ] command.extend(newOptions) self.sshUtil(command) except subprocess.CalledProcessError: pass else: - self.fail('Command succeeded when we expected failure') + self.fail("Command succeeded when we expected failure") with timeLimit(1200): - command = ['/home/venv/bin/python', self.scriptName, '--restart', '--defaultMemory=50000000'] + command = [ + "/home/venv/bin/python", + self.scriptName, + "--restart", + "--defaultMemory=50000000", + ] command.extend(toilOptions) self.sshUtil(command) diff --git a/src/toil/test/provisioners/provisionerTest.py b/src/toil/test/provisioners/provisionerTest.py index 32a710d932..5add60c8d2 100644 --- a/src/toil/test/provisioners/provisionerTest.py +++ b/src/toil/test/provisioners/provisionerTest.py @@ -25,21 +25,33 @@ class ProvisionerTest(ToilTest): def test_node_type_parsing(self) -> None: assert parse_node_types(None) == [] - assert parse_node_types('') == [] - assert parse_node_types('red beans') == [({'red beans'}, None)] - assert parse_node_types('red beans,rice') == [({'red beans'}, None), ({'rice'}, None)] - assert parse_node_types('red beans/black beans,rice') == [({'red beans', 'black beans'}, None), ({'rice'}, None)] - assert parse_node_types('frankfurters:0.05') == [({'frankfurters'}, 0.05)] - assert parse_node_types('red beans/black beans:999,rice,red beans/black beans') == [({'red beans', 'black beans'}, 999), ({'rice'}, None), ({'red beans', 'black beans'}, None)] + assert parse_node_types("") == [] + assert parse_node_types("red beans") == [({"red beans"}, None)] + assert parse_node_types("red beans,rice") == [ + ({"red beans"}, None), + ({"rice"}, None), + ] + assert parse_node_types("red beans/black beans,rice") == [ + ({"red beans", "black beans"}, None), + ({"rice"}, None), + ] + assert parse_node_types("frankfurters:0.05") == [({"frankfurters"}, 0.05)] + assert parse_node_types( + "red beans/black beans:999,rice,red beans/black beans" + ) == [ + ({"red beans", "black beans"}, 999), + ({"rice"}, None), + ({"red beans", "black beans"}, None), + ] with pytest.raises(ValueError): - parse_node_types('your thoughts:penny') + parse_node_types("your thoughts:penny") with pytest.raises(ValueError) as err: - parse_node_types(',,,') - assert 'empty' in str(err.value) + parse_node_types(",,,") + assert "empty" in str(err.value) with pytest.raises(ValueError): - parse_node_types('now hear this:') + parse_node_types("now hear this:") with pytest.raises(ValueError) as err: - parse_node_types('miles I will walk:500:500') - assert 'multiple' in str(err.value) + parse_node_types("miles I will walk:500:500") + assert "multiple" in str(err.value) with pytest.raises(ValueError): - parse_node_types('red beans:500/black beans:500,rice') + parse_node_types("red beans:500/black beans:500,rice") diff --git a/src/toil/test/provisioners/restartScript.py b/src/toil/test/provisioners/restartScript.py index 462c741491..2e352ae3eb 100644 --- a/src/toil/test/provisioners/restartScript.py +++ b/src/toil/test/provisioners/restartScript.py @@ -6,12 +6,13 @@ def f0(job): - if 'FAIL' in os.environ: - raise RuntimeError('failed on purpose') + if "FAIL" in os.environ: + raise RuntimeError("failed on purpose") -if __name__ == '__main__': + +if __name__ == "__main__": parser = ArgumentParser() Job.Runner.addToilOptions(parser) options = parser.parse_args() - rootJob = Job.wrapJobFn(f0, cores=0.5, memory='50 M', disk='50 M') + rootJob = Job.wrapJobFn(f0, cores=0.5, memory="50 M", disk="50 M") Job.Runner.startToil(rootJob, options) diff --git a/src/toil/test/server/serverTest.py b/src/toil/test/server/serverTest.py index b3688552d4..d77241a9d6 100644 --- a/src/toil/test/server/serverTest.py +++ b/src/toil/test/server/serverTest.py @@ -38,6 +38,7 @@ logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) + @needs_server class ToilServerUtilsTest(ToilTest): """ @@ -72,6 +73,7 @@ def test_workflow_canceling_recovery(self): # Make sure it is now CANCELED due to timeout self.assertEqual(state_machine.get_current_state(), "CANCELED") + class hidden: # Hide abstract tests from the test loader @@ -91,7 +93,6 @@ def get_state_store(self) -> AbstractStateStore: raise NotImplementedError() - def test_state_store(self) -> None: """ Make sure that the state store under test can store and load keys. @@ -100,45 +101,46 @@ def test_state_store(self) -> None: store = self.get_state_store() # Should start None - self.assertEqual(store.get('id1', 'key1'), None) + self.assertEqual(store.get("id1", "key1"), None) # Should hold a value - store.set('id1', 'key1', 'value1') - self.assertEqual(store.get('id1', 'key1'), 'value1') + store.set("id1", "key1", "value1") + self.assertEqual(store.get("id1", "key1"), "value1") # Should distinguish by ID and key - self.assertEqual(store.get('id2', 'key1'), None) - self.assertEqual(store.get('id1', 'key2'), None) + self.assertEqual(store.get("id2", "key1"), None) + self.assertEqual(store.get("id1", "key2"), None) - store.set('id2', 'key1', 'value2') - store.set('id1', 'key2', 'value3') - self.assertEqual(store.get('id1', 'key1'), 'value1') - self.assertEqual(store.get('id2', 'key1'), 'value2') - self.assertEqual(store.get('id1', 'key2'), 'value3') + store.set("id2", "key1", "value2") + store.set("id1", "key2", "value3") + self.assertEqual(store.get("id1", "key1"), "value1") + self.assertEqual(store.get("id2", "key1"), "value2") + self.assertEqual(store.get("id1", "key2"), "value3") # Should allow replacement - store.set('id1', 'key1', 'value4') - self.assertEqual(store.get('id1', 'key1'), 'value4') - self.assertEqual(store.get('id2', 'key1'), 'value2') - self.assertEqual(store.get('id1', 'key2'), 'value3') + store.set("id1", "key1", "value4") + self.assertEqual(store.get("id1", "key1"), "value4") + self.assertEqual(store.get("id2", "key1"), "value2") + self.assertEqual(store.get("id1", "key2"), "value3") # Should show up in another state store store2 = self.get_state_store() - self.assertEqual(store2.get('id1', 'key1'), 'value4') - self.assertEqual(store2.get('id2', 'key1'), 'value2') - self.assertEqual(store2.get('id1', 'key2'), 'value3') + self.assertEqual(store2.get("id1", "key1"), "value4") + self.assertEqual(store2.get("id2", "key1"), "value2") + self.assertEqual(store2.get("id1", "key2"), "value3") # Should allow clearing - store.set('id1', 'key1', None) - self.assertEqual(store.get('id1', 'key1'), None) - self.assertEqual(store.get('id2', 'key1'), 'value2') - self.assertEqual(store.get('id1', 'key2'), 'value3') + store.set("id1", "key1", None) + self.assertEqual(store.get("id1", "key1"), None) + self.assertEqual(store.get("id2", "key1"), "value2") + self.assertEqual(store.get("id1", "key2"), "value3") + + store.set("id2", "key1", None) + store.set("id1", "key2", None) + self.assertEqual(store.get("id1", "key1"), None) + self.assertEqual(store.get("id2", "key1"), None) + self.assertEqual(store.get("id1", "key2"), None) - store.set('id2', 'key1', None) - store.set('id1', 'key2', None) - self.assertEqual(store.get('id1', 'key1'), None) - self.assertEqual(store.get('id2', 'key1'), None) - self.assertEqual(store.get('id1', 'key2'), None) class FileStateStoreTest(hidden.AbstractStateStoreTest): """ @@ -160,6 +162,7 @@ def get_state_store(self) -> AbstractStateStore: return FileStateStore(self.state_store_dir) + class FileStateStoreURLTest(hidden.AbstractStateStoreTest): """ Test file-based state storage using URLs instead of local paths. @@ -169,7 +172,7 @@ class FileStateStoreURLTest(hidden.AbstractStateStoreTest): def setUp(self) -> None: super().setUp() - self.state_store_dir = 'file://' + self._createTempDir() + self.state_store_dir = "file://" + self._createTempDir() def get_state_store(self) -> AbstractStateStore: """ @@ -180,6 +183,7 @@ def get_state_store(self) -> AbstractStateStore: return FileStateStore(self.state_store_dir) + @needs_aws_s3 class BucketUsingTest(ToilTest): """ @@ -191,8 +195,8 @@ class BucketUsingTest(ToilTest): from mypy_boto3_s3.service_resource import Bucket region: Optional[str] - s3_resource: Optional['S3ServiceResource'] - bucket: Optional['Bucket'] + s3_resource: Optional["S3ServiceResource"] + bucket: Optional["Bucket"] bucket_name: Optional[str] @classmethod @@ -215,10 +219,12 @@ def setUpClass(cls) -> None: @classmethod def tearDownClass(cls) -> None: from toil.lib.aws.utils import delete_s3_bucket + if cls.bucket_name: delete_s3_bucket(cls.s3_resource, cls.bucket_name, cls.region) super().tearDownClass() + class AWSStateStoreTest(hidden.AbstractStateStoreTest, BucketUsingTest): """Test AWS-based state storage.""" @@ -233,7 +239,7 @@ def get_state_store(self) -> AbstractStateStore: from toil.server.utils import S3StateStore - return S3StateStore('s3://' + self.bucket_name + '/' + self.bucket_path) + return S3StateStore("s3://" + self.bucket_name + "/" + self.bucket_path) def test_state_store_paths(self) -> None: """ @@ -248,16 +254,18 @@ def test_state_store_paths(self) -> None: store = self.get_state_store() # Should hold a value - store.set('testid', 'testkey', 'testvalue') - self.assertEqual(store.get('testid', 'testkey'), 'testvalue') - - expected_url = urlparse('s3://' + self.bucket_name + '/' + - os.path.join(self.bucket_path, 'testid', 'testkey')) + store.set("testid", "testkey", "testvalue") + self.assertEqual(store.get("testid", "testkey"), "testvalue") + + expected_url = urlparse( + "s3://" + + self.bucket_name + + "/" + + os.path.join(self.bucket_path, "testid", "testkey") + ) obj = get_object_for_url(expected_url, True) - self.assertEqual(obj.content_length, len('testvalue')) - - + self.assertEqual(obj.content_length, len("testvalue")) @needs_server @@ -281,8 +289,11 @@ def setUp(self) -> None: self.temp_dir = self._createTempDir() from toil.server.app import create_app, parser_with_server_options + parser = parser_with_server_options() - args = parser.parse_args(self._server_args + ["--work_dir", os.path.join(self.temp_dir, "workflows")]) + args = parser.parse_args( + self._server_args + ["--work_dir", os.path.join(self.temp_dir, "workflows")] + ) # Make the FlaskApp server_app = create_app(args) @@ -291,7 +302,8 @@ def setUp(self) -> None: self.app: Flask = server_app.app self.app.testing = True - self.example_cwl = textwrap.dedent(""" + self.example_cwl = textwrap.dedent( + """ cwlVersion: v1.0 class: CommandLineTool baseCommand: echo @@ -304,9 +316,11 @@ def setUp(self) -> None: outputs: output: type: stdout - """) + """ + ) - self.slow_cwl = textwrap.dedent(""" + self.slow_cwl = textwrap.dedent( + """ cwlVersion: v1.0 class: CommandLineTool baseCommand: sleep @@ -319,7 +333,8 @@ def setUp(self) -> None: outputs: output: type: stdout - """) + """ + ) def tearDown(self) -> None: super().tearDown() @@ -339,7 +354,7 @@ def _check_successful_log(self, client: "FlaskClient", run_id: str) -> None: The workflow should succeed, it should have some tasks, and they should have all succeeded. """ rv = self._fetch_run_log(client, run_id) - logger.debug('Log info: %s', rv.json) + logger.debug("Log info: %s", rv.json) run_log = rv.json.get("run_log") self.assertEqual(type(run_log), dict) if "exit_code" in run_log: @@ -359,7 +374,7 @@ def _check_successful_log(self, client: "FlaskClient", run_id: str) -> None: def _report_log(self, client: "FlaskClient", run_id: str) -> None: """Report the log for the given workflow run.""" rv = self._fetch_run_log(client, run_id) - logger.debug(f'Report log response: {rv.json}') + logger.debug(f"Report log response: {rv.json}") run_log = rv.json.get("run_log") self.assertEqual(type(run_log), dict) self.assertEqual(type(run_log.get("stdout")), str) @@ -380,21 +395,24 @@ def _report_absolute_url(self, client: "FlaskClient", url: str): logger.info("Fetch %s", url) rv = client.get(url) self.assertEqual(rv.status_code, 200) - logger.info("Got %s:\n%s", url, rv.data.decode('utf-8')) + logger.info("Got %s:\n%s", url, rv.data.decode("utf-8")) def _start_slow_workflow(self, client: "FlaskClient") -> str: """ Start a slow workflow and return its ID. """ - rv = client.post("/ga4gh/wes/v1/runs", data={ - "workflow_url": "slow.cwl", - "workflow_type": "CWL", - "workflow_type_version": "v1.0", - "workflow_params": json.dumps({"delay": "5"}), - "workflow_attachment": [ - (BytesIO(self.slow_cwl.encode()), "slow.cwl"), - ], - }) + rv = client.post( + "/ga4gh/wes/v1/runs", + data={ + "workflow_url": "slow.cwl", + "workflow_type": "CWL", + "workflow_type_version": "v1.0", + "workflow_params": json.dumps({"delay": "5"}), + "workflow_attachment": [ + (BytesIO(self.slow_cwl.encode()), "slow.cwl"), + ], + }, + ) # workflow is submitted successfully self.assertEqual(rv.status_code, 200) self.assertTrue(rv.is_json) @@ -415,16 +433,30 @@ def _poll_status(self, client: "FlaskClient", run_id: str) -> str: self.assertEqual(rv.json.get("run_id"), run_id) self.assertIn("state", rv.json) state = rv.json.get("state") - self.assertIn(state, ["UNKNOWN", "QUEUED", "INITIALIZING", "RUNNING", - "PAUSED", "COMPLETE", "EXECUTOR_ERROR", "SYSTEM_ERROR", - "CANCELED", "CANCELING"]) + self.assertIn( + state, + [ + "UNKNOWN", + "QUEUED", + "INITIALIZING", + "RUNNING", + "PAUSED", + "COMPLETE", + "EXECUTOR_ERROR", + "SYSTEM_ERROR", + "CANCELED", + "CANCELING", + ], + ) return state def _cancel_workflow(self, client: "FlaskClient", run_id: str) -> None: rv = client.post(f"/ga4gh/wes/v1/runs/{run_id}/cancel") self.assertEqual(rv.status_code, 200) - def _wait_for_status(self, client: "FlaskClient", run_id: str, target_status: str) -> None: + def _wait_for_status( + self, client: "FlaskClient", run_id: str, target_status: str + ) -> None: """ Wait for the given workflow run to reach the given state. If it reaches a different terminal state, raise an exception. @@ -457,19 +489,19 @@ class ToilWESServerBenchTest(AbstractToilWESServerTest): """ def test_home(self) -> None: - """ Test the homepage endpoint.""" + """Test the homepage endpoint.""" with self.app.test_client() as client: rv = client.get("/") self.assertEqual(rv.status_code, 302) def test_health(self) -> None: - """ Test the health check endpoint.""" + """Test the health check endpoint.""" with self.app.test_client() as client: rv = client.get("/engine/v1/status") self.assertEqual(rv.status_code, 200) def test_get_service_info(self) -> None: - """ Test the GET /service-info endpoint.""" + """Test the GET /service-info endpoint.""" with self.app.test_client() as client: rv = client.get("/ga4gh/wes/v1/service-info") self.assertEqual(rv.status_code, 200) @@ -487,12 +519,15 @@ def test_get_service_info(self) -> None: self.assertIn("system_state_counts", service_info) self.assertIn("tags", service_info) + class ToilWESServerWorkflowTest(AbstractToilWESServerTest): """ Tests of the WES server running workflows. """ - def run_zip_workflow(self, zip_path: str, include_message: bool = True, include_params: bool = True) -> None: + def run_zip_workflow( + self, zip_path: str, include_message: bool = True, include_params: bool = True + ) -> None: """ We have several zip file tests; this submits a zip file and makes sure it ran OK. @@ -505,11 +540,13 @@ def run_zip_workflow(self, zip_path: str, include_message: bool = True, include_ post_data = { "workflow_url": "file://" + zip_path, "workflow_type": "CWL", - "workflow_type_version": "v1.0" + "workflow_type_version": "v1.0", } if include_params or include_message: # We need workflow_params too - post_data["workflow_params"] = json.dumps({"message": "Hello, world!"} if include_message else {}) + post_data["workflow_params"] = json.dumps( + {"message": "Hello, world!"} if include_message else {} + ) with self.app.test_client() as client: rv = client.post("/ga4gh/wes/v1/runs", data=post_data) # workflow is submitted successfully @@ -526,28 +563,37 @@ def run_zip_workflow(self, zip_path: str, include_message: bool = True, include_ def test_run_workflow_relative_url_no_attachments_fails(self) -> None: """Test run example CWL workflow from relative workflow URL but with no attachments.""" with self.app.test_client() as client: - rv = client.post("/ga4gh/wes/v1/runs", data={ - "workflow_url": "example.cwl", - "workflow_type": "CWL", - "workflow_type_version": "v1.0", - "workflow_params": "{}" - }) + rv = client.post( + "/ga4gh/wes/v1/runs", + data={ + "workflow_url": "example.cwl", + "workflow_type": "CWL", + "workflow_type_version": "v1.0", + "workflow_params": "{}", + }, + ) self.assertEqual(rv.status_code, 400) self.assertTrue(rv.is_json) - self.assertEqual(rv.json.get("msg"), "Relative 'workflow_url' but missing 'workflow_attachment'") + self.assertEqual( + rv.json.get("msg"), + "Relative 'workflow_url' but missing 'workflow_attachment'", + ) def test_run_workflow_relative_url(self) -> None: """Test run example CWL workflow from relative workflow URL.""" with self.app.test_client() as client: - rv = client.post("/ga4gh/wes/v1/runs", data={ - "workflow_url": "example.cwl", - "workflow_type": "CWL", - "workflow_type_version": "v1.0", - "workflow_params": json.dumps({"message": "Hello, world!"}), - "workflow_attachment": [ - (BytesIO(self.example_cwl.encode()), "example.cwl"), - ], - }) + rv = client.post( + "/ga4gh/wes/v1/runs", + data={ + "workflow_url": "example.cwl", + "workflow_type": "CWL", + "workflow_type_version": "v1.0", + "workflow_params": json.dumps({"message": "Hello, world!"}), + "workflow_attachment": [ + (BytesIO(self.example_cwl.encode()), "example.cwl"), + ], + }, + ) # workflow is submitted successfully self.assertEqual(rv.status_code, 200) self.assertTrue(rv.is_json) @@ -560,12 +606,15 @@ def test_run_workflow_relative_url(self) -> None: def test_run_workflow_https_url(self) -> None: """Test run example CWL workflow from the Internet.""" with self.app.test_client() as client: - rv = client.post("/ga4gh/wes/v1/runs", data={ - "workflow_url": "https://raw.githubusercontent.com/DataBiosphere/toil/4cb5bb3871ac21a9793f638b83775926ed94a226/src/toil/test/cwl/echo.cwl", - "workflow_type": "CWL", - "workflow_type_version": "v1.2", - "workflow_params": json.dumps({"message": "Hello, world!"}), - }) + rv = client.post( + "/ga4gh/wes/v1/runs", + data={ + "workflow_url": "https://raw.githubusercontent.com/DataBiosphere/toil/4cb5bb3871ac21a9793f638b83775926ed94a226/src/toil/test/cwl/echo.cwl", + "workflow_type": "CWL", + "workflow_type_version": "v1.2", + "workflow_params": json.dumps({"message": "Hello, world!"}), + }, + ) # workflow is submitted successfully self.assertEqual(rv.status_code, 200) self.assertTrue(rv.is_json) @@ -578,57 +627,63 @@ def test_run_workflow_https_url(self) -> None: def test_run_workflow_single_file_zip(self) -> None: """Test run example CWL workflow from single-file ZIP.""" workdir = self._createTempDir() - zip_path = os.path.abspath(os.path.join(workdir, 'workflow.zip')) - with zipfile.ZipFile(zip_path, 'w') as zip_file: - zip_file.writestr('example.cwl', self.example_cwl) + zip_path = os.path.abspath(os.path.join(workdir, "workflow.zip")) + with zipfile.ZipFile(zip_path, "w") as zip_file: + zip_file.writestr("example.cwl", self.example_cwl) self.run_zip_workflow(zip_path) def test_run_workflow_multi_file_zip(self) -> None: """Test run example CWL workflow from multi-file ZIP.""" workdir = self._createTempDir() - zip_path = os.path.abspath(os.path.join(workdir, 'workflow.zip')) - with zipfile.ZipFile(zip_path, 'w') as zip_file: - zip_file.writestr('main.cwl', self.example_cwl) - zip_file.writestr('distraction.cwl', "Don't mind me") + zip_path = os.path.abspath(os.path.join(workdir, "workflow.zip")) + with zipfile.ZipFile(zip_path, "w") as zip_file: + zip_file.writestr("main.cwl", self.example_cwl) + zip_file.writestr("distraction.cwl", "Don't mind me") self.run_zip_workflow(zip_path) def test_run_workflow_manifest_zip(self) -> None: """Test run example CWL workflow from ZIP with manifest.""" workdir = self._createTempDir() - zip_path = os.path.abspath(os.path.join(workdir, 'workflow.zip')) - with zipfile.ZipFile(zip_path, 'w') as zip_file: - zip_file.writestr('actual.cwl', self.example_cwl) - zip_file.writestr('distraction.cwl', self.example_cwl) - zip_file.writestr('MANIFEST.json', json.dumps({"mainWorkflowURL": "actual.cwl"})) + zip_path = os.path.abspath(os.path.join(workdir, "workflow.zip")) + with zipfile.ZipFile(zip_path, "w") as zip_file: + zip_file.writestr("actual.cwl", self.example_cwl) + zip_file.writestr("distraction.cwl", self.example_cwl) + zip_file.writestr( + "MANIFEST.json", json.dumps({"mainWorkflowURL": "actual.cwl"}) + ) self.run_zip_workflow(zip_path) - def test_run_workflow_inputs_zip(self) -> None: """Test run example CWL workflow from ZIP without manifest but with inputs.""" workdir = self._createTempDir() - zip_path = os.path.abspath(os.path.join(workdir, 'workflow.zip')) - with zipfile.ZipFile(zip_path, 'w') as zip_file: - zip_file.writestr('main.cwl', self.example_cwl) - zip_file.writestr('inputs.json', json.dumps({"message": "Hello, world!"})) + zip_path = os.path.abspath(os.path.join(workdir, "workflow.zip")) + with zipfile.ZipFile(zip_path, "w") as zip_file: + zip_file.writestr("main.cwl", self.example_cwl) + zip_file.writestr("inputs.json", json.dumps({"message": "Hello, world!"})) self.run_zip_workflow(zip_path, include_message=False) def test_run_workflow_manifest_and_inputs_zip(self) -> None: """Test run example CWL workflow from ZIP with manifest and inputs.""" workdir = self._createTempDir() - zip_path = os.path.abspath(os.path.join(workdir, 'workflow.zip')) - with zipfile.ZipFile(zip_path, 'w') as zip_file: - zip_file.writestr('actual.cwl', self.example_cwl) - zip_file.writestr('data.json', json.dumps({"message": "Hello, world!"})) - zip_file.writestr('MANIFEST.json', json.dumps({"mainWorkflowURL": "actual.cwl", "inputFileURLs": ["data.json"]})) + zip_path = os.path.abspath(os.path.join(workdir, "workflow.zip")) + with zipfile.ZipFile(zip_path, "w") as zip_file: + zip_file.writestr("actual.cwl", self.example_cwl) + zip_file.writestr("data.json", json.dumps({"message": "Hello, world!"})) + zip_file.writestr( + "MANIFEST.json", + json.dumps( + {"mainWorkflowURL": "actual.cwl", "inputFileURLs": ["data.json"]} + ), + ) self.run_zip_workflow(zip_path, include_message=False) def test_run_workflow_no_params_zip(self) -> None: """Test run example CWL workflow from ZIP without workflow_params.""" workdir = self._createTempDir() - zip_path = os.path.abspath(os.path.join(workdir, 'workflow.zip')) - with zipfile.ZipFile(zip_path, 'w') as zip_file: - zip_file.writestr('main.cwl', self.example_cwl) - zip_file.writestr('inputs.json', json.dumps({"message": "Hello, world!"})) + zip_path = os.path.abspath(os.path.join(workdir, "workflow.zip")) + with zipfile.ZipFile(zip_path, "w") as zip_file: + zip_file.writestr("main.cwl", self.example_cwl) + zip_file.writestr("inputs.json", json.dumps({"message": "Hello, world!"})) # Don't even bother sending workflow_params self.run_zip_workflow(zip_path, include_message=False, include_params=False) @@ -670,8 +725,10 @@ def test_run_and_cancel_workflows(self) -> None: cancel_seconds = cancel_complete - cancel_sent logger.info("Cancellation took %s seconds to complete", cancel_seconds) from toil.server.wes.tasks import WAIT_FOR_DEATH_TIMEOUT + self.assertLess(cancel_seconds, WAIT_FOR_DEATH_TIMEOUT) + @needs_celery_broker class ToilWESServerCeleryWorkflowTest(ToilWESServerWorkflowTest): """ @@ -685,8 +742,11 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._server_args = [] + @needs_celery_broker -class ToilWESServerCeleryS3StateWorkflowTest(ToilWESServerWorkflowTest, BucketUsingTest): +class ToilWESServerCeleryS3StateWorkflowTest( + ToilWESServerWorkflowTest, BucketUsingTest +): """ Test the server with Celery and state stored in S3. """ @@ -696,5 +756,6 @@ def setUp(self) -> None: self._server_args = ["--state_store", "s3://" + self.bucket_name + "/state"] super().setUp() + if __name__ == "__main__": unittest.main() diff --git a/src/toil/test/sort/restart_sort.py b/src/toil/test/sort/restart_sort.py index c1b40ca9ea..2393507fbd 100644 --- a/src/toil/test/sort/restart_sort.py +++ b/src/toil/test/sort/restart_sort.py @@ -29,7 +29,7 @@ defaultLines = 1000 defaultLineLen = 50 -sortMemory = '600M' +sortMemory = "600M" def setup(job, inputFile, N, downCheckpoints, options): @@ -38,12 +38,16 @@ def setup(job, inputFile, N, downCheckpoints, options): Returns the FileID of the sorted file """ RealtimeLogger.info("Starting the merge sort") - return job.addChildJobFn(down, - inputFile, N, 'root', - downCheckpoints, - options = options, - preemptible=True, - memory=sortMemory).rv() + return job.addChildJobFn( + down, + inputFile, + N, + "root", + downCheckpoints, + options=options, + preemptible=True, + memory=sortMemory, + ).rv() def down(job, inputFileStoreID, N, path, downCheckpoints, options, memory=sortMemory): @@ -61,34 +65,57 @@ def down(job, inputFileStoreID, N, path, downCheckpoints, options, memory=sortMe length = os.path.getsize(inputFile) if length > N: # We will subdivide the file - RealtimeLogger.critical("Splitting file: %s of size: %s" - % (inputFileStoreID, length)) + RealtimeLogger.critical( + "Splitting file: %s of size: %s" % (inputFileStoreID, length) + ) # Split the file into two copies midPoint = getMidPoint(inputFile, 0, length) t1 = job.fileStore.getLocalTempFile() - with open(t1, 'w') as fH: - fH.write(copySubRangeOfFile(inputFile, 0, midPoint+1)) + with open(t1, "w") as fH: + fH.write(copySubRangeOfFile(inputFile, 0, midPoint + 1)) t2 = job.fileStore.getLocalTempFile() - with open(t2, 'w') as fH: - fH.write(copySubRangeOfFile(inputFile, midPoint+1, length)) + with open(t2, "w") as fH: + fH.write(copySubRangeOfFile(inputFile, midPoint + 1, length)) # Call down recursively. By giving the rv() of the two jobs as inputs to the follow-on job, up, # we communicate the dependency without hindering concurrency. - result = job.addFollowOnJobFn(up, - job.addChildJobFn(down, job.fileStore.writeGlobalFile(t1), N, path + '/0', - downCheckpoints, checkpoint=downCheckpoints, options=options, - preemptible=True, memory=options.sortMemory).rv(), - job.addChildJobFn(down, job.fileStore.writeGlobalFile(t2), N, path + '/1', - downCheckpoints, checkpoint=downCheckpoints, options=options, - preemptible=True, memory=options.mergeMemory).rv(), - path + '/up', preemptible=True, options=options, memory=options.sortMemory).rv() + result = job.addFollowOnJobFn( + up, + job.addChildJobFn( + down, + job.fileStore.writeGlobalFile(t1), + N, + path + "/0", + downCheckpoints, + checkpoint=downCheckpoints, + options=options, + preemptible=True, + memory=options.sortMemory, + ).rv(), + job.addChildJobFn( + down, + job.fileStore.writeGlobalFile(t2), + N, + path + "/1", + downCheckpoints, + checkpoint=downCheckpoints, + options=options, + preemptible=True, + memory=options.mergeMemory, + ).rv(), + path + "/up", + preemptible=True, + options=options, + memory=options.sortMemory, + ).rv() else: # We can sort this bit of the file - RealtimeLogger.critical("Sorting file: %s of size: %s" - % (inputFileStoreID, length)) + RealtimeLogger.critical( + "Sorting file: %s of size: %s" % (inputFileStoreID, length) + ) # Sort the copy and write back to the fileStore - shutil.copyfile(inputFile, inputFile + '.sort') - sort(inputFile + '.sort') - result = job.fileStore.writeGlobalFile(inputFile + '.sort') + shutil.copyfile(inputFile, inputFile + ".sort") + sort(inputFile + ".sort") + result = job.fileStore.writeGlobalFile(inputFile + ".sort") RealtimeLogger.info("Down job finished: %s" % path) return result @@ -102,13 +129,15 @@ def up(job, inputFileID1, inputFileID2, path, options, memory=sortMemory): RealtimeLogger.info("Up job starting: %s" % path) with job.fileStore.writeGlobalFileStream() as (fileHandle, outputFileStoreID): - fileHandle = codecs.getwriter('utf-8')(fileHandle) + fileHandle = codecs.getwriter("utf-8")(fileHandle) with job.fileStore.readGlobalFileStream(inputFileID1) as inputFileHandle1: - inputFileHandle1 = codecs.getreader('utf-8')(inputFileHandle1) + inputFileHandle1 = codecs.getreader("utf-8")(inputFileHandle1) with job.fileStore.readGlobalFileStream(inputFileID2) as inputFileHandle2: - inputFileHandle2 = codecs.getreader('utf-8')(inputFileHandle2) - RealtimeLogger.info("Merging %s and %s to %s" - % (inputFileID1, inputFileID2, outputFileStoreID)) + inputFileHandle2 = codecs.getreader("utf-8")(inputFileHandle2) + RealtimeLogger.info( + "Merging %s and %s to %s" + % (inputFileID1, inputFileID2, outputFileStoreID) + ) merge(inputFileHandle1, inputFileHandle2, fileHandle) # Cleanup up the input files - these deletes will occur after the completion is successful. job.fileStore.deleteGlobalFile(inputFileID1) @@ -126,7 +155,7 @@ def sort(file): lines.sort() - with open(file, 'w') as f: + with open(file, "w") as f: for line in lines: f.write(line) @@ -181,9 +210,12 @@ def getMidPoint(file, fileStart, fileEnd): def makeFileToSort(fileName, lines=defaultLines, lineLen=defaultLineLen): - with open(fileName, 'w') as f: + with open(fileName, "w") as f: for _ in range(lines): - line = "".join(random.choice('actgACTGNXYZ') for _ in range(lineLen - 1)) + '\n' + line = ( + "".join(random.choice("actgACTGNXYZ") for _ in range(lineLen - 1)) + + "\n" + ) f.write(line) @@ -192,25 +224,51 @@ def main(options=None): # deal with command line arguments parser = ArgumentParser() Job.Runner.addToilOptions(parser) - parser.add_argument('--numLines', default=defaultLines, help='Number of lines in file to sort.', type=int) - parser.add_argument('--lineLength', default=defaultLineLen, help='Length of lines in file to sort.', type=int) + parser.add_argument( + "--numLines", + default=defaultLines, + help="Number of lines in file to sort.", + type=int, + ) + parser.add_argument( + "--lineLength", + default=defaultLineLen, + help="Length of lines in file to sort.", + type=int, + ) parser.add_argument("--fileToSort", help="The file you wish to sort") parser.add_argument("--outputFile", help="Where the sorted output will go") - parser.add_argument("--overwriteOutput", help="Write over the output file if it already exists.", default=True) - parser.add_argument("--N", dest="N", - help="The threshold below which a serial sort function is used to sort file. " - "All lines must of length less than or equal to N or program will fail", - default=10000) - parser.add_argument('--downCheckpoints', action='store_true', - help='If this option is set, the workflow will make checkpoints on its way through' - 'the recursive "down" part of the sort') - parser.add_argument("--sortMemory", dest="sortMemory", - help="Memory for jobs that sort chunks of the file.", - default=None) - - parser.add_argument("--mergeMemory", dest="mergeMemory", - help="Memory for jobs that collate results.", - default=None) + parser.add_argument( + "--overwriteOutput", + help="Write over the output file if it already exists.", + default=True, + ) + parser.add_argument( + "--N", + dest="N", + help="The threshold below which a serial sort function is used to sort file. " + "All lines must of length less than or equal to N or program will fail", + default=10000, + ) + parser.add_argument( + "--downCheckpoints", + action="store_true", + help="If this option is set, the workflow will make checkpoints on its way through" + 'the recursive "down" part of the sort', + ) + parser.add_argument( + "--sortMemory", + dest="sortMemory", + help="Memory for jobs that sort chunks of the file.", + default=None, + ) + + parser.add_argument( + "--mergeMemory", + dest="mergeMemory", + help="Memory for jobs that collate results.", + default=None, + ) options = parser.parse_args() if not hasattr(options, "sortMemory") or not options.sortMemory: @@ -221,19 +279,25 @@ def main(options=None): # do some input verification sortedFileName = options.outputFile or "sortedFile.txt" if not options.overwriteOutput and os.path.exists(sortedFileName): - print(f'Output file {sortedFileName} already exists. ' - f'Delete it to run the sort example again or use --overwriteOutput=True') + print( + f"Output file {sortedFileName} already exists. " + f"Delete it to run the sort example again or use --overwriteOutput=True" + ) exit() fileName = options.fileToSort if options.fileToSort is None: # make the file ourselves - fileName = 'fileToSort.txt' + fileName = "fileToSort.txt" if os.path.exists(fileName): - print(f'Sorting existing file: {fileName}') + print(f"Sorting existing file: {fileName}") else: - print(f'No sort file specified. Generating one automatically called: {fileName}.') - makeFileToSort(fileName=fileName, lines=options.numLines, lineLen=options.lineLength) + print( + f"No sort file specified. Generating one automatically called: {fileName}." + ) + makeFileToSort( + fileName=fileName, lines=options.numLines, lineLen=options.lineLength + ) else: if not os.path.exists(options.fileToSort): raise RuntimeError("File to sort does not exist: %s" % options.fileToSort) @@ -241,24 +305,29 @@ def main(options=None): if int(options.N) <= 0: raise RuntimeError("Invalid value of N: %s" % options.N) - # Now we are ready to run with Toil(options) as workflow: - sortedFileURL = 'file://' + os.path.abspath(sortedFileName) - #raise Exception('test') + sortedFileURL = "file://" + os.path.abspath(sortedFileName) + # raise Exception('test') if not workflow.options.restart: - sortFileURL = 'file://' + os.path.abspath(fileName) + sortFileURL = "file://" + os.path.abspath(fileName) sortFileID = workflow.importFile(sortFileURL) - sortedFileID = workflow.start(Job.wrapJobFn(setup, - sortFileID, - int(options.N), - options.downCheckpoints, - options=options, - memory=sortMemory)) + sortedFileID = workflow.start( + Job.wrapJobFn( + setup, + sortFileID, + int(options.N), + options.downCheckpoints, + options=options, + memory=sortMemory, + ) + ) """ The else block is removed here to test that the job store is not destroyed when attempting to resume without restart(). """ -if __name__ == '__main__': + + +if __name__ == "__main__": main() diff --git a/src/toil/test/sort/sort.py b/src/toil/test/sort/sort.py index 8c80f47413..a7a3d63081 100755 --- a/src/toil/test/sort/sort.py +++ b/src/toil/test/sort/sort.py @@ -27,7 +27,7 @@ defaultLines = 1000 defaultLineLen = 50 -sortMemory = '600M' +sortMemory = "600M" def setup(job, inputFile, N, downCheckpoints, options): @@ -36,12 +36,16 @@ def setup(job, inputFile, N, downCheckpoints, options): Returns the FileID of the sorted file """ RealtimeLogger.info("Starting the merge sort") - return job.addChildJobFn(down, - inputFile, N, 'root', - downCheckpoints, - options = options, - preemptible=True, - memory=sortMemory).rv() + return job.addChildJobFn( + down, + inputFile, + N, + "root", + downCheckpoints, + options=options, + preemptible=True, + memory=sortMemory, + ).rv() def down(job, inputFileStoreID, N, path, downCheckpoints, options, memory=sortMemory): @@ -59,34 +63,57 @@ def down(job, inputFileStoreID, N, path, downCheckpoints, options, memory=sortMe length = os.path.getsize(inputFile) if length > N: # We will subdivide the file - RealtimeLogger.critical("Splitting file: %s of size: %s" - % (inputFileStoreID, length)) + RealtimeLogger.critical( + "Splitting file: %s of size: %s" % (inputFileStoreID, length) + ) # Split the file into two copies midPoint = getMidPoint(inputFile, 0, length) t1 = job.fileStore.getLocalTempFile() - with open(t1, 'w') as fH: - fH.write(copySubRangeOfFile(inputFile, 0, midPoint+1)) + with open(t1, "w") as fH: + fH.write(copySubRangeOfFile(inputFile, 0, midPoint + 1)) t2 = job.fileStore.getLocalTempFile() - with open(t2, 'w') as fH: - fH.write(copySubRangeOfFile(inputFile, midPoint+1, length)) + with open(t2, "w") as fH: + fH.write(copySubRangeOfFile(inputFile, midPoint + 1, length)) # Call down recursively. By giving the rv() of the two jobs as inputs to the follow-on job, up, # we communicate the dependency without hindering concurrency. - result = job.addFollowOnJobFn(up, - job.addChildJobFn(down, job.fileStore.writeGlobalFile(t1), N, path + '/0', - downCheckpoints, checkpoint=downCheckpoints, options=options, - preemptible=True, memory=options.sortMemory).rv(), - job.addChildJobFn(down, job.fileStore.writeGlobalFile(t2), N, path + '/1', - downCheckpoints, checkpoint=downCheckpoints, options=options, - preemptible=True, memory=options.mergeMemory).rv(), - path + '/up', preemptible=True, options=options, memory=options.sortMemory).rv() + result = job.addFollowOnJobFn( + up, + job.addChildJobFn( + down, + job.fileStore.writeGlobalFile(t1), + N, + path + "/0", + downCheckpoints, + checkpoint=downCheckpoints, + options=options, + preemptible=True, + memory=options.sortMemory, + ).rv(), + job.addChildJobFn( + down, + job.fileStore.writeGlobalFile(t2), + N, + path + "/1", + downCheckpoints, + checkpoint=downCheckpoints, + options=options, + preemptible=True, + memory=options.mergeMemory, + ).rv(), + path + "/up", + preemptible=True, + options=options, + memory=options.sortMemory, + ).rv() else: # We can sort this bit of the file - RealtimeLogger.critical("Sorting file: %s of size: %s" - % (inputFileStoreID, length)) + RealtimeLogger.critical( + "Sorting file: %s of size: %s" % (inputFileStoreID, length) + ) # Sort the copy and write back to the fileStore - shutil.copyfile(inputFile, inputFile + '.sort') - sort(inputFile + '.sort') - result = job.fileStore.writeGlobalFile(inputFile + '.sort') + shutil.copyfile(inputFile, inputFile + ".sort") + sort(inputFile + ".sort") + result = job.fileStore.writeGlobalFile(inputFile + ".sort") RealtimeLogger.info("Down job finished: %s" % path) return result @@ -100,13 +127,15 @@ def up(job, inputFileID1, inputFileID2, path, options, memory=sortMemory): RealtimeLogger.info("Up job starting: %s" % path) with job.fileStore.writeGlobalFileStream() as (fileHandle, outputFileStoreID): - fileHandle = codecs.getwriter('utf-8')(fileHandle) + fileHandle = codecs.getwriter("utf-8")(fileHandle) with job.fileStore.readGlobalFileStream(inputFileID1) as inputFileHandle1: - inputFileHandle1 = codecs.getreader('utf-8')(inputFileHandle1) + inputFileHandle1 = codecs.getreader("utf-8")(inputFileHandle1) with job.fileStore.readGlobalFileStream(inputFileID2) as inputFileHandle2: - inputFileHandle2 = codecs.getreader('utf-8')(inputFileHandle2) - RealtimeLogger.info("Merging %s and %s to %s" - % (inputFileID1, inputFileID2, outputFileStoreID)) + inputFileHandle2 = codecs.getreader("utf-8")(inputFileHandle2) + RealtimeLogger.info( + "Merging %s and %s to %s" + % (inputFileID1, inputFileID2, outputFileStoreID) + ) merge(inputFileHandle1, inputFileHandle2, fileHandle) # Cleanup up the input files - these deletes will occur after the completion is successful. job.fileStore.deleteGlobalFile(inputFileID1) @@ -124,7 +153,7 @@ def sort(file): lines.sort() - with open(file, 'w') as f: + with open(file, "w") as f: for line in lines: f.write(line) @@ -179,9 +208,12 @@ def getMidPoint(file, fileStart, fileEnd): def makeFileToSort(fileName, lines=defaultLines, lineLen=defaultLineLen): - with open(fileName, 'w') as f: + with open(fileName, "w") as f: for _ in range(lines): - line = "".join(random.choice('actgACTGNXYZ') for _ in range(lineLen - 1)) + '\n' + line = ( + "".join(random.choice("actgACTGNXYZ") for _ in range(lineLen - 1)) + + "\n" + ) f.write(line) @@ -190,25 +222,51 @@ def main(options=None): # deal with command line arguments parser = ArgumentParser() Job.Runner.addToilOptions(parser) - parser.add_argument('--numLines', default=defaultLines, help='Number of lines in file to sort.', type=int) - parser.add_argument('--lineLength', default=defaultLineLen, help='Length of lines in file to sort.', type=int) + parser.add_argument( + "--numLines", + default=defaultLines, + help="Number of lines in file to sort.", + type=int, + ) + parser.add_argument( + "--lineLength", + default=defaultLineLen, + help="Length of lines in file to sort.", + type=int, + ) parser.add_argument("--fileToSort", help="The file you wish to sort") parser.add_argument("--outputFile", help="Where the sorted output will go") - parser.add_argument("--overwriteOutput", help="Write over the output file if it already exists.", default=True) - parser.add_argument("--N", dest="N", - help="The threshold below which a serial sort function is used to sort file. " - "All lines must of length less than or equal to N or program will fail", - default=10000) - parser.add_argument('--downCheckpoints', action='store_true', - help='If this option is set, the workflow will make checkpoints on its way through' - 'the recursive "down" part of the sort') - parser.add_argument("--sortMemory", dest="sortMemory", - help="Memory for jobs that sort chunks of the file.", - default=None) - - parser.add_argument("--mergeMemory", dest="mergeMemory", - help="Memory for jobs that collate results.", - default=None) + parser.add_argument( + "--overwriteOutput", + help="Write over the output file if it already exists.", + default=True, + ) + parser.add_argument( + "--N", + dest="N", + help="The threshold below which a serial sort function is used to sort file. " + "All lines must of length less than or equal to N or program will fail", + default=10000, + ) + parser.add_argument( + "--downCheckpoints", + action="store_true", + help="If this option is set, the workflow will make checkpoints on its way through" + 'the recursive "down" part of the sort', + ) + parser.add_argument( + "--sortMemory", + dest="sortMemory", + help="Memory for jobs that sort chunks of the file.", + default=None, + ) + + parser.add_argument( + "--mergeMemory", + dest="mergeMemory", + help="Memory for jobs that collate results.", + default=None, + ) options = parser.parse_args() if not hasattr(options, "sortMemory") or not options.sortMemory: @@ -219,19 +277,25 @@ def main(options=None): # do some input verification sortedFileName = options.outputFile or "sortedFile.txt" if not options.overwriteOutput and os.path.exists(sortedFileName): - print(f'Output file {sortedFileName} already exists. ' - f'Delete it to run the sort example again or use --overwriteOutput=True') + print( + f"Output file {sortedFileName} already exists. " + f"Delete it to run the sort example again or use --overwriteOutput=True" + ) exit() fileName = options.fileToSort if options.fileToSort is None: # make the file ourselves - fileName = 'fileToSort.txt' + fileName = "fileToSort.txt" if os.path.exists(fileName): - print(f'Sorting existing file: {fileName}') + print(f"Sorting existing file: {fileName}") else: - print(f'No sort file specified. Generating one automatically called: {fileName}.') - makeFileToSort(fileName=fileName, lines=options.numLines, lineLen=options.lineLength) + print( + f"No sort file specified. Generating one automatically called: {fileName}." + ) + makeFileToSort( + fileName=fileName, lines=options.numLines, lineLen=options.lineLength + ) else: if not os.path.exists(options.fileToSort): raise RuntimeError("File to sort does not exist: %s" % options.fileToSort) @@ -241,20 +305,24 @@ def main(options=None): # Now we are ready to run with Toil(options) as workflow: - sortedFileURL = 'file://' + os.path.abspath(sortedFileName) + sortedFileURL = "file://" + os.path.abspath(sortedFileName) if not workflow.options.restart: - sortFileURL = 'file://' + os.path.abspath(fileName) + sortFileURL = "file://" + os.path.abspath(fileName) sortFileID = workflow.importFile(sortFileURL) - sortedFileID = workflow.start(Job.wrapJobFn(setup, - sortFileID, - int(options.N), - options.downCheckpoints, - options=options, - memory=sortMemory)) + sortedFileID = workflow.start( + Job.wrapJobFn( + setup, + sortFileID, + int(options.N), + options.downCheckpoints, + options=options, + memory=sortMemory, + ) + ) else: sortedFileID = workflow.restart() workflow.exportFile(sortedFileID, sortedFileURL) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/toil/test/sort/sortTest.py b/src/toil/test/sort/sortTest.py index 7b93ffb141..e17201b763 100755 --- a/src/toil/test/sort/sortTest.py +++ b/src/toil/test/sort/sortTest.py @@ -25,29 +25,35 @@ from toil.common import Toil from toil.exceptions import FailedJobsException from toil.job import Job -from toil.jobStores.abstractJobStore import (JobStoreExistsException, - NoSuchJobStoreException) +from toil.jobStores.abstractJobStore import ( + JobStoreExistsException, + NoSuchJobStoreException, +) from toil.lib.bioio import root_logger -from toil.test import (ToilTest, - needs_aws_ec2, - needs_google_project, - needs_google_storage, - needs_gridengine, - needs_mesos, - needs_torque, - slow) -from toil.test.sort.sort import (copySubRangeOfFile, - getMidPoint, - main, - makeFileToSort, - merge, - sort) +from toil.test import ( + ToilTest, + needs_aws_ec2, + needs_google_project, + needs_google_storage, + needs_gridengine, + needs_mesos, + needs_torque, + slow, +) +from toil.test.sort.sort import ( + copySubRangeOfFile, + getMidPoint, + main, + makeFileToSort, + merge, + sort, +) logger = logging.getLogger(__name__) -defaultLineLen = int(os.environ.get('TOIL_TEST_SORT_LINE_LEN', 10)) -defaultLines = int(os.environ.get('TOIL_TEST_SORT_LINES', 10)) -defaultN = int(os.environ.get('TOIL_TEST_SORT_N', defaultLineLen * defaultLines / 5)) +defaultLineLen = int(os.environ.get("TOIL_TEST_SORT_LINE_LEN", 10)) +defaultLines = int(os.environ.get("TOIL_TEST_SORT_LINES", 10)) +defaultN = int(os.environ.get("TOIL_TEST_SORT_N", defaultLineLen * defaultLines / 5)) @contextmanager @@ -67,10 +73,11 @@ class SortTest(ToilTest, MesosTestSupport): Tests Toil by sorting a file in parallel on various combinations of job stores and batch systems. """ + def setUp(self): super().setUp() - self.tempDir = self._createTempDir(purpose='tempDir') - self.outputFile = os.path.join(self.tempDir, 'sortedFile.txt') + self.tempDir = self._createTempDir(purpose="tempDir") + self.outputFile = os.path.join(self.tempDir, "sortedFile.txt") self.inputFile = os.path.join(self.tempDir, "fileToSort.txt") def tearDown(self): @@ -78,9 +85,19 @@ def tearDown(self): shutil.rmtree(self.tempDir) ToilTest.tearDown(self) - def _toilSort(self, jobStoreLocator, batchSystem, - lines=defaultLines, N=defaultN, testNo=1, lineLen=defaultLineLen, - retryCount=2, badWorker=0.5, downCheckpoints=False, caching=True): + def _toilSort( + self, + jobStoreLocator, + batchSystem, + lines=defaultLines, + N=defaultN, + testNo=1, + lineLen=defaultLineLen, + retryCount=2, + badWorker=0.5, + downCheckpoints=False, + caching=True, + ): """ Generate a file consisting of the given number of random lines, each line of the given length. Sort the file with Toil by splitting the file recursively until each part is less @@ -112,8 +129,8 @@ def _toilSort(self, jobStoreLocator, batchSystem, options.caching = caching # This is required because mesos_endpoint now defaults to the IP of the machine # that is starting the workflow while the mesos *tests* run locally. - if batchSystem == 'mesos': - options.mesos_endpoint = 'localhost:5050' + if batchSystem == "mesos": + options.mesos_endpoint = "localhost:5050" options.downCheckpoints = downCheckpoints options.N = N options.outputFile = self.outputFile @@ -166,18 +183,22 @@ def _toilSort(self, jobStoreLocator, batchSystem, except FailedJobsException as e: i = e.numberOfFailedJobs if totalTrys > 32: # p(fail after this many restarts) = 0.5**32 - self.fail('Exceeded a reasonable number of restarts') + self.fail("Exceeded a reasonable number of restarts") totalTrys += 1 finally: - subprocess.check_call([resolveEntryPoint('toil'), 'clean', jobStoreLocator]) + subprocess.check_call( + [resolveEntryPoint("toil"), "clean", jobStoreLocator] + ) # final test to make sure the jobStore was actually deleted - self.assertRaises(NoSuchJobStoreException, Toil.resumeJobStore, jobStoreLocator) - - + self.assertRaises( + NoSuchJobStoreException, Toil.resumeJobStore, jobStoreLocator + ) @needs_aws_ec2 def testAwsSingle(self): - self._toilSort(jobStoreLocator=self._awsJobStore(), batchSystem='single_machine') + self._toilSort( + jobStoreLocator=self._awsJobStore(), batchSystem="single_machine" + ) @needs_aws_ec2 @needs_mesos @@ -192,14 +213,18 @@ def testAwsMesos(self): def testFileMesos(self): self._startMesos() try: - self._toilSort(jobStoreLocator=self._getTestJobStorePath(), batchSystem="mesos") + self._toilSort( + jobStoreLocator=self._getTestJobStorePath(), batchSystem="mesos" + ) finally: self._stopMesos() @needs_google_project @needs_google_storage def testGoogleSingle(self): - self._toilSort(jobStoreLocator=self._googleJobStore(), batchSystem="single_machine") + self._toilSort( + jobStoreLocator=self._googleJobStore(), batchSystem="single_machine" + ) @needs_google_project @needs_google_storage @@ -212,29 +237,46 @@ def testGoogleMesos(self): self._stopMesos() def testFileSingle(self): - self._toilSort(jobStoreLocator=self._getTestJobStorePath(), batchSystem='single_machine') + self._toilSort( + jobStoreLocator=self._getTestJobStorePath(), batchSystem="single_machine" + ) def testFileSingleNonCaching(self): - self._toilSort(jobStoreLocator=self._getTestJobStorePath(), batchSystem='single_machine', - caching=False) + self._toilSort( + jobStoreLocator=self._getTestJobStorePath(), + batchSystem="single_machine", + caching=False, + ) def testFileSingleCheckpoints(self): - self._toilSort(jobStoreLocator=self._getTestJobStorePath(), batchSystem='single_machine', - retryCount=2, downCheckpoints=True) + self._toilSort( + jobStoreLocator=self._getTestJobStorePath(), + batchSystem="single_machine", + retryCount=2, + downCheckpoints=True, + ) def testFileSingle10000(self): - self._toilSort(jobStoreLocator=self._getTestJobStorePath(), batchSystem='single_machine', - lines=10000, N=10000) + self._toilSort( + jobStoreLocator=self._getTestJobStorePath(), + batchSystem="single_machine", + lines=10000, + N=10000, + ) @needs_gridengine - @unittest.skip('GridEngine does not support shared caching') + @unittest.skip("GridEngine does not support shared caching") def testFileGridEngine(self): - self._toilSort(jobStoreLocator=self._getTestJobStorePath(), batchSystem='gridengine') + self._toilSort( + jobStoreLocator=self._getTestJobStorePath(), batchSystem="gridengine" + ) @needs_torque - @unittest.skip('PBS/Torque does not support shared caching') + @unittest.skip("PBS/Torque does not support shared caching") def testFileTorqueEngine(self): - self._toilSort(jobStoreLocator=self._getTestJobStorePath(), batchSystem='torque') + self._toilSort( + jobStoreLocator=self._getTestJobStorePath(), batchSystem="torque" + ) testNo = 5 @@ -258,7 +300,7 @@ def testMerge(self): makeFileToSort(tempFile2) sort(tempFile1) sort(tempFile2) - with open(tempFile3, 'w') as fileHandle: + with open(tempFile3, "w") as fileHandle: with open(tempFile1) as tempFileHandle1: with open(tempFile2) as tempFileHandle2: merge(tempFileHandle1, tempFileHandle2, fileHandle) @@ -277,7 +319,7 @@ def testCopySubRangeOfFile(self): assert fileSize > 0 fileStart = random.choice(range(0, fileSize)) fileEnd = random.choice(range(fileStart, fileSize)) - with open(outputFile, 'w') as f: + with open(outputFile, "w") as f: f.write(copySubRangeOfFile(tempFile, fileStart, fileEnd)) with open(outputFile) as f: l = f.read() @@ -294,11 +336,11 @@ def testGetMidPoint(self): midPoint = getMidPoint(self.inputFile, 0, fileSize) print(f"The mid point is {midPoint} of a file of {fileSize} bytes.") assert midPoint < fileSize - assert sorted_contents[midPoint] == '\n' + assert sorted_contents[midPoint] == "\n" assert midPoint >= 0 def _awsJobStore(self): - return f'aws:{self.awsRegion()}:sort-test-{uuid4()}' + return f"aws:{self.awsRegion()}:sort-test-{uuid4()}" def _googleJobStore(self): return f'google:{os.getenv("TOIL_GOOGLE_PROJECTID")}:sort-test-{uuid4()}' diff --git a/src/toil/test/src/autoDeploymentTest.py b/src/toil/test/src/autoDeploymentTest.py index e5d54f6a80..380f6a2564 100644 --- a/src/toil/test/src/autoDeploymentTest.py +++ b/src/toil/test/src/autoDeploymentTest.py @@ -5,10 +5,7 @@ from toil.exceptions import FailedJobsException from toil.lib.iterables import concat -from toil.test import (ApplianceTestSupport, - needs_local_appliance, - needs_mesos, - slow) +from toil.test import ApplianceTestSupport, needs_local_appliance, needs_mesos, slow from toil.version import exactPython logger = logging.getLogger(__name__) @@ -35,34 +32,38 @@ def _venvApplianceCluster(self): Creates an appliance cluster with a virtualenv at './venv' on the leader and a temporary directory on the host mounted at /data in the leader and worker containers. """ - dataDirPath = self._createTempDir(purpose='data') - with self._applianceCluster(mounts={dataDirPath: '/data'}) as (leader, worker): - leader.runOnAppliance('virtualenv', - '--system-site-packages', - '--never-download', # prevent silent upgrades to pip etc - '--python', exactPython, - 'venv') - leader.runOnAppliance('venv/bin/pip', 'list') # For diagnostic purposes + dataDirPath = self._createTempDir(purpose="data") + with self._applianceCluster(mounts={dataDirPath: "/data"}) as (leader, worker): + leader.runOnAppliance( + "virtualenv", + "--system-site-packages", + "--never-download", # prevent silent upgrades to pip etc + "--python", + exactPython, + "venv", + ) + leader.runOnAppliance("venv/bin/pip", "list") # For diagnostic purposes yield leader, worker # TODO: Are we sure the python in the appliance we are testing is the same # as the one we are testing from? If not, how can we get the version it is? - sitePackages = f'venv/lib/{exactPython}/site-packages' + sitePackages = f"venv/lib/{exactPython}/site-packages" def testRestart(self): """ Test whether auto-deployment works on restart. """ with self._venvApplianceCluster() as (leader, worker): + def userScript(): from toil.common import Toil from toil.job import Job # noinspection PyUnusedLocal - def job(job, disk='10M', cores=1, memory='10M'): + def job(job, disk="10M", cores=1, memory="10M"): assert False - if __name__ == '__main__': + if __name__ == "__main__": options = Job.Runner.getDefaultArgumentParser().parse_args() with Toil(options) as toil: if toil.config.restart: @@ -72,26 +73,30 @@ def job(job, disk='10M', cores=1, memory='10M'): userScript = self._getScriptSource(userScript) - leader.deployScript(path=self.sitePackages, - packagePath='foo.bar', - script=userScript) - - pythonArgs = ['venv/bin/python', '-m', 'foo.bar'] - toilArgs = ['--logDebug', - '--batchSystem=mesos', - '--mesosEndpoint=localhost:5050', - '--defaultMemory=10M', - '/data/jobstore'] + leader.deployScript( + path=self.sitePackages, packagePath="foo.bar", script=userScript + ) + + pythonArgs = ["venv/bin/python", "-m", "foo.bar"] + toilArgs = [ + "--logDebug", + "--batchSystem=mesos", + "--mesosEndpoint=localhost:5050", + "--defaultMemory=10M", + "/data/jobstore", + ] command = concat(pythonArgs, toilArgs) - self.assertRaises(subprocess.CalledProcessError, leader.runOnAppliance, *command) + self.assertRaises( + subprocess.CalledProcessError, leader.runOnAppliance, *command + ) # Deploy an updated version of the script ... - userScript = userScript.replace('assert False', 'assert True') - leader.deployScript(path=self.sitePackages, - packagePath='foo.bar', - script=userScript) + userScript = userScript.replace("assert False", "assert True") + leader.deployScript( + path=self.sitePackages, packagePath="foo.bar", script=userScript + ) # ... and restart Toil. - command = concat(pythonArgs, '--restart', toilArgs) + command = concat(pythonArgs, "--restart", toilArgs) leader.runOnAppliance(*command) def testSplitRootPackages(self): @@ -109,11 +114,11 @@ def testSplitRootPackages(self): def libraryModule(): # noinspection PyUnusedLocal def libraryJob(job): - open('/data/foo.txt', 'w').close() + open("/data/foo.txt", "w").close() - leader.deployScript(path=self.sitePackages, - packagePath='toil_lib.foo', - script=libraryModule) + leader.deployScript( + path=self.sitePackages, packagePath="toil_lib.foo", script=libraryModule + ) # Deploy the user script def userScript(): @@ -124,12 +129,14 @@ def userScript(): from toil.job import Job # noinspection PyUnusedLocal - def job(job, disk='10M', cores=1, memory='10M'): + def job(job, disk="10M", cores=1, memory="10M"): # Double the requirements to prevent chaining as chaining might hide problems # in auto-deployment code. - job.addChildJobFn(libraryJob, disk='20M', cores=cores, memory=memory) + job.addChildJobFn( + libraryJob, disk="20M", cores=cores, memory=memory + ) - if __name__ == '__main__': + if __name__ == "__main__": options = Job.Runner.getDefaultArgumentParser().parse_args() with Toil(options) as toil: if toil.config.restart: @@ -137,24 +144,32 @@ def job(job, disk='10M', cores=1, memory='10M'): else: toil.start(Job.wrapJobFn(job)) - leader.deployScript(path=self.sitePackages, - packagePath='toil_script.bar', - script=userScript) + leader.deployScript( + path=self.sitePackages, packagePath="toil_script.bar", script=userScript + ) # Assert that output file isn't there - worker.runOnAppliance('test', '!', '-f', '/data/foo.txt') + worker.runOnAppliance("test", "!", "-f", "/data/foo.txt") # Just being paranoid - self.assertRaises(subprocess.CalledProcessError, - worker.runOnAppliance, 'test', '-f', '/data/foo.txt') - leader.runOnAppliance('venv/bin/python', - '-m', 'toil_script.bar', - '--logDebug', - '--batchSystem=mesos', - '--mesosEndpoint=localhost:5050', - '--defaultMemory=10M', - '/data/jobstore') + self.assertRaises( + subprocess.CalledProcessError, + worker.runOnAppliance, + "test", + "-f", + "/data/foo.txt", + ) + leader.runOnAppliance( + "venv/bin/python", + "-m", + "toil_script.bar", + "--logDebug", + "--batchSystem=mesos", + "--mesosEndpoint=localhost:5050", + "--defaultMemory=10M", + "/data/jobstore", + ) # Assert that out output file is there - worker.runOnAppliance('test', '-f', '/data/foo.txt') + worker.runOnAppliance("test", "-f", "/data/foo.txt") def testUserTypesInJobFunctionArgs(self): """ @@ -165,6 +180,7 @@ def testUserTypesInJobFunctionArgs(self): revealed https://github.com/BD2KGenomics/toil/issues/1278. """ with self._venvApplianceCluster() as (leader, worker): + def userScript(): from toil.common import Toil from toil.job import Job @@ -174,10 +190,10 @@ class X: pass # noinspection PyUnusedLocal - def job(job, x, disk='10M', cores=1, memory='10M'): + def job(job, x, disk="10M", cores=1, memory="10M"): return x - if __name__ == '__main__': + if __name__ == "__main__": options = Job.Runner.getDefaultArgumentParser().parse_args() x = X() with Toil(options) as toil: @@ -187,23 +203,28 @@ def job(job, x, disk='10M', cores=1, memory='10M'): # translation from __main__ to foo.bar is a side effect of auto-deployment. assert r.__class__ is not X import foo.bar + assert r.__class__ is foo.bar.X # Assert that a copy was made. This is a side effect of pickling/unpickling. assert x is not r userScript = self._getScriptSource(userScript) - leader.deployScript(path=self.sitePackages, - packagePath='foo.bar', - script=userScript) - - leader.runOnAppliance('venv/bin/python', '-m', 'foo.bar', - '--logDebug', - '--batchSystem=mesos', - '--mesosEndpoint=localhost:5050', - '--defaultMemory=10M', - '--defaultDisk=10M', - '/data/jobstore') + leader.deployScript( + path=self.sitePackages, packagePath="foo.bar", script=userScript + ) + + leader.runOnAppliance( + "venv/bin/python", + "-m", + "foo.bar", + "--logDebug", + "--batchSystem=mesos", + "--mesosEndpoint=localhost:5050", + "--defaultMemory=10M", + "--defaultDisk=10M", + "/data/jobstore", + ) def testDeferralWithConcurrentEncapsulation(self): """ @@ -248,13 +269,16 @@ def testDeferralWithConcurrentEncapsulation(self): `Encapsulated` has two children to ensure that `Follow-on` is run in a separate worker. """ with self._venvApplianceCluster() as (leader, worker): + def userScript(): from toil.common import Toil from toil.job import Job def root(rootJob): def nullFile(): - return rootJob.fileStore.jobStore.import_file('file:///dev/null') + return rootJob.fileStore.jobStore.import_file( + "file:///dev/null" + ) startFile = nullFile() endFile = nullFile() @@ -290,7 +314,7 @@ def encapsulated(job, startFile): def last(job, endFile): job.fileStore.jobStore.delete_file(endFile) - if __name__ == '__main__': + if __name__ == "__main__": options = Job.Runner.getDefaultArgumentParser().parse_args() with Toil(options) as toil: rootJob = Job.wrapJobFn(root) @@ -298,18 +322,22 @@ def last(job, endFile): userScript = self._getScriptSource(userScript) - leader.deployScript(path=self.sitePackages, - packagePath='foo.bar', - script=userScript) - - leader.runOnAppliance('venv/bin/python', '-m', 'foo.bar', - '--logDebug', - '--batchSystem=mesos', - '--mesosEndpoint=localhost:5050', - '--retryCount=0', - '--defaultMemory=10M', - '--defaultDisk=10M', - '/data/jobstore') + leader.deployScript( + path=self.sitePackages, packagePath="foo.bar", script=userScript + ) + + leader.runOnAppliance( + "venv/bin/python", + "-m", + "foo.bar", + "--logDebug", + "--batchSystem=mesos", + "--mesosEndpoint=localhost:5050", + "--retryCount=0", + "--defaultMemory=10M", + "--defaultDisk=10M", + "/data/jobstore", + ) def testDeferralWithFailureAndEncapsulation(self): """ @@ -348,6 +376,7 @@ def testDeferralWithFailureAndEncapsulation(self): jobs have been executed by that worker. """ with self._venvApplianceCluster() as (leader, worker): + def userScript(): import os import time @@ -359,7 +388,9 @@ def userScript(): def root(rootJob): def nullFile(): - return rootJob.fileStore.jobStore.import_file('file:///dev/null') + return rootJob.fileStore.jobStore.import_file( + "file:///dev/null" + ) startFile = nullFile() endFile = nullFile() @@ -380,10 +411,10 @@ def deferredFile(config): Return path to a file at the root of the job store, exploiting the fact that the job store is shared between leader and worker container. """ - prefix = 'file:' + prefix = "file:" locator = config.jobStore assert locator.startswith(prefix) - return os.path.join(locator[len(prefix):], 'testDeferredFile') + return os.path.join(locator[len(prefix) :], "testDeferredFile") def deferred(deferredFilePath): """ @@ -425,6 +456,7 @@ def trigger(job, endFile): finds the left-overs of the `deferring` job. """ import errno + jobStore = job.fileStore.jobStore with jobStore.read_file_stream(endFile) as fH: pid = int(fH.read()) @@ -452,36 +484,44 @@ def tryUnlink(deferredFilePath): else: raise - if __name__ == '__main__': + if __name__ == "__main__": import errno + options = Job.Runner.getDefaultArgumentParser().parse_args() with Toil(options) as toil: deferredFilePath = deferredFile(toil.config) - open(deferredFilePath, 'w').close() + open(deferredFilePath, "w").close() try: assert os.path.exists(deferredFilePath) try: toil.start(Job.wrapJobFn(root)) except FailedJobsException as e: - assert e.numberOfFailedJobs == 2 # `root` and `deferring` - assert not os.path.exists(deferredFilePath), \ - 'Apparently, the deferred function did not run.' + assert ( + e.numberOfFailedJobs == 2 + ) # `root` and `deferring` + assert not os.path.exists( + deferredFilePath + ), "Apparently, the deferred function did not run." else: - assert False, 'Workflow should not have succeeded.' + assert False, "Workflow should not have succeeded." finally: tryUnlink(deferredFilePath) userScript = self._getScriptSource(userScript) - leader.deployScript(path=self.sitePackages, - packagePath='foo.bar', - script=userScript) - - leader.runOnAppliance('venv/bin/python', '-m', 'foo.bar', - '--logDebug', - '--batchSystem=mesos', - '--mesosEndpoint=localhost:5050', - '--retryCount=0', - '--defaultMemory=10M', - '--defaultDisk=10M', - '/data/jobstore') + leader.deployScript( + path=self.sitePackages, packagePath="foo.bar", script=userScript + ) + + leader.runOnAppliance( + "venv/bin/python", + "-m", + "foo.bar", + "--logDebug", + "--batchSystem=mesos", + "--mesosEndpoint=localhost:5050", + "--retryCount=0", + "--defaultMemory=10M", + "--defaultDisk=10M", + "/data/jobstore", + ) diff --git a/src/toil/test/src/busTest.py b/src/toil/test/src/busTest.py index 6ede621b5b..9b6328fd1b 100644 --- a/src/toil/test/src/busTest.py +++ b/src/toil/test/src/busTest.py @@ -17,10 +17,12 @@ from threading import Thread, current_thread from toil.batchSystems.abstractBatchSystem import BatchJobExitReason -from toil.bus import (JobCompletedMessage, - JobIssuedMessage, - MessageBus, - replay_message_bus) +from toil.bus import ( + JobCompletedMessage, + JobIssuedMessage, + MessageBus, + replay_message_bus, +) from toil.common import Toil from toil.exceptions import FailedJobsException from toil.job import Job @@ -28,8 +30,9 @@ logger = logging.getLogger(__name__) + class MessageBusTest(ToilTest): - + def test_enum_ints_in_file(self) -> None: """ Make sure writing bus messages to files works with enums. @@ -44,7 +47,7 @@ def test_enum_ints_in_file(self) -> None: # Make sure stuff goes away in the right order del handler_to_keep_alive del bus - + for line in open(bus_file): logger.debug("Bus line: %s", line) @@ -107,17 +110,19 @@ def test_restart_without_bus_path(self) -> None: Test the ability to restart a workflow when the message bus path used by the previous attempt is gone. """ - temp_dir = self._createTempDir(purpose='tempDir') + temp_dir = self._createTempDir(purpose="tempDir") job_store = self._getTestJobStorePath() - bus_holder_dir = os.path.join(temp_dir, 'bus_holder') + bus_holder_dir = os.path.join(temp_dir, "bus_holder") os.mkdir(bus_holder_dir) start_options = Job.Runner.getDefaultOptions(job_store) - start_options.logLevel = 'DEBUG' + start_options.logLevel = "DEBUG" start_options.retryCount = 0 start_options.clean = "never" - start_options.write_messages = os.path.abspath(os.path.join(bus_holder_dir, 'messagebus.txt')) + start_options.write_messages = os.path.abspath( + os.path.join(bus_holder_dir, "messagebus.txt") + ) root = Job.wrapJobFn(failing_job_fn) @@ -128,17 +133,17 @@ def test_restart_without_bus_path(self) -> None: except FailedJobsException: pass - logger.info('First attempt successfully failed, removing message bus log') + logger.info("First attempt successfully failed, removing message bus log") # Get rid of the bus os.unlink(start_options.write_messages) os.rmdir(bus_holder_dir) - logger.info('Making second attempt') + logger.info("Making second attempt") # Set up options without a specific bus path restart_options = Job.Runner.getDefaultOptions(job_store) - restart_options.logLevel = 'DEBUG' + restart_options.logLevel = "DEBUG" restart_options.retryCount = 0 restart_options.clean = "never" restart_options.restart = True @@ -150,14 +155,11 @@ def test_restart_without_bus_path(self) -> None: except FailedJobsException: pass - logger.info('Second attempt successfully failed') + logger.info("Second attempt successfully failed") def failing_job_fn(job: Job) -> None: """ This function is guaranteed to fail. """ - raise RuntimeError('Job attempted to run but failed') - - - + raise RuntimeError("Job attempted to run but failed") diff --git a/src/toil/test/src/checkpointTest.py b/src/toil/test/src/checkpointTest.py index 1ee9a2da09..6cee0f7897 100644 --- a/src/toil/test/src/checkpointTest.py +++ b/src/toil/test/src/checkpointTest.py @@ -63,8 +63,10 @@ def testCheckpointedRestartSucceeds(self): except FailedJobsException: self.fail("Checkpointed workflow restart doesn't clean failures.") + class CheckRetryCount(Job): """Fail N times, succeed on the next try.""" + def __init__(self, numFailuresBeforeSuccess): super().__init__(checkpoint=True) self.numFailuresBeforeSuccess = numFailuresBeforeSuccess @@ -73,11 +75,11 @@ def getNumRetries(self, fileStore): """Mark a retry in the fileStore, and return the number of retries so far.""" try: with fileStore.jobStore.read_shared_file_stream("checkpointRun") as f: - timesRun = int(f.read().decode('utf-8')) + timesRun = int(f.read().decode("utf-8")) except NoSuchFileException: timesRun = 0 with fileStore.jobStore.write_shared_file_stream("checkpointRun") as f: - f.write(str(timesRun + 1).encode('utf-8')) + f.write(str(timesRun + 1).encode("utf-8")) return timesRun def run(self, fileStore): @@ -86,10 +88,12 @@ def run(self, fileStore): if retryCount < self.numFailuresBeforeSuccess: self.addChild(AlwaysFail()) + class AlwaysFail(Job): def run(self, fileStore): raise RuntimeError(":(") + class CheckpointFailsFirstTime(Job): def __init__(self): super().__init__(checkpoint=True) @@ -97,8 +101,10 @@ def __init__(self): def run(self, fileStore): self.addChild(FailOnce()) + class FailOnce(Job): """Fail the first time the workflow is run, but succeed thereafter.""" + def run(self, fileStore): if fileStore.jobStore.config.workflowAttemptNumber < 1: raise RuntimeError("first time around") diff --git a/src/toil/test/src/deferredFunctionTest.py b/src/toil/test/src/deferredFunctionTest.py index 06befbd13d..dc8cda4d83 100644 --- a/src/toil/test/src/deferredFunctionTest.py +++ b/src/toil/test/src/deferredFunctionTest.py @@ -30,28 +30,29 @@ class DeferredFunctionTest(ToilTest, metaclass=ABCMeta): """Test the deferred function system.""" + # This determines what job store type to use. - jobStoreType = 'file' + jobStoreType = "file" def _getTestJobStore(self): - if self.jobStoreType == 'file': + if self.jobStoreType == "file": return self._getTestJobStorePath() - elif self.jobStoreType == 'aws': - return f'aws:{self.awsRegion()}:cache-tests-{uuid4()}' - elif self.jobStoreType == 'google': - projectID = os.getenv('TOIL_GOOGLE_PROJECTID') - return f'google:{projectID}:cache-tests-{str(uuid4())}' + elif self.jobStoreType == "aws": + return f"aws:{self.awsRegion()}:cache-tests-{uuid4()}" + elif self.jobStoreType == "google": + projectID = os.getenv("TOIL_GOOGLE_PROJECTID") + return f"google:{projectID}:cache-tests-{str(uuid4())}" else: - raise RuntimeError('Illegal job store type.') + raise RuntimeError("Illegal job store type.") def setUp(self): super().setUp() testDir = self._createTempDir() self.options = Job.Runner.getDefaultOptions(self._getTestJobStore()) - self.options.logLevel = 'INFO' + self.options.logLevel = "INFO" self.options.workDir = testDir - self.options.clean = 'always' - self.options.logFile = os.path.join(testDir, 'logFile') + self.options.clean = "always" + self.options.logFile = os.path.join(testDir, "logFile") # Tests for the various defer possibilities def testDeferredFunctionRunsWithMethod(self): @@ -84,11 +85,11 @@ def _testDeferredFunctionRuns(self, callableFn): :param function callableFn: The function to use in the test. :return: None """ - workdir = self._createTempDir(purpose='nonLocalDir') + workdir = self._createTempDir(purpose="nonLocalDir") nonLocalFile1 = os.path.join(workdir, str(uuid4())) nonLocalFile2 = os.path.join(workdir, str(uuid4())) - open(nonLocalFile1, 'w').close() - open(nonLocalFile2, 'w').close() + open(nonLocalFile1, "w").close() + open(nonLocalFile2, "w").close() assert os.path.exists(nonLocalFile1) assert os.path.exists(nonLocalFile2) A = Job.wrapJobFn(callableFn, files=(nonLocalFile1, nonLocalFile2)) @@ -114,15 +115,16 @@ def testDeferredFunctionRunsWithFailures(self): where a deferred function fails (since the first file doesn't exist on the retry). """ self.options.retryCount = 1 - workdir = self._createTempDir(purpose='nonLocalDir') + workdir = self._createTempDir(purpose="nonLocalDir") nonLocalFile1 = os.path.join(workdir, str(uuid4())) nonLocalFile2 = os.path.join(workdir, str(uuid4())) - open(nonLocalFile1, 'w').close() - open(nonLocalFile2, 'w').close() + open(nonLocalFile1, "w").close() + open(nonLocalFile2, "w").close() assert os.path.exists(nonLocalFile1) assert os.path.exists(nonLocalFile2) - A = Job.wrapJobFn(_deferredFunctionRunsWithFailuresFn, - files=(nonLocalFile1, nonLocalFile2)) + A = Job.wrapJobFn( + _deferredFunctionRunsWithFailuresFn, files=(nonLocalFile1, nonLocalFile2) + ) Job.Runner.startToil(A, self.options) assert not os.path.exists(nonLocalFile1) assert not os.path.exists(nonLocalFile2) @@ -145,11 +147,11 @@ def testNewJobsCanHandleOtherJobDeaths(self): # There can be no retries self.options.retryCount = 0 - workdir = self._createTempDir(purpose='nonLocalDir') + workdir = self._createTempDir(purpose="nonLocalDir") nonLocalFile1 = os.path.join(workdir, str(uuid4())) nonLocalFile2 = os.path.join(workdir, str(uuid4())) - open(nonLocalFile1, 'w').close() - open(nonLocalFile2, 'w').close() + open(nonLocalFile1, "w").close() + open(nonLocalFile2, "w").close() assert os.path.exists(nonLocalFile1) assert os.path.exists(nonLocalFile2) files = [nonLocalFile1, nonLocalFile2] @@ -157,8 +159,12 @@ def testNewJobsCanHandleOtherJobDeaths(self): # A and B here must run in parallel for this to work A = Job.wrapJobFn(_testNewJobsCanHandleOtherJobDeaths_A, files=files, cores=1) B = Job.wrapJobFn(_testNewJobsCanHandleOtherJobDeaths_B, files=files, cores=1) - C = Job.wrapJobFn(_testNewJobsCanHandleOtherJobDeaths_C, files=files, - expectedResult=False, cores=1) + C = Job.wrapJobFn( + _testNewJobsCanHandleOtherJobDeaths_C, + files=files, + expectedResult=False, + cores=1, + ) root.addChild(A) root.addChild(B) B.addChild(C) @@ -179,21 +185,22 @@ def testBatchSystemCleanupCanHandleWorkerDeaths(self): # There can be no retries self.options.retryCount = 0 - workdir = self._createTempDir(purpose='nonLocalDir') + workdir = self._createTempDir(purpose="nonLocalDir") nonLocalFile1 = os.path.join(workdir, str(uuid4())) nonLocalFile2 = os.path.join(workdir, str(uuid4())) # The first file has to be non zero or meseeks will go into an infinite sleep - file1 = open(nonLocalFile1, 'w') - file1.write('test') + file1 = open(nonLocalFile1, "w") + file1.write("test") file1.close() - open(nonLocalFile2, 'w').close() + open(nonLocalFile2, "w").close() assert os.path.exists(nonLocalFile1) assert os.path.exists(nonLocalFile2) # We only use the "A" job here, and we fill in the first file, so all # it will do is defer deleting the second file, delete the first file, # and die. - A = Job.wrapJobFn(_testNewJobsCanHandleOtherJobDeaths_A, - files=(nonLocalFile1, nonLocalFile2)) + A = Job.wrapJobFn( + _testNewJobsCanHandleOtherJobDeaths_A, files=(nonLocalFile1, nonLocalFile2) + ) try: Job.Runner.startToil(A, self.options) except FailedJobsException: @@ -201,6 +208,7 @@ def testBatchSystemCleanupCanHandleWorkerDeaths(self): assert not os.path.exists(nonLocalFile1) assert not os.path.exists(nonLocalFile2) + def _writeNonLocalFilesMethod(job, files): """ Write some data to 2 files. Pass them to a registered deferred method. @@ -209,11 +217,12 @@ def _writeNonLocalFilesMethod(job, files): :return: None """ for nlf in files: - with open(nlf, 'wb') as nonLocalFileHandle: + with open(nlf, "wb") as nonLocalFileHandle: nonLocalFileHandle.write(os.urandom(1 * 1024 * 1024)) job.defer(_deleteMethods._deleteFileMethod, files[0], nlf=files[1]) return None + def _writeNonLocalFilesClassMethod(job, files): """ Write some data to 2 files. Pass them to a registered deferred class method. @@ -222,11 +231,12 @@ def _writeNonLocalFilesClassMethod(job, files): :return: None """ for nlf in files: - with open(nlf, 'wb') as nonLocalFileHandle: + with open(nlf, "wb") as nonLocalFileHandle: nonLocalFileHandle.write(os.urandom(1 * 1024 * 1024)) job.defer(_deleteMethods._deleteFileClassMethod, files[0], nlf=files[1]) return None + def _writeNonLocalFilesLambda(job, files): """ Write some data to 2 files. Pass them to a registered deferred Lambda. @@ -236,11 +246,12 @@ def _writeNonLocalFilesLambda(job, files): """ lmd = lambda x, nlf: [os.remove(x), os.remove(nlf)] for nlf in files: - with open(nlf, 'wb') as nonLocalFileHandle: + with open(nlf, "wb") as nonLocalFileHandle: nonLocalFileHandle.write(os.urandom(1 * 1024 * 1024)) job.defer(lmd, files[0], nlf=files[1]) return None + def _deferredFunctionRunsWithFailuresFn(job, files): """ Refer testDeferredFunctionRunsWithFailures @@ -255,6 +266,7 @@ def _deferredFunctionRunsWithFailuresFn(job, files): assert os.path.exists(files[1]) job.defer(_deleteFile, files[1]) + def _deleteFile(nonLocalFile, nlf=None): """ Delete nonLocalFile and nlf @@ -270,6 +282,7 @@ def _deleteFile(nonLocalFile, nlf=None): os.remove(nlf) logger.debug("Successfully removed file: %s", nlf) + def _testNewJobsCanHandleOtherJobDeaths_A(job, files): """ Defer deletion of files[1], then wait for _testNewJobsCanHandleOtherJobDeaths_B to @@ -281,7 +294,7 @@ def _testNewJobsCanHandleOtherJobDeaths_A(job, files): # Write the pid to files[1] such that we can be sure that this process has died before # we spawn the next job that will do the cleanup. - with open(files[1], 'w') as fileHandle: + with open(files[1], "w") as fileHandle: fileHandle.write(str(os.getpid())) job.defer(_deleteFile, files[1]) logger.info("Deferred delete of %s", files[1]) @@ -290,10 +303,11 @@ def _testNewJobsCanHandleOtherJobDeaths_A(job, files): os.remove(files[0]) os.kill(os.getpid(), signal.SIGKILL) + def _testNewJobsCanHandleOtherJobDeaths_B(job, files): # Write something to files[0] such that we can be sure that this process has started # before _testNewJobsCanHandleOtherJobDeaths_A kills itself. - with open(files[0], 'w') as fileHandle: + with open(files[0], "w") as fileHandle: fileHandle.write(str(os.getpid())) while os.path.exists(files[0]): time.sleep(0.5) diff --git a/src/toil/test/src/dockerCheckTest.py b/src/toil/test/src/dockerCheckTest.py index 998dff57b2..35b628db12 100644 --- a/src/toil/test/src/dockerCheckTest.py +++ b/src/toil/test/src/dockerCheckTest.py @@ -14,7 +14,8 @@ import unittest from docker.errors import ImageNotFound -from toil import checkDockerImageExists, parseDockerAppliance, retry +from toil import checkDockerImageExists, parseDockerAppliance +from toil.lib.retry import retry from toil.test import ToilTest, needs_docker @@ -22,81 +23,87 @@ class DockerCheckTest(ToilTest): """Tests checking whether a docker image exists or not.""" - @unittest.skip('Consumes unauthenticated Docker Hub pulls if run') + @unittest.skip("Consumes unauthenticated Docker Hub pulls if run") def testOfficialUbuntuRepo(self): """Image exists. This should pass.""" - ubuntu_repo = 'ubuntu:latest' + ubuntu_repo = "ubuntu:latest" assert checkDockerImageExists(ubuntu_repo) - @unittest.skip('Consumes unauthenticated Docker Hub pulls if run') + @unittest.skip("Consumes unauthenticated Docker Hub pulls if run") def testBroadDockerRepo(self): """Image exists. This should pass.""" - broad_repo = 'broadinstitute/genomes-in-the-cloud:2.0.0' + broad_repo = "broadinstitute/genomes-in-the-cloud:2.0.0" assert checkDockerImageExists(broad_repo) - @unittest.skip('Consumes unauthenticated Docker Hub pulls if run') + @unittest.skip("Consumes unauthenticated Docker Hub pulls if run") def testBroadDockerRepoBadTag(self): """Bad tag. This should raise.""" - broad_repo = 'broadinstitute/genomes-in-the-cloud:-----' + broad_repo = "broadinstitute/genomes-in-the-cloud:-----" with self.assertRaises(ImageNotFound): checkDockerImageExists(broad_repo) - @unittest.skip('Consumes unauthenticated Docker Hub pulls if run') + @unittest.skip("Consumes unauthenticated Docker Hub pulls if run") def testNonexistentRepo(self): """Bad image. This should raise.""" - nonexistent_repo = '------:-----' + nonexistent_repo = "------:-----" with self.assertRaises(ImageNotFound): checkDockerImageExists(nonexistent_repo) def testToilQuayRepo(self): """Image exists. Should pass.""" - toil_repo = 'quay.io/ucsc_cgl/toil:latest' + toil_repo = "quay.io/ucsc_cgl/toil:latest" assert checkDockerImageExists(toil_repo) def testBadQuayRepoNTag(self): """Bad repo and tag. This should raise.""" - nonexistent_quay_repo = 'quay.io/--------:---' + nonexistent_quay_repo = "quay.io/--------:---" with self.assertRaises(ImageNotFound): checkDockerImageExists(nonexistent_quay_repo) def testBadQuayRepo(self): """Bad repo. This should raise.""" - nonexistent_quay_repo = 'quay.io/--------:latest' + nonexistent_quay_repo = "quay.io/--------:latest" with self.assertRaises(ImageNotFound): checkDockerImageExists(nonexistent_quay_repo) def testBadQuayTag(self): """Bad tag. This should raise.""" - nonexistent_quay_repo = 'quay.io/ucsc_cgl/toil:---' + nonexistent_quay_repo = "quay.io/ucsc_cgl/toil:---" with self.assertRaises(ImageNotFound): checkDockerImageExists(nonexistent_quay_repo) def testGoogleRepo(self): """Image exists. Should pass.""" - google_repo = 'gcr.io/google-containers/busybox:latest' + google_repo = "gcr.io/google-containers/busybox:latest" assert checkDockerImageExists(google_repo) - @retry(errors=[TimeoutError]) # see: https://github.com/DataBiosphere/toil/issues/4902 + @retry( + errors=[TimeoutError] + ) # see: https://github.com/DataBiosphere/toil/issues/4902 def testBadGoogleRepo(self): """Bad repo and tag. This should raise.""" - nonexistent_google_repo = 'gcr.io/google-containers/--------:---' + nonexistent_google_repo = "gcr.io/google-containers/--------:---" with self.assertRaises(ImageNotFound): checkDockerImageExists(nonexistent_google_repo) def testApplianceParser(self): """Test that a specified appliance is parsed correctly.""" - docker_list = ['ubuntu:latest', - 'ubuntu', - 'broadinstitute/genomes-in-the-cloud:2.0.0', - 'quay.io/ucsc_cgl/toil:latest', - 'gcr.io/google-containers/busybox:latest'] + docker_list = [ + "ubuntu:latest", + "ubuntu", + "broadinstitute/genomes-in-the-cloud:2.0.0", + "quay.io/ucsc_cgl/toil:latest", + "gcr.io/google-containers/busybox:latest", + ] parsings = [] for image in docker_list: registryName, imageName, tag = parseDockerAppliance(image) parsings.append([registryName, imageName, tag]) - expected_parsings = [['docker.io', 'ubuntu', 'latest'], - ['docker.io', 'ubuntu', 'latest'], - ['docker.io', 'broadinstitute/genomes-in-the-cloud', '2.0.0'], - ['quay.io', 'ucsc_cgl/toil', 'latest'], - ['gcr.io', 'google-containers/busybox', 'latest']] + expected_parsings = [ + ["docker.io", "ubuntu", "latest"], + ["docker.io", "ubuntu", "latest"], + ["docker.io", "broadinstitute/genomes-in-the-cloud", "2.0.0"], + ["quay.io", "ucsc_cgl/toil", "latest"], + ["gcr.io", "google-containers/busybox", "latest"], + ] assert parsings == expected_parsings diff --git a/src/toil/test/src/environmentTest.py b/src/toil/test/src/environmentTest.py index 9eb9f40f3d..6b6d78ccc2 100644 --- a/src/toil/test/src/environmentTest.py +++ b/src/toil/test/src/environmentTest.py @@ -13,20 +13,19 @@ # limitations under the License. import logging import os -import sys import time - from argparse import Namespace from threading import Thread from typing import Optional from toil.common import Toil from toil.job import Job -from toil.test import ToilTest, slow from toil.jobStores.abstractJobStore import NoSuchFileException +from toil.test import ToilTest logger = logging.getLogger(__name__) + class EnvironmentTest(ToilTest): """ Test to make sure that Toil's environment variable save and restore system @@ -36,44 +35,50 @@ class EnvironmentTest(ToilTest): should be sent through based on that, not base don the leader's current environment when the job is launched. """ - + def test_environment(self): options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.logLevel = "DEBUG" options.retryCount = 0 main(options) - + + def signal_leader(job): """ Make a file in the file store that the leader can see. """ - with job.fileStore.jobStore.write_shared_file_stream("jobstarted.txt", encoding="utf-8") as stream: + with job.fileStore.jobStore.write_shared_file_stream( + "jobstarted.txt", encoding="utf-8" + ) as stream: stream.write("Job has run") - + + def check_environment(job, try_name: str): """ Fail if the test environment is wrong. """ - + job.fileStore.log_to_leader(f"Try {try_name} checking environment") value = os.environ["MAGIC_ENV_VAR_123"] job.fileStore.log_to_leader(f"Try {try_name} got: {value}") if value != "Value1": raise RuntimeError("Environment variable is wrong!") + def wait_a_bit(job): """ Toil job that waits. """ time.sleep(10) + def check_environment_repeatedly(job): """ Toil job that checks the environment, waits, and checks it again, as separate invocations. """ - + signal = job.addChildJobFn(signal_leader) check1 = signal.addFollowOnJobFn(check_environment, "try1") waiter = check1.addFollowOnJobFn(wait_a_bit) @@ -81,6 +86,7 @@ def check_environment_repeatedly(job): # Add another one to make sure we don't chain check3 = waiter.addFollowOnJobFn(check_environment, "try3") + def main(options: Optional[Namespace] = None): """ Run the actual workflow with the given options. @@ -108,7 +114,9 @@ def change_environment_later(): # Wait for the workflow to say it ran something time.sleep(5) try: - with jobStore.read_shared_file_stream("jobstarted.txt", encoding="utf-8") as stream: + with jobStore.read_shared_file_stream( + "jobstarted.txt", encoding="utf-8" + ) as stream: logger.info("Got signal from job: %s", stream.read().strip()) break except NoSuchFileException: @@ -116,10 +124,12 @@ def change_environment_later(): # Change the environment variable logger.info("Changing environment variable") os.environ["MAGIC_ENV_VAR_123"] = "Value2" + changer_thread = Thread(target=change_environment_later) changer_thread.start() toil.start(Job.wrapJobFn(check_environment_repeatedly)) + if __name__ == "__main__": main() diff --git a/src/toil/test/src/fileStoreTest.py b/src/toil/test/src/fileStoreTest.py index 1e225616b9..bbcb049eaf 100644 --- a/src/toil/test/src/fileStoreTest.py +++ b/src/toil/test/src/fileStoreTest.py @@ -32,16 +32,20 @@ from toil.common import Toil from toil.exceptions import FailedJobsException from toil.fileStores import FileID -from toil.fileStores.cachingFileStore import (CacheUnbalancedError, - IllegalDeletionCacheError) +from toil.fileStores.cachingFileStore import ( + CacheUnbalancedError, + IllegalDeletionCacheError, +) from toil.job import Job from toil.jobStores.abstractJobStore import NoSuchFileException from toil.realtimeLogger import RealtimeLogger -from toil.test import (ToilTest, - needs_aws_ec2, - needs_google_project, - needs_google_storage, - slow) +from toil.test import ( + ToilTest, + needs_aws_ec2, + needs_google_project, + needs_google_storage, + slow, +) # Some tests take too long on the AWS jobstore and are unquitable for CI. They can be # be run during manual tests by setting this to False. @@ -55,41 +59,43 @@ class hidden: Hiding the abstract test classes from the Unittest loader so it can be inherited in different test suites for the different job stores. """ + class AbstractFileStoreTest(ToilTest, metaclass=ABCMeta): """ An abstract base class for testing the various general functions described in :class:toil.fileStores.abstractFileStore.AbstractFileStore """ + # This is overwritten in the inheriting classs jobStoreType = None def _getTestJobStore(self): - if self.jobStoreType == 'file': + if self.jobStoreType == "file": return self._getTestJobStorePath() - elif self.jobStoreType == 'aws': - return f'aws:{self.awsRegion()}:cache-tests-{str(uuid4())}' - elif self.jobStoreType == 'google': - projectID = os.getenv('TOIL_GOOGLE_PROJECTID') - return f'google:{projectID}:cache-tests-{str(uuid4())}' + elif self.jobStoreType == "aws": + return f"aws:{self.awsRegion()}:cache-tests-{str(uuid4())}" + elif self.jobStoreType == "google": + projectID = os.getenv("TOIL_GOOGLE_PROJECTID") + return f"google:{projectID}:cache-tests-{str(uuid4())}" else: - raise RuntimeError('Illegal job store type.') + raise RuntimeError("Illegal job store type.") def setUp(self): super().setUp() self.work_dir = self._createTempDir() self.options = Job.Runner.getDefaultOptions(self._getTestJobStore()) - self.options.logLevel = 'DEBUG' + self.options.logLevel = "DEBUG" self.options.realTimeLogging = True self.options.workDir = self.work_dir - self.options.clean = 'always' - self.options.logFile = os.path.join(self.work_dir, 'logFile') + self.options.clean = "always" + self.options.logFile = os.path.join(self.work_dir, "logFile") self.tmp_dir = self._createTempDir() def create_file(self, content, executable=False): - file_path = f'{self.tmp_dir}/{uuid4()}' + file_path = f"{self.tmp_dir}/{uuid4()}" - with open(file_path, 'w') as f: + with open(file_path, "w") as f: f.write(content) if executable: @@ -132,6 +138,7 @@ class WatchingHandler(logging.Handler): A logging handler that watches for a certain substring and trips a flag if it appears. """ + def __init__(self, match: str): super().__init__() self.match = match @@ -145,8 +152,7 @@ def emit(self, record): logging.getLogger().addHandler(handler) - F = Job.wrapJobFn(self._accessAndFail, - disk='100M') + F = Job.wrapJobFn(self._accessAndFail, disk="100M") try: Job.Runner.startToil(F, self.options) except FailedJobsException: @@ -155,19 +161,20 @@ def emit(self, record): logging.getLogger().removeHandler(handler) - assert handler.seen, "Downloaded file name not found in logs of failing Toil run" + assert ( + handler.seen + ), "Downloaded file name not found in logs of failing Toil run" @staticmethod def _accessAndFail(job): with job.fileStore.writeGlobalFileStream() as (writable, file_id): - writable.write(b'Cats') - localPath = os.path.join(job.fileStore.getLocalTempDir(), 'cats.txt') + writable.write(b"Cats") + localPath = os.path.join(job.fileStore.getLocalTempDir(), "cats.txt") job.fileStore.readGlobalFile(file_id, localPath) with job.fileStore.readGlobalFileStream(file_id) as readable: pass raise RuntimeError("I do not like this file") - # Test filestore operations. This is a slightly less intense version of the cache specific # test `testReturnFileSizes` @slow @@ -176,10 +183,13 @@ def testFileStoreOperations(self): Write a couple of files to the jobstore. Delete a couple of them. Read back written and locally deleted files. """ - workdir = self._createTempDir(purpose='nonLocalDir') - F = Job.wrapJobFn(self._testFileStoreOperations, - nonLocalDir=workdir, - numIters=30, disk='2G') + workdir = self._createTempDir(purpose="nonLocalDir") + F = Job.wrapJobFn( + self._testFileStoreOperations, + nonLocalDir=workdir, + numIters=30, + disk="2G", + ) Job.Runner.startToil(F, self.options) @staticmethod @@ -193,14 +203,15 @@ def _testFileStoreOperations(job, nonLocalDir, numIters=100): # Add one file for the sake of having something in the job store writeFileSize = random.randint(0, 30) cls = hidden.AbstractNonCachingFileStoreTest - fsId, _ = cls._writeFileToJobStore(job, isLocalFile=True, nonLocalDir=nonLocalDir, - fileMB=writeFileSize) + fsId, _ = cls._writeFileToJobStore( + job, isLocalFile=True, nonLocalDir=nonLocalDir, fileMB=writeFileSize + ) # Fill in the size of the local file we just made writtenFiles[fsId] = writeFileSize # Remember it actually should be local localFileIDs.add(fsId) - logger.info('Now have local file: %s', fsId) + logger.info("Now have local file: %s", fsId) i = 0 while i <= numIters: @@ -208,13 +219,21 @@ def _testFileStoreOperations(job, nonLocalDir, numIters=100): if randVal < 0.33: # Write writeFileSize = random.randint(0, 30) isLocalFile = True if random.random() <= 0.5 else False - fsID, _ = cls._writeFileToJobStore(job, isLocalFile=isLocalFile, - nonLocalDir=nonLocalDir, - fileMB=writeFileSize) + fsID, _ = cls._writeFileToJobStore( + job, + isLocalFile=isLocalFile, + nonLocalDir=nonLocalDir, + fileMB=writeFileSize, + ) writtenFiles[fsID] = writeFileSize if isLocalFile: localFileIDs.add(fsID) - logger.info('Wrote %s file of size %d MB: %s', 'local' if isLocalFile else 'non-local', writeFileSize, fsID) + logger.info( + "Wrote %s file of size %d MB: %s", + "local" if isLocalFile else "non-local", + writeFileSize, + fsID, + ) else: if len(writtenFiles) == 0: continue @@ -224,10 +243,19 @@ def _testFileStoreOperations(job, nonLocalDir, numIters=100): if randVal < 0.66: # Read mutable = True if random.random() <= 0.5 else False cache = True if random.random() <= 0.5 else False - job.fileStore.readGlobalFile(fsID, '/'.join([work_dir, str(uuid4())]), - cache=cache, mutable=mutable) + job.fileStore.readGlobalFile( + fsID, + "/".join([work_dir, str(uuid4())]), + cache=cache, + mutable=mutable, + ) localFileIDs.add(fsID) - logger.info('Read %s %s local copy of: %s', 'mutable' if mutable else 'immutable', 'cached' if cache else 'uncached', fsID) + logger.info( + "Read %s %s local copy of: %s", + "mutable" if mutable else "immutable", + "cached" if cache else "uncached", + fsID, + ) else: # Delete if rdelRandVal <= 0.5: # Local Delete if fsID not in localFileIDs: @@ -240,18 +268,23 @@ def _testFileStoreOperations(job, nonLocalDir, numIters=100): # ENOENT. If it doesn't something is # broken. raise - logger.info('Correctly fail to local-delete non-local file: %s', fsID) + logger.info( + "Correctly fail to local-delete non-local file: %s", + fsID, + ) else: - assert False, f"Was able to delete non-local file {fsID}" + assert ( + False + ), f"Was able to delete non-local file {fsID}" else: - logger.info('Delete local file: %s', fsID) + logger.info("Delete local file: %s", fsID) job.fileStore.deleteLocalFile(fsID) else: # Global Delete job.fileStore.deleteGlobalFile(fsID) writtenFiles.pop(fsID) if fsID in localFileIDs: localFileIDs.remove(fsID) - logger.info('No longer have file: %s', fsID) + logger.info("No longer have file: %s", fsID) i += 1 def testWriteReadGlobalFilePermissions(self): @@ -262,18 +295,23 @@ def testWriteReadGlobalFilePermissions(self): """ for executable in True, False: for caching in True, False: - with self.subTest(f'Testing readwrite file permissions\n' - f'[executable: {executable}]\n' - f'[caching: {caching}]\n'): + with self.subTest( + f"Testing readwrite file permissions\n" + f"[executable: {executable}]\n" + f"[caching: {caching}]\n" + ): self.options.caching = caching - read_write_job = Job.wrapJobFn(self._testWriteReadGlobalFilePermissions, executable=executable) + read_write_job = Job.wrapJobFn( + self._testWriteReadGlobalFilePermissions, + executable=executable, + ) Job.Runner.startToil(read_write_job, self.options) @staticmethod def _testWriteReadGlobalFilePermissions(job, executable): srcFile = job.fileStore.getLocalTempFile() - with open(srcFile, 'w') as f: - f.write('Hello') + with open(srcFile, "w") as f: + f.write("Hello") if executable: os.chmod(srcFile, os.stat(srcFile).st_mode | stat.S_IXUSR) @@ -285,10 +323,14 @@ def _testWriteReadGlobalFilePermissions(job, executable): for mutable in True, False: for symlink in True, False: dstFile = job.fileStore.getLocalTempFileName() - job.fileStore.readGlobalFile(fileID, userPath=dstFile, mutable=mutable, symlink=symlink) + job.fileStore.readGlobalFile( + fileID, userPath=dstFile, mutable=mutable, symlink=symlink + ) # Current file owner execute permissions currentPermissions = os.stat(dstFile).st_mode & stat.S_IXUSR - assert initialPermissions == currentPermissions, f'{initialPermissions} != {currentPermissions}' + assert ( + initialPermissions == currentPermissions + ), f"{initialPermissions} != {currentPermissions}" def testWriteExportFileCompatibility(self): """ @@ -296,20 +338,24 @@ def testWriteExportFileCompatibility(self): when they are exported from the leader. """ for executable in True, False: - export_file_job = Job.wrapJobFn(self._testWriteExportFileCompatibility, executable=executable) + export_file_job = Job.wrapJobFn( + self._testWriteExportFileCompatibility, executable=executable + ) with Toil(self.options) as toil: initialPermissions, fileID = toil.start(export_file_job) dstFile = os.path.join(self._createTempDir(), str(uuid4())) - toil.exportFile(fileID, 'file://' + dstFile) + toil.exportFile(fileID, "file://" + dstFile) currentPermissions = os.stat(dstFile).st_mode & stat.S_IXUSR - assert initialPermissions == currentPermissions, f'{initialPermissions} != {currentPermissions}' + assert ( + initialPermissions == currentPermissions + ), f"{initialPermissions} != {currentPermissions}" @staticmethod def _testWriteExportFileCompatibility(job, executable): srcFile = job.fileStore.getLocalTempFile() - with open(srcFile, 'w') as f: - f.write('Hello') + with open(srcFile, "w") as f: + f.write("Hello") if executable: os.chmod(srcFile, os.stat(srcFile).st_mode | stat.S_IXUSR) initialPermissions = os.stat(srcFile).st_mode & stat.S_IXUSR @@ -323,22 +369,30 @@ def testImportReadFileCompatibility(self): """ with Toil(self.options) as toil: for executable in True, False: - file_path = self.create_file(content='Hello', executable=executable) + file_path = self.create_file(content="Hello", executable=executable) initial_permissions = os.stat(file_path).st_mode & stat.S_IXUSR - file_id = toil.importFile(f'file://{file_path}') + file_id = toil.importFile(f"file://{file_path}") for mutable in True, False: for symlink in True, False: - with self.subTest(f'Now testing readGlobalFileWith: mutable={mutable} symlink={symlink}'): - A = Job.wrapJobFn(self._testImportReadFileCompatibility, - fileID=file_id, - initialPermissions=initial_permissions, - mutable=mutable, - symlink=symlink) + with self.subTest( + f"Now testing readGlobalFileWith: mutable={mutable} symlink={symlink}" + ): + A = Job.wrapJobFn( + self._testImportReadFileCompatibility, + fileID=file_id, + initialPermissions=initial_permissions, + mutable=mutable, + symlink=symlink, + ) toil.start(A) @staticmethod - def _testImportReadFileCompatibility(job, fileID, initialPermissions, mutable, symlink): - dstFile = job.fileStore.readGlobalFile(fileID, mutable=mutable, symlink=symlink) + def _testImportReadFileCompatibility( + job, fileID, initialPermissions, mutable, symlink + ): + dstFile = job.fileStore.readGlobalFile( + fileID, mutable=mutable, symlink=symlink + ) currentPermissions = os.stat(dstFile).st_mode & stat.S_IXUSR assert initialPermissions == currentPermissions @@ -353,11 +407,16 @@ def testReadWriteFileStreamTextMode(self): @staticmethod def _testReadWriteFileStreamTextMode(job): - with job.fileStore.writeGlobalFileStream(encoding='utf-8') as (stream, fileID): - stream.write('foo') + with job.fileStore.writeGlobalFileStream(encoding="utf-8") as ( + stream, + fileID, + ): + stream.write("foo") job.fileStore.readGlobalFileStream(fileID) - with job.fileStore.readGlobalFileStream(fileID, encoding='utf-8') as stream2: - assert 'foo' == stream2.read() + with job.fileStore.readGlobalFileStream( + fileID, encoding="utf-8" + ) as stream2: + assert "foo" == stream2.read() @staticmethod def _writeFileToJobStore(job, isLocalFile, nonLocalDir=None, fileMB=1): @@ -374,7 +433,7 @@ def _writeFileToJobStore(job, isLocalFile, nonLocalDir=None, fileMB=1): else: assert nonLocalDir is not None work_dir = nonLocalDir - with open(os.path.join(work_dir, str(uuid4())), 'wb') as testFile: + with open(os.path.join(work_dir, str(uuid4())), "wb") as testFile: testFile.write(os.urandom(fileMB * 1024 * 1024)) return job.fileStore.writeGlobalFile(testFile.name), testFile @@ -409,7 +468,7 @@ def testExtremeCacheSetup(self): the chain. This tests whether the cache is created properly even when the job crashes randomly. """ - if testingIsAutomatic and self.jobStoreType != 'file': + if testingIsAutomatic and self.jobStoreType != "file": self.skipTest("To save time") self.options.retryCount = 10 self.options.badWorker = 0.25 @@ -436,7 +495,7 @@ def testCacheEvictionPartialEvict(self): # Explicitly set clean to always so even the failed cases get cleaned (This will # overwrite the value set in setUp if it is ever changed in the future) - self.options.clean = 'always' + self.options.clean = "always" self._testCacheEviction(file1MB=20, file2MB=30, diskRequestMB=10) @@ -452,7 +511,7 @@ def testCacheEvictionTotalEvict(self): # Explicitly set clean to always so even the failed cases get cleaned (This will # overwrite the value set in setUp if it is ever changed in the future) - self.options.clean = 'always' + self.options.clean = "always" self._testCacheEviction(file1MB=20, file2MB=30, diskRequestMB=30) @@ -468,7 +527,7 @@ def testCacheEvictionFailCase(self): # Explicitly set clean to always so even the failed cases get cleaned (This will # overwrite the value set in setUp if it is ever changed in the future) - self.options.clean = 'always' + self.options.clean = "always" self._testCacheEviction(file1MB=20, file2MB=30, diskRequestMB=60) @@ -476,7 +535,7 @@ def _testValidityOfCacheEvictTest(self): # If the job store and cache are on the same file system, file # sizes are accounted for by the job store and are not reflected in # the cache hence this test is redundant (caching will be free). - if not self.options.jobStore.startswith(('aws', 'google')): + if not self.options.jobStore.startswith(("aws", "google")): workDirDev = os.stat(self.options.workDir).st_dev if self.options.jobStore.startswith("file:"): # Before #4538, options.jobStore would have the raw path while the Config object would prepend the @@ -484,11 +543,15 @@ def _testValidityOfCacheEvictTest(self): # The options namespace and the Config object now have the exact same behavior # which means parse_jobstore will be called with argparse rather than with the config object # so remove the prepended file: scheme - jobStoreDev = os.stat(os.path.dirname(self.options.jobStore[5:])).st_dev + jobStoreDev = os.stat( + os.path.dirname(self.options.jobStore[5:]) + ).st_dev else: jobStoreDev = os.stat(os.path.dirname(self.options.jobStore)).st_dev if workDirDev == jobStoreDev: - self.skipTest('Job store and working directory are on the same filesystem.') + self.skipTest( + "Job store and working directory are on the same filesystem." + ) def _testCacheEviction(self, file1MB, file2MB, diskRequestMB): """ @@ -501,23 +564,35 @@ def _testCacheEviction(self, file1MB, file2MB, diskRequestMB): self.options.retryCount = 0 if diskRequestMB > 50: # This can be non int as it will never reach _probeJobReqs - expectedResult = 'Fail' + expectedResult = "Fail" else: expectedResult = 50 - file1MB if diskRequestMB <= file1MB else 0 try: - A = Job.wrapJobFn(self._writeFileToJobStoreWithAsserts, isLocalFile=True, - fileMB=file1MB) + A = Job.wrapJobFn( + self._writeFileToJobStoreWithAsserts, + isLocalFile=True, + fileMB=file1MB, + ) # Sleep for 1 second after writing the first file so that their ctimes are # guaranteed to be distinct for the purpose of this test. B = Job.wrapJobFn(self._sleepy, timeToSleep=1) - C = Job.wrapJobFn(self._writeFileToJobStoreWithAsserts, isLocalFile=True, - fileMB=file2MB) - D = Job.wrapJobFn(self._adjustCacheLimit, newTotalMB=50, disk='0Mi') - E = Job.wrapJobFn(self._uselessFunc, disk=''.join([str(diskRequestMB), 'Mi'])) + C = Job.wrapJobFn( + self._writeFileToJobStoreWithAsserts, + isLocalFile=True, + fileMB=file2MB, + ) + D = Job.wrapJobFn(self._adjustCacheLimit, newTotalMB=50, disk="0Mi") + E = Job.wrapJobFn( + self._uselessFunc, disk="".join([str(diskRequestMB), "Mi"]) + ) # Set it to > 2GB such that the cleanup jobs don't die in the non-fail cases - F = Job.wrapJobFn(self._adjustCacheLimit, newTotalMB=5000, disk='10Mi') - G = Job.wrapJobFn(self._probeJobReqs, sigmaJob=100, cached=expectedResult, - disk='100Mi') + F = Job.wrapJobFn(self._adjustCacheLimit, newTotalMB=5000, disk="10Mi") + G = Job.wrapJobFn( + self._probeJobReqs, + sigmaJob=100, + cached=expectedResult, + disk="100Mi", + ) A.addChild(B) B.addChild(C) C.addChild(D) @@ -529,12 +604,16 @@ def _testCacheEviction(self, file1MB, file2MB, diskRequestMB): with open(self.options.logFile) as f: logContents = f.read() if CacheUnbalancedError.message in logContents: - self.assertEqual(expectedResult, 'Fail') + self.assertEqual(expectedResult, "Fail") else: - self.fail('Toil did not raise the expected CacheUnbalancedError but failed for some other reason') + self.fail( + "Toil did not raise the expected CacheUnbalancedError but failed for some other reason" + ) @staticmethod - def _writeFileToJobStoreWithAsserts(job, isLocalFile, nonLocalDir=None, fileMB=1, expectAsyncUpload=True): + def _writeFileToJobStoreWithAsserts( + job, isLocalFile, nonLocalDir=None, fileMB=1, expectAsyncUpload=True + ): """ This function creates a file and writes it to the jobstore. @@ -548,7 +627,9 @@ def _writeFileToJobStoreWithAsserts(job, isLocalFile, nonLocalDir=None, fileMB=1 the job store later(T) or immediately(F) """ cls = hidden.AbstractNonCachingFileStoreTest - fsID, testFile = cls._writeFileToJobStore(job, isLocalFile, nonLocalDir, fileMB) + fsID, testFile = cls._writeFileToJobStore( + job, isLocalFile, nonLocalDir, fileMB + ) actual = os.stat(testFile.name).st_nlink # If the caching is free, the job store must have hard links to @@ -566,13 +647,19 @@ def _writeFileToJobStoreWithAsserts(job, isLocalFile, nonLocalDir=None, fileMB=1 # We also expect a link in the job store expected += 1 - assert actual == expected, 'Should have %d links. Got %d.' % (expected, actual) + assert actual == expected, "Should have %d links. Got %d." % ( + expected, + actual, + ) - logger.info('Uploaded %s with %d links', fsID, actual) + logger.info("Uploaded %s with %d links", fsID, actual) if not isLocalFile: # Make sure it isn't cached if we don't want it to be - assert not job.fileStore.fileIsCached(fsID), "File uploaded from non-local-temp directory %s should not be cached" % nonLocalDir + assert not job.fileStore.fileIsCached(fsID), ( + "File uploaded from non-local-temp directory %s should not be cached" + % nonLocalDir + ) return fsID @@ -609,30 +696,36 @@ def _probeJobReqs(job, total=None, cached=None, sigmaJob=None): :param int sigmaJob: Expected sum of job requirements in MB. """ - RealtimeLogger.info('Probing job requirements') + RealtimeLogger.info("Probing job requirements") valueDict = locals() - assert (total or cached or sigmaJob) + assert total or cached or sigmaJob # Work out which function to call for which value - toCall = {'total': job.fileStore.getCacheLimit, - 'cached': job.fileStore.getCacheUsed, - 'sigmaJob': job.fileStore.getCacheExtraJobSpace} + toCall = { + "total": job.fileStore.getCacheLimit, + "cached": job.fileStore.getCacheUsed, + "sigmaJob": job.fileStore.getCacheExtraJobSpace, + } - for value in ('total', 'cached', 'sigmaJob'): + for value in ("total", "cached", "sigmaJob"): # If the value wasn't provided, it is None and should be ignored if valueDict[value] is None: continue - RealtimeLogger.info('Probing cache state: %s', value) + RealtimeLogger.info("Probing cache state: %s", value) expectedBytes = valueDict[value] * 1024 * 1024 cacheInfoBytes = toCall[value]() - RealtimeLogger.info('Got %d for %s; expected %d', cacheInfoBytes, value, expectedBytes) + RealtimeLogger.info( + "Got %d for %s; expected %d", cacheInfoBytes, value, expectedBytes + ) - assert cacheInfoBytes == expectedBytes, 'Testing %s: Expected ' % value + \ - f'{expectedBytes} but got {cacheInfoBytes}.' + assert cacheInfoBytes == expectedBytes, ( + "Testing %s: Expected " % value + + f"{expectedBytes} but got {cacheInfoBytes}." + ) @slow def testAsyncWriteWithCaching(self): @@ -650,12 +743,14 @@ def testAsyncWriteWithCaching(self): print("Testing") logger.debug("Testing testing 123") self.options.retryCount = 0 - self.options.logLevel = 'DEBUG' - A = Job.wrapJobFn(self._adjustCacheLimit, newTotalMB=1024, disk='1G') - B = Job.wrapJobFn(self._doubleWriteFileToJobStore, fileMB=850, disk='900M') - C = Job.wrapJobFn(self._readFromJobStoreWithoutAssertions, fsID=B.rv(), disk='1G') + self.options.logLevel = "DEBUG" + A = Job.wrapJobFn(self._adjustCacheLimit, newTotalMB=1024, disk="1G") + B = Job.wrapJobFn(self._doubleWriteFileToJobStore, fileMB=850, disk="900M") + C = Job.wrapJobFn( + self._readFromJobStoreWithoutAssertions, fsID=B.rv(), disk="1G" + ) # Set it to > 2GB such that the cleanup jobs don't die. - D = Job.wrapJobFn(self._adjustCacheLimit, newTotalMB=5000, disk='1G') + D = Job.wrapJobFn(self._adjustCacheLimit, newTotalMB=5000, disk="1G") A.addChild(B) B.addChild(C) C.addChild(D) @@ -671,20 +766,22 @@ def _doubleWriteFileToJobStore(job, fileMB): :param fileMB: File Size :return: Job store file ID for second written file """ - job.fileStore.log_to_leader('Double writing a file into job store') + job.fileStore.log_to_leader("Double writing a file into job store") work_dir = job.fileStore.getLocalTempDir() - with open(os.path.join(work_dir, str(uuid4())), 'wb') as testFile: + with open(os.path.join(work_dir, str(uuid4())), "wb") as testFile: testFile.write(os.urandom(fileMB * 1024 * 1024)) - job.fileStore.log_to_leader('Writing copy 1 and discarding ID') + job.fileStore.log_to_leader("Writing copy 1 and discarding ID") job.fileStore.writeGlobalFile(testFile.name) - job.fileStore.log_to_leader('Writing copy 2 and saving ID') + job.fileStore.log_to_leader("Writing copy 2 and saving ID") fsID = job.fileStore.writeGlobalFile(testFile.name) - job.fileStore.log_to_leader(f'Copy 2 ID: {fsID}') + job.fileStore.log_to_leader(f"Copy 2 ID: {fsID}") - hidden.AbstractCachingFileStoreTest._readFromJobStoreWithoutAssertions(job, fsID) + hidden.AbstractCachingFileStoreTest._readFromJobStoreWithoutAssertions( + job, fsID + ) - job.fileStore.log_to_leader('Writing copy 3 and returning ID') + job.fileStore.log_to_leader("Writing copy 3 and returning ID") return job.fileStore.writeGlobalFile(testFile.name) @staticmethod @@ -696,7 +793,7 @@ def _readFromJobStoreWithoutAssertions(job, fsID): :param fsID: Job store file ID for the read file :return: None """ - job.fileStore.log_to_leader('Reading the written file') + job.fileStore.log_to_leader("Reading the written file") job.fileStore.readGlobalFile(fsID) # writeGlobalFile tests @@ -706,9 +803,12 @@ def testWriteNonLocalFileToJobStore(self): Write a file not in localTempDir to the job store. Such a file should not be cached. Ensure the file is not cached. """ - workdir = self._createTempDir(purpose='nonLocalDir') - A = Job.wrapJobFn(self._writeFileToJobStoreWithAsserts, isLocalFile=False, - nonLocalDir=workdir) + workdir = self._createTempDir(purpose="nonLocalDir") + A = Job.wrapJobFn( + self._writeFileToJobStoreWithAsserts, + isLocalFile=False, + nonLocalDir=workdir, + ) Job.Runner.startToil(A, self.options) def testWriteLocalFileToJobStore(self): @@ -741,11 +841,18 @@ def _testCacheMissFunction(self, cacheReadFile): :param cacheReadFile: Does the read file need to be cached(T) or not(F) """ - workdir = self._createTempDir(purpose='nonLocalDir') - A = Job.wrapJobFn(self._writeFileToJobStoreWithAsserts, isLocalFile=False, - nonLocalDir=workdir) - B = Job.wrapJobFn(self._readFromJobStore, isCachedFile=False, - cacheReadFile=cacheReadFile, fsID=A.rv()) + workdir = self._createTempDir(purpose="nonLocalDir") + A = Job.wrapJobFn( + self._writeFileToJobStoreWithAsserts, + isLocalFile=False, + nonLocalDir=workdir, + ) + B = Job.wrapJobFn( + self._readFromJobStore, + isCachedFile=False, + cacheReadFile=cacheReadFile, + fsID=A.rv(), + ) A.addChild(B) Job.Runner.startToil(A, self.options) @@ -768,25 +875,38 @@ def _readFromJobStore(job, isCachedFile, cacheReadFile, fsID, isTest=True): work_dir = job.fileStore.getLocalTempDir() wantHardLink = False if isCachedFile: - outfile = job.fileStore.readGlobalFile(fsID, '/'.join([work_dir, 'temp']), - mutable=False) + outfile = job.fileStore.readGlobalFile( + fsID, "/".join([work_dir, "temp"]), mutable=False + ) wantHardLink = True else: if cacheReadFile: - outfile = job.fileStore.readGlobalFile(fsID, '/'.join([work_dir, 'temp']), - cache=True, mutable=False) + outfile = job.fileStore.readGlobalFile( + fsID, "/".join([work_dir, "temp"]), cache=True, mutable=False + ) wantHardLink = True else: - assert not job.fileStore.fileIsCached(fsID), "File mistakenly cached before read" - outfile = job.fileStore.readGlobalFile(fsID, '/'.join([work_dir, 'temp']), - cache=False, mutable=False) - assert not job.fileStore.fileIsCached(fsID), "File mistakenly cached after read" + assert not job.fileStore.fileIsCached( + fsID + ), "File mistakenly cached before read" + outfile = job.fileStore.readGlobalFile( + fsID, "/".join([work_dir, "temp"]), cache=False, mutable=False + ) + assert not job.fileStore.fileIsCached( + fsID + ), "File mistakenly cached after read" wantHardLink = False if isTest: actual = os.stat(outfile).st_nlink if wantHardLink: - assert actual > 1, 'Should have multiple links for file that was %s and %s. Got %i.' % ('cached' if isCachedFile else 'not cached', - 'saved' if cacheReadFile else 'not saved', actual) + assert actual > 1, ( + "Should have multiple links for file that was %s and %s. Got %i." + % ( + "cached" if isCachedFile else "not cached", + "saved" if cacheReadFile else "not saved", + actual, + ) + ) # We need to accept harf links even if we don't want them, # because we may get them straight from the FileJobStore since # we asked for immutable reads. @@ -800,8 +920,12 @@ def testReadCachHitFileFromJobStore(self): of links on the file are appropriate. """ A = Job.wrapJobFn(self._writeFileToJobStoreWithAsserts, isLocalFile=True) - B = Job.wrapJobFn(self._readFromJobStore, isCachedFile=True, cacheReadFile=None, - fsID=A.rv()) + B = Job.wrapJobFn( + self._readFromJobStore, + isCachedFile=True, + cacheReadFile=None, + fsID=A.rv(), + ) A.addChild(B) Job.Runner.startToil(A, self.options) @@ -831,20 +955,29 @@ def _testMultipleJobsReadGlobalFileFunction(self, cacheHit): :param bool cacheHit: Is the test for the CacheHit case(T) or cacheMiss case(F) """ - dirPurpose = 'tempWriteDir' if cacheHit else 'nonLocalDir' + dirPurpose = "tempWriteDir" if cacheHit else "nonLocalDir" workdir = self._createTempDir(purpose=dirPurpose) - file_name = os.path.join(workdir, 'test') - with open(file_name, 'w') as x: + file_name = os.path.join(workdir, "test") + with open(file_name, "w") as x: x.write(str(0)) - A = Job.wrapJobFn(self._writeFileToJobStoreWithAsserts, isLocalFile=cacheHit, - nonLocalDir=workdir, - fileMB=256) - B = Job.wrapJobFn(self._probeJobReqs, sigmaJob=100, disk='100Mi') + A = Job.wrapJobFn( + self._writeFileToJobStoreWithAsserts, + isLocalFile=cacheHit, + nonLocalDir=workdir, + fileMB=256, + ) + B = Job.wrapJobFn(self._probeJobReqs, sigmaJob=100, disk="100Mi") jobs = {} for i in range(0, 10): - jobs[i] = Job.wrapJobFn(self._multipleFileReader, diskMB=1024, fsID=A.rv(), - maxWriteFile=os.path.abspath(file_name), disk='1Gi', - memory='10Mi', cores=1) + jobs[i] = Job.wrapJobFn( + self._multipleFileReader, + diskMB=1024, + fsID=A.rv(), + maxWriteFile=os.path.abspath(file_name), + disk="1Gi", + memory="10Mi", + cores=1, + ) A.addChild(jobs[i]) jobs[i].addChild(B) Job.Runner.startToil(A, self.options) @@ -869,8 +1002,9 @@ def _multipleFileReader(job, diskMB, fsID, maxWriteFile): file will be written """ work_dir = job.fileStore.getLocalTempDir() - outfile = job.fileStore.readGlobalFile(fsID, '/'.join([work_dir, 'temp']), cache=True, - mutable=False) + outfile = job.fileStore.readGlobalFile( + fsID, "/".join([work_dir, "temp"]), cache=True, mutable=False + ) diskBytes = diskMB * 1024 * 1024 fileStats = os.stat(outfile) fileSize = fileStats.st_size @@ -883,13 +1017,13 @@ def _multipleFileReader(job, diskMB, fsID, maxWriteFile): usedCache = job.fileStore.getCacheUsed() - logger.info('Extra job space: %s', str(extraJobSpace)) - logger.info('Current file readers: %s', str(currentReaders)) - logger.info('File size: %s', str(fileSize)) - logger.info('Job disk bytes: %s', str(diskBytes)) - logger.info('Used cache: %s', str(usedCache)) + logger.info("Extra job space: %s", str(extraJobSpace)) + logger.info("Current file readers: %s", str(currentReaders)) + logger.info("File size: %s", str(fileSize)) + logger.info("Job disk bytes: %s", str(diskBytes)) + logger.info("Used cache: %s", str(usedCache)) - with open(maxWriteFile, 'r+') as x: + with open(maxWriteFile, "r+") as x: # Advisory lock the file we are saving max readers to fcntl.lockf(x, fcntl.LOCK_EX) prev_max = int(x.read()) @@ -905,23 +1039,26 @@ def _multipleFileReader(job, diskMB, fsID, maxWriteFile): assert usedCache == fileSize # Make sure that there's no over-usage of job requirements - assert ((extraJobSpace + currentReaders * fileSize) % - diskBytes) == 0.0 + assert ((extraJobSpace + currentReaders * fileSize) % diskBytes) == 0.0 # Sleep so there's no race conditions where a job ends before another can get a hold of # the file time.sleep(3) @staticmethod def _writeExportGlobalFile(job): - fileName = os.path.join(job.fileStore.getLocalTempDir(), 'testfile') - with open(fileName, 'wb') as f: - f.write(os.urandom(1024 * 30000)) # 30 Mb - outputFile = os.path.join(job.fileStore.getLocalTempDir(), 'exportedFile') - job.fileStore.export_file(job.fileStore.writeGlobalFile(fileName), 'File://' + outputFile) + fileName = os.path.join(job.fileStore.getLocalTempDir(), "testfile") + with open(fileName, "wb") as f: + f.write(os.urandom(1024 * 30000)) # 30 Mb + outputFile = os.path.join(job.fileStore.getLocalTempDir(), "exportedFile") + job.fileStore.export_file( + job.fileStore.writeGlobalFile(fileName), "File://" + outputFile + ) if not filecmp.cmp(fileName, outputFile): - logger.warning('Source file: %s', str(os.stat(fileName))) - logger.warning('Destination file: %s', str(os.stat(outputFile))) - raise RuntimeError(f"File {fileName} did not properly get copied to {outputFile}") + logger.warning("Source file: %s", str(os.stat(fileName))) + logger.warning("Destination file: %s", str(os.stat(outputFile))) + raise RuntimeError( + f"File {fileName} did not properly get copied to {outputFile}" + ) @slow def testFileStoreExportFile(self): @@ -938,12 +1075,14 @@ def testReturnFileSizes(self): Read back written and locally deleted files. Ensure that after every step that the cache is in a valid state. """ - workdir = self._createTempDir(purpose='nonLocalDir') - F = Job.wrapJobFn(self._returnFileTestFn, - jobDisk=2 * 1024 * 1024 * 1024, - initialCachedSize=0, - nonLocalDir=workdir, - disk='2Gi') + workdir = self._createTempDir(purpose="nonLocalDir") + F = Job.wrapJobFn( + self._returnFileTestFn, + jobDisk=2 * 1024 * 1024 * 1024, + initialCachedSize=0, + nonLocalDir=workdir, + disk="2Gi", + ) Job.Runner.startToil(F, self.options) @slow @@ -956,16 +1095,21 @@ def testReturnFileSizesWithBadWorker(self): self.options.retryCount = 20 self.options.badWorker = 0.5 self.options.badWorkerFailInterval = 0.1 - workdir = self._createTempDir(purpose='nonLocalDir') - F = Job.wrapJobFn(self._returnFileTestFn, - jobDisk=2 * 1024 * 1024 * 1024, - initialCachedSize=0, - nonLocalDir=workdir, - numIters=30, disk='2Gi') + workdir = self._createTempDir(purpose="nonLocalDir") + F = Job.wrapJobFn( + self._returnFileTestFn, + jobDisk=2 * 1024 * 1024 * 1024, + initialCachedSize=0, + nonLocalDir=workdir, + numIters=30, + disk="2Gi", + ) Job.Runner.startToil(F, self.options) @staticmethod - def _returnFileTestFn(job, jobDisk, initialCachedSize, nonLocalDir, numIters=100): + def _returnFileTestFn( + job, jobDisk, initialCachedSize, nonLocalDir, numIters=100 + ): """ Aux function for jobCacheTest.testReturnFileSizes Conduct numIters operations and ensure the cache has the right amount of data in it at all times. @@ -977,7 +1121,7 @@ def _returnFileTestFn(job, jobDisk, initialCachedSize, nonLocalDir, numIters=100 :param float jobDisk: The value of disk passed to this job. """ cached = initialCachedSize - RealtimeLogger.info('Expecting %d bytes cached initially', cached) + RealtimeLogger.info("Expecting %d bytes cached initially", cached) work_dir = job.fileStore.getLocalTempDir() writtenFiles = {} # fsID: (size, isLocal) # fsid: local/mutable/immutable for all operations that should make local files as tracked by the FileStore @@ -988,45 +1132,71 @@ def _returnFileTestFn(job, jobDisk, initialCachedSize, nonLocalDir, numIters=100 # We keep jobDisk in sync with the amount of free space the job # still has that the file store doesn't know it has used. cls = hidden.AbstractCachingFileStoreTest - fsId = cls._writeFileToJobStoreWithAsserts(job, isLocalFile=True, fileMB=writeFileSize) + fsId = cls._writeFileToJobStoreWithAsserts( + job, isLocalFile=True, fileMB=writeFileSize + ) writtenFiles[fsId] = writeFileSize if job.fileStore.fileIsCached(list(writtenFiles.keys())[0]): cached += writeFileSize * 1024 * 1024 - RealtimeLogger.info('Expecting %d bytes cached because file of %d MB is cached', cached, writeFileSize) + RealtimeLogger.info( + "Expecting %d bytes cached because file of %d MB is cached", + cached, + writeFileSize, + ) else: - RealtimeLogger.info('Expecting %d bytes cached because file of %d MB is not cached', cached, writeFileSize) - localFileIDs[list(writtenFiles.keys())[0]].append('local') - RealtimeLogger.info('Checking for %d bytes cached', cached) + RealtimeLogger.info( + "Expecting %d bytes cached because file of %d MB is not cached", + cached, + writeFileSize, + ) + localFileIDs[list(writtenFiles.keys())[0]].append("local") + RealtimeLogger.info("Checking for %d bytes cached", cached) cls._requirementsConcur(job, jobDisk, cached) i = 0 while i <= numIters: randVal = random.random() if randVal < 0.33: # Write - RealtimeLogger.info('Writing a file') + RealtimeLogger.info("Writing a file") writeFileSize = random.randint(0, 30) if random.random() <= 0.5: # Write a local file - RealtimeLogger.info('Writing a local file of %d MB', writeFileSize) - fsID = cls._writeFileToJobStoreWithAsserts(job, isLocalFile=True, - fileMB=writeFileSize) - RealtimeLogger.info('Wrote local file: %s', fsID) + RealtimeLogger.info( + "Writing a local file of %d MB", writeFileSize + ) + fsID = cls._writeFileToJobStoreWithAsserts( + job, isLocalFile=True, fileMB=writeFileSize + ) + RealtimeLogger.info("Wrote local file: %s", fsID) writtenFiles[fsID] = writeFileSize - localFileIDs[fsID].append('local') + localFileIDs[fsID].append("local") jobDisk -= writeFileSize * 1024 * 1024 if job.fileStore.fileIsCached(fsID): cached += writeFileSize * 1024 * 1024 - RealtimeLogger.info('Expecting %d bytes cached because file of %d MB is cached', cached, writeFileSize) + RealtimeLogger.info( + "Expecting %d bytes cached because file of %d MB is cached", + cached, + writeFileSize, + ) else: - RealtimeLogger.info('Expecting %d bytes cached because file of %d MB is not cached', cached, writeFileSize) + RealtimeLogger.info( + "Expecting %d bytes cached because file of %d MB is not cached", + cached, + writeFileSize, + ) else: # Write a non-local file - RealtimeLogger.info('Writing a non-local file of %d MB', writeFileSize) - fsID = cls._writeFileToJobStoreWithAsserts(job, isLocalFile=False, - nonLocalDir=nonLocalDir, - fileMB=writeFileSize) - RealtimeLogger.info('Wrote non-local file: %s', fsID) + RealtimeLogger.info( + "Writing a non-local file of %d MB", writeFileSize + ) + fsID = cls._writeFileToJobStoreWithAsserts( + job, + isLocalFile=False, + nonLocalDir=nonLocalDir, + fileMB=writeFileSize, + ) + RealtimeLogger.info("Wrote non-local file: %s", fsID) writtenFiles[fsID] = writeFileSize # Don't record in localFileIDs because we're not local # No change to the job since there was no caching - RealtimeLogger.info('Checking for %d bytes cached', cached) + RealtimeLogger.info("Checking for %d bytes cached", cached) cls._requirementsConcur(job, jobDisk, cached) else: if len(writtenFiles) == 0: @@ -1036,61 +1206,101 @@ def _returnFileTestFn(job, jobDisk, initialCachedSize, nonLocalDir, numIters=100 rdelRandVal = random.random() fileWasCached = job.fileStore.fileIsCached(fsID) if randVal < 0.66: # Read - RealtimeLogger.info('Reading a file with size %d and previous cache status %s: %s', rdelFileSize, str(fileWasCached), fsID) + RealtimeLogger.info( + "Reading a file with size %d and previous cache status %s: %s", + rdelFileSize, + str(fileWasCached), + fsID, + ) if rdelRandVal <= 0.5: # Read as mutable, uncached - RealtimeLogger.info('Reading as mutable and uncached; should still have %d bytes cached', cached) - job.fileStore.readGlobalFile(fsID, '/'.join([work_dir, str(uuid4())]), - mutable=True, cache=False) - localFileIDs[fsID].append('mutable') + RealtimeLogger.info( + "Reading as mutable and uncached; should still have %d bytes cached", + cached, + ) + job.fileStore.readGlobalFile( + fsID, + "/".join([work_dir, str(uuid4())]), + mutable=True, + cache=False, + ) + localFileIDs[fsID].append("mutable") # No change because the file wasn't cached else: # Read as immutable - RealtimeLogger.info('Reading as immutable and cacheable') - job.fileStore.readGlobalFile(fsID, '/'.join([work_dir, str(uuid4())]), - mutable=False, cache=True) - localFileIDs[fsID].append('immutable') + RealtimeLogger.info("Reading as immutable and cacheable") + job.fileStore.readGlobalFile( + fsID, + "/".join([work_dir, str(uuid4())]), + mutable=False, + cache=True, + ) + localFileIDs[fsID].append("immutable") jobDisk -= rdelFileSize * 1024 * 1024 if not fileWasCached: if job.fileStore.fileIsCached(fsID): - RealtimeLogger.info('File was not cached before and is now. Should have %d bytes cached', cached) + RealtimeLogger.info( + "File was not cached before and is now. Should have %d bytes cached", + cached, + ) cached += rdelFileSize * 1024 * 1024 else: - RealtimeLogger.info('File was not cached before and still is not now. ' - 'Should still have %d bytes cached', cached) + RealtimeLogger.info( + "File was not cached before and still is not now. " + "Should still have %d bytes cached", + cached, + ) else: - RealtimeLogger.info('File was cached before. Should still have %d bytes cached', cached) + RealtimeLogger.info( + "File was cached before. Should still have %d bytes cached", + cached, + ) cls._requirementsConcur(job, jobDisk, cached) else: # Delete if rdelRandVal <= 0.5: # Local Delete if fsID not in list(localFileIDs.keys()): continue - RealtimeLogger.info('Deleting a file locally with history %s: %s', localFileIDs[fsID], fsID) + RealtimeLogger.info( + "Deleting a file locally with history %s: %s", + localFileIDs[fsID], + fsID, + ) job.fileStore.deleteLocalFile(fsID) else: # Global Delete - RealtimeLogger.info('Deleting a file globally: %s', fsID) + RealtimeLogger.info("Deleting a file globally: %s", fsID) job.fileStore.deleteGlobalFile(fsID) try: job.fileStore.readGlobalFile(fsID) except FileNotFoundError as err: pass except: - raise RuntimeError('Got wrong error type for read of deleted file') + raise RuntimeError( + "Got wrong error type for read of deleted file" + ) else: - raise RuntimeError('Able to read deleted file') + raise RuntimeError("Able to read deleted file") writtenFiles.pop(fsID) if fsID in list(localFileIDs.keys()): for lFID in localFileIDs[fsID]: - if lFID != 'mutable': + if lFID != "mutable": jobDisk += rdelFileSize * 1024 * 1024 localFileIDs.pop(fsID) if fileWasCached: if not job.fileStore.fileIsCached(fsID): cached -= rdelFileSize * 1024 * 1024 - RealtimeLogger.info('File was cached before and is not now. Should have %d bytes cached', cached) + RealtimeLogger.info( + "File was cached before and is not now. Should have %d bytes cached", + cached, + ) else: - RealtimeLogger.info('File was cached before and still is cached now. ' - 'Should still have %d bytes cached', cached) + RealtimeLogger.info( + "File was cached before and still is cached now. " + "Should still have %d bytes cached", + cached, + ) else: - RealtimeLogger.info('File was not cached before deletion. Should still have %d bytes cached', cached) + RealtimeLogger.info( + "File was not cached before deletion. Should still have %d bytes cached", + cached, + ) cls._requirementsConcur(job, jobDisk, cached) i += 1 return jobDisk, cached @@ -1105,15 +1315,32 @@ def _requirementsConcur(job, jobDisk, cached): used = job.fileStore.getCacheUsed() if not job.fileStore.cachingIsFree(): - RealtimeLogger.info('Caching is not free; %d bytes are used and %d bytes are expected', used, cached) - assert used == cached, 'Cache should have %d bytes used, but actually has %d bytes used' % (cached, used) + RealtimeLogger.info( + "Caching is not free; %d bytes are used and %d bytes are expected", + used, + cached, + ) + assert used == cached, ( + "Cache should have %d bytes used, but actually has %d bytes used" + % (cached, used) + ) else: - RealtimeLogger.info('Caching is free; %d bytes are used and %d bytes would be expected if caching were not free', used, cached) - assert used == 0, 'Cache should have nothing in it, but actually has %d bytes used' % used + RealtimeLogger.info( + "Caching is free; %d bytes are used and %d bytes would be expected if caching were not free", + used, + cached, + ) + assert used == 0, ( + "Cache should have nothing in it, but actually has %d bytes used" + % used + ) jobUnused = job.fileStore.getCacheUnusedJobRequirement() - assert jobUnused == jobDisk, 'Job should have %d bytes of disk for non-FileStore use but the FileStore reports %d' % (jobDisk, jobUnused) + assert jobUnused == jobDisk, ( + "Job should have %d bytes of disk for non-FileStore use but the FileStore reports %d" + % (jobDisk, jobUnused) + ) # Testing the resumability of a failed worker @slow @@ -1122,13 +1349,16 @@ def testControlledFailedWorkerRetry(self): Conduct a couple of job store operations. Then die. Ensure that the restarted job is tracking values in the cache state file appropriately. """ - workdir = self._createTempDir(purpose='nonLocalDir') + workdir = self._createTempDir(purpose="nonLocalDir") self.options.retryCount = 1 jobDiskBytes = 2 * 1024 * 1024 * 1024 - F = Job.wrapJobFn(self._controlledFailTestFn, jobDisk=jobDiskBytes, - testDir=workdir, - disk=jobDiskBytes) - G = Job.wrapJobFn(self._probeJobReqs, sigmaJob=100, disk='100Mi') + F = Job.wrapJobFn( + self._controlledFailTestFn, + jobDisk=jobDiskBytes, + testDir=workdir, + disk=jobDiskBytes, + ) + G = Job.wrapJobFn(self._probeJobReqs, sigmaJob=100, disk="100Mi") F.addChild(G) Job.Runner.startToil(F, self.options) @@ -1144,22 +1374,35 @@ def _controlledFailTestFn(job, jobDisk, testDir): """ # Make sure we actually have the disk size we are supposed to - job.fileStore.log_to_leader('Job is running with %d bytes of disk, %d requested' % (job.disk, jobDisk)) - assert job.disk == jobDisk, 'Job was scheduled with %d bytes but requested %d' % (job.disk, jobDisk) + job.fileStore.log_to_leader( + "Job is running with %d bytes of disk, %d requested" + % (job.disk, jobDisk) + ) + assert ( + job.disk == jobDisk + ), "Job was scheduled with %d bytes but requested %d" % (job.disk, jobDisk) cls = hidden.AbstractCachingFileStoreTest - if os.path.exists(os.path.join(testDir, 'testfile.test')): - with open(os.path.join(testDir, 'testfile.test'), 'rb') as fH: - cached = unpack('d', fH.read())[0] - RealtimeLogger.info('Loaded expected cache size of %d from testfile.test', cached) + if os.path.exists(os.path.join(testDir, "testfile.test")): + with open(os.path.join(testDir, "testfile.test"), "rb") as fH: + cached = unpack("d", fH.read())[0] + RealtimeLogger.info( + "Loaded expected cache size of %d from testfile.test", cached + ) cls._requirementsConcur(job, jobDisk, cached) cls._returnFileTestFn(job, jobDisk, cached, testDir, 20) else: - RealtimeLogger.info('Expecting cache size of 0 because testfile.test is absent') - modifiedJobReqs, cached = cls._returnFileTestFn(job, jobDisk, 0, testDir, 20) - with open(os.path.join(testDir, 'testfile.test'), 'wb') as fH: - fH.write(pack('d', cached)) - RealtimeLogger.info('Wrote cache size of %d to testfile.test', cached) + RealtimeLogger.info( + "Expecting cache size of 0 because testfile.test is absent" + ) + modifiedJobReqs, cached = cls._returnFileTestFn( + job, jobDisk, 0, testDir, 20 + ) + with open(os.path.join(testDir, "testfile.test"), "wb") as fH: + fH.write(pack("d", cached)) + RealtimeLogger.info( + "Wrote cache size of %d to testfile.test", cached + ) os.kill(os.getpid(), signal.SIGKILL) @slow @@ -1178,9 +1421,15 @@ def testRemoveLocalImmutablyReadFile(self): def _deleteLocallyReadFilesFn(self, readAsMutable): self.options.retryCount = 0 - A = Job.wrapJobFn(self._writeFileToJobStoreWithAsserts, isLocalFile=True, memory='10M') - B = Job.wrapJobFn(self._removeReadFileFn, A.rv(), readAsMutable=readAsMutable, - memory='20M') + A = Job.wrapJobFn( + self._writeFileToJobStoreWithAsserts, isLocalFile=True, memory="10M" + ) + B = Job.wrapJobFn( + self._removeReadFileFn, + A.rv(), + readAsMutable=readAsMutable, + memory="20M", + ) A.addChild(B) Job.Runner.startToil(A, self.options) @@ -1198,9 +1447,10 @@ def _removeReadFileFn(job, fileToDelete, readAsMutable): # Are we processing the read file or the written file? processsingReadFile = True # Read in the file - outfile = job.fileStore.readGlobalFile(fileToDelete, os.path.join(work_dir, 'temp'), - mutable=readAsMutable) - tempfile = os.path.join(work_dir, 'tmp.tmp') + outfile = job.fileStore.readGlobalFile( + fileToDelete, os.path.join(work_dir, "temp"), mutable=readAsMutable + ) + tempfile = os.path.join(work_dir, "tmp.tmp") # The first time we run this loop, processsingReadFile is True and fileToDelete is the # file read from the job store. The second time, processsingReadFile is False and # fileToDelete is one that was just written in to the job store. Ensure the correct @@ -1210,7 +1460,9 @@ def _removeReadFileFn(job, fileToDelete, readAsMutable): try: job.fileStore.deleteLocalFile(fileToDelete) except IllegalDeletionCacheError: - job.fileStore.log_to_leader('Detected a deleted file %s.' % fileToDelete) + job.fileStore.log_to_leader( + "Detected a deleted file %s." % fileToDelete + ) os.rename(tempfile, outfile) else: # If we are processing the write test, or if we are testing the immutably read @@ -1219,7 +1471,7 @@ def _removeReadFileFn(job, fileToDelete, readAsMutable): if processsingReadFile: processsingReadFile = False # Write a file - with open(os.path.join(work_dir, str(uuid4())), 'wb') as testFile: + with open(os.path.join(work_dir, str(uuid4())), "wb") as testFile: testFile.write(os.urandom(1 * 1024 * 1024)) fileToDelete = job.fileStore.writeGlobalFile(testFile.name) outfile = testFile.name @@ -1231,7 +1483,7 @@ def testDeleteLocalFile(self): Test the deletion capabilities of deleteLocalFile """ self.options.retryCount = 0 - workdir = self._createTempDir(purpose='nonLocalDir') + workdir = self._createTempDir(purpose="nonLocalDir") A = Job.wrapJobFn(self._deleteLocalFileFn, nonLocalDir=workdir) Job.Runner.startToil(A, self.options) @@ -1243,11 +1495,11 @@ def _deleteLocalFileFn(job, nonLocalDir): """ work_dir = job.fileStore.getLocalTempDir() # Write local file - with open(os.path.join(work_dir, str(uuid4())), 'wb') as localFile: + with open(os.path.join(work_dir, str(uuid4())), "wb") as localFile: localFile.write(os.urandom(1 * 1024 * 1024)) localFsID = job.fileStore.writeGlobalFile(localFile.name) # write Non-Local File - with open(os.path.join(nonLocalDir, str(uuid4())), 'wb') as nonLocalFile: + with open(os.path.join(nonLocalDir, str(uuid4())), "wb") as nonLocalFile: nonLocalFile.write(os.urandom(1 * 1024 * 1024)) nonLocalFsID = job.fileStore.writeGlobalFile(nonLocalFile.name) # Delete fsid of local file. The file should be deleted @@ -1277,7 +1529,7 @@ def _deleteLocalFileFn(job, nonLocalDir): assert not os.path.exists(readBackFile2) # Try to get a non-FileID that doesn't exist. try: - job.fileStore.readGlobalFile('bogus') + job.fileStore.readGlobalFile("bogus") except NoSuchFileException: # TODO: We would like to require TypeError, but for Cactus # support we have to accept non-FileIDs. @@ -1286,7 +1538,7 @@ def _deleteLocalFileFn(job, nonLocalDir): raise RuntimeError("Managed to get a file from a non-FileID") # Try to get a FileID for something that doesn't exist try: - job.fileStore.readGlobalFile(FileID('bogus', 4096)) + job.fileStore.readGlobalFile(FileID("bogus", 4096)) except NoSuchFileException: pass else: @@ -1314,7 +1566,7 @@ def _createUncachedFileStream(job): Create and return a FileID for a non-cached file written via a stream. """ - messageBytes = b'This is a test file\n' + messageBytes = b"This is a test file\n" with job.fileStore.jobStore.write_file_stream() as (out, idString): # Write directly to the job store so the caching file store doesn't even see it. @@ -1327,7 +1579,9 @@ def _createUncachedFileStream(job): return fileID @staticmethod - def _readFileWithDelay(job, fileID, cores=0.1, memory=50 * 1024 * 1024, disk=50 * 1024 * 1024): + def _readFileWithDelay( + job, fileID, cores=0.1, memory=50 * 1024 * 1024, disk=50 * 1024 * 1024 + ): """ Read a file from the CachingFileStore with a delay imposed on the download. Should create contention. @@ -1340,43 +1594,47 @@ def _readFileWithDelay(job, fileID, cores=0.1, memory=50 * 1024 * 1024, disk=50 job.fileStore.forceDownloadDelay = 120 readStart = datetime.datetime.now() - logger.debug('Begin read at %s', str(readStart)) + logger.debug("Begin read at %s", str(readStart)) localPath = job.fileStore.readGlobalFile(fileID, cache=True, mutable=True) readEnd = datetime.datetime.now() - logger.debug('End read at %s: took %f seconds', str(readEnd), (readEnd - readStart).total_seconds()) - - with open(localPath, 'rb') as fh: - text = fh.read().decode('utf-8').strip() - logger.debug('Got file contents: %s', text) + logger.debug( + "End read at %s: took %f seconds", + str(readEnd), + (readEnd - readStart).total_seconds(), + ) + with open(localPath, "rb") as fh: + text = fh.read().decode("utf-8").strip() + logger.debug("Got file contents: %s", text) class NonCachingFileStoreTestWithFileJobStore(hidden.AbstractNonCachingFileStoreTest): - jobStoreType = 'file' + jobStoreType = "file" + @pytest.mark.timeout(1000) class CachingFileStoreTestWithFileJobStore(hidden.AbstractCachingFileStoreTest): - jobStoreType = 'file' + jobStoreType = "file" @needs_aws_ec2 class NonCachingFileStoreTestWithAwsJobStore(hidden.AbstractNonCachingFileStoreTest): - jobStoreType = 'aws' + jobStoreType = "aws" @slow @needs_aws_ec2 @pytest.mark.timeout(1000) class CachingFileStoreTestWithAwsJobStore(hidden.AbstractCachingFileStoreTest): - jobStoreType = 'aws' + jobStoreType = "aws" @needs_google_project @needs_google_storage class NonCachingFileStoreTestWithGoogleJobStore(hidden.AbstractNonCachingFileStoreTest): - jobStoreType = 'google' + jobStoreType = "google" @slow @@ -1384,7 +1642,7 @@ class NonCachingFileStoreTestWithGoogleJobStore(hidden.AbstractNonCachingFileSto @needs_google_storage @pytest.mark.timeout(1000) class CachingFileStoreTestWithGoogleJobStore(hidden.AbstractCachingFileStoreTest): - jobStoreType = 'google' + jobStoreType = "google" def _exportStaticMethodAsGlobalFunctions(cls): @@ -1393,10 +1651,12 @@ def _exportStaticMethodAsGlobalFunctions(cls): the convention that the first argument of a job function is named 'job'. """ for name, kind, clazz, value in inspect.classify_class_attrs(cls): - if kind == 'static method' and name != '__new__': # __new__ became static in 3.7 + if ( + kind == "static method" and name != "__new__" + ): # __new__ became static in 3.7 method = value.__func__ args = inspect.getfullargspec(method).args - if args and args[0] == 'job': + if args and args[0] == "job": globals()[name] = method diff --git a/src/toil/test/src/helloWorldTest.py b/src/toil/test/src/helloWorldTest.py index 27cfd96e45..acaedd766a 100644 --- a/src/toil/test/src/helloWorldTest.py +++ b/src/toil/test/src/helloWorldTest.py @@ -22,27 +22,30 @@ def testHelloWorld(self): options.logLevel = "INFO" Job.Runner.startToil(HelloWorld(), options) + class HelloWorld(Job): def __init__(self): - Job.__init__(self, memory=100000, cores=1, disk="3G") + Job.__init__(self, memory=100000, cores=1, disk="3G") def run(self, fileStore): fileID = self.addChildJobFn(childFn, cores=1, memory="1M", disk="3G").rv() self.addFollowOn(FollowOn(fileID)) + def childFn(job): with job.fileStore.writeGlobalFileStream() as (fH, fileID): fH.write(b"Hello, World!") return fileID + class FollowOn(Job): - def __init__(self,fileId): + def __init__(self, fileId): Job.__init__(self) - self.fileId=fileId + self.fileId = fileId def run(self, fileStore): tempDir = fileStore.getLocalTempDir() - tempFilePath = "/".join([tempDir,"LocalCopy"]) + tempFilePath = "/".join([tempDir, "LocalCopy"]) with fileStore.readGlobalFileStream(self.fileId) as globalFile: with open(tempFilePath, "wb") as localFile: localFile.write(globalFile.read()) diff --git a/src/toil/test/src/importExportFileTest.py b/src/toil/test/src/importExportFileTest.py index 47ef39207e..fd698be902 100644 --- a/src/toil/test/src/importExportFileTest.py +++ b/src/toil/test/src/importExportFileTest.py @@ -28,14 +28,16 @@ class ImportExportFileTest(ToilTest): def setUp(self): super().setUp() self.tmp_dir = self._createTempDir() - self.output_file_path = f'{self.tmp_dir}/out' - self.message_portion_1 = 'What do you get when you cross a seal and a polar bear?' - self.message_portion_2 = ' A polar bear.' + self.output_file_path = f"{self.tmp_dir}/out" + self.message_portion_1 = ( + "What do you get when you cross a seal and a polar bear?" + ) + self.message_portion_2 = " A polar bear." def create_file(self, content, executable=False): - file_path = f'{self.tmp_dir}/{uuid.uuid4()}' + file_path = f"{self.tmp_dir}/{uuid.uuid4()}" - with open(file_path, 'w') as f: + with open(file_path, "w") as f: f.write(content) if executable: @@ -48,26 +50,44 @@ def _import_export_workflow(self, options, fail): with Toil(options) as toil: if not options.restart: msg_portion_file_path = self.create_file(content=self.message_portion_1) - msg_portion_file_id = toil.importFile(f'file://{msg_portion_file_path}') + msg_portion_file_id = toil.importFile(f"file://{msg_portion_file_path}") self.assertIsInstance(msg_portion_file_id, FileID) - self.assertEqual(os.stat(msg_portion_file_path).st_size, msg_portion_file_id.size) + self.assertEqual( + os.stat(msg_portion_file_path).st_size, msg_portion_file_id.size + ) file_that_can_trigger_failure_when_job_starts = self.create_file( - content='Time to freak out!' if fail else 'Keep calm and carry on.') - self.trigger_file_id = toil.importFile(f'file://{file_that_can_trigger_failure_when_job_starts}') + content="Time to freak out!" if fail else "Keep calm and carry on." + ) + self.trigger_file_id = toil.importFile( + f"file://{file_that_can_trigger_failure_when_job_starts}" + ) workflow_final_output_file_id = toil.start( - RestartingJob(msg_portion_file_id, self.trigger_file_id, self.message_portion_2)) + RestartingJob( + msg_portion_file_id, + self.trigger_file_id, + self.message_portion_2, + ) + ) else: # TODO: We're hackily updating this file without using the # correct FileStore interface. User code should not do this! with toil._jobStore.update_file_stream(self.trigger_file_id) as f: - f.write(('Time to freak out!' if fail else 'Keep calm and carry on.').encode('utf-8')) + f.write( + ( + "Time to freak out!" if fail else "Keep calm and carry on." + ).encode("utf-8") + ) workflow_final_output_file_id = toil.restart() - toil.exportFile(workflow_final_output_file_id, f'file://{self.output_file_path}') + toil.exportFile( + workflow_final_output_file_id, f"file://{self.output_file_path}" + ) with open(self.output_file_path) as f: - self.assertEqual(f.read(), f'{self.message_portion_1}{self.message_portion_2}') + self.assertEqual( + f.read(), f"{self.message_portion_1}{self.message_portion_2}" + ) def _run_import_export_workflow(self, restart): options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) @@ -100,17 +120,21 @@ def test_basic_import_export(self): with Toil(options) as toil: # TODO: test this with non-local (AWS, Google) # Note: this is somewhat done in src/toil/test/src/fileStoreTest.py - with self.subTest('Testing permissions are preserved for local importFile/exportFile'): + with self.subTest( + "Testing permissions are preserved for local importFile/exportFile" + ): for executable in True, False: - file_path = self.create_file(content='Hello', executable=executable) + file_path = self.create_file(content="Hello", executable=executable) initial_permissions = os.stat(file_path).st_mode & stat.S_IXUSR - file_id = toil.importFile(f'file://{file_path}') - toil.exportFile(file_id, f'file://{self.output_file_path}') - current_permissions = os.stat(self.output_file_path).st_mode & stat.S_IXUSR + file_id = toil.importFile(f"file://{file_path}") + toil.exportFile(file_id, f"file://{self.output_file_path}") + current_permissions = ( + os.stat(self.output_file_path).st_mode & stat.S_IXUSR + ) assert initial_permissions == current_permissions - with self.subTest('Testing relative paths without the file:// schema.'): - relative_path_data = 'Everything is relative.' + with self.subTest("Testing relative paths without the file:// schema."): + relative_path_data = "Everything is relative." file_path = self.create_file(content=relative_path_data) file_id = toil.importFile(os.path.relpath(file_path)) @@ -118,31 +142,37 @@ def test_basic_import_export(self): with open(self.output_file_path) as f: self.assertEqual(f.read(), relative_path_data) - with self.subTest('Test local importFile accepts a shared_file_name.'): + with self.subTest("Test local importFile accepts a shared_file_name."): # TODO: whyyyy do we allow this? shared file names are not unique and can overwrite each other # ...not only that... we can't use exportFile on them afterwards!? - file_path = self.create_file(content='why') - shared_file_name = 'users_should_probably_not_be_allowed_to_make_shared_files.bad' - toil.importFile(f'file://{file_path}', sharedFileName=shared_file_name) - with toil._jobStore.read_shared_file_stream(shared_file_name, encoding='utf-8') as f: - self.assertEqual(f.read(), 'why') + file_path = self.create_file(content="why") + shared_file_name = ( + "users_should_probably_not_be_allowed_to_make_shared_files.bad" + ) + toil.importFile(f"file://{file_path}", sharedFileName=shared_file_name) + with toil._jobStore.read_shared_file_stream( + shared_file_name, encoding="utf-8" + ) as f: + self.assertEqual(f.read(), "why") class RestartingJob(Job): def __init__(self, msg_portion_file_id, trigger_file_id, message_portion_2): - Job.__init__(self, memory=100000, cores=1, disk="1M") + Job.__init__(self, memory=100000, cores=1, disk="1M") self.msg_portion_file_id = msg_portion_file_id self.trigger_file_id = trigger_file_id self.message_portion_2 = message_portion_2 def run(self, file_store): with file_store.readGlobalFileStream(self.trigger_file_id) as readable: - if readable.read() == b'Time to freak out!': - raise RuntimeError('D:') + if readable.read() == b"Time to freak out!": + raise RuntimeError("D:") with file_store.writeGlobalFileStream() as (writable, output_file_id): - with file_store.readGlobalFileStream(self.msg_portion_file_id, encoding='utf-8') as readable: + with file_store.readGlobalFileStream( + self.msg_portion_file_id, encoding="utf-8" + ) as readable: # combine readable.read() (the original message 1) with message 2 # this will be the final output of the workflow - writable.write(f'{readable.read()}{self.message_portion_2}'.encode()) + writable.write(f"{readable.read()}{self.message_portion_2}".encode()) return output_file_id diff --git a/src/toil/test/src/jobDescriptionTest.py b/src/toil/test/src/jobDescriptionTest.py index afac977af1..660b4dad10 100644 --- a/src/toil/test/src/jobDescriptionTest.py +++ b/src/toil/test/src/jobDescriptionTest.py @@ -31,7 +31,7 @@ def setUp(self): Job.Runner.addToilOptions(parser) options = parser.parse_args(args=[self.jobStorePath]) self.toil = Toil(options) - self.assertEqual( self.toil, self.toil.__enter__() ) + self.assertEqual(self.toil, self.toil.__enter__()) def tearDown(self): self.toil.__exit__(None, None, None) @@ -44,14 +44,21 @@ def testJobDescription(self): Tests the public interface of a JobDescription. """ - memory = 2^32 - disk = 2^32 + memory = 2 ^ 32 + disk = 2 ^ 32 cores = "1" preemptible = 1 - j = JobDescription(requirements={"memory": memory, "cores": cores, "disk": disk, "preemptible": preemptible}, - jobName='testJobGraph', unitName='noName') - + j = JobDescription( + requirements={ + "memory": memory, + "cores": cores, + "disk": disk, + "preemptible": preemptible, + }, + jobName="testJobGraph", + unitName="noName", + ) # Without a body, and with nothing to run, nextSuccessors will be None self.assertEqual(j.has_body(), False) @@ -61,7 +68,7 @@ def testJobDescription(self): j.attach_body("fake", ModuleDescriptor.forModule("toil")) self.assertEqual(j.has_body(), True) - #Check attributes + # Check attributes self.assertEqual(j.memory, memory) self.assertEqual(j.disk, disk) self.assertEqual(j.cores, int(cores)) @@ -75,18 +82,26 @@ def testJobDescription(self): self.assertEqual(j.predecessorsFinished, set()) self.assertEqual(j.logJobStoreFileID, None) - #Check equals function (should be based on object identity and not contents) - j2 = JobDescription(requirements={"memory": memory, "cores": cores, "disk": disk, "preemptible": preemptible}, - jobName='testJobGraph', unitName='noName') + # Check equals function (should be based on object identity and not contents) + j2 = JobDescription( + requirements={ + "memory": memory, + "cores": cores, + "disk": disk, + "preemptible": preemptible, + }, + jobName="testJobGraph", + unitName="noName", + ) j2.attach_body("fake", ModuleDescriptor.forModule("toil")) self.assertNotEqual(j, j2) ###TODO test other functionality def testJobDescriptionSequencing(self): - j = JobDescription(requirements={}, jobName='unimportant') + j = JobDescription(requirements={}, jobName="unimportant") - j.addChild('child') - j.addFollowOn('followOn') + j.addChild("child") + j.addFollowOn("followOn") # With a body, nothing should be ready to run j.attach_body("fake", ModuleDescriptor.forModule("toil")) @@ -94,13 +109,13 @@ def testJobDescriptionSequencing(self): # With body cleared, child should be ready to run j.detach_body() - self.assertEqual(list(j.nextSuccessors()), ['child']) + self.assertEqual(list(j.nextSuccessors()), ["child"]) # Without the child, the follow-on should be ready to run - j.filterSuccessors(lambda jID: jID != 'child') - self.assertEqual(list(j.nextSuccessors()), ['followOn']) + j.filterSuccessors(lambda jID: jID != "child") + self.assertEqual(list(j.nextSuccessors()), ["followOn"]) # Without the follow-on, we should return None, to be distinct from an # empty list. Nothing left to do! - j.filterSuccessors(lambda jID: jID != 'followOn') + j.filterSuccessors(lambda jID: jID != "followOn") self.assertEqual(j.nextSuccessors(), None) diff --git a/src/toil/test/src/jobEncapsulationTest.py b/src/toil/test/src/jobEncapsulationTest.py index 32cd8e29d1..e0eecc3a50 100644 --- a/src/toil/test/src/jobEncapsulationTest.py +++ b/src/toil/test/src/jobEncapsulationTest.py @@ -20,6 +20,7 @@ class JobEncapsulationTest(ToilTest): """Tests testing the EncapsulationJob class.""" + def testEncapsulation(self): """ Tests the Job.encapsulation method, which uses the EncapsulationJob @@ -61,6 +62,7 @@ def testAddChildEncapsulate(self): def noOp(): pass + def encapsulatedJobFn(job, string, outFile): a = job.addChildFn(fn1Test, string, outFile, name="inner-a") b = a.addFollowOnFn(fn1Test, a.rv(), outFile, name="inner-b") diff --git a/src/toil/test/src/jobFileStoreTest.py b/src/toil/test/src/jobFileStoreTest.py index 206025c4b9..4c299014f3 100644 --- a/src/toil/test/src/jobFileStoreTest.py +++ b/src/toil/test/src/jobFileStoreTest.py @@ -23,7 +23,7 @@ logger = logging.getLogger(__name__) -PREFIX_LENGTH=200 +PREFIX_LENGTH = 200 # TODO: This test is ancient and while similar tests exist in `fileStoreTest.py`, none of them look @@ -32,6 +32,7 @@ class JobFileStoreTest(ToilTest): """ Tests testing the methods defined in :class:toil.fileStores.abstractFileStore.AbstractFileStore. """ + def testCachingFileStore(self): options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) with Toil(options) as workflow: @@ -43,38 +44,42 @@ def testNonCachingFileStore(self): with Toil(options) as workflow: workflow.start(Job.wrapJobFn(simpleFileStoreJob)) - def _testJobFileStore(self, retryCount=0, badWorker=0.0, stringNo=1, stringLength=1000000, - testNo=2): + def _testJobFileStore( + self, retryCount=0, badWorker=0.0, stringNo=1, stringLength=1000000, testNo=2 + ): """ Creates a chain of jobs, each reading and writing files using the toil.fileStores.abstractFileStore.AbstractFileStore interface. Verifies the files written are always what we expect. """ for test in range(testNo): - #Make a list of random strings, each of 100k chars and hash the first 200 - #base prefix to the string + # Make a list of random strings, each of 100k chars and hash the first 200 + # base prefix to the string def randomString(): chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" s = "".join([random.choice(chars) for i in range(stringLength)]) return s[:PREFIX_LENGTH], s - #Total length is 2 million characters (20 strings of length 100K each) + + # Total length is 2 million characters (20 strings of length 100K each) testStrings = dict([randomString() for i in range(stringNo)]) options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.logLevel = "DEBUG" - options.retryCount=retryCount - options.badWorker=badWorker + options.retryCount = retryCount + options.badWorker = badWorker options.badWorkerFailInterval = 1.0 chainLength = 10 # Run the workflow, the return value being the number of failed jobs - Job.Runner.startToil(Job.wrapJobFn(fileTestJob, [], - testStrings, chainLength), - options) + Job.Runner.startToil( + Job.wrapJobFn(fileTestJob, [], testStrings, chainLength), options + ) def testJobFileStore(self): """ Tests case that about half the files are cached """ - self._testJobFileStore(retryCount=0, badWorker=0.0, stringNo=5, stringLength=1000000) + self._testJobFileStore( + retryCount=0, badWorker=0.0, stringNo=5, stringLength=1000000 + ) @slow def testJobFileStoreWithBadWorker(self): @@ -82,79 +87,98 @@ def testJobFileStoreWithBadWorker(self): Tests case that about half the files are cached and the worker is randomly failing. """ - self._testJobFileStore(retryCount=100, badWorker=0.5, stringNo=5, stringLength=1000000) + self._testJobFileStore( + retryCount=100, badWorker=0.5, stringNo=5, stringLength=1000000 + ) def fileTestJob(job, inputFileStoreIDs, testStrings, chainLength): """ Test job exercises toil.fileStores.abstractFileStore.AbstractFileStore functions """ - outputFileStoreIds = [] #Strings passed to the next job in the chain + outputFileStoreIds = [] # Strings passed to the next job in the chain - #Load the input jobStoreFileIDs and check that they map to the - #same set of random input strings, exercising the different functions in the fileStore interface + # Load the input jobStoreFileIDs and check that they map to the + # same set of random input strings, exercising the different functions in the fileStore interface for fileStoreID in inputFileStoreIDs: if random.random() > 0.5: - #Read the file for the fileStoreID, randomly picking a way to invoke readGlobalFile + # Read the file for the fileStoreID, randomly picking a way to invoke readGlobalFile if random.random() > 0.5: - local_path = job.fileStore.getLocalTempFileName() if random.random() > 0.5 else None + local_path = ( + job.fileStore.getLocalTempFileName() + if random.random() > 0.5 + else None + ) cache = random.random() > 0.5 - tempFile = job.fileStore.readGlobalFile(fileStoreID, - local_path, - cache=cache) + tempFile = job.fileStore.readGlobalFile( + fileStoreID, local_path, cache=cache + ) with open(tempFile) as fH: string = fH.readline() - logging.info("Downloaded %s to local path %s with cache %s and got %s with %d letters", - fileStoreID, local_path, cache, tempFile, len(string)) + logging.info( + "Downloaded %s to local path %s with cache %s and got %s with %d letters", + fileStoreID, + local_path, + cache, + tempFile, + len(string), + ) else: - #Check the local file is as we expect - with job.fileStore.readGlobalFileStream(fileStoreID, 'utf-8') as fH: + # Check the local file is as we expect + with job.fileStore.readGlobalFileStream(fileStoreID, "utf-8") as fH: string = fH.readline() logging.info("Streamed %s and got %d letters", fileStoreID, len(string)) - #Check the string we get back is what we expect - assert string[:PREFIX_LENGTH] in testStrings, f"Could not find string: {string[:PREFIX_LENGTH]}" - assert testStrings[string[:PREFIX_LENGTH]] == string, f"Mismatch in string: {string[:PREFIX_LENGTH]}" - - #This allows the file to be passed to the next job + # Check the string we get back is what we expect + assert ( + string[:PREFIX_LENGTH] in testStrings + ), f"Could not find string: {string[:PREFIX_LENGTH]}" + assert ( + testStrings[string[:PREFIX_LENGTH]] == string + ), f"Mismatch in string: {string[:PREFIX_LENGTH]}" + + # This allows the file to be passed to the next job outputFileStoreIds.append(fileStoreID) else: - #This tests deletion + # This tests deletion logging.info("Deleted %s", fileStoreID) job.fileStore.deleteGlobalFile(fileStoreID) - #Fill out the output strings until we have the same number as the input strings - #exercising different ways of writing files to the file store + # Fill out the output strings until we have the same number as the input strings + # exercising different ways of writing files to the file store while len(outputFileStoreIds) < len(testStrings): - #Pick a string and write it into a file + # Pick a string and write it into a file testString = random.choice(list(testStrings.values())) if random.random() > 0.5: - #Make a local copy of the file - tempFile = job.fileStore.getLocalTempFile() if random.random() > 0.5 \ - else os.path.join(job.fileStore.getLocalTempDir(), "temp.txt") - with open(tempFile, 'w') as fH: + # Make a local copy of the file + tempFile = ( + job.fileStore.getLocalTempFile() + if random.random() > 0.5 + else os.path.join(job.fileStore.getLocalTempDir(), "temp.txt") + ) + with open(tempFile, "w") as fH: fH.write(testString) - #Write a local copy of the file using the local file + # Write a local copy of the file using the local file fileStoreID = job.fileStore.writeGlobalFile(tempFile) # Make sure it returned a valid and correct FileID with the right size assert isinstance(fileStoreID, FileID) - assert fileStoreID.size == len(testString.encode('utf-8')) + assert fileStoreID.size == len(testString.encode("utf-8")) outputFileStoreIds.append(fileStoreID) else: - #Use the writeGlobalFileStream method to write the file + # Use the writeGlobalFileStream method to write the file with job.fileStore.writeGlobalFileStream() as (fH, fileStoreID): - fH.write(testString.encode('utf-8')) + fH.write(testString.encode("utf-8")) outputFileStoreIds.append(fileStoreID) - #Make sure it returned a valid and correct FileID with the right size + # Make sure it returned a valid and correct FileID with the right size assert isinstance(fileStoreID, FileID) - assert fileStoreID.size == len(testString.encode('utf-8')) + assert fileStoreID.size == len(testString.encode("utf-8")) if chainLength > 0: - #Make a child that will read these files and check it gets the same results - job.addChildJobFn(fileTestJob, outputFileStoreIds, testStrings, chainLength-1) + # Make a child that will read these files and check it gets the same results + job.addChildJobFn(fileTestJob, outputFileStoreIds, testStrings, chainLength - 1) fileStoreString = "Testing writeGlobalFile" @@ -163,13 +187,13 @@ def fileTestJob(job, inputFileStoreIDs, testStrings, chainLength): def simpleFileStoreJob(job): localFilePath = os.path.join(job.fileStore.getLocalTempDir(), "parentTemp.txt") - with open(localFilePath, 'w') as f: + with open(localFilePath, "w") as f: f.write(fileStoreString) testID1 = job.fileStore.writeGlobalFile(localFilePath) testID2 = None with job.fileStore.writeGlobalFileStream() as (f, fileID): - f.write(streamingFileStoreString.encode('utf-8')) + f.write(streamingFileStoreString.encode("utf-8")) testID2 = fileID job.addChildJobFn(fileStoreChild, testID1, testID2) @@ -177,12 +201,12 @@ def simpleFileStoreJob(job): def fileStoreChild(job, testID1, testID2): with job.fileStore.readGlobalFileStream(testID1) as f: - assert(f.read().decode('utf-8') == fileStoreString) + assert f.read().decode("utf-8") == fileStoreString localFilePath = os.path.join(job.fileStore.getLocalTempDir(), "childTemp.txt") job.fileStore.readGlobalFile(testID2, localFilePath) with open(localFilePath) as f: - assert(f.read() == streamingFileStoreString) + assert f.read() == streamingFileStoreString job.fileStore.deleteLocalFile(testID2) try: diff --git a/src/toil/test/src/jobServiceTest.py b/src/toil/test/src/jobServiceTest.py index cf3a285acd..732b591444 100644 --- a/src/toil/test/src/jobServiceTest.py +++ b/src/toil/test/src/jobServiceTest.py @@ -46,10 +46,10 @@ def testServiceSerialization(self): """ job = Job() service = ToySerializableService("woot") - startValue = job.addService(service) # Add a first service to job - subService = ToySerializableService(startValue) # Now create a child of + startValue = job.addService(service) # Add a first service to job + subService = ToySerializableService(startValue) # Now create a child of # that service that takes the start value promise from the parent service - job.addService(subService, parentService=service) # This should work if + job.addService(subService, parentService=service) # This should work if # serialization on services is working correctly. self.runToil(job) @@ -60,11 +60,13 @@ def testService(self, checkpoint=False): Tests the creation of a Job.Service with random failures of the worker. """ for test in range(2): - outFile = get_temp_file(rootDir=self._createTempDir()) # Temporary file + outFile = get_temp_file(rootDir=self._createTempDir()) # Temporary file messageInt = random.randint(1, sys.maxsize) try: # Wire up the services/jobs - t = Job.wrapJobFn(serviceTest, outFile, messageInt, checkpoint=checkpoint) + t = Job.wrapJobFn( + serviceTest, outFile, messageInt, checkpoint=checkpoint + ) # Run the workflow repeatedly until success self.runToil(t) @@ -75,24 +77,30 @@ def testService(self, checkpoint=False): os.remove(outFile) @slow - @skipIf(SingleMachineBatchSystem.numCores < 4, 'Need at least four cores to run this test') + @skipIf( + SingleMachineBatchSystem.numCores < 4, + "Need at least four cores to run this test", + ) def testServiceDeadlock(self): """ Creates a job with more services than maxServices, checks that deadlock is detected. """ outFile = get_temp_file(rootDir=self._createTempDir()) try: + def makeWorkflow(): job = Job() r1 = job.addService(ToySerializableService("woot1")) r2 = job.addService(ToySerializableService("woot2")) r3 = job.addService(ToySerializableService("woot3")) - job.addChildFn(fnTest, [ r1, r2, r3 ], outFile) + job.addChildFn(fnTest, [r1, r2, r3], outFile) return job # This should fail as too few services available try: - self.runToil(makeWorkflow(), badWorker=0.0, maxServiceJobs=2, deadlockWait=5) + self.runToil( + makeWorkflow(), badWorker=0.0, maxServiceJobs=2, deadlockWait=5 + ) except DeadlockException: print("Got expected deadlock exception") else: @@ -113,7 +121,10 @@ def testServiceWithCheckpoints(self): self.testService(checkpoint=True) @slow - @skipIf(SingleMachineBatchSystem.numCores < 4, 'Need at least four cores to run this test') + @skipIf( + SingleMachineBatchSystem.numCores < 4, + "Need at least four cores to run this test", + ) def testServiceRecursive(self, checkpoint=True): """ Tests the creation of a Job.Service, creating a chain of services and accessing jobs. @@ -122,10 +133,12 @@ def testServiceRecursive(self, checkpoint=True): for test in range(1): # Temporary file outFile = get_temp_file(rootDir=self._createTempDir()) - messages = [ random.randint(1, sys.maxsize) for i in range(3) ] + messages = [random.randint(1, sys.maxsize) for i in range(3)] try: # Wire up the services/jobs - t = Job.wrapJobFn(serviceTestRecursive, outFile, messages, checkpoint=checkpoint) + t = Job.wrapJobFn( + serviceTestRecursive, outFile, messages, checkpoint=checkpoint + ) # Run the workflow repeatedly until success self.runToil(t) @@ -136,7 +149,10 @@ def testServiceRecursive(self, checkpoint=True): os.remove(outFile) @slow - @skipIf(SingleMachineBatchSystem.numCores < 4, 'Need at least four cores to run this test') + @skipIf( + SingleMachineBatchSystem.numCores < 4, + "Need at least four cores to run this test", + ) @retry_flaky_test(prepare=[ToilTest.tearDown, ToilTest.setUp]) @pytest.mark.timeout(1200) def testServiceParallelRecursive(self, checkpoint=True): @@ -147,21 +163,38 @@ def testServiceParallelRecursive(self, checkpoint=True): for test in range(1): # Temporary file outFiles = [get_temp_file(rootDir=self._createTempDir()) for j in range(2)] - messageBundles = [ [ random.randint(1, sys.maxsize) for i in range(3) ] for j in range(2) ] + messageBundles = [ + [random.randint(1, sys.maxsize) for i in range(3)] for j in range(2) + ] try: # Wire up the services/jobs - t = Job.wrapJobFn(serviceTestParallelRecursive, outFiles, messageBundles, checkpoint=True) + t = Job.wrapJobFn( + serviceTestParallelRecursive, + outFiles, + messageBundles, + checkpoint=True, + ) # Run the workflow repeatedly until success self.runToil(t, retryCount=2) # Check output - for (messages, outFile) in zip(messageBundles, outFiles): - self.assertEqual(list(map(int, open(outFile).readlines())), messages) + for messages, outFile in zip(messageBundles, outFiles): + self.assertEqual( + list(map(int, open(outFile).readlines())), messages + ) finally: list(map(os.remove, outFiles)) - def runToil(self, rootJob, retryCount=1, badWorker=0.5, badWorkedFailInterval=0.1, maxServiceJobs=sys.maxsize, deadlockWait=60): + def runToil( + self, + rootJob, + retryCount=1, + badWorker=0.5, + badWorkedFailInterval=0.1, + maxServiceJobs=sys.maxsize, + deadlockWait=60, + ): # Create the runner for the workflow. options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.logLevel = "DEBUG" @@ -171,7 +204,7 @@ def runToil(self, rootJob, retryCount=1, badWorker=0.5, badWorkedFailInterval=0. options.badWorkerFailInterval = badWorkedFailInterval options.servicePollingInterval = 1 options.maxServiceJobs = maxServiceJobs - options.deadlockWait=deadlockWait + options.deadlockWait = deadlockWait # Run the workflow totalTrys = 0 @@ -181,17 +214,33 @@ def runToil(self, rootJob, retryCount=1, badWorker=0.5, badWorkedFailInterval=0. break except FailedJobsException as e: i = e.numberOfFailedJobs - if totalTrys > 50: #p(fail after this many restarts) = 0.5**32 - self.fail() #Exceeded a reasonable number of restarts + if totalTrys > 50: # p(fail after this many restarts) = 0.5**32 + self.fail() # Exceeded a reasonable number of restarts totalTrys += 1 options.restart = True + class PerfectServiceTest(JobServiceTest): - def runToil(self, rootJob, retryCount=1, badWorker=0, badWorkedFailInterval=1000, maxServiceJobs=sys.maxsize, deadlockWait=60): + def runToil( + self, + rootJob, + retryCount=1, + badWorker=0, + badWorkedFailInterval=1000, + maxServiceJobs=sys.maxsize, + deadlockWait=60, + ): """ Let us run all the tests in the other service test class, but without worker failures. """ - super().runToil(rootJob, retryCount, badWorker, badWorkedFailInterval, maxServiceJobs, deadlockWait) + super().runToil( + rootJob, + retryCount, + badWorker, + badWorkedFailInterval, + maxServiceJobs, + deadlockWait, + ) def serviceTest(job, outFile, messageInt): @@ -199,51 +248,73 @@ def serviceTest(job, outFile, messageInt): Creates one service and one accessing job, which communicate with two files to establish that both run concurrently. """ - #Clean out out-file - open(outFile, 'w').close() - randInt = random.randint(1, sys.maxsize) # We create a random number that is added to messageInt and subtracted by the serviceAccessor, to prove that + # Clean out out-file + open(outFile, "w").close() + randInt = random.randint( + 1, sys.maxsize + ) # We create a random number that is added to messageInt and subtracted by the serviceAccessor, to prove that # when service test is checkpointed and restarted there is never a connection made between an earlier service and later serviceAccessor, or vice versa. - job.addChildJobFn(serviceAccessor, job.addService(ToyService(messageInt + randInt)), outFile, randInt) + job.addChildJobFn( + serviceAccessor, + job.addService(ToyService(messageInt + randInt)), + outFile, + randInt, + ) + def serviceTestRecursive(job, outFile, messages): """ Creates a chain of services and accessing jobs, each paired together. """ if len(messages) > 0: - #Clean out out-file - open(outFile, 'w').close() + # Clean out out-file + open(outFile, "w").close() randInt = random.randint(1, sys.maxsize) service = ToyService(messages[0] + randInt) - child = job.addChildJobFn(serviceAccessor, job.addService(service), outFile, randInt) + child = job.addChildJobFn( + serviceAccessor, job.addService(service), outFile, randInt + ) for i in range(1, len(messages)): randInt = random.randint(1, sys.maxsize) service2 = ToyService(messages[i] + randInt, cores=0.1) - child = child.addChildJobFn(serviceAccessor, - job.addService(service2, parentService=service), - outFile, randInt, cores=0.1) + child = child.addChildJobFn( + serviceAccessor, + job.addService(service2, parentService=service), + outFile, + randInt, + cores=0.1, + ) service = service2 + def serviceTestParallelRecursive(job, outFiles, messageBundles): """ Creates multiple chains of services and accessing jobs. """ for messages, outFile in zip(messageBundles, outFiles): - #Clean out out-file - open(outFile, 'w').close() + # Clean out out-file + open(outFile, "w").close() if len(messages) > 0: randInt = random.randint(1, sys.maxsize) service = ToyService(messages[0] + randInt) - child = job.addChildJobFn(serviceAccessor, job.addService(service), outFile, randInt) + child = job.addChildJobFn( + serviceAccessor, job.addService(service), outFile, randInt + ) for i in range(1, len(messages)): randInt = random.randint(1, sys.maxsize) service2 = ToyService(messages[i] + randInt, cores=0.1) - child = child.addChildJobFn(serviceAccessor, - job.addService(service2, parentService=service), - outFile, randInt, cores=0.1) + child = child.addChildJobFn( + serviceAccessor, + job.addService(service2, parentService=service), + outFile, + randInt, + cores=0.1, + ) service = service2 + class ToyService(Job.Service): def __init__(self, messageInt, *args, **kwargs): """ @@ -264,10 +335,17 @@ def start(self, job): # So we don't associate these files with this job. inJobStoreID = job.fileStore.jobStore.get_empty_file_store_id() outJobStoreID = job.fileStore.jobStore.get_empty_file_store_id() - self.serviceThread = Thread(target=self.serviceWorker, - args=(job.fileStore.jobStore, self.terminate, self.error, - inJobStoreID, outJobStoreID, - self.messageInt)) + self.serviceThread = Thread( + target=self.serviceWorker, + args=( + job.fileStore.jobStore, + self.terminate, + self.error, + inJobStoreID, + outJobStoreID, + self.messageInt, + ), + ) self.serviceThread.start() return (inJobStoreID, outJobStoreID) @@ -281,22 +359,27 @@ def check(self): return True @staticmethod - def serviceWorker(jobStore, terminate, error, inJobStoreID, outJobStoreID, messageInt): + def serviceWorker( + jobStore, terminate, error, inJobStoreID, outJobStoreID, messageInt + ): try: while True: - if terminate.isSet(): # Quit if we've got the terminate signal + if terminate.isSet(): # Quit if we've got the terminate signal logger.debug("Service worker being told to quit") return - time.sleep(0.2) # Sleep to avoid thrashing + time.sleep(0.2) # Sleep to avoid thrashing # Try reading a line from the input file try: with jobStore.read_file_stream(inJobStoreID) as f: - f = codecs.getreader('utf-8')(f) + f = codecs.getreader("utf-8")(f) line = f.readline() except: - logger.debug("Something went wrong reading a line: %s", traceback.format_exc()) + logger.debug( + "Something went wrong reading a line: %s", + traceback.format_exc(), + ) raise if len(line.strip()) == 0: @@ -307,7 +390,11 @@ def serviceWorker(jobStore, terminate, error, inJobStoreID, outJobStoreID, messa try: inputInt = int(line) except ValueError: - logger.debug("Tried casting input line '%s' to integer but got error: %s", line, traceback.format_exc()) + logger.debug( + "Tried casting input line '%s' to integer but got error: %s", + line, + traceback.format_exc(), + ) continue # Write out the resulting read integer and the message @@ -318,6 +405,7 @@ def serviceWorker(jobStore, terminate, error, inJobStoreID, outJobStoreID, messa error.set() raise + def serviceAccessor(job, communicationFiles, outFile, randInt): """ Writes a random integer iinto the inJobStoreFileID file, then tries 10 times reading @@ -331,15 +419,15 @@ def serviceAccessor(job, communicationFiles, outFile, randInt): # Write the integer into the file logger.debug("Writing key to inJobStoreFileID") with job.fileStore.jobStore.update_file_stream(inJobStoreFileID) as fH: - fH.write(("%s\n" % key).encode('utf-8')) + fH.write(("%s\n" % key).encode("utf-8")) logger.debug("Trying to read key and message from outJobStoreFileID") - for i in range(10): # Try 10 times over - time.sleep(0.2) #Avoid thrashing + for i in range(10): # Try 10 times over + time.sleep(0.2) # Avoid thrashing # Try reading an integer from the input file and writing out the message with job.fileStore.jobStore.read_file_stream(outJobStoreFileID) as fH: - fH = codecs.getreader('utf-8')(fH) + fH = codecs.getreader("utf-8")(fH) line = fH.readline() tokens = line.split() @@ -349,12 +437,15 @@ def serviceAccessor(job, communicationFiles, outFile, randInt): key2, message = tokens if int(key2) == key: - logger.debug(f"Matched key's: {key}, writing message: {int(message) - randInt} with randInt: {randInt}") - with open(outFile, 'a') as fH: + logger.debug( + f"Matched key's: {key}, writing message: {int(message) - randInt} with randInt: {randInt}" + ) + with open(outFile, "a") as fH: fH.write("%s\n" % (int(message) - randInt)) return - assert 0 # Job failed to get info from the service + assert 0 # Job failed to get info from the service + class ToySerializableService(Job.Service): def __init__(self, messageInt, *args, **kwargs): @@ -373,9 +464,10 @@ def stop(self, job): def check(self): return True + def fnTest(strings, outputFile): """ Function concatenates the strings together and writes them to the output file """ - with open(outputFile, 'w') as fH: + with open(outputFile, "w") as fH: fH.write(" ".join(strings)) diff --git a/src/toil/test/src/jobTest.py b/src/toil/test/src/jobTest.py index d35f512b0f..2e0e36ffb5 100644 --- a/src/toil/test/src/jobTest.py +++ b/src/toil/test/src/jobTest.py @@ -29,6 +29,7 @@ class JobTest(ToilTest): """Tests the job class.""" + @classmethod def setUpClass(cls): super().setUpClass() @@ -121,9 +122,9 @@ def testStatic2(self): @slow def testTrivialDAGConsistency(self): - options = Job.Runner.getDefaultOptions(self._createTempDir() + '/jobStore') - options.clean = 'always' - options.logLevel = 'debug' + options = Job.Runner.getDefaultOptions(self._createTempDir() + "/jobStore") + options.clean = "always" + options.logLevel = "debug" i = Job.wrapJobFn(trivialParent) with Toil(options) as toil: try: @@ -136,9 +137,9 @@ def testTrivialDAGConsistency(self): @pytest.mark.timeout(300) def testDAGConsistency(self): - options = Job.Runner.getDefaultOptions(self._createTempDir() + '/jobStore') - options.clean = 'always' - options.logLevel = 'debug' + options = Job.Runner.getDefaultOptions(self._createTempDir() + "/jobStore") + options.clean = "always" + options.logLevel = "debug" i = Job.wrapJobFn(parent) with Toil(options) as toil: try: @@ -155,9 +156,9 @@ def testSiblingDAGConsistency(self): Slightly more complex case. The stranded job's predecessors are siblings instead of parent/child. """ - options = Job.Runner.getDefaultOptions(self._createTempDir() + '/jobStore') - options.clean = 'always' - options.logLevel = 'debug' + options = Job.Runner.getDefaultOptions(self._createTempDir() + "/jobStore") + options.clean = "always" + options.logLevel = "debug" i = Job.wrapJobFn(diamond) with Toil(options) as toil: try: @@ -196,8 +197,12 @@ def testDeadlockDetection(self): # Test making multiple roots childEdges2 = childEdges.copy() - childEdges2.add((nodeNumber, 1)) # This creates an extra root at "nodeNumber" - rootJob2 = self.makeJobGraph(nodeNumber + 1, childEdges2, followOnEdges, None, False) + childEdges2.add( + (nodeNumber, 1) + ) # This creates an extra root at "nodeNumber" + rootJob2 = self.makeJobGraph( + nodeNumber + 1, childEdges2, followOnEdges, None, False + ) try: rootJob2.checkJobGraphConnected() self.assertTrue(False) # Multiple roots were not detected @@ -209,8 +214,9 @@ def checkChildEdgeCycleDetection(fNode, tNode): adjacencyList[fNode].add(tNode) self.assertTrue(not self.isAcyclic(adjacencyList)) try: - self.makeJobGraph(nodeNumber, childEdges, - followOnEdges, None).checkJobGraphAcylic() + self.makeJobGraph( + nodeNumber, childEdges, followOnEdges, None + ).checkJobGraphAcylic() self.assertTrue(False) # A cycle was not detected except JobGraphDeadlockException: pass # This is the expected behaviour @@ -218,22 +224,25 @@ def checkChildEdgeCycleDetection(fNode, tNode): childEdges.remove((fNode, tNode)) adjacencyList[fNode].remove(tNode) # Check is now acyclic again - self.makeJobGraph(nodeNumber, childEdges, - followOnEdges, None, False).checkJobGraphAcylic() + self.makeJobGraph( + nodeNumber, childEdges, followOnEdges, None, False + ).checkJobGraphAcylic() def checkFollowOnEdgeCycleDetection(fNode, tNode): followOnEdges.add((fNode, tNode)) # Create a cycle try: - self.makeJobGraph(nodeNumber, childEdges, - followOnEdges, None, False).checkJobGraphAcylic() + self.makeJobGraph( + nodeNumber, childEdges, followOnEdges, None, False + ).checkJobGraphAcylic() # self.assertTrue(False) #The cycle was not detected except JobGraphDeadlockException: pass # This is the expected behaviour # Remove the edges followOnEdges.remove((fNode, tNode)) # Check is now acyclic again - self.makeJobGraph(nodeNumber, childEdges, - followOnEdges, None, False).checkJobGraphAcylic() + self.makeJobGraph( + nodeNumber, childEdges, followOnEdges, None, False + ).checkJobGraphAcylic() # Now try adding edges that create a cycle @@ -257,9 +266,16 @@ def checkFollowOnEdgeCycleDetection(fNode, tNode): # Try adding a follow on edge between two nodes with shared descendants fNode, tNode = self.getRandomEdge(nodeNumber) - if (len(self.reachable(tNode, adjacencyList) - .intersection(self.reachable(fNode, adjacencyList))) > 0 - and (fNode, tNode) not in childEdges and (fNode, tNode) not in followOnEdges): + if ( + len( + self.reachable(tNode, adjacencyList).intersection( + self.reachable(fNode, adjacencyList) + ) + ) + > 0 + and (fNode, tNode) not in childEdges + and (fNode, tNode) not in followOnEdges + ): checkFollowOnEdgeCycleDetection(fNode, tNode) @slow @@ -279,7 +295,9 @@ def testNewCheckpointIsLeafVertexNonRootCase(self): def createWorkflow(): rootJob = Job.wrapJobFn(simpleJobFn, "Parent") - childCheckpointJob = rootJob.addChildJobFn(simpleJobFn, "Child", checkpoint=True) + childCheckpointJob = rootJob.addChildJobFn( + simpleJobFn, "Child", checkpoint=True + ) return rootJob, childCheckpointJob self.runNewCheckpointIsLeafVertexTest(createWorkflow) @@ -315,35 +333,45 @@ def runNewCheckpointIsLeafVertexTest(self, createWorkflowFn): """ - logger.debug('Test checkpoint job that is a leaf vertex') - self.runCheckpointVertexTest(*createWorkflowFn(), - expectedException=None) - - logger.debug('Test checkpoint job that is not a leaf vertex due to the presence of a service') - self.runCheckpointVertexTest(*createWorkflowFn(), - checkpointJobService=TrivialService("LeafTestService"), - expectedException=JobGraphDeadlockException) - - logger.debug('Test checkpoint job that is not a leaf vertex due to the presence of a child job') - self.runCheckpointVertexTest(*createWorkflowFn(), - checkpointJobChild=Job.wrapJobFn( - simpleJobFn, "LeafTestChild"), - expectedException=JobGraphDeadlockException) - - logger.debug('Test checkpoint job that is not a leaf vertex due to the presence of a follow-on job') - self.runCheckpointVertexTest(*createWorkflowFn(), - checkpointJobFollowOn=Job.wrapJobFn( - simpleJobFn, - "LeafTestFollowOn"), - expectedException=JobGraphDeadlockException) - - def runCheckpointVertexTest(self, - workflowRootJob, - checkpointJob, - checkpointJobService=None, - checkpointJobChild=None, - checkpointJobFollowOn=None, - expectedException=None): + logger.debug("Test checkpoint job that is a leaf vertex") + self.runCheckpointVertexTest(*createWorkflowFn(), expectedException=None) + + logger.debug( + "Test checkpoint job that is not a leaf vertex due to the presence of a service" + ) + self.runCheckpointVertexTest( + *createWorkflowFn(), + checkpointJobService=TrivialService("LeafTestService"), + expectedException=JobGraphDeadlockException + ) + + logger.debug( + "Test checkpoint job that is not a leaf vertex due to the presence of a child job" + ) + self.runCheckpointVertexTest( + *createWorkflowFn(), + checkpointJobChild=Job.wrapJobFn(simpleJobFn, "LeafTestChild"), + expectedException=JobGraphDeadlockException + ) + + logger.debug( + "Test checkpoint job that is not a leaf vertex due to the presence of a follow-on job" + ) + self.runCheckpointVertexTest( + *createWorkflowFn(), + checkpointJobFollowOn=Job.wrapJobFn(simpleJobFn, "LeafTestFollowOn"), + expectedException=JobGraphDeadlockException + ) + + def runCheckpointVertexTest( + self, + workflowRootJob, + checkpointJob, + checkpointJobService=None, + checkpointJobChild=None, + checkpointJobFollowOn=None, + expectedException=None, + ): """ Modifies the checkpoint job according to the given parameters then runs the workflow, checking for the expected exception, if any. @@ -380,7 +408,7 @@ def testEvaluatingRandomDAG(self): jobStore = self._getTestJobStorePath() for test in range(5): # Temporary file - tempDir = self._createTempDir(purpose='tempDir') + tempDir = self._createTempDir(purpose="tempDir") # Make a random DAG for the set of child edges nodeNumber = random.choice(range(2, 8)) childEdges = self.makeRandomDAG(nodeNumber) @@ -421,8 +449,8 @@ def testEvaluatingRandomDAG(self): numberOfFailedJobs = 0 except FailedJobsException as e: numberOfFailedJobs = e.numberOfFailedJobs - if totalTrys > 32: #p(fail after this many restarts) ~= 0.5**32 - self.fail() #Exceeded a reasonable number of restarts + if totalTrys > 32: # p(fail after this many restarts) ~= 0.5**32 + self.fail() # Exceeded a reasonable number of restarts totalTrys += 1 # For each job check it created a valid output file and add the ordering @@ -447,7 +475,7 @@ def testEvaluatingRandomDAG(self): def getRandomEdge(nodeNumber): assert nodeNumber > 1 fNode = random.choice(range(nodeNumber - 1)) - return fNode, random.choice(range(fNode+1, nodeNumber)) + return fNode, random.choice(range(fNode + 1, nodeNumber)) @staticmethod def makeRandomDAG(nodeNumber): @@ -457,7 +485,9 @@ def makeRandomDAG(nodeNumber): referring to nodes and the edge is from a to b. """ # Pick number of total edges to create - edgeNumber = random.choice(range(nodeNumber - 1, 1 + (nodeNumber * (nodeNumber - 1) // 2))) + edgeNumber = random.choice( + range(nodeNumber - 1, 1 + (nodeNumber * (nodeNumber - 1) // 2)) + ) # Make a spanning tree of edges so that nodes are connected edges = {(random.choice(range(i)), i) for i in range(1, nodeNumber)} # Add extra random edges until there are edgeNumber edges @@ -499,8 +529,10 @@ def addRandomFollowOnEdges(self, childAdjacencyList): """ def makeAugmentedAdjacencyList(): - augmentedAdjacencyList = [childAdjacencyList[i].union(followOnAdjacencyList[i]) - for i in range(len(childAdjacencyList))] + augmentedAdjacencyList = [ + childAdjacencyList[i].union(followOnAdjacencyList[i]) + for i in range(len(childAdjacencyList)) + ] def addImpliedEdges(node, followOnEdges): # Let node2 be a child of node or a successor of a child of node. @@ -550,7 +582,9 @@ def f(node2): return followOnEdges - def makeJobGraph(self, nodeNumber, childEdges, followOnEdges, outPath, addServices=True): + def makeJobGraph( + self, nodeNumber, childEdges, followOnEdges, outPath, addServices=True + ): """ Converts a DAG into a job graph. childEdges and followOnEdges are the lists of child and followOn edges. @@ -560,9 +594,15 @@ def makeJobGraph(self, nodeNumber, childEdges, followOnEdges, outPath, addServic def makeJob(string): promises = [] - job = Job.wrapFn(fn2Test, promises, string, - None if outPath is None else os.path.join(outPath, string), - cores=0.1, memory="0.5G", disk="0.1G") + job = Job.wrapFn( + fn2Test, + promises, + string, + None if outPath is None else os.path.join(outPath, string), + cores=0.1, + memory="0.5G", + disk="0.1G", + ) jobsToPromisesMap[job] = promises return job @@ -581,7 +621,16 @@ def makeJob(string): predecessors[jobs[tNode]].append(jobs[fNode]) # Map of jobs to return values - jobsToRvs = {job: job.addService(TrivialService(job.rv(), cores=0.1, memory="0.5G", disk="0.1G")) if addServices else job.rv() for job in jobs} + jobsToRvs = { + job: ( + job.addService( + TrivialService(job.rv(), cores=0.1, memory="0.5G", disk="0.1G") + ) + if addServices + else job.rv() + ) + for job in jobs + } def getRandomPredecessor(job): predecessor = random.choice(list(predecessors[job])) @@ -622,9 +671,11 @@ def cyclic(fNode, visited, stack): return False return True + def simpleJobFn(job, value): job.fileStore.log_to_leader(value) + def fn1Test(string, outputFile): """ Function appends the next character after the last character in the given @@ -633,7 +684,7 @@ def fn1Test(string, outputFile): """ rV = string + chr(ord(string[-1]) + 1) - with open(outputFile, 'w') as fH: + with open(outputFile, "w") as fH: fH.write(rV) return rV @@ -643,7 +694,7 @@ def fn2Test(pStrings, s, outputFile): Function concatenates the strings in pStrings and s, in that order, and writes the result to the output file. Returns s. """ - with open(outputFile, 'w') as fH: + with open(outputFile, "w") as fH: fH.write(" ".join(pStrings) + " " + s) return s @@ -687,13 +738,12 @@ def child(job): def errorChild(job): - raise RuntimeError('Child failure') + raise RuntimeError("Child failure") class TrivialService(Job.Service): def __init__(self, message, *args, **kwargs): - """ Service that does nothing, used to check for deadlocks - """ + """Service that does nothing, used to check for deadlocks""" Job.Service.__init__(self, *args, **kwargs) self.message = message @@ -707,5 +757,5 @@ def check(self): pass -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/src/toil/test/src/miscTests.py b/src/toil/test/src/miscTests.py index cd4b0e8e4c..308658c09f 100644 --- a/src/toil/test/src/miscTests.py +++ b/src/toil/test/src/miscTests.py @@ -20,10 +20,7 @@ from toil.common import getNodeID from toil.lib.exceptions import panic, raise_ -from toil.lib.io import (AtomicFileCreate, - atomic_install, - atomic_tmp_file, - mkdtemp) +from toil.lib.io import AtomicFileCreate, atomic_install, atomic_tmp_file, mkdtemp from toil.lib.misc import CalledProcessErrorStderr, call_command from toil.test import ToilTest, slow @@ -36,6 +33,7 @@ class MiscTests(ToilTest): This class contains miscellaneous tests that don't have enough content to be their own test file, and that don't logically fit in with any of the other test suites. """ + def setUp(self): super().setUp() self.testDir = self._createTempDir() @@ -49,14 +47,14 @@ def testIDStability(self): @slow def testGetSizeOfDirectoryWorks(self): - '''A test to make sure toil.common.getDirSizeRecursively does not + """A test to make sure toil.common.getDirSizeRecursively does not underestimate the amount of disk space needed. Disk space allocation varies from system to system. The computed value should always be equal to or slightly greater than the creation value. This test generates a number of random directories and randomly sized files to test this using getDirSizeRecursively. - ''' + """ from toil.common import getDirSizeRecursively # a list of the directories used in the test @@ -64,17 +62,17 @@ def testGetSizeOfDirectoryWorks(self): # A dict of {FILENAME: FILESIZE} for all files used in the test files = {} # Create a random directory structure - for i in range(0,10): - directories.append(mkdtemp(dir=random.choice(directories), prefix='test')) + for i in range(0, 10): + directories.append(mkdtemp(dir=random.choice(directories), prefix="test")) # Create 50 random file entries in different locations in the directories. 75% of the time # these are fresh files of size [1, 10] MB and 25% of the time they are hard links to old # files. while len(files) <= 50: fileName = os.path.join(random.choice(directories), self._getRandomName()) - if random.randint(0,100) < 75: + if random.randint(0, 100) < 75: # Create a fresh file in the range of 1-10 MB fileSize = int(round(random.random(), 2) * 10 * 1024 * 1024) - with open(fileName, 'wb') as fileHandle: + with open(fileName, "wb") as fileHandle: fileHandle.write(os.urandom(fileSize)) files[fileName] = fileSize else: @@ -83,7 +81,7 @@ def testGetSizeOfDirectoryWorks(self): continue linkSrc = random.choice(list(files.keys())) os.link(linkSrc, fileName) - files[fileName] = 'Link to %s' % linkSrc + files[fileName] = "Link to %s" % linkSrc computedDirectorySize = getDirSizeRecursively(self.testDir) totalExpectedSize = sum(x for x in list(files.values()) if isinstance(x, int)) @@ -101,7 +99,7 @@ def _get_test_out_file(self, tail): def _write_test_file(self, outf_tmp): with open(outf_tmp, "w") as fh: - fh.write(self.id() + '\n') + fh.write(self.id() + "\n") def test_atomic_install(self): outf = self._get_test_out_file(".foo.gz") @@ -111,7 +109,7 @@ def test_atomic_install(self): self.assertTrue(os.path.exists(outf)) def test_atomic_install_dev(self): - devn = '/dev/null' + devn = "/dev/null" tmp = atomic_tmp_file(devn) self.assertEqual(tmp, devn) atomic_install(tmp, devn) @@ -138,10 +136,13 @@ def test_call_command_ok(self): self.assertTrue(isinstance(o, str), str(type(o))) def test_call_command_err(self): - with self.assertRaisesRegex(CalledProcessErrorStderr, - "^Command '\\['cat', '/dev/Frankenheimer']' exit status 1: cat: /dev/Frankenheimer: No such file or directory\n$"): + with self.assertRaisesRegex( + CalledProcessErrorStderr, + "^Command '\\['cat', '/dev/Frankenheimer']' exit status 1: cat: /dev/Frankenheimer: No such file or directory\n$", + ): call_command(["cat", "/dev/Frankenheimer"]) + class TestPanic(ToilTest): def test_panic_by_hand(self): try: @@ -192,7 +193,7 @@ def try_and_panic_with_secondary(self): self.line_of_primary_exc = inspect.currentframe().f_lineno + 1 raise ValueError("primary") except: - with panic( log ): + with panic(log): raise RuntimeError("secondary") def try_and_nested_panic_with_secondary(self): @@ -200,8 +201,8 @@ def try_and_nested_panic_with_secondary(self): self.line_of_primary_exc = inspect.currentframe().f_lineno + 1 raise ValueError("primary") except: - with panic( log ): - with panic( log ): + with panic(log): + with panic(log): raise RuntimeError("secondary") def __assert_raised_exception_is_primary(self): diff --git a/src/toil/test/src/promisedRequirementTest.py b/src/toil/test/src/promisedRequirementTest.py index ad2eab6a1d..9806f52e8a 100644 --- a/src/toil/test/src/promisedRequirementTest.py +++ b/src/toil/test/src/promisedRequirementTest.py @@ -32,46 +32,77 @@ class hidden: http://stackoverflow.com/questions/1323455/python-unit-test-with-base-and-sub-class#answer-25695512 """ - class AbstractPromisedRequirementsTest(batchSystemTest.hidden.AbstractBatchSystemJobTest): + class AbstractPromisedRequirementsTest( + batchSystemTest.hidden.AbstractBatchSystemJobTest + ): """An abstract base class for testing Toil workflows with promised requirements.""" + @slow def testConcurrencyDynamic(self): """ Asserts that promised core resources are allocated properly using a dynamic Toil workflow """ for coresPerJob in self.allocatedCores: - log.debug('Testing %d cores per job with CPU count %d', coresPerJob, self.cpuCount) - tempDir = self._createTempDir('testFiles') + log.debug( + "Testing %d cores per job with CPU count %d", + coresPerJob, + self.cpuCount, + ) + tempDir = self._createTempDir("testFiles") counterPath = self.getCounterPath(tempDir) - root = Job.wrapJobFn(maxConcurrency, self.cpuCount, counterPath, coresPerJob, - cores=1, memory='1M', disk='1M') + root = Job.wrapJobFn( + maxConcurrency, + self.cpuCount, + counterPath, + coresPerJob, + cores=1, + memory="1M", + disk="1M", + ) values = Job.Runner.startToil(root, self.getOptions(tempDir)) maxValue = max(values) self.assertLessEqual(maxValue, self.cpuCount // coresPerJob) @slow - @retry_flaky_test(prepare=[batchSystemTest.hidden.AbstractBatchSystemJobTest.tearDown, - batchSystemTest.hidden.AbstractBatchSystemJobTest.setUp]) + @retry_flaky_test( + prepare=[ + batchSystemTest.hidden.AbstractBatchSystemJobTest.tearDown, + batchSystemTest.hidden.AbstractBatchSystemJobTest.setUp, + ] + ) def testConcurrencyStatic(self): """ Asserts that promised core resources are allocated properly using a static DAG """ for coresPerJob in self.allocatedCores: - log.debug('Testing %d cores per job with CPU count %d', coresPerJob, self.cpuCount) - tempDir = self._createTempDir('testFiles') + log.debug( + "Testing %d cores per job with CPU count %d", + coresPerJob, + self.cpuCount, + ) + tempDir = self._createTempDir("testFiles") counterPath = self.getCounterPath(tempDir) root = Job() - one = Job.wrapFn(getOne, cores=0.1, memory='32M', disk='1M') - thirtyTwoMb = Job.wrapFn(getThirtyTwoMb, cores=0.1, memory='32M', disk='1M') + one = Job.wrapFn(getOne, cores=0.1, memory="32M", disk="1M") + thirtyTwoMb = Job.wrapFn( + getThirtyTwoMb, cores=0.1, memory="32M", disk="1M" + ) root.addChild(one) root.addChild(thirtyTwoMb) for _ in range(self.cpuCount): - root.addFollowOn(Job.wrapFn(batchSystemTest.measureConcurrency, counterPath, - cores=PromisedRequirement(lambda x: x * coresPerJob, one.rv()), - memory=PromisedRequirement(thirtyTwoMb.rv()), - disk='1M')) + root.addFollowOn( + Job.wrapFn( + batchSystemTest.measureConcurrency, + counterPath, + cores=PromisedRequirement( + lambda x: x * coresPerJob, one.rv() + ), + memory=PromisedRequirement(thirtyTwoMb.rv()), + disk="1M", + ) + ) Job.Runner.startToil(root, self.getOptions(tempDir)) _, maxValue = batchSystemTest.getCounters(counterPath) self.assertLessEqual(maxValue, self.cpuCount // coresPerJob) @@ -90,7 +121,7 @@ def getCounterPath(self, tempDir): :param str tempDir: path to test directory :return: path to counter file """ - counterPath = os.path.join(tempDir, 'counter') + counterPath = os.path.join(tempDir, "counter") batchSystemTest.resetCounters(counterPath) minValue, maxValue = batchSystemTest.getCounters(counterPath) assert (minValue, maxValue) == (0, 0) @@ -108,13 +139,19 @@ def testPromisesWithJobStoreFileObjects(self, caching=True): file2 = 512 F1 = Job.wrapJobFn(_writer, file1) F2 = Job.wrapJobFn(_writer, file2) - G = Job.wrapJobFn(_follower, file1 + file2, - disk=PromisedRequirement(lambda x, y: x.size + y.size, - F1.rv(), F2.rv())) + G = Job.wrapJobFn( + _follower, + file1 + file2, + disk=PromisedRequirement( + lambda x, y: x.size + y.size, F1.rv(), F2.rv() + ), + ) F1.addChild(F2) F2.addChild(G) - Job.Runner.startToil(F1, self.getOptions(self._createTempDir('testFiles'), caching=caching)) + Job.Runner.startToil( + F1, self.getOptions(self._createTempDir("testFiles"), caching=caching) + ) def testPromisesWithNonCachingFileStore(self): self.testPromisesWithJobStoreFileObjects(caching=False) @@ -124,14 +161,18 @@ def testPromiseRequirementRaceStatic(self): """ Checks for a race condition when using promised requirements and child job functions. """ - A = Job.wrapJobFn(logDiskUsage, 'A', sleep=5, disk=PromisedRequirement(1024)) - B = Job.wrapJobFn(logDiskUsage, 'B', disk=PromisedRequirement(lambda x: x + 1024, A.rv())) + A = Job.wrapJobFn( + logDiskUsage, "A", sleep=5, disk=PromisedRequirement(1024) + ) + B = Job.wrapJobFn( + logDiskUsage, "B", disk=PromisedRequirement(lambda x: x + 1024, A.rv()) + ) A.addChild(B) - Job.Runner.startToil(A, self.getOptions(self._createTempDir('testFiles'))) + Job.Runner.startToil(A, self.getOptions(self._createTempDir("testFiles"))) def _writer(job, fileSize): - ''' + """ Write a local file and return the FileID obtained from running writeGlobalFile on it. @@ -139,8 +180,8 @@ def _writer(job, fileSize): :param int fileSize: Size of the file in bytes :returns: the result of writeGlobalFile on a locally created file :rtype: job.FileID - ''' - with open(job.fileStore.getLocalTempFileName(), 'wb') as fH: + """ + with open(job.fileStore.getLocalTempFileName(), "wb") as fH: fH.write(os.urandom(fileSize)) return job.fileStore.writeGlobalFile(fH.name) @@ -166,15 +207,18 @@ def maxConcurrency(job, cpuCount, filename, coresPerJob): :param int coresPerJob: number of cores assigned to each job :return int max concurrency value: """ - one = job.addChildFn(getOne, cores=0.1, memory='32M', disk='1M') - thirtyTwoMb = job.addChildFn(getThirtyTwoMb, cores=0.1, memory='32M', disk='1M') + one = job.addChildFn(getOne, cores=0.1, memory="32M", disk="1M") + thirtyTwoMb = job.addChildFn(getThirtyTwoMb, cores=0.1, memory="32M", disk="1M") values = [] for _ in range(cpuCount): - value = job.addFollowOnFn(batchSystemTest.measureConcurrency, filename, - cores=PromisedRequirement(lambda x: x * coresPerJob, one.rv()), - memory=PromisedRequirement(thirtyTwoMb.rv()), - disk='1M').rv() + value = job.addFollowOnFn( + batchSystemTest.measureConcurrency, + filename, + cores=PromisedRequirement(lambda x: x * coresPerJob, one.rv()), + memory=PromisedRequirement(thirtyTwoMb.rv()), + disk="1M", + ).rv() values.append(value) return values @@ -184,7 +228,7 @@ def getOne(): def getThirtyTwoMb(): - return '32M' + return "32M" def logDiskUsage(job, funcName, sleep=0): @@ -194,7 +238,7 @@ def logDiskUsage(job, funcName, sleep=0): :return: job function's disk usage """ diskUsage = job.disk - job.fileStore.log_to_leader(f'{funcName}: {diskUsage}') + job.fileStore.log_to_leader(f"{funcName}: {diskUsage}") time.sleep(sleep) return diskUsage @@ -212,14 +256,16 @@ def tearDown(self): @needs_mesos -class MesosPromisedRequirementsTest(hidden.AbstractPromisedRequirementsTest, MesosTestSupport): +class MesosPromisedRequirementsTest( + hidden.AbstractPromisedRequirementsTest, MesosTestSupport +): """ Tests against the Mesos batch system """ def getOptions(self, tempDir, caching=True): options = super().getOptions(tempDir, caching=caching) - options.mesos_endpoint = 'localhost:5050' + options.mesos_endpoint = "localhost:5050" return options def getBatchSystemName(self): diff --git a/src/toil/test/src/promisesTest.py b/src/toil/test/src/promisesTest.py index 6baeb02aa1..54af16654c 100644 --- a/src/toil/test/src/promisesTest.py +++ b/src/toil/test/src/promisesTest.py @@ -23,7 +23,7 @@ def test(self): """ for _ in range(2): options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) - options.logLevel = 'INFO' + options.logLevel = "INFO" root = Job.wrapJobFn(parent) Job.Runner.startToil(root, options) @@ -40,7 +40,7 @@ class ChainedIndexedPromisesTest(ToilTest): # https://github.com/BD2KGenomics/toil/issues/1021 def test(self): options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) - options.logLevel = 'INFO' + options.logLevel = "INFO" root = Job.wrapJobFn(a) self.assertEqual(Job.Runner.startToil(root, options), 42) @@ -62,17 +62,18 @@ class PathIndexingPromiseTest(ToilTest): Test support for indexing promises of arbitrarily nested data structures of lists, dicts and tuples, or any other object supporting the __getitem__() protocol. """ + def test(self): options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) - options.logLevel = 'INFO' + options.logLevel = "INFO" root = Job.wrapJobFn(d) - self.assertEqual(Job.Runner.startToil(root, options), ('b', 43, 3)) + self.assertEqual(Job.Runner.startToil(root, options), ("b", 43, 3)) def d(job): child = job.addChild(job.wrapFn(e)) - return child.rv('a'), child.rv(42), child.rv('c', 2) + return child.rv("a"), child.rv(42), child.rv("c", 2) def e(): - return {'a': 'b', 42: 43, 'c': [1, 2, 3]} + return {"a": "b", 42: 43, "c": [1, 2, 3]} diff --git a/src/toil/test/src/realtimeLoggerTest.py b/src/toil/test/src/realtimeLoggerTest.py index dcdade7c41..07414b1f89 100644 --- a/src/toil/test/src/realtimeLoggerTest.py +++ b/src/toil/test/src/realtimeLoggerTest.py @@ -23,7 +23,7 @@ class RealtimeLoggerTest(ToilTest): def testRealtimeLogger(self): options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.realTimeLogging = True - options.logLevel = 'INFO' + options.logLevel = "INFO" detector = MessageDetector() @@ -53,16 +53,16 @@ def __init__(self): super().__init__() def emit(self, record): - if record.msg == 'This should be logged at info level': + if record.msg == "This should be logged at info level": self.detected = True - if record.msg == 'This should be logged at debug level': + if record.msg == "This should be logged at debug level": self.overLogged = True class LogTest(Job): def __init__(self): - Job.__init__(self, memory=100000, cores=1, disk='3G') + Job.__init__(self, memory=100000, cores=1, disk="3G") def run(self, fileStore): - RealtimeLogger.info('This should be logged at info level') - RealtimeLogger.debug('This should be logged at debug level') + RealtimeLogger.info("This should be logged at info level") + RealtimeLogger.debug("This should be logged at debug level") diff --git a/src/toil/test/src/regularLogTest.py b/src/toil/test/src/regularLogTest.py index 25872c0227..3832c94f6b 100644 --- a/src/toil/test/src/regularLogTest.py +++ b/src/toil/test/src/regularLogTest.py @@ -28,10 +28,14 @@ class RegularLogTest(ToilTest): def setUp(self) -> None: super().setUp() - self.tempDir = self._createTempDir(purpose='tempDir') + self.tempDir = self._createTempDir(purpose="tempDir") def _getFiles(self, dir): - return [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))] + return [ + os.path.join(dir, f) + for f in os.listdir(dir) + if os.path.isfile(os.path.join(dir, f)) + ] def _assertFileTypeExists(self, dir, extension, encoding=None): # an encoding of None implies no compression @@ -45,56 +49,86 @@ def _assertFileTypeExists(self, dir, extension, encoding=None): if encoding is not None: for log in onlyLogs: with open(log, "rb") as f: - logger.info("Checking for encoding %s on file %s", str(encoding), log) + logger.info( + "Checking for encoding %s on file %s", str(encoding), log + ) if encoding == "gzip": # Check for gzip magic header '\x1f\x8b' - assert f.read().startswith(b'\x1f\x8b') + assert f.read().startswith(b"\x1f\x8b") else: mime = mimetypes.guess_type(log) self.assertEqual(mime[1], encoding) @slow def testLogToMaster(self): - toilOutput = subprocess.check_output([sys.executable, - '-m', helloWorld.__name__, - './toilTest', - '--clean=always', - '--logLevel=info'], stderr=subprocess.STDOUT) - assert helloWorld.childMessage in toilOutput.decode('utf-8') + toilOutput = subprocess.check_output( + [ + sys.executable, + "-m", + helloWorld.__name__, + "./toilTest", + "--clean=always", + "--logLevel=info", + ], + stderr=subprocess.STDOUT, + ) + assert helloWorld.childMessage in toilOutput.decode("utf-8") def testWriteLogs(self): - subprocess.check_call([sys.executable, - '-m', helloWorld.__name__, - './toilTest', - '--clean=always', - '--logLevel=debug', - '--writeLogs=%s' % self.tempDir]) - self._assertFileTypeExists(self.tempDir, '.log') + subprocess.check_call( + [ + sys.executable, + "-m", + helloWorld.__name__, + "./toilTest", + "--clean=always", + "--logLevel=debug", + "--writeLogs=%s" % self.tempDir, + ] + ) + self._assertFileTypeExists(self.tempDir, ".log") @slow def testWriteGzipLogs(self): - subprocess.check_call([sys.executable, - '-m', helloWorld.__name__, - './toilTest', - '--clean=always', - '--logLevel=debug', - '--writeLogsGzip=%s' % self.tempDir]) - self._assertFileTypeExists(self.tempDir, '.log.gz', 'gzip') + subprocess.check_call( + [ + sys.executable, + "-m", + helloWorld.__name__, + "./toilTest", + "--clean=always", + "--logLevel=debug", + "--writeLogsGzip=%s" % self.tempDir, + ] + ) + self._assertFileTypeExists(self.tempDir, ".log.gz", "gzip") @slow def testMultipleLogToMaster(self): - toilOutput = subprocess.check_output([sys.executable, - '-m', helloWorld.__name__, - './toilTest', - '--clean=always', - '--logLevel=info'], stderr=subprocess.STDOUT) - assert helloWorld.parentMessage in toilOutput.decode('utf-8') + toilOutput = subprocess.check_output( + [ + sys.executable, + "-m", + helloWorld.__name__, + "./toilTest", + "--clean=always", + "--logLevel=info", + ], + stderr=subprocess.STDOUT, + ) + assert helloWorld.parentMessage in toilOutput.decode("utf-8") def testRegularLog(self): - toilOutput = subprocess.check_output([sys.executable, - '-m', helloWorld.__name__, - './toilTest', - '--clean=always', - '--batchSystem=single_machine', - '--logLevel=debug'], stderr=subprocess.STDOUT) - assert "single machine batch system" in toilOutput.decode('utf-8') + toilOutput = subprocess.check_output( + [ + sys.executable, + "-m", + helloWorld.__name__, + "./toilTest", + "--clean=always", + "--batchSystem=single_machine", + "--logLevel=debug", + ], + stderr=subprocess.STDOUT, + ) + assert "single machine batch system" in toilOutput.decode("utf-8") diff --git a/src/toil/test/src/resourceTest.py b/src/toil/test/src/resourceTest.py index e8293a7044..d2cc555d15 100644 --- a/src/toil/test/src/resourceTest.py +++ b/src/toil/test/src/resourceTest.py @@ -30,7 +30,7 @@ @contextmanager -def tempFileContaining(content, suffix=''): +def tempFileContaining(content, suffix=""): """ Write a file with the given contents, and keep it on disk as long as the context is active. :param str content: The contents of the file. @@ -38,7 +38,7 @@ def tempFileContaining(content, suffix=''): """ fd, path = tempfile.mkstemp(suffix=suffix) try: - encoded = content.encode('utf-8') + encoded = content.encode("utf-8") assert os.write(fd, encoded) == len(encoded) except: os.close(fd) @@ -52,41 +52,55 @@ def tempFileContaining(content, suffix=''): class ResourceTest(ToilTest): """Test module descriptors and resources derived from them.""" + def testStandAlone(self): - self._testExternal(moduleName='userScript', pyFiles=('userScript.py', 'helper.py')) + self._testExternal( + moduleName="userScript", pyFiles=("userScript.py", "helper.py") + ) def testPackage(self): - self._testExternal(moduleName='foo.userScript', pyFiles=('foo/__init__.py', - 'foo/userScript.py', - 'foo/bar/__init__.py', - 'foo/bar/helper.py')) + self._testExternal( + moduleName="foo.userScript", + pyFiles=( + "foo/__init__.py", + "foo/userScript.py", + "foo/bar/__init__.py", + "foo/bar/helper.py", + ), + ) def testVirtualEnv(self): - self._testExternal(moduleName='foo.userScript', - virtualenv=True, - pyFiles=('foo/__init__.py', - 'foo/userScript.py', - 'foo/bar/__init__.py', - 'foo/bar/helper.py', - 'de/pen/dency.py', - 'de/__init__.py', - 'de/pen/__init__.py')) + self._testExternal( + moduleName="foo.userScript", + virtualenv=True, + pyFiles=( + "foo/__init__.py", + "foo/userScript.py", + "foo/bar/__init__.py", + "foo/bar/helper.py", + "de/pen/dency.py", + "de/__init__.py", + "de/pen/__init__.py", + ), + ) def testStandAloneInPackage(self): - self.assertRaises(ResourceException, - self._testExternal, - moduleName='userScript', - pyFiles=('__init__.py', 'userScript.py', 'helper.py')) + self.assertRaises( + ResourceException, + self._testExternal, + moduleName="userScript", + pyFiles=("__init__.py", "userScript.py", "helper.py"), + ) def _testExternal(self, moduleName, pyFiles, virtualenv=False): dirPath = self._createTempDir() if virtualenv: self.assertTrue(inVirtualEnv()) # --never-download prevents silent upgrades to pip, wheel and setuptools - subprocess.check_call(['virtualenv', '--never-download', '--python', exactPython, dirPath]) - sitePackages = os.path.join(dirPath, 'lib', - exactPython, - 'site-packages') + subprocess.check_call( + ["virtualenv", "--never-download", "--python", exactPython, dirPath] + ) + sitePackages = os.path.join(dirPath, "lib", exactPython, "site-packages") # tuple assignment is necessary to make this line immediately precede the try: oldPrefix, sys.prefix, dirPath = sys.prefix, dirPath, sitePackages else: @@ -95,21 +109,23 @@ def _testExternal(self, moduleName, pyFiles, virtualenv=False): for relPath in pyFiles: path = os.path.join(dirPath, relPath) os.makedirs(os.path.dirname(path), exist_ok=True) - with open(path, 'w') as f: - f.write('pass\n') + with open(path, "w") as f: + f.write("pass\n") sys.path.append(dirPath) try: userScript = importlib.import_module(moduleName) try: - self._test(userScript.__name__, - expectedContents=pyFiles, - allowExtraContents=True) + self._test( + userScript.__name__, + expectedContents=pyFiles, + allowExtraContents=True, + ) finally: del userScript while moduleName: del sys.modules[moduleName] self.assertFalse(moduleName in sys.modules) - moduleName = '.'.join(moduleName.split('.')[:-1]) + moduleName = ".".join(moduleName.split(".")[:-1]) finally: sys.path.remove(dirPath) @@ -120,17 +136,22 @@ def _testExternal(self, moduleName, pyFiles, virtualenv=False): def testBuiltIn(self): # Create a ModuleDescriptor for the module containing ModuleDescriptor, i.e. toil.resource module_name = ModuleDescriptor.__module__ - self.assertEqual(module_name, 'toil.resource') + self.assertEqual(module_name, "toil.resource") self._test(module_name, shouldBelongToToil=True) - def _test(self, module_name, - shouldBelongToToil=False, expectedContents=None, allowExtraContents=True): + def _test( + self, + module_name, + shouldBelongToToil=False, + expectedContents=None, + allowExtraContents=True, + ): module = ModuleDescriptor.forModule(module_name) # Assert basic attributes and properties self.assertEqual(module.belongsToToil, shouldBelongToToil) self.assertEqual(module.name, module_name) if shouldBelongToToil: - self.assertTrue(module.dirPath.endswith('/src')) + self.assertTrue(module.dirPath.endswith("/src")) # Before the module is saved as a resource, localize() and globalize() are identity # methods. This should log.warnings. @@ -139,27 +160,32 @@ def _test(self, module_name, # Create a mock job store ... jobStore = MagicMock() # ... to generate a fake URL for the resource ... - url = 'file://foo.zip' + url = "file://foo.zip" jobStore.getSharedPublicUrl.return_value = url # ... and save the resource to it. resource = module.saveAsResourceTo(jobStore) # Ensure that the URL generation method is actually called, ... - jobStore.getSharedPublicUrl.assert_called_once_with(sharedFileName=resource.pathHash) + jobStore.getSharedPublicUrl.assert_called_once_with( + sharedFileName=resource.pathHash + ) # ... and that ensure that write_shared_file_stream is called. - jobStore.write_shared_file_stream.assert_called_once_with(shared_file_name=resource.pathHash, - encrypted=False) + jobStore.write_shared_file_stream.assert_called_once_with( + shared_file_name=resource.pathHash, encrypted=False + ) # Now it gets a bit complicated: Ensure that the context manager returned by the # jobStore's write_shared_file_stream() method is entered and that the file handle yielded # by the context manager is written to once with the zipped source tree from which # 'toil.resource' was originally imported. Keep the zipped tree around such that we can # mock the download later. - file_handle = jobStore.write_shared_file_stream.return_value.__enter__.return_value + file_handle = ( + jobStore.write_shared_file_stream.return_value.__enter__.return_value + ) # The first 0 index selects the first call of write(), the second 0 selects positional # instead of keyword arguments, and the third 0 selects the first positional, i.e. the # contents. This is a bit brittle since it assumes that all the data is written in a # single call to write(). If more calls are made we can easily concatenate them. zipFile = file_handle.write.call_args_list[0][0][0] - self.assertTrue(zipFile.startswith(b'PK')) # the magic header for ZIP files + self.assertTrue(zipFile.startswith(b"PK")) # the magic header for ZIP files # Check contents if requested if expectedContents is not None: @@ -186,7 +212,7 @@ def _test(self, module_name, # urlopen() that yields the zipped tree ... mock_urlopen = MagicMock() mock_urlopen.return_value.read.return_value = zipFile - with patch('toil.resource.urlopen', mock_urlopen): + with patch("toil.resource.urlopen", mock_urlopen): # ... and use it to download and unpack the resource localModule = module.localize() # The name should be equal between original and localized resource ... @@ -217,22 +243,27 @@ def script(): def fn(): pass - if __name__ == '__main__': + if __name__ == "__main__": parser = ArgumentParser() Job.Runner.addToilOptions(parser) options = parser.parse_args() - job = Job.wrapFn(fn, memory='10M', cores=0.1, disk='10M') + job = Job.wrapFn(fn, memory="10M", cores=0.1, disk="10M") with Toil(options) as toil: toil.start(job) - scriptBody = dedent('\n'.join(getsource(script).split('\n')[1:])) - shebang = '#! %s\n' % sys.executable + scriptBody = dedent("\n".join(getsource(script).split("\n")[1:])) + shebang = "#! %s\n" % sys.executable with tempFileContaining(shebang + scriptBody) as scriptPath: - self.assertFalse(scriptPath.endswith(('.py', '.pyc'))) + self.assertFalse(scriptPath.endswith((".py", ".pyc"))) os.chmod(scriptPath, 0o755) - jobStorePath = scriptPath + '.jobStore' - process = subprocess.Popen([scriptPath, jobStorePath], stderr=subprocess.PIPE) + jobStorePath = scriptPath + ".jobStore" + process = subprocess.Popen( + [scriptPath, jobStorePath], stderr=subprocess.PIPE + ) stdout, stderr = process.communicate() - self.assertTrue('The name of a user script/module must end in .py or .pyc.' in stderr.decode('utf-8')) + self.assertTrue( + "The name of a user script/module must end in .py or .pyc." + in stderr.decode("utf-8") + ) self.assertNotEqual(0, process.returncode) self.assertFalse(os.path.exists(jobStorePath)) diff --git a/src/toil/test/src/restartDAGTest.py b/src/toil/test/src/restartDAGTest.py index 934ace38c9..09d195e41d 100644 --- a/src/toil/test/src/restartDAGTest.py +++ b/src/toil/test/src/restartDAGTest.py @@ -31,9 +31,10 @@ class RestartDAGTest(ToilTest): Tests that restarted job DAGs don't run children of jobs that failed in the first run till the parent completes successfully in the restart. """ + def setUp(self): super().setUp() - self.tempDir = self._createTempDir(purpose='tempDir') + self.tempDir = self._createTempDir(purpose="tempDir") self.testJobStore = self._getTestJobStorePath() def tearDown(self): @@ -42,11 +43,11 @@ def tearDown(self): @slow def testRestartedWorkflowSchedulesCorrectJobsOnFailedParent(self): - self._testRestartedWorkflowSchedulesCorrectJobs('raise') + self._testRestartedWorkflowSchedulesCorrectJobs("raise") @slow def testRestartedWorkflowSchedulesCorrectJobsOnKilledParent(self): - self._testRestartedWorkflowSchedulesCorrectJobs('kill') + self._testRestartedWorkflowSchedulesCorrectJobs("kill") def _testRestartedWorkflowSchedulesCorrectJobs(self, failType): """ @@ -63,12 +64,12 @@ def _testRestartedWorkflowSchedulesCorrectJobs(self, failType): """ # Specify options options = Job.Runner.getDefaultOptions(self.testJobStore) - options.logLevel = 'DEBUG' + options.logLevel = "DEBUG" options.retryCount = 0 options.clean = "never" - parentFile = os.path.join(self.tempDir, 'parent') - childFile = os.path.join(self.tempDir, 'child') + parentFile = os.path.join(self.tempDir, "parent") + childFile = os.path.join(self.tempDir, "child") # Make the first job root = Job.wrapJobFn(passingFn) @@ -87,11 +88,11 @@ def _testRestartedWorkflowSchedulesCorrectJobs(self, failType): assert not os.path.exists(childFile) # Run the test - for runMode in 'start', 'restart': + for runMode in "start", "restart": self.errorRaised = None try: with Toil(options) as toil: - if runMode == 'start': + if runMode == "start": toil.start(root) else: toil.restart() @@ -103,19 +104,27 @@ def _testRestartedWorkflowSchedulesCorrectJobs(self, failType): # it together in this finally clause. if self.errorRaised is not None: if not os.path.exists(parentFile): - failReasons.append('The failing parent file did not exist on toil "%s".' - % runMode) + failReasons.append( + 'The failing parent file did not exist on toil "%s".' + % runMode + ) if os.path.exists(childFile): - failReasons.append('The child file existed. i.e. the child was run on ' - 'toil "%s".' % runMode) + failReasons.append( + "The child file existed. i.e. the child was run on " + 'toil "%s".' % runMode + ) if isinstance(self.errorRaised, FailedJobsException): if self.errorRaised.numberOfFailedJobs != 3: - failReasons.append('FailedJobsException was raised on toil "%s" but ' - 'the number of failed jobs (%s) was not 3.' - % (runMode, self.errorRaised.numberOfFailedJobs)) + failReasons.append( + 'FailedJobsException was raised on toil "%s" but ' + "the number of failed jobs (%s) was not 3." + % (runMode, self.errorRaised.numberOfFailedJobs) + ) elif isinstance(self.errorRaised, AssertionError): - failReasons.append('Toil raised an AssertionError instead of a ' - 'FailedJobsException on toil "%s".' % runMode) + failReasons.append( + "Toil raised an AssertionError instead of a " + 'FailedJobsException on toil "%s".' % runMode + ) else: failReasons.append("Toil raised error: %s" % self.errorRaised) self.errorRaised = None @@ -123,8 +132,12 @@ def _testRestartedWorkflowSchedulesCorrectJobs(self, failType): else: self.fail('No errors were raised on toil "%s".' % runMode) if failReasons: - self.fail('Test failed for ({}) reasons:\n\t{}'.format(len(failReasons), - '\n\t'.join(failReasons))) + self.fail( + "Test failed for ({}) reasons:\n\t{}".format( + len(failReasons), "\n\t".join(failReasons) + ) + ) + def passingFn(job, fileName=None): """ @@ -135,7 +148,8 @@ def passingFn(job, fileName=None): """ if fileName is not None: # Emulates system touch. - open(fileName, 'w').close() + open(fileName, "w").close() + def failingFn(job, failType, fileName): """ @@ -145,11 +159,11 @@ def failingFn(job, failType, fileName): :param str failType: 'raise' or 'kill :param str fileName: The name of a file that must be created. """ - assert failType in ('raise', 'kill') + assert failType in ("raise", "kill") # Use that function to avoid code redundancy passingFn(job, fileName) - if failType == 'raise': + if failType == "raise": assert False else: os.kill(os.getpid(), signal.SIGKILL) diff --git a/src/toil/test/src/resumabilityTest.py b/src/toil/test/src/resumabilityTest.py index 7fdade743f..7d9828d74b 100644 --- a/src/toil/test/src/resumabilityTest.py +++ b/src/toil/test/src/resumabilityTest.py @@ -24,6 +24,7 @@ class ResumabilityTest(ToilTest): """ https://github.com/BD2KGenomics/toil/issues/808 """ + @slow def test(self): """ @@ -70,7 +71,7 @@ def test_chaining(self): # This one is intended to fail. Job.Runner.startToil(root, options) - with open(options.logFile, 'r') as f: + with open(options.logFile) as f: log_content = f.read() # Make sure we actually did do chaining assert "Chaining from" in log_content @@ -81,6 +82,7 @@ def test_chaining(self): options.restart = True Job.Runner.startToil(root, options) + def parent(job): """ Set up a bunch of dummy child jobs, and a bad job that needs to be @@ -90,18 +92,21 @@ def parent(job): job.addChildJobFn(goodChild) job.addFollowOnJobFn(badChild) + def chaining_parent(job): """ Set up a failing job to chain to. """ job.addFollowOnJobFn(badChild) + def goodChild(job): """ Does nothing. """ return + def badChild(job): """ Fails the first time it's run, succeeds the second time. @@ -110,6 +115,8 @@ def badChild(job): with job.fileStore.jobStore.read_shared_file_stream("alreadyRun") as fileHandle: fileHandle.read() except NoSuchFileException as ex: - with job.fileStore.jobStore.write_shared_file_stream("alreadyRun", encrypted=False) as fileHandle: + with job.fileStore.jobStore.write_shared_file_stream( + "alreadyRun", encrypted=False + ) as fileHandle: fileHandle.write(b"failed once\n") raise RuntimeError(f"this is an expected error: {str(ex)}") diff --git a/src/toil/test/src/retainTempDirTest.py b/src/toil/test/src/retainTempDirTest.py index 6bfe6c52b6..2736ef7ae6 100644 --- a/src/toil/test/src/retainTempDirTest.py +++ b/src/toil/test/src/retainTempDirTest.py @@ -23,6 +23,7 @@ class CleanWorkDirTest(ToilTest): """ Tests testing :class:toil.fileStores.abstractFileStore.AbstractFileStore """ + def setUp(self): super().setUp() self.testDir = self._createTempDir() @@ -33,33 +34,61 @@ def tearDown(self): def testNever(self): retainedTempData = self._runAndReturnWorkDir("never", job=tempFileTestJob) - self.assertNotEqual(retainedTempData, [], "The worker's temporary workspace was deleted despite " - "cleanWorkDir being set to 'never'") + self.assertNotEqual( + retainedTempData, + [], + "The worker's temporary workspace was deleted despite " + "cleanWorkDir being set to 'never'", + ) def testAlways(self): retainedTempData = self._runAndReturnWorkDir("always", job=tempFileTestJob) - self.assertEqual(retainedTempData, [], "The worker's temporary workspace was not deleted despite " - "cleanWorkDir being set to 'always'") + self.assertEqual( + retainedTempData, + [], + "The worker's temporary workspace was not deleted despite " + "cleanWorkDir being set to 'always'", + ) def testOnErrorWithError(self): - retainedTempData = self._runAndReturnWorkDir("onError", job=tempFileTestErrorJob, expectError=True) - self.assertEqual(retainedTempData, [], "The worker's temporary workspace was not deleted despite " - "an error occurring and cleanWorkDir being set to 'onError'") + retainedTempData = self._runAndReturnWorkDir( + "onError", job=tempFileTestErrorJob, expectError=True + ) + self.assertEqual( + retainedTempData, + [], + "The worker's temporary workspace was not deleted despite " + "an error occurring and cleanWorkDir being set to 'onError'", + ) def testOnErrorWithNoError(self): retainedTempData = self._runAndReturnWorkDir("onError", job=tempFileTestJob) - self.assertNotEqual(retainedTempData, [], "The worker's temporary workspace was deleted despite " - "no error occurring and cleanWorkDir being set to 'onError'") + self.assertNotEqual( + retainedTempData, + [], + "The worker's temporary workspace was deleted despite " + "no error occurring and cleanWorkDir being set to 'onError'", + ) def testOnSuccessWithError(self): - retainedTempData = self._runAndReturnWorkDir("onSuccess", job=tempFileTestErrorJob, expectError=True) - self.assertNotEqual(retainedTempData, [], "The worker's temporary workspace was deleted despite " - "an error occurring and cleanWorkDir being set to 'onSuccesss'") + retainedTempData = self._runAndReturnWorkDir( + "onSuccess", job=tempFileTestErrorJob, expectError=True + ) + self.assertNotEqual( + retainedTempData, + [], + "The worker's temporary workspace was deleted despite " + "an error occurring and cleanWorkDir being set to 'onSuccesss'", + ) def testOnSuccessWithSuccess(self): retainedTempData = self._runAndReturnWorkDir("onSuccess", job=tempFileTestJob) - self.assertEqual(retainedTempData, [], "The worker's temporary workspace was not deleted despite " - "a successful job execution and cleanWorkDir being set to 'onSuccesss'") + self.assertEqual( + retainedTempData, + [], + "The worker's temporary workspace was not deleted despite " + "a successful job execution and cleanWorkDir being set to 'onSuccesss'", + ) def _runAndReturnWorkDir(self, cleanWorkDir, job, expectError=False): """ @@ -89,10 +118,12 @@ def _launchError(self, A, options): else: self.fail("Toil run succeeded unexpectedly") + def tempFileTestJob(job): with open(job.fileStore.getLocalTempFile(), "w") as f: f.write("test file retention") + def tempFileTestErrorJob(job): with open(job.fileStore.getLocalTempFile(), "w") as f: f.write("test file retention") diff --git a/src/toil/test/src/systemTest.py b/src/toil/test/src/systemTest.py index 3576165700..b83a00c098 100644 --- a/src/toil/test/src/systemTest.py +++ b/src/toil/test/src/systemTest.py @@ -10,17 +10,21 @@ class SystemTest(ToilTest): """Test various assumptions about the operating system's behavior.""" + def testAtomicityOfNonEmptyDirectoryRenames(self): for _ in range(100): - parent = self._createTempDir(purpose='parent') - child = os.path.join(parent, 'child') + parent = self._createTempDir(purpose="parent") + child = os.path.join(parent, "child") # Use processes (as opposed to threads) to prevent GIL from ordering things artificially pool = multiprocessing.Pool(processes=cpu_count()) try: numTasks = cpu_count() * 10 grandChildIds = pool.map_async( - func=partial(_testAtomicityOfNonEmptyDirectoryRenamesTask, parent, child), - iterable=list(range(numTasks))) + func=partial( + _testAtomicityOfNonEmptyDirectoryRenamesTask, parent, child + ), + iterable=list(range(numTasks)), + ) grandChildIds = grandChildIds.get() finally: pool.close() @@ -31,15 +35,15 @@ def testAtomicityOfNonEmptyDirectoryRenames(self): self.assertEqual(len(grandChildIds), 1) # Assert that the winner's grandChild wasn't silently overwritten by a looser expectedGrandChildId = grandChildIds[0] - actualGrandChild = os.path.join(child, 'grandChild') + actualGrandChild = os.path.join(child, "grandChild") actualGrandChildId = os.stat(actualGrandChild).st_ino self.assertEqual(actualGrandChildId, expectedGrandChildId) def _testAtomicityOfNonEmptyDirectoryRenamesTask(parent, child, _): - tmpChildDir = mkdtemp(dir=parent, prefix='child', suffix='.tmp') - grandChild = os.path.join(tmpChildDir, 'grandChild') - open(grandChild, 'w').close() + tmpChildDir = mkdtemp(dir=parent, prefix="child", suffix=".tmp") + grandChild = os.path.join(tmpChildDir, "grandChild") + open(grandChild, "w").close() grandChildId = os.stat(grandChild).st_ino try: os.rename(tmpChildDir, child) diff --git a/src/toil/test/src/threadingTest.py b/src/toil/test/src/threadingTest.py index bb0f0de7ad..9a31977fe7 100644 --- a/src/toil/test/src/threadingTest.py +++ b/src/toil/test/src/threadingTest.py @@ -6,28 +6,29 @@ import traceback from functools import partial -from toil.lib.threading import (LastProcessStandingArena, - cpu_count, - global_mutex) +from toil.lib.threading import LastProcessStandingArena, cpu_count, global_mutex from toil.test import ToilTest log = logging.getLogger(__name__) + class ThreadingTest(ToilTest): """Test Toil threading/synchronization tools.""" + def testGlobalMutexOrdering(self): for it in range(10): - log.info('Iteration %d', it) + log.info("Iteration %d", it) scope = self._createTempDir() - mutex = 'mutex' + mutex = "mutex" # Use processes (as opposed to threads) to prevent GIL from ordering things artificially pool = multiprocessing.Pool(processes=cpu_count()) try: numTasks = 100 results = pool.map_async( func=partial(_testGlobalMutexOrderingTask, scope, mutex), - iterable=list(range(numTasks))) + iterable=list(range(numTasks)), + ) results = results.get() finally: pool.close() @@ -40,17 +41,18 @@ def testGlobalMutexOrdering(self): def testLastProcessStanding(self): for it in range(10): - log.info('Iteration %d', it) + log.info("Iteration %d", it) scope = self._createTempDir() - arena_name = 'thunderdome' + arena_name = "thunderdome" # Use processes (as opposed to threads) to prevent GIL from ordering things artificially pool = multiprocessing.Pool(processes=cpu_count()) try: numTasks = 100 results = pool.map_async( func=partial(_testLastProcessStandingTask, scope, arena_name), - iterable=list(range(numTasks))) + iterable=list(range(numTasks)), + ) results = results.get() finally: pool.close() @@ -61,19 +63,24 @@ def testLastProcessStanding(self): # Make sure all workers say they succeeded self.assertEqual(item, True) for filename in os.listdir(scope): - assert not filename.startswith('precious'), f"File {filename} still exists" + assert not filename.startswith( + "precious" + ), f"File {filename} still exists" + def _testGlobalMutexOrderingTask(scope, mutex, number): try: # We will all fight over the potato - potato = os.path.join(scope, 'potato') + potato = os.path.join(scope, "potato") with global_mutex(scope, mutex): - log.info('PID %d = num %d running', os.getpid(), number) - assert not os.path.exists(potato), "We see someone else holding the potato file" + log.info("PID %d = num %d running", os.getpid(), number) + assert not os.path.exists( + potato + ), "We see someone else holding the potato file" # Put our name there - with open(potato, 'w') as out_stream: + with open(potato, "w") as out_stream: out_stream.write(str(number)) # Wait @@ -82,51 +89,63 @@ def _testGlobalMutexOrderingTask(scope, mutex, number): # Make sure our name is still there with open(potato) as in_stream: seen = in_stream.read().rstrip() - assert seen == str(number), f"We are {number} but {seen} stole our potato!" + assert seen == str( + number + ), f"We are {number} but {seen} stole our potato!" os.unlink(potato) assert not os.path.exists(potato), "We left the potato behind" - log.info('PID %d = num %d dropped potato', os.getpid(), number) + log.info("PID %d = num %d dropped potato", os.getpid(), number) return True except: traceback.print_exc() return False + def _testLastProcessStandingTask(scope, arena_name, number): try: arena = LastProcessStandingArena(scope, arena_name) arena.enter() - log.info('PID %d = num %d entered arena', os.getpid(), number) + log.info("PID %d = num %d entered arena", os.getpid(), number) try: # We all make files - my_precious = os.path.join(scope, 'precious' + str(number)) + my_precious = os.path.join(scope, "precious" + str(number)) # Put our name there - with open(my_precious, 'w') as out_stream: + with open(my_precious, "w") as out_stream: out_stream.write(str(number)) # Wait time.sleep(random.random() * 0.01) # Make sure our file is still there unmodified - assert os.path.exists(my_precious), f"Precious file {my_precious} has been stolen!" + assert os.path.exists( + my_precious + ), f"Precious file {my_precious} has been stolen!" with open(my_precious) as in_stream: seen = in_stream.read().rstrip() - assert seen == str(number), f"We are {number} but saw {seen} in our precious file!" + assert seen == str( + number + ), f"We are {number} but saw {seen} in our precious file!" finally: was_last = False for _ in arena.leave(): was_last = True - log.info('PID %d = num %d is last standing', os.getpid(), number) + log.info("PID %d = num %d is last standing", os.getpid(), number) # Clean up all the files for filename in os.listdir(scope): - if filename.startswith('precious'): - log.info('PID %d = num %d cleaning up %s', os.getpid(), number, filename) + if filename.startswith("precious"): + log.info( + "PID %d = num %d cleaning up %s", + os.getpid(), + number, + filename, + ) os.unlink(os.path.join(scope, filename)) - log.info('PID %d = num %d left arena', os.getpid(), number) + log.info("PID %d = num %d left arena", os.getpid(), number) return True except: diff --git a/src/toil/test/src/toilContextManagerTest.py b/src/toil/test/src/toilContextManagerTest.py index 4691303b19..dfe24eb6ac 100644 --- a/src/toil/test/src/toilContextManagerTest.py +++ b/src/toil/test/src/toilContextManagerTest.py @@ -29,13 +29,13 @@ def tearDown(self): def testContextManger(self): options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) - options.logLevel = 'INFO' + options.logLevel = "INFO" with Toil(options) as toil: toil.start(HelloWorld()) def testNoContextManger(self): options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) - options.logLevel = 'INFO' + options.logLevel = "INFO" toil = Toil(options) self.assertRaises(ToilContextManagerException, toil.start, HelloWorld()) @@ -45,7 +45,9 @@ def testExportAfterFailedExport(self): with Toil(options) as toil: _ = toil.start(HelloWorld()) # oh no, an error! :( - raise RuntimeError("we died after workflow completion but before our export finished") + raise RuntimeError( + "we died after workflow completion but before our export finished" + ) except RuntimeError: pass @@ -54,17 +56,18 @@ def testExportAfterFailedExport(self): fileID = toil.restart() print(fileID) # Hopefully the error didn't cause us to lose all our work! - toil.exportFile(fileID, 'file://' + self.exportPath) + toil.exportFile(fileID, "file://" + self.exportPath) with open(self.exportPath) as f: # The file should have all our content self.assertEqual(f.read(), "Hello, World!") + class HelloWorld(Job): def __init__(self): - Job.__init__(self, memory=100000, disk='1M') + Job.__init__(self, memory=100000, disk="1M") def run(self, fileStore): - fileID = self.addChildJobFn(childFn, memory='1M', disk='1M').rv() + fileID = self.addChildJobFn(childFn, memory="1M", disk="1M").rv() return self.addFollowOn(FollowOn(fileID)).rv() @@ -81,7 +84,7 @@ def __init__(self, fileId): def run(self, fileStore): tempDir = fileStore.getLocalTempDir() - tempFilePath = "/".join([tempDir, 'LocalCopy']) + tempFilePath = "/".join([tempDir, "LocalCopy"]) with fileStore.readGlobalFileStream(self.fileId) as globalFile: with open(tempFilePath, "wb") as localFile: localFile.write(globalFile.read()) diff --git a/src/toil/test/src/userDefinedJobArgTypeTest.py b/src/toil/test/src/userDefinedJobArgTypeTest.py index 9c1183a72c..a939a2e0d6 100644 --- a/src/toil/test/src/userDefinedJobArgTypeTest.py +++ b/src/toil/test/src/userDefinedJobArgTypeTest.py @@ -52,9 +52,11 @@ def testJobClassFromMain(self): self._testFromMain() def _testFromMain(self): - testMethodName = self.id().split('.')[-1] - self.assertTrue(testMethodName.endswith('FromMain')) - subprocess.check_call([sys.executable, '-m', self.__module__, testMethodName[:-8]]) + testMethodName = self.id().split(".")[-1] + self.assertTrue(testMethodName.endswith("FromMain")) + subprocess.check_call( + [sys.executable, "-m", self.__module__, testMethodName[:-8]] + ) class JobClass(Job): @@ -66,8 +68,9 @@ def __init__(self, level, foo): def run(self, fileStore): self.foo.assertIsCopy() if self.level < 2: - self.addChildJobFn(jobFunction, self.level + 1, Foo(), cores=1, memory="1M", - disk="300M") + self.addChildJobFn( + jobFunction, self.level + 1, Foo(), cores=1, memory="1M", disk="300M" + ) def jobFunction(job, level, foo): diff --git a/src/toil/test/src/workerTest.py b/src/toil/test/src/workerTest.py index 6df4f447b2..543ded531f 100644 --- a/src/toil/test/src/workerTest.py +++ b/src/toil/test/src/workerTest.py @@ -12,46 +12,55 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from toil.common import Config from toil.job import CheckpointJobDescription, JobDescription from toil.jobStores.fileJobStore import FileJobStore from toil.test import ToilTest from toil.worker import nextChainable -from typing import Optional - class WorkerTests(ToilTest): """Test miscellaneous units of the worker.""" + def setUp(self): super().setUp() path = self._getTestJobStorePath() self.jobStore = FileJobStore(path) self.config = Config() - self.config.jobStore = 'file:%s' % path + self.config.jobStore = "file:%s" % path self.jobStore.initialize(self.config) self.jobNumber = 0 def testNextChainable(self): """Make sure chainable/non-chainable jobs are identified correctly.""" - def createTestJobDesc(memory, cores, disk, preemptible: bool = True, checkpoint: bool = False, local: Optional[bool] = None): + + def createTestJobDesc( + memory, + cores, + disk, + preemptible: bool = True, + checkpoint: bool = False, + local: Optional[bool] = None, + ): """ Create a JobDescription with no command (representing a Job that has already run) and return the JobDescription. """ - name = 'job%d' % self.jobNumber + name = "job%d" % self.jobNumber self.jobNumber += 1 descClass = CheckpointJobDescription if checkpoint else JobDescription jobDesc = descClass( requirements={ - 'memory': memory, - 'cores': cores, - 'disk': disk, - 'preemptible': preemptible - }, - jobName=name, - local=local + "memory": memory, + "cores": cores, + "disk": disk, + "preemptible": preemptible, + }, + jobName=name, + local=local, ) # Assign an ID @@ -60,7 +69,7 @@ def createTestJobDesc(memory, cores, disk, preemptible: bool = True, checkpoint: # Save and return the JobDescription return self.jobStore.create_job(jobDesc) - for successorType in ['addChild', 'addFollowOn']: + for successorType in ["addChild", "addFollowOn"]: # Try with the branch point at both child and follow-on stages # Identical non-checkpoint jobs should be chainable. @@ -96,14 +105,22 @@ def createTestJobDesc(memory, cores, disk, preemptible: bool = True, checkpoint: self.assertEqual(nextChainable(jobDesc1, self.jobStore, self.config), None) # If there is an increase in resource requirements we should get nothing to chain. - base_reqs = {'memory': 1, 'cores': 2, 'disk': 3, 'preemptible': True, 'checkpoint': False} - for increased_attribute in ('memory', 'cores', 'disk'): + base_reqs = { + "memory": 1, + "cores": 2, + "disk": 3, + "preemptible": True, + "checkpoint": False, + } + for increased_attribute in ("memory", "cores", "disk"): reqs = dict(base_reqs) jobDesc1 = createTestJobDesc(**reqs) reqs[increased_attribute] += 1 jobDesc2 = createTestJobDesc(**reqs) getattr(jobDesc1, successorType)(jobDesc2.jobStoreID) - self.assertEqual(nextChainable(jobDesc1, self.jobStore, self.config), None) + self.assertEqual( + nextChainable(jobDesc1, self.jobStore, self.config), None + ) # A change in preemptability from True to False should be disallowed. jobDesc1 = createTestJobDesc(1, 2, 3, preemptible=True) diff --git a/src/toil/test/utils/ABCWorkflowDebug/debugWorkflow.py b/src/toil/test/utils/ABCWorkflowDebug/debugWorkflow.py index 518eeaaf67..e32f59aeab 100644 --- a/src/toil/test/utils/ABCWorkflowDebug/debugWorkflow.py +++ b/src/toil/test/utils/ABCWorkflowDebug/debugWorkflow.py @@ -10,102 +10,119 @@ logger = logging.getLogger(__name__) -''' +""" This workflow's purpose is to create files and jobs for viewing using stats, status, and printDot() in toilDebugTest.py. It's intended for future use in a debugging tutorial containing a broken job. It is also a minor integration test. -''' +""" + def initialize_jobs(job): - ''' + """ Stub function used to start a toil workflow since toil workflows can only start with one job (but afterwards can run many in parallel). - ''' - job.fileStore.log_to_leader('''initialize_jobs''') + """ + job.fileStore.log_to_leader("""initialize_jobs""") + def writeA(job, mkFile): - '''Runs a program, and writes a string 'A' into A.txt using mkFile.py.''' - job.fileStore.log_to_leader('''writeA''') + """Runs a program, and writes a string 'A' into A.txt using mkFile.py.""" + job.fileStore.log_to_leader("""writeA""") # temp folder for the run tempDir = job.fileStore.getLocalTempDir() # import files - mkFile_fs = job.fileStore.readGlobalFile(mkFile[0], userPath=os.path.join(tempDir, mkFile[1])) + mkFile_fs = job.fileStore.readGlobalFile( + mkFile[0], userPath=os.path.join(tempDir, mkFile[1]) + ) # make a file (A.txt) and writes a string 'A' into it using 'mkFile.py' - content = 'A' - cmd = python + ' ' + mkFile_fs + ' ' + 'A.txt' + ' ' + content + content = "A" + cmd = python + " " + mkFile_fs + " " + "A.txt" + " " + content this_process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) this_process.wait() # get the output file and return it as a tuple of location + name - output_filename = 'A.txt' + output_filename = "A.txt" output_file = job.fileStore.writeGlobalFile(output_filename) A1 = (output_file, output_filename) rvDict = {"A1": A1} return rvDict + def writeB(job, mkFile, B_file): - ''' + """ Runs a program, extracts a string 'B' from an existing file, B_file.txt, and writes it into B.txt using mkFile.py. - ''' - job.fileStore.log_to_leader('''writeB''') + """ + job.fileStore.log_to_leader("""writeB""") # temp folder for the run tempDir = job.fileStore.getLocalTempDir() # import files - mkFile_fs = job.fileStore.readGlobalFile(mkFile[0], userPath=os.path.join(tempDir, mkFile[1])) - B_file_fs = job.fileStore.readGlobalFile(B_file[0], userPath=os.path.join(tempDir, B_file[1])) + mkFile_fs = job.fileStore.readGlobalFile( + mkFile[0], userPath=os.path.join(tempDir, mkFile[1]) + ) + B_file_fs = job.fileStore.readGlobalFile( + B_file[0], userPath=os.path.join(tempDir, B_file[1]) + ) # make a file (B.txt) and write the contents of 'B_file.txt' into it using 'mkFile.py' with open(B_file_fs) as f: - file_contents = '' + file_contents = "" for line in f: file_contents = file_contents + line - cmd = python + ' ' + mkFile_fs + ' ' + 'B.txt' + ' ' + file_contents + cmd = python + " " + mkFile_fs + " " + "B.txt" + " " + file_contents this_process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) this_process.wait() # get the output file and return it as a tuple of location + name - output_filename = 'B.txt' + output_filename = "B.txt" output_file = job.fileStore.writeGlobalFile(output_filename) B1 = (output_file, output_filename) rvDict = {"B1": B1} return rvDict + def writeC(job): - '''Creates/writes a file, C.txt, containing the string 'C'.''' - job.fileStore.log_to_leader('''writeC''') + """Creates/writes a file, C.txt, containing the string 'C'.""" + job.fileStore.log_to_leader("""writeC""") # temp folder for the run tempDir = job.fileStore.getLocalTempDir() # get the output file and return it as a tuple of location + name - output_filename = os.path.join(tempDir, 'C.txt') - with open(output_filename, 'w') as f: - f.write('C') + output_filename = os.path.join(tempDir, "C.txt") + with open(output_filename, "w") as f: + f.write("C") output_file = job.fileStore.writeGlobalFile(output_filename) - C1 = (output_file, 'C.txt') + C1 = (output_file, "C.txt") rvDict = {"C1": C1} return rvDict + def writeABC(job, A_dict, B_dict, C_dict, filepath): - '''Takes 3 files (specified as dictionaries) and writes their contents to ABC.txt.''' - job.fileStore.log_to_leader('''writeABC''') + """Takes 3 files (specified as dictionaries) and writes their contents to ABC.txt.""" + job.fileStore.log_to_leader("""writeABC""") # temp folder for the run tempDir = job.fileStore.getLocalTempDir() # import files - A_fs = job.fileStore.readGlobalFile(A_dict['A1'][0], userPath=os.path.join(tempDir, A_dict['A1'][1])) - B_fs = job.fileStore.readGlobalFile(B_dict['B1'][0], userPath=os.path.join(tempDir, B_dict['B1'][1])) - C_fs = job.fileStore.readGlobalFile(C_dict['C1'][0], userPath=os.path.join(tempDir, C_dict['C1'][1])) - - file_contents = '' + A_fs = job.fileStore.readGlobalFile( + A_dict["A1"][0], userPath=os.path.join(tempDir, A_dict["A1"][1]) + ) + B_fs = job.fileStore.readGlobalFile( + B_dict["B1"][0], userPath=os.path.join(tempDir, B_dict["B1"][1]) + ) + C_fs = job.fileStore.readGlobalFile( + C_dict["C1"][0], userPath=os.path.join(tempDir, C_dict["C1"][1]) + ) + + file_contents = "" with open(A_fs) as f: for line in f: file_contents = file_contents + line @@ -118,25 +135,27 @@ def writeABC(job, A_dict, B_dict, C_dict, filepath): for line in f: file_contents = file_contents + line - with open(os.path.join(tempDir, 'ABC.txt'), 'w') as f: + with open(os.path.join(tempDir, "ABC.txt"), "w") as f: f.write(file_contents) # get the output file and return it as a tuple of location + name - output_filename = os.path.join(tempDir, 'ABC.txt') + output_filename = os.path.join(tempDir, "ABC.txt") output_file = job.fileStore.writeGlobalFile(output_filename) job.fileStore.export_file(output_file, "file://" + filepath) def finalize_jobs(job, num): - '''Does nothing but should be recorded in stats, status, and printDot().''' - job.fileStore.log_to_leader('''finalize_jobs''') + """Does nothing but should be recorded in stats, status, and printDot().""" + job.fileStore.log_to_leader("""finalize_jobs""") + def broken_job(job, num): - '''A job that will always fail. To be used for a tutorial.''' - job.fileStore.log_to_leader('''broken_job''') + """A job that will always fail. To be used for a tutorial.""" + job.fileStore.log_to_leader("""broken_job""") file = toil.importFile(None) -if __name__=="__main__": + +if __name__ == "__main__": jobStorePath = sys.argv[1] if len(sys.argv) > 1 else mkdtemp("debugWorkflow") options = Job.Runner.getDefaultOptions(jobStorePath) options.clean = "never" @@ -144,11 +163,17 @@ def broken_job(job, num): options.logLevel = "INFO" with Toil(options) as toil: - B_file0 = toil.importFile("file://" + os.path.abspath("src/toil/test/utils/ABCWorkflowDebug/B_file.txt")) + B_file0 = toil.importFile( + "file://" + + os.path.abspath("src/toil/test/utils/ABCWorkflowDebug/B_file.txt") + ) B_file0_preserveThisFilename = "B_file.txt" B_file = (B_file0, B_file0_preserveThisFilename) - file_maker0 = toil.importFile("file://" + os.path.abspath("src/toil/test/utils/ABCWorkflowDebug/mkFile.py")) + file_maker0 = toil.importFile( + "file://" + + os.path.abspath("src/toil/test/utils/ABCWorkflowDebug/mkFile.py") + ) file_maker0_preserveThisFilename = "mkFile.py" file_maker = (file_maker0, file_maker0_preserveThisFilename) diff --git a/src/toil/test/utils/ABCWorkflowDebug/mkFile.py b/src/toil/test/utils/ABCWorkflowDebug/mkFile.py index bf07f272d9..102d6455e8 100644 --- a/src/toil/test/utils/ABCWorkflowDebug/mkFile.py +++ b/src/toil/test/utils/ABCWorkflowDebug/mkFile.py @@ -2,14 +2,15 @@ def main(): - parser = ArgumentParser(description='Creates a file and writes into it.') - parser.add_argument('file_name', help='File name to be written to.') - parser.add_argument('contents', help='A string to be written into the file.') + parser = ArgumentParser(description="Creates a file and writes into it.") + parser.add_argument("file_name", help="File name to be written to.") + parser.add_argument("contents", help="A string to be written into the file.") args, unknown_args = parser.parse_known_args() with open(args.file_name, "w") as f: f.write(args.contents) -if __name__=="__main__": + +if __name__ == "__main__": main() diff --git a/src/toil/test/utils/toilDebugTest.py b/src/toil/test/utils/toilDebugTest.py index 093cb15f50..010cff6d79 100644 --- a/src/toil/test/utils/toilDebugTest.py +++ b/src/toil/test/utils/toilDebugTest.py @@ -16,12 +16,8 @@ import subprocess import tempfile -import pytest - -from toil.test import ToilTest - from toil.lib.resources import glob -from toil.test import slow, needs_wdl +from toil.test import ToilTest, needs_wdl, slow from toil.version import python logger = logging.getLogger(__name__) @@ -121,6 +117,7 @@ def testFetchJobStoreFiles() -> None: for symlink in (True, False): fetchFiles(symLink=symlink, jobStoreDir=job_store_dir, outputDir=output_dir) + class DebugJobTest(ToilTest): """ Test the toil debug-job command. @@ -137,23 +134,28 @@ def _get_job_store_and_job_id(self): logger.info("Running workflow that always fails") try: # Run an always-failing workflow - subprocess.check_call([ - python, - os.path.abspath("src/toil/test/docs/scripts/example_alwaysfail.py"), - "--retryCount=0", - "--logCritical", - "--disableProgress", - job_store - ], stderr=subprocess.DEVNULL) + subprocess.check_call( + [ + python, + os.path.abspath("src/toil/test/docs/scripts/example_alwaysfail.py"), + "--retryCount=0", + "--logCritical", + "--disableProgress", + job_store, + ], + stderr=subprocess.DEVNULL, + ) raise RuntimeError("Failing workflow succeeded!") except subprocess.CalledProcessError: # Should fail to run logger.info("Task failed successfully") - pass - + # Get the job ID. # TODO: This assumes a lot about the FileJobStore. Use the MessageBus instead? - job_id = "kind-explode/" + os.listdir(os.path.join(job_store, "jobs/kind-explode"))[0] + job_id = ( + "kind-explode/" + + os.listdir(os.path.join(job_store, "jobs/kind-explode"))[0] + ) return job_store, job_id @@ -161,7 +163,7 @@ def _get_wdl_job_store_and_job_name(self): """ Get a job store and the name of a failed job in it that actually wanted to use some files. """ - + # First make a job store. job_store = os.path.join(self._createTempDir(), "tree") @@ -170,12 +172,14 @@ def _get_wdl_job_store_and_job_name(self): wf_result = subprocess.run( [ "toil-wdl-runner", - os.path.abspath("src/toil/test/docs/scripts/example_alwaysfail_with_files.wdl"), + os.path.abspath( + "src/toil/test/docs/scripts/example_alwaysfail_with_files.wdl" + ), "--retryCount=0", "--logDebug", "--disableProgress", "--jobStore", - job_store + job_store, ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, @@ -191,7 +195,9 @@ def _get_wdl_job_store_and_job_name(self): # Make sure that the job store we created actually has its job store # root job ID file. If it doesn't, we failed during workflow setup and # not because of a real failing job. - assert os.path.exists(os.path.join(job_store, "files/shared/rootJobStoreID")), "Failed workflow still needs a root job" + assert os.path.exists( + os.path.join(job_store, "files/shared/rootJobStoreID") + ), "Failed workflow still needs a root job" # Get a job name for a job that fails job_name = "WDLTaskJob" @@ -208,18 +214,14 @@ def test_run_job(self): logger.info("Trying to rerun job %s", job_id) # Rerun the job, which should fail again - output = subprocess.check_output([ - "toil", - "debug-job", - "--logDebug", - job_store, - job_id - ], stderr=subprocess.STDOUT) + output = subprocess.check_output( + ["toil", "debug-job", "--logDebug", job_store, job_id], + stderr=subprocess.STDOUT, + ) # Even if the job fails, the attempt to run it will succeed. - log = output.decode('utf-8') + log = output.decode("utf-8") assert "Boom!" in log, f"Did not find the expected exception message in: {log}" - def test_print_job_info(self): """ Make sure that we can use --printJobInfo to get information on a job from a job store. @@ -230,14 +232,9 @@ def test_print_job_info(self): logger.info("Trying to print job info for job %s", job_id) # Print the job info and make sure that doesn't crash. - subprocess.check_call([ - "toil", - "debug-job", - "--logDebug", - job_store, - "--printJobInfo", - job_id - ]) + subprocess.check_call( + ["toil", "debug-job", "--logDebug", job_store, "--printJobInfo", job_id] + ) @needs_wdl def test_retrieve_task_directory(self): @@ -252,18 +249,23 @@ def test_retrieve_task_directory(self): dest_dir = os.path.join(self._createTempDir(), "dump") # Print the job info and make sure that doesn't crash. - subprocess.check_call([ - "toil", - "debug-job", - "--logDebug", - job_store, - job_name, - "--retrieveTaskDirectory", - dest_dir - ]) - - first_file = os.path.join(dest_dir, "inside/mnt/miniwdl_task_container/work/_miniwdl_inputs/0/test.txt") - assert os.path.exists(first_file), "Input file not found in fake container environment" - self.assertEqual(open(first_file).read(), "These are the contents\n") - + subprocess.check_call( + [ + "toil", + "debug-job", + "--logDebug", + job_store, + job_name, + "--retrieveTaskDirectory", + dest_dir, + ] + ) + first_file = os.path.join( + dest_dir, + "inside/mnt/miniwdl_task_container/work/_miniwdl_inputs/0/test.txt", + ) + assert os.path.exists( + first_file + ), "Input file not found in fake container environment" + self.assertEqual(open(first_file).read(), "These are the contents\n") diff --git a/src/toil/test/utils/toilKillTest.py b/src/toil/test/utils/toilKillTest.py index 67fd69ea9b..1f8ef4b36a 100644 --- a/src/toil/test/utils/toilKillTest.py +++ b/src/toil/test/utils/toilKillTest.py @@ -21,8 +21,7 @@ import unittest from toil.common import Toil -from toil.jobStores.abstractJobStore import (NoSuchFileException, - NoSuchJobStoreException) +from toil.jobStores.abstractJobStore import NoSuchFileException, NoSuchJobStoreException from toil.jobStores.utils import generate_locator from toil.test import ToilTest, needs_aws_s3, needs_cwl @@ -61,7 +60,7 @@ def test_cwl_toil_kill(self): kill_cmd = ["toil", "kill", self.job_store] # run the sleep workflow - logger.info('Running workflow: %s', ' '.join(run_cmd)) + logger.info("Running workflow: %s", " ".join(run_cmd)) cwl_process = subprocess.Popen(run_cmd) # wait until workflow starts running @@ -75,9 +74,9 @@ def test_cwl_toil_kill(self): # kill flag exists to be deleted to kill the leader break else: - logger.info('Waiting for kill flag...') + logger.info("Waiting for kill flag...") except (NoSuchJobStoreException, NoSuchFileException): - logger.info('Waiting for job store to be openable...') + logger.info("Waiting for job store to be openable...") time.sleep(2) # run toil kill diff --git a/src/toil/test/utils/utilsTest.py b/src/toil/test/utils/utilsTest.py index b0c529af88..ab831a72be 100644 --- a/src/toil/test/utils/utilsTest.py +++ b/src/toil/test/utils/utilsTest.py @@ -23,7 +23,7 @@ import pytest -pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) # noqa +pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) # noqa sys.path.insert(0, pkg_root) # noqa import toil @@ -31,14 +31,16 @@ from toil.common import Config, Toil from toil.job import Job from toil.lib.bioio import system -from toil.test import (ToilTest, - get_temp_file, - integrative, - needs_aws_ec2, - needs_cwl, - needs_docker, - needs_rsync3, - slow) +from toil.test import ( + ToilTest, + get_temp_file, + integrative, + needs_aws_ec2, + needs_cwl, + needs_docker, + needs_rsync3, + slow, +) from toil.test.sort.sortTest import makeFileToSort from toil.utils.toilStats import get_stats, process_data from toil.utils.toilStatus import ToilStatus @@ -58,7 +60,7 @@ def setUp(self): self.tempDir = self._createTempDir() self.tempFile = get_temp_file(rootDir=self.tempDir) self.outputFile = get_temp_file(rootDir=self.tempDir) - self.outputFile = 'someSortedStuff.txt' + self.outputFile = "someSortedStuff.txt" self.toilDir = os.path.join(self.tempDir, "jobstore") self.assertFalse(os.path.exists(self.toilDir)) self.lines = 1000 @@ -72,19 +74,19 @@ def setUp(self): self.sort_workflow_cmd = [ python, - '-m', - 'toil.test.sort.sort', - f'file:{self.toilDir}', - f'--fileToSort={self.tempFile}', - f'--outputFile={self.outputFile}', - '--clean=never', + "-m", + "toil.test.sort.sort", + f"file:{self.toilDir}", + f"--fileToSort={self.tempFile}", + f"--outputFile={self.outputFile}", + "--clean=never", ] self.restart_sort_workflow_cmd = [ python, - '-m', - 'toil.test.sort.restart_sort', - f'file:{self.toilDir}' + "-m", + "toil.test.sort.restart_sort", + f"file:{self.toilDir}", ] def tearDown(self): @@ -93,7 +95,11 @@ def tearDown(self): if os.path.exists(self.toilDir): shutil.rmtree(self.toilDir) - for f in [self.tempFile, self.outputFile, os.path.join(self.tempDir, "output.txt")]: + for f in [ + self.tempFile, + self.outputFile, + os.path.join(self.tempDir, "output.txt"), + ]: if os.path.exists(f): os.remove(f) @@ -101,26 +107,26 @@ def tearDown(self): @property def toilMain(self): - return resolveEntryPoint('toil') + return resolveEntryPoint("toil") @property def cleanCommand(self): - return [self.toilMain, 'clean', self.toilDir] + return [self.toilMain, "clean", self.toilDir] @property def statsCommand(self): - return [self.toilMain, 'stats', self.toilDir, '--pretty'] + return [self.toilMain, "stats", self.toilDir, "--pretty"] def statusCommand(self, failIfNotComplete=False): - commandTokens = [self.toilMain, 'status', self.toilDir] + commandTokens = [self.toilMain, "status", self.toilDir] if failIfNotComplete: - commandTokens.append('--failIfNotComplete') + commandTokens.append("--failIfNotComplete") return commandTokens def test_config_functionality(self): """Ensure that creating and reading back the config file works""" config_file = os.path.abspath("config.yaml") - config_command = [self.toilMain, 'config', config_file] + config_command = [self.toilMain, "config", config_file] # make sure the command `toil config file_path` works try: subprocess.check_call(config_command) @@ -131,7 +137,7 @@ def test_config_functionality(self): # make sure that toil can read from the generated config file try: parser.parse_args(["random_jobstore", "--config", config_file]) - with open(config_file, mode="r") as cm: + with open(config_file) as cm: payload = cm.read() expected = "workDir batchSystem symlinkImports defaultMemory retryCount" assert all( @@ -143,7 +149,6 @@ def test_config_functionality(self): finally: os.remove(config_file) - @needs_rsync3 @pytest.mark.timeout(1200) @needs_aws_ec2 @@ -164,30 +169,63 @@ def testAWSProvisionerUtils(self): :return: """ # TODO: Run these for the other clouds. - clusterName = f'cluster-utils-test{uuid.uuid4()}' - keyName = os.getenv('TOIL_AWS_KEYNAME').strip() or 'id_rsa' - expected_owner = os.getenv('TOIL_OWNER_TAG') or keyName + clusterName = f"cluster-utils-test{uuid.uuid4()}" + keyName = os.getenv("TOIL_AWS_KEYNAME").strip() or "id_rsa" + expected_owner = os.getenv("TOIL_OWNER_TAG") or keyName try: from toil.provisioners.aws.awsProvisioner import AWSProvisioner + aws_provisioner = AWSProvisioner.__module__ logger.debug(f"Found AWSProvisioner: {aws_provisioner}.") # launch master with an assortment of custom tags - system([self.toilMain, 'launch-cluster', '--clusterType', 'mesos', - '-t', 'key1=value1', '-t', 'key2=value2', '--tag', 'key3=value3', - '--leaderNodeType=t2.medium', '--keyPairName=' + keyName, clusterName, - '--provisioner=aws', '--zone=us-west-2a', '--logLevel=DEBUG']) - - cluster = toil.provisioners.cluster_factory(provisioner='aws', zone='us-west-2a', clusterName=clusterName) + system( + [ + self.toilMain, + "launch-cluster", + "--clusterType", + "mesos", + "-t", + "key1=value1", + "-t", + "key2=value2", + "--tag", + "key3=value3", + "--leaderNodeType=t2.medium", + "--keyPairName=" + keyName, + clusterName, + "--provisioner=aws", + "--zone=us-west-2a", + "--logLevel=DEBUG", + ] + ) + + cluster = toil.provisioners.cluster_factory( + provisioner="aws", zone="us-west-2a", clusterName=clusterName + ) leader = cluster.getLeader() # check that the leader carries the appropriate tags - tags = {'key1': 'value1', 'key2': 'value2', 'key3': 'value3', 'Name': clusterName, 'Owner': expected_owner} + tags = { + "key1": "value1", + "key2": "value2", + "key3": "value3", + "Name": clusterName, + "Owner": expected_owner, + } for key in tags: self.assertEqual(leader.tags.get(key), tags[key]) finally: - system([self.toilMain, 'destroy-cluster', '--zone=us-west-2a', '--provisioner=aws', clusterName]) + system( + [ + self.toilMain, + "destroy-cluster", + "--zone=us-west-2a", + "--provisioner=aws", + clusterName, + ] + ) @slow def testUtilsSort(self): @@ -196,19 +234,27 @@ def testUtilsSort(self): sort example with the --restart flag. """ # Get the sort command to run - toilCommand = [sys.executable, - '-m', toil.test.sort.sort.__name__, - self.toilDir, - '--logLevel=DEBUG', - '--fileToSort', self.tempFile, - '--outputFile', self.outputFile, - '--N', str(self.N), - '--stats', - '--retryCount=2', - '--badWorker=0.5', - '--badWorkerFailInterval=0.05'] + toilCommand = [ + sys.executable, + "-m", + toil.test.sort.sort.__name__, + self.toilDir, + "--logLevel=DEBUG", + "--fileToSort", + self.tempFile, + "--outputFile", + self.outputFile, + "--N", + str(self.N), + "--stats", + "--retryCount=2", + "--badWorker=0.5", + "--badWorkerFailInterval=0.05", + ] # Try restarting it to check that a JobStoreException is thrown - self.assertRaises(subprocess.CalledProcessError, system, toilCommand + ['--restart']) + self.assertRaises( + subprocess.CalledProcessError, system, toilCommand + ["--restart"] + ) # Check that trying to run it in restart mode does not create the jobStore self.assertFalse(os.path.exists(self.toilDir)) @@ -217,9 +263,15 @@ def testUtilsSort(self): try: system(toilCommand) finished = True - except subprocess.CalledProcessError: # This happens when the script fails due to having unfinished jobs + except ( + subprocess.CalledProcessError + ): # This happens when the script fails due to having unfinished jobs system(self.statusCommand()) - self.assertRaises(subprocess.CalledProcessError, system, self.statusCommand(failIfNotComplete=True)) + self.assertRaises( + subprocess.CalledProcessError, + system, + self.statusCommand(failIfNotComplete=True), + ) finished = False self.assertTrue(os.path.exists(self.toilDir)) @@ -230,11 +282,17 @@ def testUtilsSort(self): totalTrys = 1 while not finished: try: - system(toilCommand + ['--restart']) + system(toilCommand + ["--restart"]) finished = True - except subprocess.CalledProcessError: # This happens when the script fails due to having unfinished jobs + except ( + subprocess.CalledProcessError + ): # This happens when the script fails due to having unfinished jobs system(self.statusCommand()) - self.assertRaises(subprocess.CalledProcessError, system, self.statusCommand(failIfNotComplete=True)) + self.assertRaises( + subprocess.CalledProcessError, + system, + self.statusCommand(failIfNotComplete=True), + ) if totalTrys > 16: self.fail() # Exceeded a reasonable number of restarts totalTrys += 1 @@ -262,17 +320,23 @@ def testUtilsStatsSort(self): Tests the stats commands on a complete run of the stats test. """ # Get the sort command to run - toilCommand = [sys.executable, - '-m', toil.test.sort.sort.__name__, - self.toilDir, - '--logLevel=DEBUG', - '--fileToSort', self.tempFile, - '--outputFile', self.outputFile, - '--N', str(self.N), - '--stats', - '--retryCount=99', - '--badWorker=0.5', - '--badWorkerFailInterval=0.01'] + toilCommand = [ + sys.executable, + "-m", + toil.test.sort.sort.__name__, + self.toilDir, + "--logLevel=DEBUG", + "--fileToSort", + self.tempFile, + "--outputFile", + self.outputFile, + "--N", + str(self.N), + "--stats", + "--retryCount=99", + "--badWorker=0.5", + "--badWorkerFailInterval=0.01", + ] # Run the script for the first time system(toilCommand) @@ -291,8 +355,8 @@ def testUtilsStatsSort(self): def testUnicodeSupport(self): options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) - options.clean = 'always' - options.logLevel = 'debug' + options.clean = "always" + options.logLevel = "debug" Job.Runner.startToil(Job.wrapFn(printUnicodeCharacter), options) @slow @@ -301,7 +365,7 @@ def testMultipleJobsPerWorkerStats(self): Tests case where multiple jobs are run on 1 worker to ensure that all jobs report back their data """ options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) - options.clean = 'never' + options.clean = "never" options.stats = True Job.Runner.startToil(RunTwoJobsPerWorker(), options) config = Config() @@ -309,7 +373,10 @@ def testMultipleJobsPerWorkerStats(self): jobStore = Toil.resumeJobStore(config.jobStore) stats = get_stats(jobStore) collatedStats = process_data(jobStore.config, stats) - self.assertTrue(len(collatedStats.job_types) == 2, "Some jobs are not represented in the stats.") + self.assertTrue( + len(collatedStats.job_types) == 2, + "Some jobs are not represented in the stats.", + ) def check_status(self, status, status_fn, seconds=20): i = 0.0 @@ -318,19 +385,23 @@ def check_status(self, status, status_fn, seconds=20): i += 0.5 if i > seconds: s = status_fn(self.toilDir) - self.assertEqual(s, status, f'Waited {seconds} seconds without status reaching {status}; stuck at {s}') + self.assertEqual( + s, + status, + f"Waited {seconds} seconds without status reaching {status}; stuck at {s}", + ) def testGetPIDStatus(self): """Test that ToilStatus.getPIDStatus() behaves as expected.""" wf = subprocess.Popen(self.sort_workflow_cmd) - self.check_status('RUNNING', status_fn=ToilStatus.getPIDStatus, seconds=60) + self.check_status("RUNNING", status_fn=ToilStatus.getPIDStatus, seconds=60) wf.wait() - self.check_status('COMPLETED', status_fn=ToilStatus.getPIDStatus, seconds=60) + self.check_status("COMPLETED", status_fn=ToilStatus.getPIDStatus, seconds=60) # TODO: we need to reach into the FileJobStore's files and delete this # shared file. We assume we know its internal layout. - os.remove(os.path.join(self.toilDir, 'files/shared/pid.log')) - self.check_status('QUEUED', status_fn=ToilStatus.getPIDStatus, seconds=60) + os.remove(os.path.join(self.toilDir, "files/shared/pid.log")) + self.check_status("QUEUED", status_fn=ToilStatus.getPIDStatus, seconds=60) def testGetStatusFailedToilWF(self): """ @@ -339,41 +410,67 @@ def testGetStatusFailedToilWF(self): opportunity to test the 'RUNNING' functionality of getStatus(). """ # --badWorker is set to force failure. - wf = subprocess.Popen(self.sort_workflow_cmd + ['--badWorker=1']) - self.check_status('RUNNING', status_fn=ToilStatus.getStatus, seconds=60) + wf = subprocess.Popen(self.sort_workflow_cmd + ["--badWorker=1"]) + self.check_status("RUNNING", status_fn=ToilStatus.getStatus, seconds=60) wf.wait() - self.check_status('ERROR', status_fn=ToilStatus.getStatus, seconds=60) + self.check_status("ERROR", status_fn=ToilStatus.getStatus, seconds=60) @needs_cwl @needs_docker def testGetStatusFailedCWLWF(self): """Test that ToilStatus.getStatus() behaves as expected with a failing CWL workflow.""" # --badWorker is set to force failure. - cmd = ['toil-cwl-runner', '--jobStore', self.toilDir, '--clean=never', '--badWorker=1', - 'src/toil/test/cwl/sorttool.cwl', '--reverse', '--input', 'src/toil/test/cwl/whale.txt', f'--outdir={self.tempDir}'] + cmd = [ + "toil-cwl-runner", + "--jobStore", + self.toilDir, + "--clean=never", + "--badWorker=1", + "src/toil/test/cwl/sorttool.cwl", + "--reverse", + "--input", + "src/toil/test/cwl/whale.txt", + f"--outdir={self.tempDir}", + ] wf = subprocess.Popen(cmd) - self.check_status('RUNNING', status_fn=ToilStatus.getStatus, seconds=60) + self.check_status("RUNNING", status_fn=ToilStatus.getStatus, seconds=60) wf.wait() - self.check_status('ERROR', status_fn=ToilStatus.getStatus, seconds=60) + self.check_status("ERROR", status_fn=ToilStatus.getStatus, seconds=60) @needs_cwl @needs_docker def testGetStatusSuccessfulCWLWF(self): """Test that ToilStatus.getStatus() behaves as expected with a successful CWL workflow.""" - cmd = ['toil-cwl-runner', '--jobStore', self.toilDir, '--clean=never', - 'src/toil/test/cwl/sorttool.cwl', '--reverse', '--input', 'src/toil/test/cwl/whale.txt', f'--outdir={self.tempDir}'] + cmd = [ + "toil-cwl-runner", + "--jobStore", + self.toilDir, + "--clean=never", + "src/toil/test/cwl/sorttool.cwl", + "--reverse", + "--input", + "src/toil/test/cwl/whale.txt", + f"--outdir={self.tempDir}", + ] wf = subprocess.Popen(cmd) - self.check_status('RUNNING', status_fn=ToilStatus.getStatus, seconds=60) + self.check_status("RUNNING", status_fn=ToilStatus.getStatus, seconds=60) wf.wait() - self.check_status('COMPLETED', status_fn=ToilStatus.getStatus, seconds=60) + self.check_status("COMPLETED", status_fn=ToilStatus.getStatus, seconds=60) @needs_cwl - @patch('builtins.print') + @patch("builtins.print") def testPrintJobLog(self, mock_print): """Test that ToilStatus.printJobLog() reads the log from a failed command without error.""" # Run a workflow that will always fail - cmd = ['toil-cwl-runner', '--jobStore', self.toilDir, '--clean=never', - 'src/toil/test/cwl/alwaysfails.cwl', '--message', 'Testing'] + cmd = [ + "toil-cwl-runner", + "--jobStore", + self.toilDir, + "--clean=never", + "src/toil/test/cwl/alwaysfails.cwl", + "--message", + "Testing", + ] wf = subprocess.Popen(cmd) wf.wait() # print log and check output @@ -382,7 +479,7 @@ def testPrintJobLog(self, mock_print): # Make sure it printed some kind of complaint about the missing command. args, kwargs = mock_print.call_args - self.assertIn('invalidcommand', args[0]) + self.assertIn("invalidcommand", args[0]) @pytest.mark.timeout(1200) def testRestartAttribute(self): @@ -392,17 +489,27 @@ def testRestartAttribute(self): In this case, the job store should not be destroyed until restart() is called. """ # Run a workflow that will always fail - cmd = self.restart_sort_workflow_cmd + ['--badWorker=1', '--logDebug'] + cmd = self.restart_sort_workflow_cmd + ["--badWorker=1", "--logDebug"] subprocess.run(cmd) - restart_cmd = self.restart_sort_workflow_cmd + ['--badWorker=0', '--logDebug', '--restart'] + restart_cmd = self.restart_sort_workflow_cmd + [ + "--badWorker=0", + "--logDebug", + "--restart", + ] subprocess.run(restart_cmd) # Check the job store exists after restart attempt self.assertTrue(os.path.exists(self.toilDir)) - successful_cmd = [python, '-m', 'toil.test.sort.sort', '--logDebug', 'file:' + self.toilDir, - '--restart'] + successful_cmd = [ + python, + "-m", + "toil.test.sort.sort", + "--logDebug", + "file:" + self.toilDir, + "--restart", + ] subprocess.run(successful_cmd) # Check the job store is destroyed after calling restart() @@ -413,13 +520,14 @@ def printUnicodeCharacter(): # We want to get a unicode character to stdout but we can't print it directly because of # Python encoding issues. To work around this we print in a separate Python process. See # http://stackoverflow.com/questions/492483/setting-the-correct-encoding-when-piping-stdout-in-python - subprocess.check_call([sys.executable, '-c', "print('\\xc3\\xbc')"]) + subprocess.check_call([sys.executable, "-c", "print('\\xc3\\xbc')"]) class RunTwoJobsPerWorker(Job): """ Runs child job with same resources as self in an attempt to chain the jobs on the same worker """ + def __init__(self): Job.__init__(self) diff --git a/src/toil/test/wdl/wdltoil_test.py b/src/toil/test/wdl/wdltoil_test.py index 0942a5499a..591cd45975 100644 --- a/src/toil/test/wdl/wdltoil_test.py +++ b/src/toil/test/wdl/wdltoil_test.py @@ -1,43 +1,46 @@ import json +import logging import os import re import shutil import string import subprocess import unittest -from uuid import uuid4 -from typing import Optional, Union - +from typing import Any, Optional, Union from unittest.mock import patch -from typing import Any, Dict, List, Set +from uuid import uuid4 -import logging -import pytest +import WDL.Error +import WDL.Expr from toil.fileStores import FileID -from toil.provisioners import cluster_factory -from toil.test import (ToilTest, - needs_docker, - needs_docker_cuda, - needs_google_storage, - needs_singularity_or_docker, - needs_wdl, - slow, integrative) +from toil.test import ( + ToilTest, + needs_docker, + needs_docker_cuda, + needs_google_storage, + needs_singularity_or_docker, + needs_wdl, + slow, +) from toil.version import exactPython -from toil.wdl.wdltoil import WDLSectionJob, WDLWorkflowGraph, remove_common_leading_whitespace, parse_disks - -import WDL.Expr -import WDL.Error +from toil.wdl.wdltoil import ( + WDLSectionJob, + WDLWorkflowGraph, + parse_disks, + remove_common_leading_whitespace, +) logger = logging.getLogger(__name__) + @needs_wdl class BaseWDLTest(ToilTest): """Base test class for WDL tests.""" def setUp(self) -> None: """Runs anew before each test to create farm fresh temp dirs.""" - self.output_dir = os.path.join('/tmp/', 'toil-wdl-test-' + str(uuid4())) + self.output_dir = os.path.join("/tmp/", "toil-wdl-test-" + str(uuid4())) os.makedirs(self.output_dir) def tearDown(self) -> None: @@ -50,16 +53,18 @@ def tearDown(self) -> None: # These tests are known to require things not implemented by # Toil and will not be run in CI. WDL_CONFORMANCE_TESTS_UNSUPPORTED_BY_TOIL = [ - 16, # Basic object test (deprecated and removed in 1.1); MiniWDL and toil-wdl-runner do not support Objects, so this will fail if ran by them - 21, # Parser: expression placeholders in strings in conditional expressions in 1.0, Cromwell style; Fails with MiniWDL and toil-wdl-runner - 64, # Legacy test for as_map_as_input; It looks like MiniWDL does not have the function as_map() - 77, # Test that array cannot coerce to a string. WDL 1.1 does not allow compound types to coerce into a string. This should return a TypeError. + 16, # Basic object test (deprecated and removed in 1.1); MiniWDL and toil-wdl-runner do not support Objects, so this will fail if ran by them + 21, # Parser: expression placeholders in strings in conditional expressions in 1.0, Cromwell style; Fails with MiniWDL and toil-wdl-runner + 64, # Legacy test for as_map_as_input; It looks like MiniWDL does not have the function as_map() + 77, # Test that array cannot coerce to a string. WDL 1.1 does not allow compound types to coerce into a string. This should return a TypeError. ] + class WDLConformanceTests(BaseWDLTest): """ WDL conformance tests for Toil. """ + wdl_dir = "wdl-conformance-tests" @classmethod @@ -85,8 +90,14 @@ def check(self, p: subprocess.CompletedProcess) -> None: """ if p.returncode != 0: - logger.error("Failed process standard output: %s", p.stdout.decode('utf-8', errors='replace')) - logger.error("Failed process standard error: %s", p.stderr.decode('utf-8', errors='replace')) + logger.error( + "Failed process standard output: %s", + p.stdout.decode("utf-8", errors="replace"), + ) + logger.error( + "Failed process standard error: %s", + p.stderr.decode("utf-8", errors="replace"), + ) p.check_returncode() @@ -96,7 +107,9 @@ def test_conformance_tests_v10(self): command = self.base_command + ["-v", "1.0"] if WDL_CONFORMANCE_TESTS_UNSUPPORTED_BY_TOIL: command.append("--exclude-numbers") - command.append(",".join([str(t) for t in WDL_CONFORMANCE_TESTS_UNSUPPORTED_BY_TOIL])) + command.append( + ",".join([str(t) for t in WDL_CONFORMANCE_TESTS_UNSUPPORTED_BY_TOIL]) + ) p = subprocess.run(command, capture_output=True) self.check(p) @@ -107,7 +120,9 @@ def test_conformance_tests_v11(self): command = self.base_command + ["-v", "1.1"] if WDL_CONFORMANCE_TESTS_UNSUPPORTED_BY_TOIL: command.append("--exclude-numbers") - command.append(",".join([str(t) for t in WDL_CONFORMANCE_TESTS_UNSUPPORTED_BY_TOIL])) + command.append( + ",".join([str(t) for t in WDL_CONFORMANCE_TESTS_UNSUPPORTED_BY_TOIL]) + ) p = subprocess.run(command, capture_output=True) self.check(p) @@ -115,7 +130,18 @@ def test_conformance_tests_v11(self): @slow def test_conformance_tests_integration(self): ids_to_run = "encode,tut01,tut02,tut03,tut04" - p = subprocess.run(self.base_command + ["-v", "1.0", "--conformance-file", "integration.yaml", "--id", ids_to_run], capture_output=True) + p = subprocess.run( + self.base_command + + [ + "-v", + "1.0", + "--conformance-file", + "integration.yaml", + "--id", + ids_to_run, + ], + capture_output=True, + ) self.check(p) @@ -132,7 +158,7 @@ class WDLTests(BaseWDLTest): @classmethod def setUpClass(cls) -> None: """Runs once for all tests.""" - cls.base_command = [exactPython, '-m', 'toil.wdl.wdltoil'] + cls.base_command = [exactPython, "-m", "toil.wdl.wdltoil"] # We inherit a testMD5sum but it is going to need Singularity or Docker # now. And also needs to have a WDL 1.0+ WDL file. So we replace it. @@ -140,110 +166,154 @@ def setUpClass(cls) -> None: def test_MD5sum(self): """Test if Toil produces the same outputs as known good outputs for WDL's GATK tutorial #1.""" - wdl = os.path.abspath('src/toil/test/wdl/md5sum/md5sum.1.0.wdl') - json_file = os.path.abspath('src/toil/test/wdl/md5sum/md5sum.json') + wdl = os.path.abspath("src/toil/test/wdl/md5sum/md5sum.1.0.wdl") + json_file = os.path.abspath("src/toil/test/wdl/md5sum/md5sum.json") result_json = subprocess.check_output( - self.base_command + [wdl, json_file, '-o', self.output_dir, '--logDebug', '--retryCount=0']) + self.base_command + + [wdl, json_file, "-o", self.output_dir, "--logDebug", "--retryCount=0"] + ) result = json.loads(result_json) - assert 'ga4ghMd5.value' in result - assert isinstance(result['ga4ghMd5.value'], str) - assert os.path.exists(result['ga4ghMd5.value']) - assert os.path.basename(result['ga4ghMd5.value']) == 'md5sum.txt' + assert "ga4ghMd5.value" in result + assert isinstance(result["ga4ghMd5.value"], str) + assert os.path.exists(result["ga4ghMd5.value"]) + assert os.path.basename(result["ga4ghMd5.value"]) == "md5sum.txt" def test_url_to_file(self): """ Test if web URL strings can be coerced to usable Files. """ - wdl = os.path.abspath('src/toil/test/wdl/testfiles/url_to_file.wdl') + wdl = os.path.abspath("src/toil/test/wdl/testfiles/url_to_file.wdl") result_json = subprocess.check_output( - self.base_command + [wdl, '-o', self.output_dir, '--logInfo', '--retryCount=0']) + self.base_command + + [wdl, "-o", self.output_dir, "--logInfo", "--retryCount=0"] + ) result = json.loads(result_json) - assert 'url_to_file.first_line' in result - assert isinstance(result['url_to_file.first_line'], str) - self.assertEqual(result['url_to_file.first_line'], 'chr1\t248387328') + assert "url_to_file.first_line" in result + assert isinstance(result["url_to_file.first_line"], str) + self.assertEqual(result["url_to_file.first_line"], "chr1\t248387328") @needs_docker def test_wait(self): """ Test if Bash "wait" works in WDL scripts. """ - wdl = os.path.abspath('src/toil/test/wdl/testfiles/wait.wdl') + wdl = os.path.abspath("src/toil/test/wdl/testfiles/wait.wdl") result_json = subprocess.check_output( - self.base_command + [wdl, '-o', self.output_dir, '--logInfo', '--retryCount=0', '--wdlContainer=docker']) + self.base_command + + [ + wdl, + "-o", + self.output_dir, + "--logInfo", + "--retryCount=0", + "--wdlContainer=docker", + ] + ) result = json.loads(result_json) - assert 'wait.result' in result - assert isinstance(result['wait.result'], str) - self.assertEqual(result['wait.result'], 'waited') + assert "wait.result" in result + assert isinstance(result["wait.result"], str) + self.assertEqual(result["wait.result"], "waited") @needs_singularity_or_docker def test_all_call_outputs(self): """ Test if Toil can collect all call outputs from a workflow that doesn't expose them. """ - wdl = os.path.abspath('src/toil/test/wdl/testfiles/not_enough_outputs.wdl') + wdl = os.path.abspath("src/toil/test/wdl/testfiles/not_enough_outputs.wdl") # With no flag we don't include the call outputs result_json = subprocess.check_output( - self.base_command + [wdl, '-o', self.output_dir, '--logInfo', '--retryCount=0']) + self.base_command + + [wdl, "-o", self.output_dir, "--logInfo", "--retryCount=0"] + ) result = json.loads(result_json) - assert 'wf.only_result' in result - assert 'wf.do_math.square' not in result - assert 'wf.do_math.cube' not in result - assert 'wf.should_never_output' not in result + assert "wf.only_result" in result + assert "wf.do_math.square" not in result + assert "wf.do_math.cube" not in result + assert "wf.should_never_output" not in result # With flag off we don't include the call outputs result_json = subprocess.check_output( - self.base_command + [wdl, '-o', self.output_dir, '--logInfo', '--retryCount=0', '--allCallOutputs=false']) + self.base_command + + [ + wdl, + "-o", + self.output_dir, + "--logInfo", + "--retryCount=0", + "--allCallOutputs=false", + ] + ) result = json.loads(result_json) - assert 'wf.only_result' in result - assert 'wf.do_math.square' not in result - assert 'wf.do_math.cube' not in result - assert 'wf.should_never_output' not in result + assert "wf.only_result" in result + assert "wf.do_math.square" not in result + assert "wf.do_math.cube" not in result + assert "wf.should_never_output" not in result # With flag on we do include the call outputs result_json = subprocess.check_output( - self.base_command + [wdl, '-o', self.output_dir, '--logInfo', '--retryCount=0', '--allCallOutputs=on']) + self.base_command + + [ + wdl, + "-o", + self.output_dir, + "--logInfo", + "--retryCount=0", + "--allCallOutputs=on", + ] + ) result = json.loads(result_json) - assert 'wf.only_result' in result - assert 'wf.do_math.square' in result - assert 'wf.do_math.cube' in result - assert 'wf.should_never_output' not in result + assert "wf.only_result" in result + assert "wf.do_math.square" in result + assert "wf.do_math.cube" in result + assert "wf.should_never_output" not in result @needs_singularity_or_docker def test_croo_detection(self): """ Test if Toil can detect and do something sensible with Cromwell Output Organizer workflows. """ - wdl = os.path.abspath('src/toil/test/wdl/testfiles/croo.wdl') + wdl = os.path.abspath("src/toil/test/wdl/testfiles/croo.wdl") # With no flag we should include all task outputs result_json = subprocess.check_output( - self.base_command + [wdl, '-o', self.output_dir, '--logInfo', '--retryCount=0']) + self.base_command + + [wdl, "-o", self.output_dir, "--logInfo", "--retryCount=0"] + ) result = json.loads(result_json) - assert 'wf.only_result' in result - assert 'wf.do_math.square' in result - assert 'wf.do_math.cube' in result - assert 'wf.should_never_output' not in result + assert "wf.only_result" in result + assert "wf.do_math.square" in result + assert "wf.do_math.cube" in result + assert "wf.should_never_output" not in result # With flag off we obey the WDL spec even if we're suspicious result_json = subprocess.check_output( - self.base_command + [wdl, '-o', self.output_dir, '--logInfo', '--retryCount=0', '--allCallOutputs=off']) + self.base_command + + [ + wdl, + "-o", + self.output_dir, + "--logInfo", + "--retryCount=0", + "--allCallOutputs=off", + ] + ) result = json.loads(result_json) - assert 'wf.only_result' in result - assert 'wf.do_math.square' not in result - assert 'wf.do_math.cube' not in result - assert 'wf.should_never_output' not in result + assert "wf.only_result" in result + assert "wf.do_math.square" not in result + assert "wf.do_math.cube" not in result + assert "wf.should_never_output" not in result @needs_singularity_or_docker def test_caching(self): @@ -305,7 +375,7 @@ def test_url_to_optional_file(self): """ Test if missing and error-producing URLs are handled correctly for optional File? values. """ - wdl = os.path.abspath('src/toil/test/wdl/testfiles/url_to_optional_file.wdl') + wdl = os.path.abspath("src/toil/test/wdl/testfiles/url_to_optional_file.wdl") def run_for_code(code: int) -> dict: """ @@ -316,20 +386,29 @@ def run_for_code(code: int) -> dict: logger.info("Test optional file with HTTP code %s", code) json_value = '{"url_to_optional_file.http_code": %d}' % code result_json = subprocess.check_output( - self.base_command + [wdl, json_value, '-o', self.output_dir, '--logInfo', '--retryCount=0']) + self.base_command + + [ + wdl, + json_value, + "-o", + self.output_dir, + "--logInfo", + "--retryCount=0", + ] + ) result = json.loads(result_json) return result # Check files that exist result = run_for_code(200) - assert 'url_to_optional_file.out_file' in result - self.assertNotEqual(result['url_to_optional_file.out_file'], None) + assert "url_to_optional_file.out_file" in result + self.assertNotEqual(result["url_to_optional_file.out_file"], None) for code in (404, 410): # Check files that definitely don't result = run_for_code(code) - assert 'url_to_optional_file.out_file' in result - self.assertEqual(result['url_to_optional_file.out_file'], None) + assert "url_to_optional_file.out_file" in result + self.assertEqual(result["url_to_optional_file.out_file"], None) for code in (402, 418, 500, 502): # Check that cases where the server refuses to say if the file @@ -337,50 +416,72 @@ def run_for_code(code: int) -> dict: with self.assertRaises(subprocess.CalledProcessError): run_for_code(code) - - def test_missing_output_directory(self): """ Test if Toil can run a WDL workflow into a new directory. """ - wdl = os.path.abspath('src/toil/test/wdl/md5sum/md5sum.1.0.wdl') - json_file = os.path.abspath('src/toil/test/wdl/md5sum/md5sum.json') - subprocess.check_call(self.base_command + [wdl, json_file, '-o', os.path.join(self.output_dir, "does", "not", "exist"), '--logDebug', '--retryCount=0']) + wdl = os.path.abspath("src/toil/test/wdl/md5sum/md5sum.1.0.wdl") + json_file = os.path.abspath("src/toil/test/wdl/md5sum/md5sum.json") + subprocess.check_call( + self.base_command + + [ + wdl, + json_file, + "-o", + os.path.join(self.output_dir, "does", "not", "exist"), + "--logDebug", + "--retryCount=0", + ] + ) @needs_singularity_or_docker - def test_miniwdl_self_test(self, extra_args: Optional[List[str]] = None) -> None: + def test_miniwdl_self_test(self, extra_args: Optional[list[str]] = None) -> None: """Test if the MiniWDL self test runs and produces the expected output.""" - wdl_file = os.path.abspath('src/toil/test/wdl/miniwdl_self_test/self_test.wdl') - json_file = os.path.abspath('src/toil/test/wdl/miniwdl_self_test/inputs.json') + wdl_file = os.path.abspath("src/toil/test/wdl/miniwdl_self_test/self_test.wdl") + json_file = os.path.abspath("src/toil/test/wdl/miniwdl_self_test/inputs.json") result_json = subprocess.check_output( - self.base_command + [wdl_file, json_file, '--logDebug', '-o', self.output_dir, '--outputDialect', - 'miniwdl'] + (extra_args or [])) + self.base_command + + [ + wdl_file, + json_file, + "--logDebug", + "-o", + self.output_dir, + "--outputDialect", + "miniwdl", + ] + + (extra_args or []) + ) result = json.loads(result_json) # Expect MiniWDL-style output with a designated "dir" - assert 'dir' in result - assert isinstance(result['dir'], str) - out_dir = result['dir'] + assert "dir" in result + assert isinstance(result["dir"], str) + out_dir = result["dir"] - assert 'outputs' in result - assert isinstance(result['outputs'], dict) - outputs = result['outputs'] + assert "outputs" in result + assert isinstance(result["outputs"], dict) + outputs = result["outputs"] - assert 'hello_caller.message_files' in outputs - assert isinstance(outputs['hello_caller.message_files'], list) - assert len(outputs['hello_caller.message_files']) == 2 - for item in outputs['hello_caller.message_files']: + assert "hello_caller.message_files" in outputs + assert isinstance(outputs["hello_caller.message_files"], list) + assert len(outputs["hello_caller.message_files"]) == 2 + for item in outputs["hello_caller.message_files"]: # All the files should be strings in the "out" directory assert isinstance(item, str), "File output must be a string" - assert item.startswith(out_dir), "File output must be in the output directory" + assert item.startswith( + out_dir + ), "File output must be in the output directory" # Look at the filename within that directory - name_in_out_dir = item[len(out_dir):] + name_in_out_dir = item[len(out_dir) :] # Ity should contain the job name of "hello", so they are human-readable. - assert "hello" in name_in_out_dir, f"File output {name_in_out_dir} should have the originating task name in it" + assert ( + "hello" in name_in_out_dir + ), f"File output {name_in_out_dir} should have the originating task name in it" # And it should not contain non-human-readable content. # @@ -388,10 +489,15 @@ def test_miniwdl_self_test(self, extra_args: Optional[List[str]] = None) -> None # don't try and get around this by just rolling other random # strings; we want these outputs to be human-readable!!! digit_count = len([c for c in name_in_out_dir if c in string.digits]) - assert digit_count < 3, f"File output {name_in_out_dir} has {digit_count} digits, which is too many to be plausibly human-readable" + assert ( + digit_count < 3 + ), f"File output {name_in_out_dir} has {digit_count} digits, which is too many to be plausibly human-readable" - assert 'hello_caller.messages' in outputs - assert outputs['hello_caller.messages'] == ["Hello, Alyssa P. Hacker!", "Hello, Ben Bitdiddle!"] + assert "hello_caller.messages" in outputs + assert outputs["hello_caller.messages"] == [ + "Hello, Alyssa P. Hacker!", + "Hello, Ben Bitdiddle!", + ] @needs_singularity_or_docker def test_miniwdl_self_test_by_reference(self) -> None: @@ -407,42 +513,47 @@ def test_giraffe_deepvariant(self): # TODO: enable test if nvidia-container-runtime and Singularity are installed but Docker isn't. json_dir = self._createTempDir() - base_uri = 'https://raw.githubusercontent.com/vgteam/vg_wdl/65dd739aae765f5c4dedd14f2e42d5a263f9267a' + base_uri = "https://raw.githubusercontent.com/vgteam/vg_wdl/65dd739aae765f5c4dedd14f2e42d5a263f9267a" wdl_file = f"{base_uri}/workflows/giraffe_and_deepvariant.wdl" - json_file = os.path.abspath(os.path.join(json_dir, 'inputs.json')) - with open(json_file, 'w') as fp: + json_file = os.path.abspath(os.path.join(json_dir, "inputs.json")) + with open(json_file, "w") as fp: # Write some inputs. We need to override the example inputs to use a GPU container, but that means we need absolute input URLs. - json.dump(fp, { - "GiraffeDeepVariant.INPUT_READ_FILE_1": f"{base_uri}/tests/small_sim_graph/reads_1.fastq.gz", - "GiraffeDeepVariant.INPUT_READ_FILE_2": f"{base_uri}/tests/small_sim_graph/reads_2.fastq.gz", - "GiraffeDeepVariant.XG_FILE": f"{base_uri}/tests/small_sim_graph/graph.xg", - "GiraffeDeepVariant.SAMPLE_NAME": "s0", - "GiraffeDeepVariant.GBWT_FILE": f"{base_uri}/tests/small_sim_graph/graph.gbwt", - "GiraffeDeepVariant.GGBWT_FILE": f"{base_uri}/tests/small_sim_graph/graph.gg", - "GiraffeDeepVariant.MIN_FILE": f"{base_uri}/tests/small_sim_graph/graph.min", - "GiraffeDeepVariant.DIST_FILE": f"{base_uri}/tests/small_sim_graph/graph.dist", - "GiraffeDeepVariant.OUTPUT_GAF": True, - "GiraffeDeepVariant.runDeepVariantCallVariants.in_dv_gpu_container": "google/deepvariant:1.3.0-gpu" - }) + json.dump( + fp, + { + "GiraffeDeepVariant.INPUT_READ_FILE_1": f"{base_uri}/tests/small_sim_graph/reads_1.fastq.gz", + "GiraffeDeepVariant.INPUT_READ_FILE_2": f"{base_uri}/tests/small_sim_graph/reads_2.fastq.gz", + "GiraffeDeepVariant.XG_FILE": f"{base_uri}/tests/small_sim_graph/graph.xg", + "GiraffeDeepVariant.SAMPLE_NAME": "s0", + "GiraffeDeepVariant.GBWT_FILE": f"{base_uri}/tests/small_sim_graph/graph.gbwt", + "GiraffeDeepVariant.GGBWT_FILE": f"{base_uri}/tests/small_sim_graph/graph.gg", + "GiraffeDeepVariant.MIN_FILE": f"{base_uri}/tests/small_sim_graph/graph.min", + "GiraffeDeepVariant.DIST_FILE": f"{base_uri}/tests/small_sim_graph/graph.dist", + "GiraffeDeepVariant.OUTPUT_GAF": True, + "GiraffeDeepVariant.runDeepVariantCallVariants.in_dv_gpu_container": "google/deepvariant:1.3.0-gpu", + }, + ) result_json = subprocess.check_output( - self.base_command + [wdl_file, json_file, '-o', self.output_dir, '--outputDialect', 'miniwdl']) + self.base_command + + [wdl_file, json_file, "-o", self.output_dir, "--outputDialect", "miniwdl"] + ) result = json.loads(result_json) # Expect MiniWDL-style output with a designated "dir" - assert 'dir' in result - assert isinstance(result['dir'], str) - out_dir = result['dir'] + assert "dir" in result + assert isinstance(result["dir"], str) + out_dir = result["dir"] - assert 'outputs' in result - assert isinstance(result['outputs'], dict) - outputs = result['outputs'] + assert "outputs" in result + assert isinstance(result["outputs"], dict) + outputs = result["outputs"] # Expect a VCF file to have been written - assert 'GiraffeDeepVariant.output_vcf' in outputs - assert isinstance(outputs['GiraffeDeepVariant.output_vcf'], str) - assert os.path.exists(outputs['GiraffeDeepVariant.output_vcf']) + assert "GiraffeDeepVariant.output_vcf" in outputs + assert isinstance(outputs["GiraffeDeepVariant.output_vcf"], str) + assert os.path.exists(outputs["GiraffeDeepVariant.output_vcf"]) @slow @needs_singularity_or_docker @@ -453,53 +564,56 @@ def test_giraffe(self): # TODO: Skip if node lacks enough memory. json_dir = self._createTempDir() - base_uri = 'https://raw.githubusercontent.com/vgteam/vg_wdl/65dd739aae765f5c4dedd14f2e42d5a263f9267a' + base_uri = "https://raw.githubusercontent.com/vgteam/vg_wdl/65dd739aae765f5c4dedd14f2e42d5a263f9267a" wdl_file = f"{base_uri}/workflows/giraffe.wdl" json_file = f"{base_uri}/params/giraffe.json" result_json = subprocess.check_output( - self.base_command + [ + self.base_command + + [ wdl_file, json_file, - '-o', + "-o", self.output_dir, - '--outputDialect', - 'miniwdl', - '--scale', - '0.1', - '--logDebug', + "--outputDialect", + "miniwdl", + "--scale", + "0.1", + "--logDebug", ] ) result = json.loads(result_json) # Expect MiniWDL-style output with a designated "dir" - assert 'dir' in result - assert isinstance(result['dir'], str) - out_dir = result['dir'] + assert "dir" in result + assert isinstance(result["dir"], str) + out_dir = result["dir"] - assert 'outputs' in result - assert isinstance(result['outputs'], dict) - outputs = result['outputs'] + assert "outputs" in result + assert isinstance(result["outputs"], dict) + outputs = result["outputs"] # Expect a BAM file to have been written - assert 'Giraffe.output_bam' in outputs - assert isinstance(outputs['Giraffe.output_bam'], str) - assert os.path.exists(outputs['Giraffe.output_bam']) + assert "Giraffe.output_bam" in outputs + assert isinstance(outputs["Giraffe.output_bam"], str) + assert os.path.exists(outputs["Giraffe.output_bam"]) @needs_singularity_or_docker @needs_google_storage def test_gs_uri(self): """Test if Toil can access Google Storage URIs.""" - wdl = os.path.abspath('src/toil/test/wdl/md5sum/md5sum.1.0.wdl') - json_file = os.path.abspath('src/toil/test/wdl/md5sum/md5sum-gs.json') + wdl = os.path.abspath("src/toil/test/wdl/md5sum/md5sum.1.0.wdl") + json_file = os.path.abspath("src/toil/test/wdl/md5sum/md5sum-gs.json") - result_json = subprocess.check_output(self.base_command + [wdl, json_file, '-o', self.output_dir, '--logDebug']) + result_json = subprocess.check_output( + self.base_command + [wdl, json_file, "-o", self.output_dir, "--logDebug"] + ) result = json.loads(result_json) - assert 'ga4ghMd5.value' in result - assert isinstance(result['ga4ghMd5.value'], str) - assert os.path.exists(result['ga4ghMd5.value']) - assert os.path.basename(result['ga4ghMd5.value']) == 'md5sum.txt' + assert "ga4ghMd5.value" in result + assert isinstance(result["ga4ghMd5.value"], str) + assert os.path.exists(result["ga4ghMd5.value"]) + assert os.path.basename(result["ga4ghMd5.value"]) == "md5sum.txt" class WDLToilBenchTests(ToilTest): @@ -514,9 +628,9 @@ def test_coalesce(self): # Set up data structures for our fake workflow graph to pull from. # This has all decl-type nodes - all_decls: Set[str] = set() + all_decls: set[str] = set() # And this has all transitive dependencies for all nodes. - all_deps: Dict[str, Set[str]] = {} + all_deps: dict[str, set[str]] = {} def mock_is_decl(self: Any, node_id: str) -> bool: """ @@ -524,7 +638,7 @@ def mock_is_decl(self: Any, node_id: str) -> bool: """ return node_id in all_decls - def mock_get_transitive_dependencies(self: Any, node_id: str) -> Set[str]: + def mock_get_transitive_dependencies(self: Any, node_id: str) -> set[str]: """ Replacement function to get all the transitive dependencies of a node. """ @@ -536,17 +650,20 @@ def mock_get_transitive_dependencies(self: Any, node_id: str) -> Set[str]: # # If that changes, the test will need to change! Maybe then it will be # worth extracting a base type for this interface. - with patch.object(WDLWorkflowGraph, 'is_decl', mock_is_decl): - with patch.object(WDLWorkflowGraph, 'get_transitive_dependencies', mock_get_transitive_dependencies): + with patch.object(WDLWorkflowGraph, "is_decl", mock_is_decl): + with patch.object( + WDLWorkflowGraph, + "get_transitive_dependencies", + mock_get_transitive_dependencies, + ): with self.subTest(msg="Two unrelated decls can coalesce"): # Set up two unrelated decls all_decls = {"decl1", "decl2"} - all_deps = { - "decl1": set(), - "decl2": set() - } + all_deps = {"decl1": set(), "decl2": set()} - result = WDLSectionJob.coalesce_nodes(["decl1", "decl2"], WDLWorkflowGraph([])) + result = WDLSectionJob.coalesce_nodes( + ["decl1", "decl2"], WDLWorkflowGraph([]) + ) # Make sure they coalesced assert len(result) == 1 @@ -555,61 +672,59 @@ def mock_get_transitive_dependencies(self: Any, node_id: str) -> Set[str]: with self.subTest(msg="A decl will not coalesce with a non-decl"): all_decls = {"decl"} - all_deps = { - "decl": set(), - "nondecl": set() - } + all_deps = {"decl": set(), "nondecl": set()} - result = WDLSectionJob.coalesce_nodes(["decl", "nondecl"], WDLWorkflowGraph([])) + result = WDLSectionJob.coalesce_nodes( + ["decl", "nondecl"], WDLWorkflowGraph([]) + ) assert len(result) == 2 assert len(result[0]) == 1 assert len(result[1]) == 1 - with self.subTest(msg="Two adjacent decls with a common dependency can coalesce"): + with self.subTest( + msg="Two adjacent decls with a common dependency can coalesce" + ): all_decls = {"decl1", "decl2"} - all_deps = { - "decl1": {"base"}, - "decl2": {"base"}, - "base": set() - } + all_deps = {"decl1": {"base"}, "decl2": {"base"}, "base": set()} - result = WDLSectionJob.coalesce_nodes(["base", "decl1", "decl2"], WDLWorkflowGraph([])) + result = WDLSectionJob.coalesce_nodes( + ["base", "decl1", "decl2"], WDLWorkflowGraph([]) + ) assert len(result) == 2 assert "base" in result[0] assert "decl1" in result[1] assert "decl2" in result[1] - with self.subTest(msg="Two adjacent decls with different dependencies will not coalesce"): + with self.subTest( + msg="Two adjacent decls with different dependencies will not coalesce" + ): all_decls = {"decl1", "decl2"} - all_deps = { - "decl1": {"base"}, - "decl2": set(), - "base": set() - } + all_deps = {"decl1": {"base"}, "decl2": set(), "base": set()} - result = WDLSectionJob.coalesce_nodes(["base", "decl1", "decl2"], WDLWorkflowGraph([])) + result = WDLSectionJob.coalesce_nodes( + ["base", "decl1", "decl2"], WDLWorkflowGraph([]) + ) assert len(result) == 3 assert "base" in result[0] - with self.subTest(msg="Two adjacent decls with different successors will coalesce"): + with self.subTest( + msg="Two adjacent decls with different successors will coalesce" + ): all_decls = {"decl1", "decl2"} - all_deps = { - "decl1": set(), - "decl2": set(), - "successor": {"decl2"} - } + all_deps = {"decl1": set(), "decl2": set(), "successor": {"decl2"}} - result = WDLSectionJob.coalesce_nodes(["decl1", "decl2", "successor"], WDLWorkflowGraph([])) + result = WDLSectionJob.coalesce_nodes( + ["decl1", "decl2", "successor"], WDLWorkflowGraph([]) + ) assert len(result) == 2 assert "decl1" in result[0] assert "decl2" in result[0] assert "successor" in result[1] - def make_string_expr(self, to_parse: str) -> WDL.Expr.String: """ Parse pseudo-WDL for testing whitespace removal. @@ -617,7 +732,7 @@ def make_string_expr(self, to_parse: str) -> WDL.Expr.String: pos = WDL.Error.SourcePosition("nowhere", "nowhere", 0, 0, 0, 0) - parts: List[Union[str, WDL.Expr.Placeholder]] = re.split("(~{[^}]*})", to_parse) + parts: list[Union[str, WDL.Expr.Placeholder]] = re.split("(~{[^}]*})", to_parse) for i in range(1, len(parts), 2): parts[i] = WDL.Expr.Placeholder(pos, {}, WDL.Expr.Null(pos)) @@ -717,7 +832,9 @@ def test_remove_common_leading_whitespace(self): assert trimmed.parts[0] == "" # An empty expression works - expr = WDL.Expr.String(WDL.Error.SourcePosition("nowhere", "nowhere", 0, 0, 0, 0), []) + expr = WDL.Expr.String( + WDL.Error.SourcePosition("nowhere", "nowhere", 0, 0, 0, 0), [] + ) trimmed = remove_common_leading_whitespace(expr) assert len(trimmed.parts) == 0 @@ -741,22 +858,31 @@ def test_choose_human_readable_directory(self): Test to make sure that we pick sensible but non-colliding directories to put files in. """ - from toil.wdl.wdltoil import choose_human_readable_directory, DirectoryNamingStateDict + from toil.wdl.wdltoil import ( + DirectoryNamingStateDict, + choose_human_readable_directory, + ) state: DirectoryNamingStateDict = {} # The first time we should get apath with the task name and without the ID - first_chosen = choose_human_readable_directory("root", "taskname", "111-222-333", state) + first_chosen = choose_human_readable_directory( + "root", "taskname", "111-222-333", state + ) assert first_chosen.startswith("root") assert "taskname" in first_chosen assert "111-222-333" not in first_chosen # If we use the same ID we should get the same result - same_id = choose_human_readable_directory("root", "taskname", "111-222-333", state) + same_id = choose_human_readable_directory( + "root", "taskname", "111-222-333", state + ) self.assertEqual(same_id, first_chosen) # If we use a different ID we shoudl get a different result still obeying the constraints - diff_id = choose_human_readable_directory("root", "taskname", "222-333-444", state) + diff_id = choose_human_readable_directory( + "root", "taskname", "222-333-444", state + ) self.assertNotEqual(diff_id, first_chosen) assert diff_id.startswith("root") assert "taskname" in diff_id diff --git a/src/toil/test/wdl/wdltoil_test_kubernetes.py b/src/toil/test/wdl/wdltoil_test_kubernetes.py index 77a804f4ff..26d4962c17 100644 --- a/src/toil/test/wdl/wdltoil_test_kubernetes.py +++ b/src/toil/test/wdl/wdltoil_test_kubernetes.py @@ -1,13 +1,16 @@ import unittest - -from toil.test.provisioners.clusterTest import AbstractClusterTest from uuid import uuid4 import pytest from toil.provisioners import cluster_factory -from toil.test import (slow, integrative) -from toil.test.wdl.wdltoil_test import WDL_CONFORMANCE_TEST_REPO, WDL_CONFORMANCE_TEST_COMMIT +from toil.test import integrative, slow +from toil.test.provisioners.clusterTest import AbstractClusterTest +from toil.test.wdl.wdltoil_test import ( + WDL_CONFORMANCE_TEST_COMMIT, + WDL_CONFORMANCE_TEST_REPO, +) + @integrative @slow @@ -19,7 +22,7 @@ class WDLKubernetesClusterTest(AbstractClusterTest): def __init__(self, name): super().__init__(name) - self.clusterName = 'wdl-integration-test-' + str(uuid4()) + self.clusterName = "wdl-integration-test-" + str(uuid4()) # t2.medium is the minimum t2 instance that permits Kubernetes self.leaderNodeType = "t2.medium" self.instanceTypes = ["t2.medium"] @@ -27,13 +30,21 @@ def __init__(self, name): def setUp(self) -> None: super().setUp() - self.jobStore = f'aws:{self.awsRegion()}:wdl-test-{uuid4()}' + self.jobStore = f"aws:{self.awsRegion()}:wdl-test-{uuid4()}" def launchCluster(self) -> None: - self.createClusterUtil(args=['--leaderStorage', str(self.requestedLeaderStorage), - '--nodeTypes', ",".join(self.instanceTypes), - '-w', ",".join(self.numWorkers), - '--nodeStorage', str(self.requestedLeaderStorage)]) + self.createClusterUtil( + args=[ + "--leaderStorage", + str(self.requestedLeaderStorage), + "--nodeTypes", + ",".join(self.instanceTypes), + "-w", + ",".join(self.numWorkers), + "--nodeStorage", + str(self.requestedLeaderStorage), + ] + ) def test_wdl_kubernetes_cluster(self): """ @@ -54,22 +65,26 @@ def test_wdl_kubernetes_cluster(self): wdl_dir = "wdl_conformance_tests" # get the wdl-conformance-tests repo to get WDL tasks to run - self.sshUtil([ - "bash", - "-c", - f"git clone {WDL_CONFORMANCE_TEST_REPO} {wdl_dir} && cd {wdl_dir} && git checkout {WDL_CONFORMANCE_TEST_COMMIT}" - ]) + self.sshUtil( + [ + "bash", + "-c", + f"git clone {WDL_CONFORMANCE_TEST_REPO} {wdl_dir} && cd {wdl_dir} && git checkout {WDL_CONFORMANCE_TEST_COMMIT}", + ] + ) # run on kubernetes batchsystem - toil_options = ['--batchSystem=kubernetes', - f"--jobstore={self.jobStore}"] + toil_options = ["--batchSystem=kubernetes", f"--jobstore={self.jobStore}"] # run WDL workflow that will run singularity test_options = [f"tests/md5sum/md5sum.wdl", f"tests/md5sum/md5sum.json"] - self.sshUtil([ - "bash", - "-c", - f"cd {wdl_dir} && toil-wdl-runner {' '.join(test_options)} {' '.join(toil_options)}"]) + self.sshUtil( + [ + "bash", + "-c", + f"cd {wdl_dir} && toil-wdl-runner {' '.join(test_options)} {' '.join(toil_options)}", + ] + ) if __name__ == "__main__": diff --git a/src/toil/toilState.py b/src/toil/toilState.py index aaa8c45b77..642296a116 100644 --- a/src/toil/toilState.py +++ b/src/toil/toilState.py @@ -13,12 +13,11 @@ # limitations under the License. import logging import time -from typing import Dict, Optional, Set +from typing import Optional from toil.bus import JobUpdatedMessage, MessageBus from toil.job import CheckpointJobDescription, JobDescription -from toil.jobStores.abstractJobStore import (AbstractJobStore, - NoSuchJobException) +from toil.jobStores.abstractJobStore import AbstractJobStore, NoSuchJobException logger = logging.getLogger(__name__) @@ -63,47 +62,47 @@ def __init__( # This holds the one true copy of every JobDescription in the leader. # TODO: Do in-place update instead of assignment when we load so we # can't let any non-true copies escape. - self.__job_database: Dict[str, JobDescription] = {} + self.__job_database: dict[str, JobDescription] = {} # Maps from successor (child or follow-on) jobStoreID to predecessor jobStoreIDs - self.successor_to_predecessors: Dict[str, Set[str]] = {} + self.successor_to_predecessors: dict[str, set[str]] = {} # Hash of jobStoreIDs to counts of numbers of successors issued. # There are no entries for jobs without successors in this map. - self.successorCounts: Dict[str, int] = {} + self.successorCounts: dict[str, int] = {} # This is a hash of service jobs, referenced by jobStoreID, to their client's ID - self.service_to_client: Dict[str, str] = {} + self.service_to_client: dict[str, str] = {} # Holds, for each client job ID, the job IDs of its services that are # possibly currently issued. Includes every service host that has been # given to the service manager by the leader, and hasn't been seen by # the leader as stopped yet. - self.servicesIssued: Dict[str, Set[str]] = {} + self.servicesIssued: dict[str, set[str]] = {} # Holds the IDs of jobs that are currently issued to the batch system # and haven't come back yet. # TODO: a bit redundant with leader's issued_jobs_by_batch_system_id - self.jobs_issued: Set[str] = set() + self.jobs_issued: set[str] = set() # The set of totally failed jobs - this needs to be filtered at the # end to remove jobs that were removed by checkpoints - self.totalFailedJobs: Set[str] = set() + self.totalFailedJobs: set[str] = set() # Jobs (as jobStoreIDs) with successors that have totally failed - self.hasFailedSuccessors: Set[str] = set() + self.hasFailedSuccessors: set[str] = set() # The set of successors of failed jobs as a set of jobStoreIds - self.failedSuccessors: Set[str] = set() + self.failedSuccessors: set[str] = set() # Set of jobs that have multiple predecessors that have one or more predecessors # finished, but not all of them. - self.jobsToBeScheduledWithMultiplePredecessors: Set[str] = set() + self.jobsToBeScheduledWithMultiplePredecessors: set[str] = set() def load_workflow( self, rootJob: JobDescription, - jobCache: Optional[Dict[str, JobDescription]] = None + jobCache: Optional[dict[str, JobDescription]] = None, ) -> None: """ Load the workflow rooted at the given job. @@ -281,7 +280,11 @@ def successor_returned(self, predecessor_id: str) -> None: ) else: self.successorCounts[predecessor_id] -= 1 - logger.debug("Successors: one fewer for %s, now have %d", predecessor_id, self.successorCounts[predecessor_id]) + logger.debug( + "Successors: one fewer for %s, now have %d", + predecessor_id, + self.successorCounts[predecessor_id], + ) if self.successorCounts[predecessor_id] == 0: del self.successorCounts[predecessor_id] @@ -296,7 +299,6 @@ def count_pending_successors(self, predecessor_id: str) -> int: else: return self.successorCounts[predecessor_id] - def _buildToilState(self, jobDesc: JobDescription) -> None: """ Build the ToilState class from the subtree root JobDescription. @@ -330,7 +332,10 @@ def _buildToilState(self, jobDesc: JobDescription) -> None: # Set the job updated because we should be able to make progress on it. self.bus.publish(JobUpdatedMessage(str(jobDesc.jobStoreID), 0)) - if isinstance(jobDesc, CheckpointJobDescription) and jobDesc.checkpoint is not None: + if ( + isinstance(jobDesc, CheckpointJobDescription) + and jobDesc.checkpoint is not None + ): jobDesc.restore_checkpoint() else: # There exist successors @@ -345,7 +350,9 @@ def _buildToilState(self, jobDesc: JobDescription) -> None: jobDesc.nextSuccessors() or set() ) - def processSuccessorWithMultiplePredecessors(successor: JobDescription) -> None: + def processSuccessorWithMultiplePredecessors( + successor: JobDescription, + ) -> None: # If jobDesc is not reported as complete by the successor if jobDesc.jobStoreID not in successor.predecessorsFinished: @@ -354,11 +361,15 @@ def processSuccessorWithMultiplePredecessors(successor: JobDescription) -> None: # If the successor has no predecessors to finish if len(successor.predecessorsFinished) > successor.predecessorNumber: - raise RuntimeError("There are more finished predecessors than possible.") + raise RuntimeError( + "There are more finished predecessors than possible." + ) if len(successor.predecessorsFinished) == successor.predecessorNumber: # It is ready to be run, so remove it from the set of waiting jobs - self.jobsToBeScheduledWithMultiplePredecessors.remove(successorJobStoreID) + self.jobsToBeScheduledWithMultiplePredecessors.remove( + successorJobStoreID + ) # Recursively consider the successor self._buildToilState(successor) @@ -383,9 +394,16 @@ def processSuccessorWithMultiplePredecessors(successor: JobDescription) -> None: # We put the successor job in the set of waiting successor # jobs with multiple predecessors - if successorJobStoreID in self.jobsToBeScheduledWithMultiplePredecessors: - raise RuntimeError("Failed to schedule the successor job. The successor job is already scheduled.") - self.jobsToBeScheduledWithMultiplePredecessors.add(successorJobStoreID) + if ( + successorJobStoreID + in self.jobsToBeScheduledWithMultiplePredecessors + ): + raise RuntimeError( + "Failed to schedule the successor job. The successor job is already scheduled." + ) + self.jobsToBeScheduledWithMultiplePredecessors.add( + successorJobStoreID + ) # Process successor processSuccessorWithMultiplePredecessors(successor) @@ -399,14 +417,22 @@ def processSuccessorWithMultiplePredecessors(successor: JobDescription) -> None: # We've already seen the successor # Add the job as a predecessor - if jobDesc.jobStoreID in self.successor_to_predecessors[successorJobStoreID]: - raise RuntimeError("Failed to add the job as a predecessor. The job is already added as a predecessor.") + if ( + jobDesc.jobStoreID + in self.successor_to_predecessors[successorJobStoreID] + ): + raise RuntimeError( + "Failed to add the job as a predecessor. The job is already added as a predecessor." + ) self.successor_to_predecessors[successorJobStoreID].add( str(jobDesc.jobStoreID) ) # If the successor has multiple predecessors - if successorJobStoreID in self.jobsToBeScheduledWithMultiplePredecessors: + if ( + successorJobStoreID + in self.jobsToBeScheduledWithMultiplePredecessors + ): # Get the successor from cache successor = self.get_job(successorJobStoreID) diff --git a/src/toil/utils/toilConfig.py b/src/toil/utils/toilConfig.py index 1e78c1d24e..5ed402ab18 100644 --- a/src/toil/utils/toilConfig.py +++ b/src/toil/utils/toilConfig.py @@ -26,11 +26,20 @@ def main() -> None: parser = ArgParser() - parser.add_argument("output", default="config.yaml", help="Filepath to write the config file too. Default=%(" - "default)s") + parser.add_argument( + "output", + default="config.yaml", + help="Filepath to write the config file too. Default=%(" "default)s", + ) add_logging_options(parser) options = parser.parse_args() set_logging_from_options(options) - logger.debug("Attempting to write a default config file to %s.", os.path.abspath(options.output)) + logger.debug( + "Attempting to write a default config file to %s.", + os.path.abspath(options.output), + ) generate_config(os.path.abspath(options.output)) - logger.info("Successfully wrote a default config file to %s.", os.path.abspath(options.output)) + logger.info( + "Successfully wrote a default config file to %s.", + os.path.abspath(options.output), + ) diff --git a/src/toil/utils/toilDebugFile.py b/src/toil/utils/toilDebugFile.py index b8abcea4bc..f78b43e7d3 100644 --- a/src/toil/utils/toilDebugFile.py +++ b/src/toil/utils/toilDebugFile.py @@ -20,8 +20,8 @@ from toil.common import Config, Toil, parser_with_common_options from toil.jobStores.fileJobStore import FileJobStore -from toil.lib.resources import glob from toil.lib.conversions import strtobool +from toil.lib.resources import glob from toil.statsAndLogging import set_logging_from_options logger = logging.getLogger(__name__) @@ -44,17 +44,23 @@ def fetchJobStoreFiles(jobStore: FileJobStore, options: argparse.Namespace) -> N # globbing around inside it. Does this even work? for jobStoreFile in options.fetch: - jobStoreHits = glob(directoryname=options.jobStore, - glob_pattern=jobStoreFile) + jobStoreHits = glob(directoryname=options.jobStore, glob_pattern=jobStoreFile) for jobStoreFileID in jobStoreHits: - logger.debug(f"Copying job store file: {jobStoreFileID} to {options.localFilePath[0]}") - jobStore.read_file(jobStoreFileID, - os.path.join(options.localFilePath[0], - os.path.basename(jobStoreFileID)), - symlink=options.useSymlinks) - - -def printContentsOfJobStore(job_store: FileJobStore, job_id: Optional[str] = None) -> None: + logger.debug( + f"Copying job store file: {jobStoreFileID} to {options.localFilePath[0]}" + ) + jobStore.read_file( + jobStoreFileID, + os.path.join( + options.localFilePath[0], os.path.basename(jobStoreFileID) + ), + symlink=options.useSymlinks, + ) + + +def printContentsOfJobStore( + job_store: FileJobStore, job_id: Optional[str] = None +) -> None: """ Fetch a list of all files contained in the job store if nameOfJob is not declared, otherwise it only prints out the names of files for that specific @@ -90,22 +96,33 @@ def printContentsOfJobStore(job_store: FileJobStore, job_id: Optional[str] = Non def main() -> None: parser = parser_with_common_options(jobstore_option=True, prog="toil debug-file") - parser.add_argument("--localFilePath", - nargs=1, - help="Location to which to copy job store files.") - parser.add_argument("--fetch", - nargs="+", - help="List of job-store files to be copied locally." - "Use either explicit names (i.e. 'data.txt'), or " - "specify glob patterns (i.e. '*.txt')") - parser.add_argument("--listFilesInJobStore", type=strtobool, - help="Prints a list of the current files in the jobStore.") - parser.add_argument("--fetchEntireJobStore", type=strtobool, - help="Copy all job store files into a local directory.") - parser.add_argument("--useSymlinks", type=strtobool, - help="Creates symlink 'shortcuts' of files in the localFilePath" - " instead of hardlinking or copying, where possible. If this is" - " not possible, it will copy the files (shutil.copyfile()).") + parser.add_argument( + "--localFilePath", nargs=1, help="Location to which to copy job store files." + ) + parser.add_argument( + "--fetch", + nargs="+", + help="List of job-store files to be copied locally." + "Use either explicit names (i.e. 'data.txt'), or " + "specify glob patterns (i.e. '*.txt')", + ) + parser.add_argument( + "--listFilesInJobStore", + type=strtobool, + help="Prints a list of the current files in the jobStore.", + ) + parser.add_argument( + "--fetchEntireJobStore", + type=strtobool, + help="Copy all job store files into a local directory.", + ) + parser.add_argument( + "--useSymlinks", + type=strtobool, + help="Creates symlink 'shortcuts' of files in the localFilePath" + " instead of hardlinking or copying, where possible. If this is" + " not possible, it will copy the files (shutil.copyfile()).", + ) # Load the jobStore options = parser.parse_args() diff --git a/src/toil/utils/toilDebugJob.py b/src/toil/utils/toilDebugJob.py index c4ac2f139c..47f9956876 100644 --- a/src/toil/utils/toilDebugJob.py +++ b/src/toil/utils/toilDebugJob.py @@ -17,11 +17,10 @@ import os import pprint import sys - from pathlib import Path -from typing import Optional, List, Tuple +from typing import Optional -from toil.common import Config, Toil, parser_with_common_options +from toil.common import Toil, parser_with_common_options from toil.job import FilesDownloadedStoppingPointReached from toil.jobStores.fileJobStore import FileJobStore from toil.statsAndLogging import set_logging_from_options @@ -33,23 +32,38 @@ def main() -> None: - parser = parser_with_common_options(jobstore_option=True, prog="toil debug-job", default_log_level=logging.DEBUG) - parser.add_argument("job", type=str, - help="The job store id or job name of a job within the provided jobstore") - parser.add_argument("--printJobInfo", action="store_true", - help="Dump debugging info about the job instead of running it") - parser.add_argument("--retrieveTaskDirectory", dest="retrieve_task_directory", type=str, default=None, - help="Download CWL or WDL task inputs to the given directory and stop.") + parser = parser_with_common_options( + jobstore_option=True, prog="toil debug-job", default_log_level=logging.DEBUG + ) + parser.add_argument( + "job", + type=str, + help="The job store id or job name of a job within the provided jobstore", + ) + parser.add_argument( + "--printJobInfo", + action="store_true", + help="Dump debugging info about the job instead of running it", + ) + parser.add_argument( + "--retrieveTaskDirectory", + dest="retrieve_task_directory", + type=str, + default=None, + help="Download CWL or WDL task inputs to the given directory and stop.", + ) options = parser.parse_args() set_logging_from_options(options) - if options.retrieve_task_directory is not None and os.path.exists(options.retrieve_task_directory): + if options.retrieve_task_directory is not None and os.path.exists( + options.retrieve_task_directory + ): # The logic to duplicate container mounts depends on stuff not already existing. logger.error( "The directory %s given for --retrieveTaskDirectory already exists. " "Stopping to avoid clobbering existing files.", - options.retrieve_task_directory + options.retrieve_task_directory, ) sys.exit(1) @@ -83,26 +97,45 @@ def main() -> None: if len(hits) == 0: # No hits if suggestion is None: - logger.critical("No job found with ID or name \"%s\". No jobs are completely failed.", options.job) + logger.critical( + 'No job found with ID or name "%s". No jobs are completely failed.', + options.job, + ) else: - logger.critical("No job found with ID or name \"%s\". How about the failed job %s instead?", options.job, suggestion) + logger.critical( + 'No job found with ID or name "%s". How about the failed job %s instead?', + options.job, + suggestion, + ) sys.exit(1) elif len(hits) > 1: # Several hits, maybe only one has failed completely_failed_hits = [job for job in hits if job.remainingTryCount == 0] if len(completely_failed_hits) == 0: - logger.critical("Multiple jobs match \"%s\" but none are completely failed: %s", options.job, hits) + logger.critical( + 'Multiple jobs match "%s" but none are completely failed: %s', + options.job, + hits, + ) sys.exit(1) elif len(completely_failed_hits) > 0: - logger.critical("Multiple jobs matching \"%s\" are completely failed: %s", options.job, completely_failed_hits) + logger.critical( + 'Multiple jobs matching "%s" are completely failed: %s', + options.job, + completely_failed_hits, + ) sys.exit(1) else: # We found one completely failed job, they probably mean that one. - logger.info("There are %s jobs matching \"%s\"; assuming you mean the failed one: %s", options.job, completely_failed_hits[0]) + logger.info( + 'There are %s jobs matching "%s"; assuming you mean the failed one: %s', + options.job, + completely_failed_hits[0], + ) job_id = completely_failed_hits[0].jobStoreID else: # We found one job with this name, so they must mean that one - logger.info("Looked up job named \"%s\": %s", options.job, hits[0]) + logger.info('Looked up job named "%s": %s', options.job, hits[0]) job_id = hits[0].jobStoreID if options.printJobInfo: @@ -122,19 +155,29 @@ def main() -> None: local_worker_temp_dir = None if options.retrieve_task_directory is not None: # Pick a directory in it (which may be removed by the worker) as the worker's temp dir. - local_worker_temp_dir = os.path.join(options.retrieve_task_directory, "worker") + local_worker_temp_dir = os.path.join( + options.retrieve_task_directory, "worker" + ) # Make sure it exists os.makedirs(local_worker_temp_dir, exist_ok=True) # And tell the job to just download files debug_flags.add("download_only") # We might need to reconstruct a container environment. - host_and_job_paths: Optional[List[Tuple[str, str]]] = None + host_and_job_paths: Optional[list[tuple[str, str]]] = None # Track if the run succeeded without error run_succeeded = False logger.info(f"Running the following job locally: {job_id}") try: - workerScript(jobStore, config, job_id, job_id, redirect_output_to_log_file=False, local_worker_temp_dir=local_worker_temp_dir, debug_flags=debug_flags) + workerScript( + jobStore, + config, + job_id, + job_id, + redirect_output_to_log_file=False, + local_worker_temp_dir=local_worker_temp_dir, + debug_flags=debug_flags, + ) except FilesDownloadedStoppingPointReached as e: # We asked for the files to be downloaded and now they are. assert options.retrieve_task_directory is not None @@ -165,21 +208,37 @@ def main() -> None: for host_path, job_path in sorted_mounts: if not os.path.exists(host_path): - logger.error("Job intended to mount %s as %s but it does not exist!", host_path, job_path) + logger.error( + "Job intended to mount %s as %s but it does not exist!", + host_path, + job_path, + ) continue if not job_path.startswith("/"): - logger.error("Job intended to mount %s as %s but destination is a relative path!", host_path, job_path) + logger.error( + "Job intended to mount %s as %s but destination is a relative path!", + host_path, + job_path, + ) continue # Drop the slash because we are building a chroot-ish mini filesystem. job_relative_path = job_path[1:] if job_relative_path.startswith("/"): # We are having trouble understanding what the job # intended to do. Stop working on this mount. - logger.error("Job intended to mount %s as %s but destination starts with multiple slashes for some reason!", host_path, job_path) + logger.error( + "Job intended to mount %s as %s but destination starts with multiple slashes for some reason!", + host_path, + job_path, + ) continue fake_job_path = os.path.join(fake_job_root, job_relative_path) if os.path.exists(fake_job_path): - logger.error("Job intended to mount %s as %s but that location is already mounted!", host_path, job_path) + logger.error( + "Job intended to mount %s as %s but that location is already mounted!", + host_path, + job_path, + ) continue logger.info("Job mounted %s as %s", host_path, job_path) diff --git a/src/toil/utils/toilDestroyCluster.py b/src/toil/utils/toilDestroyCluster.py index 7239084842..948d3b2bb1 100644 --- a/src/toil/utils/toilDestroyCluster.py +++ b/src/toil/utils/toilDestroyCluster.py @@ -20,16 +20,21 @@ logger = logging.getLogger(__name__) + def main() -> None: - parser = parser_with_common_options(provisioner_options=True, jobstore_option=False, prog="toil destroy-cluster") + parser = parser_with_common_options( + provisioner_options=True, jobstore_option=False, prog="toil destroy-cluster" + ) options = parser.parse_args() set_logging_from_options(options) - logger.info('Destroying cluster %s', options.clusterName) + logger.info("Destroying cluster %s", options.clusterName) - cluster = cluster_factory(provisioner=options.provisioner, - clusterName=options.clusterName, - zone=options.zone) + cluster = cluster_factory( + provisioner=options.provisioner, + clusterName=options.clusterName, + zone=options.zone, + ) cluster.destroyCluster() - logger.info('Cluster %s is now gone.', options.clusterName) + logger.info("Cluster %s is now gone.", options.clusterName) diff --git a/src/toil/utils/toilKill.py b/src/toil/utils/toilKill.py index c83dff1c73..ad4f1b8123 100644 --- a/src/toil/utils/toilKill.py +++ b/src/toil/utils/toilKill.py @@ -26,8 +26,11 @@ def main() -> None: parser = parser_with_common_options(prog="toil kill") - parser.add_argument('--force', action='store_true', - help="Send SIGKILL to the leader process if local.") + parser.add_argument( + "--force", + action="store_true", + help="Send SIGKILL to the leader process if local.", + ) options = parser.parse_args() set_logging_from_options(options) config = Config() @@ -65,7 +68,9 @@ def main() -> None: os.kill(pid_to_kill, signal.SIGKILL if options.force else signal.SIGTERM) logger.info("Toil process %i successfully terminated.", pid_to_kill) except OSError: - logger.error("Could not signal process %i. Is it still running?", pid_to_kill) + logger.error( + "Could not signal process %i. Is it still running?", pid_to_kill + ) sys.exit(1) else: # Flip the flag inside the job store to signal kill diff --git a/src/toil/utils/toilLaunchCluster.py b/src/toil/utils/toilLaunchCluster.py index 328f76401d..f2972ef487 100644 --- a/src/toil/utils/toilLaunchCluster.py +++ b/src/toil/utils/toilLaunchCluster.py @@ -15,28 +15,27 @@ import logging import os -from typing import Dict, List, Tuple, Union +from typing import Union from toil import applianceSelf from toil.common import parser_with_common_options + try: from toil.lib.aws import build_tag_dict_from_env except ModuleNotFoundError: - build_tag_dict_from_env: Dict[str, str] = lambda _: {} # type: ignore[no-redef] + build_tag_dict_from_env: dict[str, str] = lambda _: {} # type: ignore[no-redef] from toil.lib.conversions import opt_strtobool -from toil.provisioners import (check_valid_node_types, - cluster_factory, - parse_node_types) +from toil.provisioners import check_valid_node_types, cluster_factory, parse_node_types from toil.statsAndLogging import set_logging_from_options logger = logging.getLogger(__name__) -def create_tags_dict(tags: List[str]) -> Dict[str, str]: +def create_tags_dict(tags: list[str]) -> dict[str, str]: tags_dict = dict() for tag in tags: try: - key, value = tag.split('=') + key, value = tag.split("=") except ValueError: logger.error("Tag specification '%s' must contain '='", tag) raise @@ -45,88 +44,160 @@ def create_tags_dict(tags: List[str]) -> Dict[str, str]: def main() -> None: - parser = parser_with_common_options(provisioner_options=True, jobstore_option=False, prog="toil launch-cluster") - parser.add_argument("-T", "--clusterType", dest="clusterType", - choices=['mesos', 'kubernetes'], - default=None, # TODO: change default to "kubernetes" when we are ready. - help="Cluster scheduler to use.") - parser.add_argument("--leaderNodeType", dest="leaderNodeType", required=True, - help="Non-preemptible node type to use for the cluster leader.") - parser.add_argument("--keyPairName", dest='keyPairName', - help="On AWS, the name of the AWS key pair to include on the instance." - " On Google/GCE, this is the ssh key pair.") - parser.add_argument("--owner", dest='owner', - help="The owner tag for all instances. If not given, the value in" - "TOIL_OWNER_TAG will be used, or else the value of --keyPairName.") - parser.add_argument("--boto", dest='botoPath', - help="The path to the boto credentials directory. This is transferred " - "to all nodes in order to access the AWS jobStore from non-AWS instances.") - parser.add_argument("-t", "--tag", metavar='NAME=VALUE', dest='tags', - default=[], action='append', - help="Tags are added to the AWS cluster for this node and all of its " - "children. Tags are of the form:\n" - " -t key1=value1 --tag key2=value2\n" - "Multiple tags are allowed and each tag needs its own flag. By " - "default the cluster is tagged with " - " {\n" - " \"Name\": clusterName,\n" - " \"Owner\": IAM username\n" - " }. ") - - parser.add_argument("--network", - help="GCE cloud network to use. default: 'default'") - parser.add_argument("--vpcSubnet", - help="VPC subnet ID to launch cluster leader in. Uses default subnet " - "if not specified. This subnet needs to have auto assign IPs turned on.") - parser.add_argument("--use_private_ip", dest="use_private_ip", action='store_true', default=False, - help="if specified, ignore the public ip of the nodes") - parser.add_argument("--nodeTypes", dest='nodeTypes', default=None, type=str, - help="Specifies a list of comma-separated node types, each of which is " - "composed of slash-separated instance types, and an optional spot " - "bid set off by a colon, making the node type preemptible. Instance " - "types may appear in multiple node types, and the same node type " - "may appear as both preemptible and non-preemptible.\n" - "Valid argument specifying two node types:\n" - "\tc5.4xlarge/c5a.4xlarge:0.42,t2.large\n" - "Node types:\n" - "\tc5.4xlarge/c5a.4xlarge:0.42 and t2.large\n" - "Instance types:\n" - "\tc5.4xlarge, c5a.4xlarge, and t2.large\n" - "Semantics:\n" - "\tBid $0.42/hour for either c5.4xlarge or c5a.4xlarge instances,\n" - "\ttreated interchangeably, while they are available at that price,\n" - "\tand buy t2.large instances at full price\n" - "Must also provide the --workers argument to specify how many " - "workers of each node type to create.") - parser.add_argument("-w", "--workers", dest='workers', default=None, type=str, - help="Comma-separated list of the ranges of numbers of workers of each " - "node type to launch, such as '0-2,5,1-3'. If a range is given, " - "workers will automatically be launched and terminated by the cluster " - "to auto-scale to the workload.") - parser.add_argument("--leaderStorage", dest='leaderStorage', type=int, default=50, - help="Specify the size (in gigabytes) of the root volume for the leader " - "instance. This is an EBS volume.") - parser.add_argument("--nodeStorage", dest='nodeStorage', type=int, default=50, - help="Specify the size (in gigabytes) of the root volume for any worker " - "instances created when using the -w flag. This is an EBS volume.") - parser.add_argument('--forceDockerAppliance', dest='forceDockerAppliance', action='store_true', - default=False, - help="Disables sanity checking the existence of the docker image specified " - "by TOIL_APPLIANCE_SELF, which Toil uses to provision mesos for " - "autoscaling.") - parser.add_argument('--awsEc2ProfileArn', dest='awsEc2ProfileArn', default=None, type=str, - help="If provided, the specified ARN is used as the instance profile for EC2 instances." - "Useful for setting custom IAM profiles. If not specified, a new IAM role is created " - "by default with sufficient access to perform basic cluster operations.") - parser.add_argument('--awsEc2ExtraSecurityGroupId', dest='awsEc2ExtraSecurityGroupIds', default=[], action='append', - help="Any additional security groups to attach to EC2 instances. Note that a security group " - "with its name equal to the cluster name will always be created, thus ensure that " - "the extra security groups do not have the same name as the cluster name.") - parser.add_argument("--allowFuse", type=opt_strtobool, default=True, - help="Enable both the leader and worker nodes to be able to run Singularity with FUSE. For " - "Kubernetes, this will make the leader privileged and ask workers to run as privileged. " - "(default: %(default)s)") - #TODO Set Aws Profile in CLI options + parser = parser_with_common_options( + provisioner_options=True, jobstore_option=False, prog="toil launch-cluster" + ) + parser.add_argument( + "-T", + "--clusterType", + dest="clusterType", + choices=["mesos", "kubernetes"], + default=None, # TODO: change default to "kubernetes" when we are ready. + help="Cluster scheduler to use.", + ) + parser.add_argument( + "--leaderNodeType", + dest="leaderNodeType", + required=True, + help="Non-preemptible node type to use for the cluster leader.", + ) + parser.add_argument( + "--keyPairName", + dest="keyPairName", + help="On AWS, the name of the AWS key pair to include on the instance." + " On Google/GCE, this is the ssh key pair.", + ) + parser.add_argument( + "--owner", + dest="owner", + help="The owner tag for all instances. If not given, the value in" + "TOIL_OWNER_TAG will be used, or else the value of --keyPairName.", + ) + parser.add_argument( + "--boto", + dest="botoPath", + help="The path to the boto credentials directory. This is transferred " + "to all nodes in order to access the AWS jobStore from non-AWS instances.", + ) + parser.add_argument( + "-t", + "--tag", + metavar="NAME=VALUE", + dest="tags", + default=[], + action="append", + help="Tags are added to the AWS cluster for this node and all of its " + "children. Tags are of the form:\n" + " -t key1=value1 --tag key2=value2\n" + "Multiple tags are allowed and each tag needs its own flag. By " + "default the cluster is tagged with " + " {\n" + ' "Name": clusterName,\n' + ' "Owner": IAM username\n' + " }. ", + ) + + parser.add_argument( + "--network", help="GCE cloud network to use. default: 'default'" + ) + parser.add_argument( + "--vpcSubnet", + help="VPC subnet ID to launch cluster leader in. Uses default subnet " + "if not specified. This subnet needs to have auto assign IPs turned on.", + ) + parser.add_argument( + "--use_private_ip", + dest="use_private_ip", + action="store_true", + default=False, + help="if specified, ignore the public ip of the nodes", + ) + parser.add_argument( + "--nodeTypes", + dest="nodeTypes", + default=None, + type=str, + help="Specifies a list of comma-separated node types, each of which is " + "composed of slash-separated instance types, and an optional spot " + "bid set off by a colon, making the node type preemptible. Instance " + "types may appear in multiple node types, and the same node type " + "may appear as both preemptible and non-preemptible.\n" + "Valid argument specifying two node types:\n" + "\tc5.4xlarge/c5a.4xlarge:0.42,t2.large\n" + "Node types:\n" + "\tc5.4xlarge/c5a.4xlarge:0.42 and t2.large\n" + "Instance types:\n" + "\tc5.4xlarge, c5a.4xlarge, and t2.large\n" + "Semantics:\n" + "\tBid $0.42/hour for either c5.4xlarge or c5a.4xlarge instances,\n" + "\ttreated interchangeably, while they are available at that price,\n" + "\tand buy t2.large instances at full price\n" + "Must also provide the --workers argument to specify how many " + "workers of each node type to create.", + ) + parser.add_argument( + "-w", + "--workers", + dest="workers", + default=None, + type=str, + help="Comma-separated list of the ranges of numbers of workers of each " + "node type to launch, such as '0-2,5,1-3'. If a range is given, " + "workers will automatically be launched and terminated by the cluster " + "to auto-scale to the workload.", + ) + parser.add_argument( + "--leaderStorage", + dest="leaderStorage", + type=int, + default=50, + help="Specify the size (in gigabytes) of the root volume for the leader " + "instance. This is an EBS volume.", + ) + parser.add_argument( + "--nodeStorage", + dest="nodeStorage", + type=int, + default=50, + help="Specify the size (in gigabytes) of the root volume for any worker " + "instances created when using the -w flag. This is an EBS volume.", + ) + parser.add_argument( + "--forceDockerAppliance", + dest="forceDockerAppliance", + action="store_true", + default=False, + help="Disables sanity checking the existence of the docker image specified " + "by TOIL_APPLIANCE_SELF, which Toil uses to provision mesos for " + "autoscaling.", + ) + parser.add_argument( + "--awsEc2ProfileArn", + dest="awsEc2ProfileArn", + default=None, + type=str, + help="If provided, the specified ARN is used as the instance profile for EC2 instances." + "Useful for setting custom IAM profiles. If not specified, a new IAM role is created " + "by default with sufficient access to perform basic cluster operations.", + ) + parser.add_argument( + "--awsEc2ExtraSecurityGroupId", + dest="awsEc2ExtraSecurityGroupIds", + default=[], + action="append", + help="Any additional security groups to attach to EC2 instances. Note that a security group " + "with its name equal to the cluster name will always be created, thus ensure that " + "the extra security groups do not have the same name as the cluster name.", + ) + parser.add_argument( + "--allowFuse", + type=opt_strtobool, + default=True, + help="Enable both the leader and worker nodes to be able to run Singularity with FUSE. For " + "Kubernetes, this will make the leader privileged and ask workers to run as privileged. " + "(default: %(default)s)", + ) + # TODO Set Aws Profile in CLI options options = parser.parse_args() set_logging_from_options(options) @@ -134,75 +205,94 @@ def main() -> None: # Get worker node types worker_node_types = parse_node_types(options.nodeTypes) - check_valid_node_types(options.provisioner, worker_node_types + [({options.leaderNodeType}, None)]) + check_valid_node_types( + options.provisioner, worker_node_types + [({options.leaderNodeType}, None)] + ) # Holds string ranges, like "5", or "3-10" - worker_node_ranges = options.workers.split(',') if options.workers else [] + worker_node_ranges = options.workers.split(",") if options.workers else [] # checks the validity of TOIL_APPLIANCE_SELF before proceeding applianceSelf(forceDockerAppliance=options.forceDockerAppliance) # This holds either ints to launch static nodes, or tuples of ints # specifying ranges to launch managed auto-scaling nodes, for each type. - nodeCounts: List[Union[int, Tuple[int, int]]] = [] + nodeCounts: list[Union[int, tuple[int, int]]] = [] - if ((worker_node_types != [] or worker_node_ranges != []) and not - (worker_node_types != [] and worker_node_ranges != [])): + if (worker_node_types != [] or worker_node_ranges != []) and not ( + worker_node_types != [] and worker_node_ranges != [] + ): raise RuntimeError("The --nodeTypes option requires --workers, and visa versa.") if worker_node_types and worker_node_ranges: - if not len(worker_node_types) == len(worker_node_ranges): - raise RuntimeError("List of worker count ranges must be the same length as the list of node types.") - - for spec in worker_node_ranges: - if '-' in spec: - # Provision via autoscaling - parts = spec.split('-') - if len(parts) != 2: - raise RuntimeError("Unacceptable range: " + spec) - nodeCounts.append((int(parts[0]), int(parts[1]))) - else: - # Provision fixed nodes - nodeCounts.append(int(spec)) - - owner = options.owner or os.getenv('TOIL_OWNER_TAG') or options.keyPairName or 'toil' + if not len(worker_node_types) == len(worker_node_ranges): + raise RuntimeError( + "List of worker count ranges must be the same length as the list of node types." + ) + + for spec in worker_node_ranges: + if "-" in spec: + # Provision via autoscaling + parts = spec.split("-") + if len(parts) != 2: + raise RuntimeError("Unacceptable range: " + spec) + nodeCounts.append((int(parts[0]), int(parts[1]))) + else: + # Provision fixed nodes + nodeCounts.append(int(spec)) + + owner = ( + options.owner or os.getenv("TOIL_OWNER_TAG") or options.keyPairName or "toil" + ) # Check to see if the user specified a zone. If not, see if one is stored in an environment variable. - options.zone = options.zone or os.environ.get(f'TOIL_{options.provisioner.upper()}_ZONE') + options.zone = options.zone or os.environ.get( + f"TOIL_{options.provisioner.upper()}_ZONE" + ) if not options.zone: - raise RuntimeError(f'Please provide a value for --zone or set a default in the ' - f'TOIL_{options.provisioner.upper()}_ZONE environment variable.') + raise RuntimeError( + f"Please provide a value for --zone or set a default in the " + f"TOIL_{options.provisioner.upper()}_ZONE environment variable." + ) if options.clusterType == "mesos": - logger.warning('You are using a Mesos cluster, which is no longer recommended as Toil is ' - 'transitioning to Kubernetes-based clusters. Consider switching to ' - '--clusterType=kubernetes instead.') + logger.warning( + "You are using a Mesos cluster, which is no longer recommended as Toil is " + "transitioning to Kubernetes-based clusters. Consider switching to " + "--clusterType=kubernetes instead." + ) if options.clusterType is None: - logger.warning('Argument --clusterType is not set... using "mesos". ' - 'In future versions of Toil, the default cluster scheduler will be ' - 'set to "kubernetes" if the cluster type is not specified.') + logger.warning( + 'Argument --clusterType is not set... using "mesos". ' + "In future versions of Toil, the default cluster scheduler will be " + 'set to "kubernetes" if the cluster type is not specified.' + ) options.clusterType = "mesos" - logger.info('Creating cluster %s...', options.clusterName) - - cluster = cluster_factory(provisioner=options.provisioner, - clusterName=options.clusterName, - clusterType=options.clusterType, - zone=options.zone, - nodeStorage=options.nodeStorage, - enable_fuse=options.allowFuse) - - cluster.launchCluster(leaderNodeType=options.leaderNodeType, - leaderStorage=options.leaderStorage, - owner=owner, - keyName=options.keyPairName, - botoPath=options.botoPath, - userTags=tags, - network=options.network, - vpcSubnet=options.vpcSubnet, - awsEc2ProfileArn=options.awsEc2ProfileArn, - awsEc2ExtraSecurityGroupIds=options.awsEc2ExtraSecurityGroupIds) + logger.info("Creating cluster %s...", options.clusterName) + + cluster = cluster_factory( + provisioner=options.provisioner, + clusterName=options.clusterName, + clusterType=options.clusterType, + zone=options.zone, + nodeStorage=options.nodeStorage, + enable_fuse=options.allowFuse, + ) + + cluster.launchCluster( + leaderNodeType=options.leaderNodeType, + leaderStorage=options.leaderStorage, + owner=owner, + keyName=options.keyPairName, + botoPath=options.botoPath, + userTags=tags, + network=options.network, + vpcSubnet=options.vpcSubnet, + awsEc2ProfileArn=options.awsEc2ProfileArn, + awsEc2ExtraSecurityGroupIds=options.awsEc2ExtraSecurityGroupIds, + ) for typeNum, spec in enumerate(nodeCounts): # For each batch of workers to make @@ -220,8 +310,12 @@ def main() -> None: cluster.addNodes(nodeTypes=wanted[0], numNodes=spec, preemptible=False) else: # We have a spot bid - cluster.addNodes(nodeTypes=wanted[0], numNodes=spec, preemptible=True, - spotBid=wanted[1]) + cluster.addNodes( + nodeTypes=wanted[0], + numNodes=spec, + preemptible=True, + spotBid=wanted[1], + ) elif isinstance(spec, tuple): # Make a range of auto-scaling nodes @@ -238,11 +332,20 @@ def main() -> None: if wanted[1] is None: # Make non-spot instances - cluster.addManagedNodes(nodeTypes=wanted[0], minNodes=min_count, maxNodes=max_count, - preemptible=False) + cluster.addManagedNodes( + nodeTypes=wanted[0], + minNodes=min_count, + maxNodes=max_count, + preemptible=False, + ) else: # Bid at the given price. - cluster.addManagedNodes(nodeTypes=wanted[0], minNodes=min_count, maxNodes=max_count, - preemptible=True, spotBid=wanted[1]) - - logger.info('Cluster created successfully.') + cluster.addManagedNodes( + nodeTypes=wanted[0], + minNodes=min_count, + maxNodes=max_count, + preemptible=True, + spotBid=wanted[1], + ) + + logger.info("Cluster created successfully.") diff --git a/src/toil/utils/toilMain.py b/src/toil/utils/toilMain.py index 9449b1571f..f47c9354ca 100755 --- a/src/toil/utils/toilMain.py +++ b/src/toil/utils/toilMain.py @@ -3,9 +3,8 @@ import sys import textwrap import types -from typing import Any, Dict - from importlib.metadata import version as metadata_version +from typing import Any from toil.version import version @@ -13,23 +12,25 @@ def main() -> None: modules = loadModules() - if len(sys.argv) < 2 or sys.argv[1] == '--help': + if len(sys.argv) < 2 or sys.argv[1] == "--help": printHelp(modules) sys.exit(0) cmd = sys.argv[1] - if cmd == '--version': + if cmd == "--version": printVersion() sys.exit(0) try: module = modules[cmd] except KeyError: - sys.stderr.write(f'Unknown option "{cmd}". Pass --help to display usage information.\n') + sys.stderr.write( + f'Unknown option "{cmd}". Pass --help to display usage information.\n' + ) sys.exit(1) del sys.argv[1] - get_or_die(module, 'main')() + get_or_die(module, "main")() def get_or_die(module: types.ModuleType, name: str) -> Any: @@ -39,12 +40,14 @@ def get_or_die(module: types.ModuleType, name: str) -> Any: if hasattr(module, name): return getattr(module, name) else: - sys.stderr.write(f'Internal Toil error!\nToil utility module ' - f'{module.__name__} is missing required attribute {name}\n') + sys.stderr.write( + f"Internal Toil error!\nToil utility module " + f"{module.__name__} is missing required attribute {name}\n" + ) sys.exit(1) -def loadModules() -> Dict[str, types.ModuleType]: +def loadModules() -> dict[str, types.ModuleType]: # noinspection PyUnresolvedReferences from toil.utils import toilClean # noqa from toil.utils import toilConfig # noqa @@ -59,13 +62,22 @@ def loadModules() -> Dict[str, types.ModuleType]: from toil.utils import toilStats # noqa from toil.utils import toilStatus # noqa - return {'-'.join([i.lower() for i in re.findall('[A-Z][^A-Z]*', name)]): module for name, module in locals().items()} + return { + "-".join([i.lower() for i in re.findall("[A-Z][^A-Z]*", name)]): module + for name, module in locals().items() + } -def printHelp(modules: Dict[str, types.ModuleType]) -> None: +def printHelp(modules: dict[str, types.ModuleType]) -> None: name = os.path.basename(sys.argv[0]) - descriptions = '\n '.join(f'{cmd} - {get_or_die(mod, "__doc__").strip()}' for cmd, mod in modules.items() if mod) - print(textwrap.dedent(f""" + descriptions = "\n ".join( + f'{cmd} - {get_or_die(mod, "__doc__").strip()}' + for cmd, mod in modules.items() + if mod + ) + print( + textwrap.dedent( + f""" Usage: {name} COMMAND ... {name} --help {name} COMMAND --help @@ -73,16 +85,20 @@ def printHelp(modules: Dict[str, types.ModuleType]) -> None: Where COMMAND is one of the following: {descriptions} - """[1:])) + """[ + 1: + ] + ) + ) def printVersion() -> None: try: - detected_version = metadata_version('toil') + detected_version = metadata_version("toil") if "a" in detected_version: # This probably means Toil is installed as development print(version) else: print(detected_version) except: - print(f'Version gathered from toil.version: {version}') + print(f"Version gathered from toil.version: {version}") diff --git a/src/toil/utils/toilRsyncCluster.py b/src/toil/utils/toilRsyncCluster.py index bb69a83141..c94c8c72f7 100644 --- a/src/toil/utils/toilRsyncCluster.py +++ b/src/toil/utils/toilRsyncCluster.py @@ -17,29 +17,42 @@ import sys from toil.common import parser_with_common_options -from toil.provisioners import cluster_factory, NoSuchClusterException +from toil.provisioners import NoSuchClusterException, cluster_factory from toil.statsAndLogging import set_logging_from_options logger = logging.getLogger(__name__) def main() -> None: - parser = parser_with_common_options(provisioner_options=True, jobstore_option=False, prog="toil rsync-cluster") - parser.add_argument("--insecure", dest='insecure', action='store_true', required=False, - help="Temporarily disable strict host key checking.") - parser.add_argument("args", nargs=argparse.REMAINDER, help="Arguments to pass to" - "`rsync`. Takes any arguments that rsync accepts. Specify the" - " remote with a colon. For example, to upload `example.py`," - " specify `toil rsync-cluster -p aws test-cluster example.py :`." - "\nOr, to download a file from the remote:, `toil rsync-cluster" - " -p aws test-cluster :example.py .`") + parser = parser_with_common_options( + provisioner_options=True, jobstore_option=False, prog="toil rsync-cluster" + ) + parser.add_argument( + "--insecure", + dest="insecure", + action="store_true", + required=False, + help="Temporarily disable strict host key checking.", + ) + parser.add_argument( + "args", + nargs=argparse.REMAINDER, + help="Arguments to pass to" + "`rsync`. Takes any arguments that rsync accepts. Specify the" + " remote with a colon. For example, to upload `example.py`," + " specify `toil rsync-cluster -p aws test-cluster example.py :`." + "\nOr, to download a file from the remote:, `toil rsync-cluster" + " -p aws test-cluster :example.py .`", + ) options = parser.parse_args() set_logging_from_options(options) - cluster = cluster_factory(provisioner=options.provisioner, - clusterName=options.clusterName, - zone=options.zone) + cluster = cluster_factory( + provisioner=options.provisioner, + clusterName=options.clusterName, + zone=options.zone, + ) try: cluster.getLeader().coreRsync(args=options.args, strict=not options.insecure) except NoSuchClusterException as e: logger.error(e) - sys.exit(1) \ No newline at end of file + sys.exit(1) diff --git a/src/toil/utils/toilSshCluster.py b/src/toil/utils/toilSshCluster.py index 7ff71f8aaa..465ee3a519 100644 --- a/src/toil/utils/toilSshCluster.py +++ b/src/toil/utils/toilSshCluster.py @@ -15,48 +15,71 @@ import argparse import logging import sys -from typing import List from toil.common import parser_with_common_options -from toil.provisioners import cluster_factory, NoSuchClusterException +from toil.provisioners import NoSuchClusterException, cluster_factory from toil.statsAndLogging import set_logging_from_options logger = logging.getLogger(__name__) def main() -> None: - parser = parser_with_common_options(provisioner_options=True, jobstore_option=False, prog="toil ssh-cluster") - parser.add_argument("--insecure", action='store_true', - help="Temporarily disable strict host key checking.") - parser.add_argument("--sshOption", dest='sshOptions', default=[], action='append', - help="Pass an additional option to the SSH command.") - parser.add_argument("--grafana_port", dest='grafana_port', default=3000, - help="Assign a local port to be used for the Grafana dashboard.") - parser.add_argument('args', nargs=argparse.REMAINDER) + parser = parser_with_common_options( + provisioner_options=True, jobstore_option=False, prog="toil ssh-cluster" + ) + parser.add_argument( + "--insecure", + action="store_true", + help="Temporarily disable strict host key checking.", + ) + parser.add_argument( + "--sshOption", + dest="sshOptions", + default=[], + action="append", + help="Pass an additional option to the SSH command.", + ) + parser.add_argument( + "--grafana_port", + dest="grafana_port", + default=3000, + help="Assign a local port to be used for the Grafana dashboard.", + ) + parser.add_argument("args", nargs=argparse.REMAINDER) options = parser.parse_args() set_logging_from_options(options) # Since we collect all the remaining arguments at the end for a command to # run, it's easy to lose options. - if len(options.args) > 0 and options.args[0].startswith('-'): - logger.warning('Argument \'%s\' interpreted as a command to run ' - 'despite looking like an option.', options.args[0]) + if len(options.args) > 0 and options.args[0].startswith("-"): + logger.warning( + "Argument '%s' interpreted as a command to run " + "despite looking like an option.", + options.args[0], + ) - cluster = cluster_factory(provisioner=options.provisioner, - clusterName=options.clusterName, - zone=options.zone) - command = options.args if options.args else ['bash'] - sshOptions: List[str] = options.sshOptions + cluster = cluster_factory( + provisioner=options.provisioner, + clusterName=options.clusterName, + zone=options.zone, + ) + command = options.args if options.args else ["bash"] + sshOptions: list[str] = options.sshOptions # Forward ports: # 3000 for Grafana dashboard # 9090 for Prometheus dashboard - sshOptions.extend(['-L', f'{options.grafana_port}:localhost:3000', - '-L', '9090:localhost:9090']) + sshOptions.extend( + ["-L", f"{options.grafana_port}:localhost:3000", "-L", "9090:localhost:9090"] + ) try: - cluster.getLeader().sshAppliance(*command, strict=not options.insecure, tty=sys.stdin.isatty(), - sshOptions=sshOptions) + cluster.getLeader().sshAppliance( + *command, + strict=not options.insecure, + tty=sys.stdin.isatty(), + sshOptions=sshOptions, + ) except NoSuchClusterException as e: logger.error(e) sys.exit(1) diff --git a/src/toil/utils/toilStats.py b/src/toil/utils/toilStats.py index 54eb263a45..1311744cd2 100644 --- a/src/toil/utils/toilStats.py +++ b/src/toil/utils/toilStats.py @@ -18,7 +18,7 @@ import sys from argparse import ArgumentParser, Namespace from functools import partial -from typing import Any, Callable, Dict, List, Optional, TextIO, Union +from typing import Any, Callable, Optional, TextIO, Union from toil.common import Config, Toil, parser_with_common_options from toil.job import Job @@ -37,7 +37,7 @@ "clock": "core-s", "wait": "core-s", "memory": "KiB", - "disk": "B" + "disk": "B", } # These are what we call them to the user TITLES = { @@ -45,7 +45,7 @@ "clock": "CPU Time", "wait": "CPU Wait", "memory": "Memory", - "disk": "Disk" + "disk": "Disk", } # Of those, these are in time @@ -65,6 +65,7 @@ "max": "max", } + class ColumnWidths: """ Convenience object that stores the width of columns for printing. Helps make things pretty. @@ -74,7 +75,7 @@ def __init__(self) -> None: self.categories = CATEGORIES self.fields_count = ["count", "min", "med", "ave", "max", "total"] self.fields = ["min", "med", "ave", "max", "total"] - self.data: Dict[str, int] = {} + self.data: dict[str, int] = {} for category in self.categories: for field in self.fields_count: self.set_width(category, field, 8) @@ -110,21 +111,27 @@ def pretty_space(k: float, field: Optional[int] = None, alone: bool = False) -> # If we don't have a header to say bytes, include the B. trailer = "B" if alone else "" if k < 1024: - return pad_str("%gKi%s" % (k, trailer), field) + return pad_str("{:g}Ki{}".format(k, trailer), field) if k < (1024 * 1024): - return pad_str("%.1fMi%s" % (k / 1024.0, trailer), field) + return pad_str("{:.1f}Mi{}".format(k / 1024.0, trailer), field) if k < (1024 * 1024 * 1024): - return pad_str("%.1fGi%s" % (k / 1024.0 / 1024.0, trailer), field) + return pad_str("{:.1f}Gi{}".format(k / 1024.0 / 1024.0, trailer), field) if k < (1024 * 1024 * 1024 * 1024): - return pad_str("%.1fTi%s" % (k / 1024.0 / 1024.0 / 1024.0, trailer), field) + return pad_str( + "{:.1f}Ti{}".format(k / 1024.0 / 1024.0 / 1024.0, trailer), field + ) if k < (1024 * 1024 * 1024 * 1024 * 1024): - return pad_str("%.1fPi%s" % (k / 1024.0 / 1024.0 / 1024.0 / 1024.0, trailer), field) + return pad_str( + "{:.1f}Pi{}".format(k / 1024.0 / 1024.0 / 1024.0 / 1024.0, trailer), field + ) # due to https://stackoverflow.com/questions/47149154 assert False -def pretty_time(t: float, field: Optional[int] = None, unit: str = "s", alone: bool = False) -> str: +def pretty_time( + t: float, field: Optional[int] = None, unit: str = "s", alone: bool = False +) -> str: """ Given input t as seconds, return a nicely formatted string. """ @@ -170,7 +177,10 @@ def pretty_time(t: float, field: Optional[int] = None, unit: str = "s", alone: b s = t % 60 wPlural = pluralDict[w > 1] dPlural = pluralDict[d > 1] - return pad_str("%dweek%s%dday%s%dh%dm%d%s" % (w, wPlural, d, dPlural, h, m, s, unit_str), field) + return pad_str( + "%dweek%s%dday%s%dh%dm%d%s" % (w, wPlural, d, dPlural, h, m, s, unit_str), field + ) + def report_unit(unit: str) -> str: """ @@ -180,7 +190,14 @@ def report_unit(unit: str) -> str: return "core·s" return unit -def report_time(t: float, options: Namespace, field: Optional[int] = None, unit: str = "s", alone: bool = False) -> str: + +def report_time( + t: float, + options: Namespace, + field: Optional[int] = None, + unit: str = "s", + alone: bool = False, +) -> str: """Given t seconds, report back the correct format as string.""" assert unit in ("s", "core-s") if options.pretty: @@ -189,11 +206,15 @@ def report_time(t: float, options: Namespace, field: Optional[int] = None, unit: if field is not None: assert field >= len(unit_text) return "%*.2f%s" % (field - len(unit_text), t, unit_text) - return "%.2f%s" % (t, unit_text) + return "{:.2f}{}".format(t, unit_text) def report_space( - k: float, options: Namespace, field: Optional[int] = None, unit: str = "KiB", alone: bool = False + k: float, + options: Namespace, + field: Optional[int] = None, + unit: str = "KiB", + alone: bool = False, ) -> str: """ Given k kibibytes, report back the correct format as string. @@ -216,10 +237,12 @@ def report_space( return "%d%s" % (int(k), trailer) -def report_number(n: Union[int, float, None], field: Optional[int] = None, nan_value: str = "NaN") -> str: +def report_number( + n: Union[int, float, None], field: Optional[int] = None, nan_value: str = "NaN" +) -> str: """ Given a number, report back the correct format as string. - + If it is a NaN or None, use nan_value to represent it instead. """ if n is None or math.isnan(n): @@ -229,7 +252,14 @@ def report_number(n: Union[int, float, None], field: Optional[int] = None, nan_v # leave room for . and the spacing to the previous field. return "%*.*g" % (field, field - 2, n) if field else "%g" % n -def report(v: float, category: str, options: Namespace, field: Optional[int] = None, alone=False) -> str: + +def report( + v: float, + category: str, + options: Namespace, + field: Optional[int] = None, + alone=False, +) -> str: """ Report a value of the given category formatted as a string. @@ -248,6 +278,7 @@ def report(v: float, category: str, options: Namespace, field: Optional[int] = N else: raise ValueError(f"Unimplemented unit {unit} for category {category}") + def sprint_tag( key: str, tag: Expando, @@ -365,15 +396,15 @@ def get(tree: Expando, name: str) -> float: return float("nan") -def sort_jobs(jobTypes: List[Any], options: Namespace) -> List[Any]: +def sort_jobs(jobTypes: list[Any], options: Namespace) -> list[Any]: """Return a jobTypes all sorted.""" sortField = LONG_FORMS[options.sortField] - if ( - options.sortCategory in CATEGORIES - ): + if options.sortCategory in CATEGORIES: return sorted( jobTypes, - key=lambda tag: getattr(tag, "%s_%s" % (sortField, options.sortCategory)), + key=lambda tag: getattr( + tag, "{}_{}".format(sortField, options.sortCategory) + ), reverse=options.sort == "decending", ) elif options.sortCategory == "alpha": @@ -397,7 +428,7 @@ def report_pretty_data( root: Expando, worker: Expando, job: Expando, - job_types: List[Any], + job_types: list[Any], options: Namespace, ) -> str: """Print the important bits out.""" @@ -426,7 +457,7 @@ def report_pretty_data( def compute_column_widths( - job_types: List[Any], worker: Expando, job: Expando, options: Namespace + job_types: list[Any], worker: Expando, job: Expando, options: Namespace ) -> ColumnWidths: """Return a ColumnWidths() object with the correct max widths.""" cw = ColumnWidths() @@ -451,12 +482,14 @@ def update_column_widths(tag: Expando, cw: ColumnWidths, options: Namespace) -> cw.set_width(category, field, len(s) + 1) -def build_element(element: Expando, items: List[Job], item_name: str, defaults: Dict[str, float]) -> Expando: +def build_element( + element: Expando, items: list[Job], item_name: str, defaults: dict[str, float] +) -> Expando: """Create an element for output.""" def assertNonnegative(i: float, name: str) -> float: if i < 0: - raise RuntimeError("Negative value %s reported for %s" % (i, name)) + raise RuntimeError("Negative value {} reported for {}".format(i, name)) else: return float(i) @@ -470,12 +503,17 @@ def assertNonnegative(i: float, name: str) -> float: if category in COMPUTED_CATEGORIES: continue category_key = category if category != "cores" else "requested_cores" - category_value = assertNonnegative(float(item.get(category_key, defaults[category])), category) + category_value = assertNonnegative( + float(item.get(category_key, defaults[category])), category + ) values.append(category_value) for index in range(0, len(item_values[CATEGORIES[0]])): # For each item, compute the computed categories - item_values["wait"].append(item_values["time"][index] * item_values["cores"][index] - item_values["clock"][index]) + item_values["wait"].append( + item_values["time"][index] * item_values["cores"][index] + - item_values["clock"][index] + ) for category, values in item_values.items(): values.sort() @@ -485,10 +523,7 @@ def assertNonnegative(i: float, name: str) -> float: for k, v in item_values.items(): v.append(0) - item_element = Expando( - total_number=float(len(items)), - name=item_name - ) + item_element = Expando(total_number=float(len(items)), name=item_name) for category, values in item_values.items(): item_element["total_" + category] = float(sum(values)) @@ -504,7 +539,7 @@ def assertNonnegative(i: float, name: str) -> float: def create_summary( element: Expando, - containingItems: List[Expando], + containingItems: list[Expando], containingItemName: str, count_contained: Callable[[Expando], int], ) -> None: @@ -599,7 +634,7 @@ def process_data(config: Config, stats: Expando) -> Expando: build_element(collatedStatsTag, jobs, "jobs", defaults), getattr(stats, "workers", []), "worker", - lambda worker: getattr(worker, "jobs_run", 0) + lambda worker: getattr(worker, "jobs_run", 0), ) # Get info for each job jobNames = set() @@ -684,7 +719,9 @@ def main() -> None: for c in options.categories.split(","): if c.strip().lower() not in CATEGORIES: - logger.critical("Cannot use category %s, options are: %s", c.strip().lower(), CATEGORIES) + logger.critical( + "Cannot use category %s, options are: %s", c.strip().lower(), CATEGORIES + ) sys.exit(1) options.categories = [x.strip().lower() for x in options.categories.split(",")] @@ -696,7 +733,9 @@ def main() -> None: except NoSuchJobStoreException: logger.critical("The job store %s does not exist", config.jobStore) sys.exit(1) - logger.info('Gathering stats from jobstore... depending on the number of jobs, this may take a while (e.g. 10 jobs ~= 3 seconds; 100,000 jobs ~= 3,000 seconds or 50 minutes).') + logger.info( + "Gathering stats from jobstore... depending on the number of jobs, this may take a while (e.g. 10 jobs ~= 3 seconds; 100,000 jobs ~= 3,000 seconds or 50 minutes)." + ) stats = get_stats(jobStore) collatedStatsTag = process_data(jobStore.config, stats) report_data(collatedStatsTag, options) diff --git a/src/toil/utils/toilStatus.py b/src/toil/utils/toilStatus.py index bbf6e7c82b..9fc4da97cc 100644 --- a/src/toil/utils/toilStatus.py +++ b/src/toil/utils/toilStatus.py @@ -15,13 +15,12 @@ import logging import os import sys -from typing import Any, Dict, List, Optional, Set +from typing import Any, Optional from toil.bus import replay_message_bus -from toil.common import Config, Toil, parser_with_common_options +from toil.common import Toil, parser_with_common_options from toil.job import JobDescription, JobException, ServiceJobDescription -from toil.jobStores.abstractJobStore import (NoSuchFileException, - NoSuchJobStoreException) +from toil.jobStores.abstractJobStore import NoSuchFileException, NoSuchJobStoreException from toil.statsAndLogging import StatsAndLogging, set_logging_from_options logger = logging.getLogger(__name__) @@ -30,33 +29,53 @@ class ToilStatus: """Tool for reporting on job status.""" - def __init__(self, jobStoreName: str, specifiedJobs: Optional[List[str]] = None): + def __init__(self, jobStoreName: str, specifiedJobs: Optional[list[str]] = None): self.jobStoreName = jobStoreName self.jobStore = Toil.resumeJobStore(jobStoreName) if specifiedJobs is None: rootJob = self.fetchRootJob() - logger.info('Traversing the job graph gathering jobs. This may take a couple of minutes.') + logger.info( + "Traversing the job graph gathering jobs. This may take a couple of minutes." + ) self.jobsToReport = self.traverseJobGraph(rootJob) else: self.jobsToReport = self.fetchUserJobs(specifiedJobs) self.message_bus_path = self.jobStore.config.write_messages + def print_dot_chart(self) -> None: """Print a dot output graph representing the workflow.""" print("digraph toil_graph {") print("# This graph was created from job-store: %s" % self.jobStoreName) # Make job IDs to node names map - jobsToNodeNames: Dict[str, str] = dict( - map(lambda job: (str(job.jobStoreID), str(job.jobStoreID).replace("_", "_u_").replace("/", "_s_").replace("-", "_d_")), self.jobsToReport) - ) + def id_to_name(job_id: str) -> str: + """ + Change a job ID into a GraphViz node name. + """ + replacements = [ + ("_", "_u_"), + ("/", "_s_"), + ("-", "_d_") + ] + result = job_id + for char, replacement in replacements: + result = result.replace(char, replacement) + return result + id_strings = [str(job.jobStoreID) for job in self.jobsToReport] + jobsToNodeNames = { + s: id_to_name(s) for s in id_strings + } # Print the nodes for job in set(self.jobsToReport): print( '{} [label="{} {}" color="{}"];'.format( - jobsToNodeNames[str(job.jobStoreID)], job.jobName, job.displayName, "black" if job.has_body() else "green" + jobsToNodeNames[str(job.jobStoreID)], + job.jobName, + job.displayName, + "black" if job.has_body() else "green" ) ) @@ -82,7 +101,11 @@ def printJobLog(self) -> None: with job.getLogFileHandle(self.jobStore) as fH: # TODO: This looks intended to be machine-readable, but the format is # unspecified and no escaping is done. But keep these tags around. - print(StatsAndLogging.formatLogStream(fH, stream_name=f"LOG_FILE_OF_JOB:{job} LOG:")) + print( + StatsAndLogging.formatLogStream( + fH, stream_name=f"LOG_FILE_OF_JOB:{job} LOG:" + ) + ) else: print(f"LOG_FILE_OF_JOB: {job} LOG: Job has no log file") @@ -94,22 +117,33 @@ def printJobChildren(self) -> None: children += "\t(CHILD_JOB:%s,PRECEDENCE:%i)" % (childJob, level) print(children) - def printAggregateJobStats(self, properties: List[Set[str]], childNumber: List[int]) -> None: + def printAggregateJobStats( + self, properties: list[set[str]], childNumber: list[int] + ) -> None: """ Prints each job's ID, log file, remaining tries, and other properties. :param properties: A set of string flag names for each job in self.jobsToReport. :param childNumber: A list of child counts for each job in self.jobsToReport. """ - for job, job_properties, job_child_number in zip(self.jobsToReport, properties, childNumber): + for job, job_properties, job_child_number in zip( + self.jobsToReport, properties, childNumber + ): def lf(x: str) -> str: return f"{x}:{str(x in job_properties)}" + # We use a sort of not-really-machine-readable key:value TSV format here. # But we only include important keys to help the humans, and flags # don't have a value, just a key. parts = [f"JOB:{job}"] - for flag in ["COMPLETELY_FAILED", "READY_TO_RUN", "IS_ZOMBIE", "HAS_SERVICES", "IS_SERVICE"]: + for flag in [ + "COMPLETELY_FAILED", + "READY_TO_RUN", + "IS_ZOMBIE", + "HAS_SERVICES", + "IS_SERVICE", + ]: if flag in job_properties: parts.append(flag) if job.logJobStoreFileID: @@ -121,7 +155,7 @@ def lf(x: str) -> str: print("\t".join(parts)) - def report_on_jobs(self) -> Dict[str, Any]: + def report_on_jobs(self) -> dict[str, Any]: """ Gathers information about jobs such as its child jobs and status. @@ -132,20 +166,20 @@ def report_on_jobs(self) -> Dict[str, Any]: hasChildren = [] readyToRun = [] zombies = [] - hasLogFile: List[JobDescription] = [] + hasLogFile: list[JobDescription] = [] hasServices = [] - services: List[ServiceJobDescription] = [] + services: list[ServiceJobDescription] = [] completely_failed = [] # These are stats for jobs in self.jobsToReport - child_number: List[int] = [] - properties: List[Set[str]] = [] + child_number: list[int] = [] + properties: list[set[str]] = [] # TODO: This mix of semantics is confusing and made per-job status be # wrong for multiple years because it was not understood. Redesign it! for job in self.jobsToReport: - job_properties: Set[str] = set() + job_properties: set[str] = set() if job.logJobStoreFileID is not None: hasLogFile.append(job) @@ -176,16 +210,16 @@ def report_on_jobs(self) -> Dict[str, Any]: jobStats = { # These are lists of the mathcing jobs - 'hasChildren': hasChildren, - 'readyToRun': readyToRun, - 'zombies': zombies, - 'hasServices': hasServices, - 'services': services, - 'hasLogFile': hasLogFile, - 'completelyFailed': completely_failed, + "hasChildren": hasChildren, + "readyToRun": readyToRun, + "zombies": zombies, + "hasServices": hasServices, + "services": services, + "hasLogFile": hasLogFile, + "completelyFailed": completely_failed, # These are stats for jobs in self.jobsToReport - 'properties': properties, - 'childNumber': child_number + "properties": properties, + "childNumber": child_number, } return jobStats @@ -202,21 +236,21 @@ def getPIDStatus(jobStoreName: str) -> str: try: jobstore = Toil.resumeJobStore(jobStoreName) except NoSuchJobStoreException: - return 'QUEUED' + return "QUEUED" except NoSuchFileException: - return 'QUEUED' + return "QUEUED" try: pid = jobstore.read_leader_pid() try: os.kill(pid, 0) # Does not kill process when 0 is passed. except OSError: # Process not found, must be done. - return 'COMPLETED' + return "COMPLETED" else: - return 'RUNNING' + return "RUNNING" except NoSuchFileException: pass - return 'QUEUED' + return "QUEUED" @staticmethod def getStatus(jobStoreName: str) -> str: @@ -235,22 +269,22 @@ def getStatus(jobStoreName: str) -> str: try: jobstore = Toil.resumeJobStore(jobStoreName) except NoSuchJobStoreException: - return 'QUEUED' + return "QUEUED" except NoSuchFileException: - return 'QUEUED' + return "QUEUED" try: - with jobstore.read_shared_file_stream('succeeded.log') as successful: + with jobstore.read_shared_file_stream("succeeded.log") as successful: pass - return 'COMPLETED' + return "COMPLETED" except NoSuchFileException: try: - with jobstore.read_shared_file_stream('failed.log') as failed: + with jobstore.read_shared_file_stream("failed.log") as failed: pass - return 'ERROR' + return "ERROR" except NoSuchFileException: pass - return 'RUNNING' + return "RUNNING" def print_running_jobs(self) -> None: """ @@ -264,10 +298,14 @@ def print_running_jobs(self) -> None: for job_status in all_job_statuses.values(): if job_status.is_running(): - status_line = [f"Job ID {job_status.job_store_id} with name {job_status.name} is running"] + status_line = [ + f"Job ID {job_status.job_store_id} with name {job_status.name} is running" + ] if job_status.batch_system != "": # batch system exists - status_line.append(f" on {job_status.batch_system} as ID {job_status.external_batch_id}") + status_line.append( + f" on {job_status.batch_system} as ID {job_status.external_batch_id}" + ) status_line.append(".") print("".join(status_line)) else: @@ -290,10 +328,12 @@ def fetchRootJob(self) -> JobDescription: return self.jobStore.load_root_job() except JobException as e: logger.info(e) - print('Root job is absent. The workflow has may have completed successfully.') + print( + "Root job is absent. The workflow has may have completed successfully." + ) raise - def fetchUserJobs(self, jobs: List[str]) -> List[JobDescription]: + def fetchUserJobs(self, jobs: list[str]) -> list[JobDescription]: """ Takes a user input array of jobs, verifies that they are in the jobStore and returns the array of jobsToReport. @@ -306,16 +346,16 @@ def fetchUserJobs(self, jobs: List[str]) -> List[JobDescription]: try: jobsToReport.append(self.jobStore.load_job(jobID)) except JobException: - print('The job %s could not be found.' % jobID, file=sys.stderr) + print("The job %s could not be found." % jobID, file=sys.stderr) raise return jobsToReport def traverseJobGraph( self, rootJob: JobDescription, - jobsToReport: Optional[List[JobDescription]] = None, - foundJobStoreIDs: Optional[Set[str]] = None, - ) -> List[JobDescription]: + jobsToReport: Optional[list[JobDescription]] = None, + foundJobStoreIDs: Optional[set[str]] = None, + ) -> list[JobDescription]: """ Find all current jobs in the jobStore and return them as an Array. @@ -338,15 +378,24 @@ def traverseJobGraph( jobsToReport.append(rootJob) # Traverse jobs in stack for successorJobStoreID in rootJob.allSuccessors(): - if successorJobStoreID not in foundJobStoreIDs and self.jobStore.job_exists(successorJobStoreID): - self.traverseJobGraph(self.jobStore.load_job(successorJobStoreID), jobsToReport, foundJobStoreIDs) + if ( + successorJobStoreID not in foundJobStoreIDs + and self.jobStore.job_exists(successorJobStoreID) + ): + self.traverseJobGraph( + self.jobStore.load_job(successorJobStoreID), + jobsToReport, + foundJobStoreIDs, + ) # Traverse service jobs for jobs in rootJob.services: for serviceJobStoreID in jobs: if self.jobStore.job_exists(serviceJobStoreID): if serviceJobStoreID in foundJobStoreIDs: - raise RuntimeError('Service job was unexpectedly found while traversing ') + raise RuntimeError( + "Service job was unexpectedly found while traversing " + ) foundJobStoreIDs.add(serviceJobStoreID) jobsToReport.append(self.jobStore.load_job(serviceJobStoreID)) @@ -356,40 +405,80 @@ def traverseJobGraph( def main() -> None: """Reports the state of a Toil workflow.""" parser = parser_with_common_options(prog="toil status") - parser.add_argument("--failIfNotComplete", action="store_true", - help="Return exit value of 1 if toil jobs not all completed. default=%(default)s", - default=False) - - parser.add_argument("--noAggStats", dest="stats", action="store_false", - help="Do not print overall, aggregate status of workflow.", - default=True) - - parser.add_argument("--dot", "--printDot", dest="print_dot", action="store_true", - help="Print dot formatted description of the graph. If using --jobs will " - "restrict to subgraph including only those jobs. default=%(default)s", - default=False) - - parser.add_argument("--jobs", nargs='+', - help="Restrict reporting to the following jobs (allows subsetting of the report).", - default=None) - - parser.add_argument("--perJob", "--printPerJobStats", dest="print_per_job_stats", action="store_true", - help="Print info about each job. default=%(default)s", - default=False) - - parser.add_argument("--logs", "--printLogs", dest="print_logs", action="store_true", - help="Print the log files of jobs (if they exist). default=%(default)s", - default=False) - - parser.add_argument("--children", "--printChildren", dest="print_children", action="store_true", - help="Print children of each job. default=%(default)s", - default=False) - - parser.add_argument("--status", "--printStatus", dest="print_status", action="store_true", - help="Determine which jobs are currently running and the associated batch system ID, if any") - - parser.add_argument("--failed", "--printFailed", dest="print_failed", action="store_true", - help="List jobs which seem to have failed to run") + parser.add_argument( + "--failIfNotComplete", + action="store_true", + help="Return exit value of 1 if toil jobs not all completed. default=%(default)s", + default=False, + ) + + parser.add_argument( + "--noAggStats", + dest="stats", + action="store_false", + help="Do not print overall, aggregate status of workflow.", + default=True, + ) + + parser.add_argument( + "--dot", + "--printDot", + dest="print_dot", + action="store_true", + help="Print dot formatted description of the graph. If using --jobs will " + "restrict to subgraph including only those jobs. default=%(default)s", + default=False, + ) + + parser.add_argument( + "--jobs", + nargs="+", + help="Restrict reporting to the following jobs (allows subsetting of the report).", + default=None, + ) + + parser.add_argument( + "--perJob", + "--printPerJobStats", + dest="print_per_job_stats", + action="store_true", + help="Print info about each job. default=%(default)s", + default=False, + ) + + parser.add_argument( + "--logs", + "--printLogs", + dest="print_logs", + action="store_true", + help="Print the log files of jobs (if they exist). default=%(default)s", + default=False, + ) + + parser.add_argument( + "--children", + "--printChildren", + dest="print_children", + action="store_true", + help="Print children of each job. default=%(default)s", + default=False, + ) + + parser.add_argument( + "--status", + "--printStatus", + dest="print_status", + action="store_true", + help="Determine which jobs are currently running and the associated batch system ID, if any", + ) + + parser.add_argument( + "--failed", + "--printFailed", + dest="print_failed", + action="store_true", + help="List jobs which seem to have failed to run", + ) options = parser.parse_args() set_logging_from_options(options) @@ -401,7 +490,7 @@ def main() -> None: try: status = ToilStatus(options.jobStore, options.jobs) except NoSuchJobStoreException: - print(f'The job store {options.jobStore} was not found.') + print(f"The job store {options.jobStore} was not found.") return except JobException: # Workflow likely complete, user informed in ToilStatus() return @@ -410,16 +499,16 @@ def main() -> None: # Info to be reported. # These are lists of matching jobs. - hasChildren = jobStats['hasChildren'] - readyToRun = jobStats['readyToRun'] - zombies = jobStats['zombies'] - hasServices = jobStats['hasServices'] - services = jobStats['services'] - hasLogFile = jobStats['hasLogFile'] - completely_failed = jobStats['completelyFailed'] + hasChildren = jobStats["hasChildren"] + readyToRun = jobStats["readyToRun"] + zombies = jobStats["zombies"] + hasServices = jobStats["hasServices"] + services = jobStats["services"] + hasLogFile = jobStats["hasLogFile"] + completely_failed = jobStats["completelyFailed"] # These are results for corresponding jobs in status.jobsToReport - properties = jobStats['properties'] - childNumber = jobStats['childNumber'] + properties = jobStats["properties"] + childNumber = jobStats["childNumber"] if options.print_per_job_stats: status.printAggregateJobStats(properties, childNumber) @@ -434,21 +523,30 @@ def main() -> None: for job in completely_failed: print(job) if options.stats: - print('Of the %i jobs considered, ' - 'there are ' - '%i completely failed jobs, ' - '%i jobs with children, ' - '%i jobs ready to run, ' - '%i zombie jobs, ' - '%i jobs with services, ' - '%i services, ' - 'and %i jobs with log files currently in %s.' % - (len(status.jobsToReport), len(completely_failed), len(hasChildren), - len(readyToRun), len(zombies), len(hasServices), len(services), - len(hasLogFile), status.jobStore)) + print( + "Of the %i jobs considered, " + "there are " + "%i completely failed jobs, " + "%i jobs with children, " + "%i jobs ready to run, " + "%i zombie jobs, " + "%i jobs with services, " + "%i services, " + "and %i jobs with log files currently in %s." + % ( + len(status.jobsToReport), + len(completely_failed), + len(hasChildren), + len(readyToRun), + len(zombies), + len(hasServices), + len(services), + len(hasLogFile), + status.jobStore, + ) + ) if options.print_status: status.print_running_jobs() if len(status.jobsToReport) > 0 and options.failIfNotComplete: # Upon workflow completion, all jobs will have been removed from job store exit(1) - diff --git a/src/toil/utils/toilUpdateEC2Instances.py b/src/toil/utils/toilUpdateEC2Instances.py index 264d5a0811..d338571a3f 100644 --- a/src/toil/utils/toilUpdateEC2Instances.py +++ b/src/toil/utils/toilUpdateEC2Instances.py @@ -31,7 +31,9 @@ def internet_connection() -> bool: def main() -> None: if not internet_connection(): - raise RuntimeError('No internet. Updating the EC2 Instance list requires internet.') + raise RuntimeError( + "No internet. Updating the EC2 Instance list requires internet." + ) updateStaticEC2Instances() diff --git a/src/toil/wdl/utils.py b/src/toil/wdl/utils.py index 0b5d56184e..08dfeebae4 100644 --- a/src/toil/wdl/utils.py +++ b/src/toil/wdl/utils.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Iterable +from collections.abc import Iterable def get_version(iterable: Iterable[str]) -> str: @@ -22,14 +22,14 @@ def get_version(iterable: Iterable[str]) -> str: :return: The WDL version used in the workflow. """ if isinstance(iterable, str): - iterable = iterable.split('\n') + iterable = iterable.split("\n") for line in iterable: line = line.strip() # check if the first non-empty, non-comment line is the version statement - if line and not line.startswith('#'): - if line.startswith('version '): + if line and not line.startswith("#"): + if line.startswith("version "): return line[8:].strip() break # only draft-2 doesn't contain the version declaration - return 'draft-2' + return "draft-2" diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index fe6addfcf4..bf63f95ccf 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -16,6 +16,7 @@ import asyncio import errno +import hashlib import io import json import logging @@ -30,71 +31,57 @@ import tempfile import textwrap import uuid -import hashlib +from collections.abc import Generator, Iterable, Iterator, Sequence from contextlib import ExitStack, contextmanager from graphlib import TopologicalSorter -from tempfile import mkstemp, gettempdir -from typing import ( - Any, - Callable, - Dict, - Generator, - IO, - Iterable, - Iterator, - List, - Optional, - Protocol, - Sequence, - Set, - Tuple, - Type, - TypeVar, - Union, - cast, - TypedDict -) -from typing_extensions import Buffer +from tempfile import mkstemp +from typing import IO, Any, Callable, Optional, Protocol, TypedDict, TypeVar, Union, cast if sys.version_info < (3, 11): from typing_extensions import NotRequired else: # NotRequired is recommended for TypedDicts over Optional but was introduced in Python 3.11 from typing import NotRequired -from mypy_extensions import Arg, DefaultArg + +from functools import partial from urllib.error import HTTPError from urllib.parse import quote, unquote, urljoin, urlsplit -from functools import partial import WDL.Error import WDL.runtime.config from configargparse import ArgParser, Namespace from WDL._util import byte_size_units, chmod_R_plus -from WDL.Tree import ReadSourceResult from WDL.CLI import print_error from WDL.runtime.backend.docker_swarm import SwarmContainer from WDL.runtime.backend.singularity import SingularityContainer -from WDL.runtime.task_container import TaskContainer from WDL.runtime.error import DownloadFailed +from WDL.runtime.task_container import TaskContainer +from WDL.Tree import ReadSourceResult from toil.batchSystems.abstractBatchSystem import InsufficientSystemResources from toil.common import Toil, addOptions from toil.exceptions import FailedJobsException from toil.fileStores import FileID from toil.fileStores.abstractFileStore import AbstractFileStore -from toil.job import (AcceleratorRequirement, - Job, - Promise, - Promised, - TemporaryID, - parse_accelerator, - unwrap, - unwrap_all, - ParseableIndivisibleResource) -from toil.jobStores.abstractJobStore import (AbstractJobStore, UnimplementedURLException, - InvalidImportExportUrlException, LocatorException) +from toil.job import ( + AcceleratorRequirement, + Job, + ParseableIndivisibleResource, + Promise, + Promised, + TemporaryID, + parse_accelerator, + unwrap, + unwrap_all, +) +from toil.jobStores.abstractJobStore import ( + AbstractJobStore, + InvalidImportExportUrlException, + LocatorException, + UnimplementedURLException, +) from toil.lib.accelerators import get_individual_local_accelerators -from toil.lib.conversions import convert_units, human2bytes, VALID_PREFIXES +from toil.lib.conversions import VALID_PREFIXES, convert_units, human2bytes from toil.lib.io import mkdtemp from toil.lib.memoize import memoize from toil.lib.misc import get_user_name @@ -169,17 +156,32 @@ def file_digest_fallback_impl(f: ReadableFileObj, alg_name: str) -> hashlib._Has # execution_dir: Directory to use as the working directory for workflow code. # container: The type of container to use when executing a WDL task. Carries through the value of the commandline --container option # all_call_outputs: whether a job should include all calls outputs -WDLContext = TypedDict('WDLContext', {"execution_dir": NotRequired[str], "container": NotRequired[str], - "task_path": str, "namespace": str, "all_call_outputs": bool}) +WDLContext = TypedDict( + "WDLContext", + { + "execution_dir": NotRequired[str], + "container": NotRequired[str], + "task_path": str, + "namespace": str, + "all_call_outputs": bool, + }, +) class InsufficientMountDiskSpace(Exception): - def __init__(self, mount_targets: List[str], desired_bytes: int, available_bytes: int) -> None: - super().__init__("Not enough available disk space for the target mount points %s. Needed %d bytes but there is only %d available." - % (", ".join(mount_targets), desired_bytes, available_bytes)) + def __init__( + self, mount_targets: list[str], desired_bytes: int, available_bytes: int + ) -> None: + super().__init__( + "Not enough available disk space for the target mount points %s. Needed %d bytes but there is only %d available." + % (", ".join(mount_targets), desired_bytes, available_bytes) + ) + @contextmanager -def wdl_error_reporter(task: str, exit: bool = False, log: Callable[[str], None] = logger.critical) -> Generator[None, None, None]: +def wdl_error_reporter( + task: str, exit: bool = False, log: Callable[[str], None] = logger.critical +) -> Generator[None]: """ Run code in a context where WDL errors will be reported with pretty formatting. """ @@ -199,7 +201,7 @@ def wdl_error_reporter(task: str, exit: bool = False, log: Callable[[str], None] InvalidImportExportUrlException, UnimplementedURLException, JobTooBigError, - InsufficientMountDiskSpace + InsufficientMountDiskSpace, ) as e: logger.exception(e) # Don't expose tracebacks to the user for exceptions that may be expected @@ -221,28 +223,44 @@ def wdl_error_reporter(task: str, exit: bool = False, log: Callable[[str], None] # Reraise the exception to stop raise -F = TypeVar('F', bound=Callable[..., Any]) -def report_wdl_errors(task: str, exit: bool = False, log: Callable[[str], None] = logger.critical) -> Callable[[F], F]: + +F = TypeVar("F", bound=Callable[..., Any]) + + +def report_wdl_errors( + task: str, exit: bool = False, log: Callable[[str], None] = logger.critical +) -> Callable[[F], F]: """ Create a decorator to report WDL errors with the given task message. Decorator can then be applied to a function, and if a WDL error happens it will say that it could not {task}. """ + def decorator(decoratee: F) -> F: """ Decorate a function with WDL error reporting. """ + def decorated(*args: Any, **kwargs: Any) -> Any: """ Run the decoratee and handle WDL errors. """ with wdl_error_reporter(task, exit=exit, log=log): return decoratee(*args, **kwargs) + return cast(F, decorated) + return decorator -def remove_common_leading_whitespace(expression: WDL.Expr.String, tolerate_blanks: bool = True, tolerate_dedents: bool = False, tolerate_all_whitespace: bool = True, debug: bool = False) -> WDL.Expr.String: + +def remove_common_leading_whitespace( + expression: WDL.Expr.String, + tolerate_blanks: bool = True, + tolerate_dedents: bool = False, + tolerate_all_whitespace: bool = True, + debug: bool = False, +) -> WDL.Expr.String: """ Remove "common leading whitespace" as defined in the WDL 1.1 spec. @@ -277,7 +295,7 @@ def remove_common_leading_whitespace(expression: WDL.Expr.String, tolerate_blank # We split the parts list into lines, which are also interleaved string # literals and placeholder expressions. - lines: List[List[Union[str, WDL.Expr.Placeholder]]] = [[]] + lines: list[list[str | WDL.Expr.Placeholder]] = [[]] for part in expression.parts: if isinstance(part, str): # It's a string. Split it into lines. @@ -297,7 +315,7 @@ def remove_common_leading_whitespace(expression: WDL.Expr.String, tolerate_blank # Then we compute the common amount of leading whitespace on all the lines, # looking at the first string literal. # This will be the longest common whitespace prefix, or None if not yet detected. - common_whitespace_prefix: Optional[str] = None + common_whitespace_prefix: str | None = None for line in lines: if len(line) == 0: # TODO: how should totally empty lines be handled? Not in the spec! @@ -319,20 +337,29 @@ def remove_common_leading_whitespace(expression: WDL.Expr.String, tolerate_blank # There's no leading whitespace here! common_whitespace_prefix = "" continue - if len(line) == 1 and tolerate_all_whitespace and all(x in (' ', '\t') for x in line[0]): + if ( + len(line) == 1 + and tolerate_all_whitespace + and all(x in (" ", "\t") for x in line[0]) + ): # All-whitespace lines shouldn't count continue # TODO: There are good algorithms for common prefixes. This is a bad one. # Find the number of leading whitespace characters line_whitespace_end = 0 - while line_whitespace_end < len(line[0]) and line[0][line_whitespace_end] in (' ', '\t'): + while line_whitespace_end < len(line[0]) and line[0][ + line_whitespace_end + ] in (" ", "\t"): line_whitespace_end += 1 # Find the string of leading whitespace characters line_whitespace_prefix = line[0][:line_whitespace_end] - if ' ' in line_whitespace_prefix and '\t' in line_whitespace_prefix: + if " " in line_whitespace_prefix and "\t" in line_whitespace_prefix: # Warn and don't change anything if spaces and tabs are mixed, per the spec. - logger.warning("Line in command at %s mixes leading spaces and tabs! Not removing leading whitespace!", expression.pos) + logger.warning( + "Line in command at %s mixes leading spaces and tabs! Not removing leading whitespace!", + expression.pos, + ) return expression if common_whitespace_prefix is None: @@ -345,7 +372,9 @@ def remove_common_leading_whitespace(expression: WDL.Expr.String, tolerate_blank # Hackily make os.path do it for us, # character-by-character. See # - common_whitespace_prefix = os.path.commonprefix([common_whitespace_prefix, line_whitespace_prefix]) + common_whitespace_prefix = os.path.commonprefix( + [common_whitespace_prefix, line_whitespace_prefix] + ) if common_whitespace_prefix is None: common_whitespace_prefix = "" @@ -371,12 +400,13 @@ def first_mismatch(prefix: str, value: str) -> int: ( ( cast( - List[Union[str, WDL.Expr.Placeholder]], - [line[0][first_mismatch(common_whitespace_prefix, line[0]):]] - ) + line[1:] + list[Union[str, WDL.Expr.Placeholder]], + [line[0][first_mismatch(common_whitespace_prefix, line[0]) :]], + ) + + line[1:] ) - if len(line) > 0 and isinstance(line[0], str) else - line + if len(line) > 0 and isinstance(line[0], str) + else line ) for line in lines ] @@ -385,7 +415,7 @@ def first_mismatch(prefix: str, value: str) -> int: # Then we reassemble the parts and make a new expression. # Build lists and turn the lists into strings later - new_parts: List[Union[List[str], WDL.Expr.Placeholder]] = [] + new_parts: list[list[str] | WDL.Expr.Placeholder] = [] for i, line in enumerate(stripped_lines): if i > 0: # This is a second line, so we need to tack on a newline. @@ -411,7 +441,9 @@ def first_mismatch(prefix: str, value: str) -> int: logger.debug("New Parts: %s", new_parts) # Now go back to the alternating strings and placeholders that MiniWDL wants - new_parts_merged: List[Union[str, WDL.Expr.Placeholder]] = [("".join(x) if isinstance(x, list) else x) for x in new_parts] + new_parts_merged: list[str | WDL.Expr.Placeholder] = [ + ("".join(x) if isinstance(x, list) else x) for x in new_parts + ] if debug: logger.debug("New Parts Merged: %s", new_parts_merged) @@ -423,7 +455,12 @@ def first_mismatch(prefix: str, value: str) -> int: return modified -def potential_absolute_uris(uri: str, path: List[str], importer: Optional[WDL.Tree.Document] = None, execution_dir: Optional[str] = None) -> Iterator[str]: +def potential_absolute_uris( + uri: str, + path: list[str], + importer: WDL.Tree.Document | None = None, + execution_dir: str | None = None, +) -> Iterator[str]: """ Get potential absolute URIs to check for an imported file. @@ -466,7 +503,7 @@ def potential_absolute_uris(uri: str, path: List[str], importer: Optional[WDL.Tr # Then the current directory. We need to make sure to include a filename component here or it will treat the current directory with no trailing / as a document and relative paths will look 1 level up. # When importing on a worker, the cwd will be a tmpdir and will result in FileNotFoundError after os.path.abspath, so override with the execution dir - full_path_list.append(Toil.normalize_uri(execution_dir or '.') + '/.') + full_path_list.append(Toil.normalize_uri(execution_dir or ".") + "/.") # Then the specified paths. # TODO: @@ -475,7 +512,7 @@ def potential_absolute_uris(uri: str, path: List[str], importer: Optional[WDL.Tr full_path_list += [Toil.normalize_uri(p) for p in path] # This holds all the URIs we tried and failed with. - failures: Set[str] = set() + failures: set[str] = set() for candidate_base in full_path_list: # Try fetching based off each base URI @@ -483,14 +520,19 @@ def potential_absolute_uris(uri: str, path: List[str], importer: Optional[WDL.Tr if candidate_uri in failures: # Already tried this one, maybe we have an absolute uri input. continue - logger.debug('Consider %s which is %s off of %s', candidate_uri, uri, candidate_base) + logger.debug( + "Consider %s which is %s off of %s", candidate_uri, uri, candidate_base + ) # Try it yield candidate_uri # If we come back it didn't work failures.add(candidate_uri) -async def toil_read_source(uri: str, path: List[str], importer: Optional[WDL.Tree.Document]) -> ReadSourceResult: + +async def toil_read_source( + uri: str, path: list[str], importer: WDL.Tree.Document | None +) -> ReadSourceResult: """ Implementation of a MiniWDL read_source function that can use any filename or URL supported by Toil. @@ -504,7 +546,7 @@ async def toil_read_source(uri: str, path: List[str], importer: Optional[WDL.Tre for candidate_uri in potential_absolute_uris(uri, path, importer): # For each place to try in order destination_buffer = io.BytesIO() - logger.debug('Fetching %s', candidate_uri) + logger.debug("Fetching %s", candidate_uri) tried.append(candidate_uri) try: # TODO: this is probably sync work that would be better as async work here @@ -513,14 +555,14 @@ async def toil_read_source(uri: str, path: List[str], importer: Optional[WDL.Tre # TODO: we need to assume any error is just a not-found, # because the exceptions thrown by read_from_url() # implementations are not specified. - logger.debug('Tried to fetch %s from %s but got %s', uri, candidate_uri, e) + logger.debug("Tried to fetch %s from %s but got %s", uri, candidate_uri, e) continue # If we get here, we got it probably. try: - string_data = destination_buffer.getvalue().decode('utf-8') + string_data = destination_buffer.getvalue().decode("utf-8") except UnicodeDecodeError: # But if it isn't actually unicode text, pretend it doesn't exist. - logger.warning('Data at %s is not text; skipping!', candidate_uri) + logger.warning("Data at %s is not text; skipping!", candidate_uri) continue # Return our result and its URI. TODO: Should we de-URI files? @@ -530,7 +572,7 @@ async def toil_read_source(uri: str, path: List[str], importer: Optional[WDL.Tre # does: # https://github.com/chanzuckerberg/miniwdl/blob/e3e8ef74e80fbe59f137b0ad40b354957915c345/WDL/Tree.py#L1493 # TODO: Make a more informative message? - logger.error('Could not find %s at any of: %s', uri, tried) + logger.error("Could not find %s at any of: %s", uri, tried) raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), uri) @@ -544,13 +586,19 @@ def virtualized_equal(value1: WDL.Value.Base, value2: WDL.Value.Base) -> bool: :param value2: WDL value :return: Whether the two values are equal with file virtualization accounted for """ + def f(file: WDL.Value.File) -> WDL.Value.File: return set_file_value(file, get_file_virtualized_value(file) or file.value) - return map_over_typed_files_in_value(value1, f) == map_over_typed_files_in_value(value2, f) + + return map_over_typed_files_in_value(value1, f) == map_over_typed_files_in_value( + value2, f + ) + # Bindings have a long type name WDLBindings = WDL.Env.Bindings[WDL.Value.Base] + def combine_bindings(all_bindings: Sequence[WDLBindings]) -> WDLBindings: """ Combine variable bindings from multiple predecessor tasks into one set for @@ -583,9 +631,14 @@ def combine_bindings(all_bindings: Sequence[WDLBindings]) -> WDLBindings: # This is a duplicate existing_value = merged[binding.name] if not virtualized_equal(existing_value, binding.value): - raise RuntimeError('Conflicting bindings for %s with values %s and %s', binding.name, existing_value, binding.value) + raise RuntimeError( + "Conflicting bindings for %s with values %s and %s", + binding.name, + existing_value, + binding.value, + ) else: - logger.debug('Drop duplicate binding for %s', binding.name) + logger.debug("Drop duplicate binding for %s", binding.name) else: merged = merged.bind(binding.name, binding.value, binding.info) @@ -593,7 +646,11 @@ def combine_bindings(all_bindings: Sequence[WDLBindings]) -> WDLBindings: # TODO: Develop a Protocol that can match the logging function type more closely -def log_bindings(log_function: Callable[..., None], message: str, all_bindings: Sequence[Promised[WDLBindings]]) -> None: +def log_bindings( + log_function: Callable[..., None], + message: str, + all_bindings: Sequence[Promised[WDLBindings]], +) -> None: """ Log bindings to the console, even if some are still promises. @@ -619,6 +676,7 @@ def log_bindings(log_function: Callable[..., None], message: str, all_bindings: elif isinstance(bindings, Promise): log_function("") + def get_supertype(types: Sequence[WDL.Type.Base]) -> WDL.Type.Base: """ Get the supertype that can hold values of all the given types. @@ -635,7 +693,9 @@ def get_supertype(types: Sequence[WDL.Type.Base]) -> WDL.Type.Base: optional = optional or typ.optional else: # We have conflicting types - raise RuntimeError(f"Cannot generate a supertype from conflicting types: {types}") + raise RuntimeError( + f"Cannot generate a supertype from conflicting types: {types}" + ) if supertype is None: return WDL.Type.Any(null=optional) # optional flag isn't used in Any return supertype.copy(optional=optional) @@ -650,10 +710,10 @@ def for_each_node(root: WDL.Tree.WorkflowNode) -> Iterator[WDL.Tree.WorkflowNode yield root for child_node in root.children: if isinstance(child_node, WDL.Tree.WorkflowNode): - for result in for_each_node(child_node): - yield result + yield from for_each_node(child_node) + -def recursive_dependencies(root: WDL.Tree.WorkflowNode) -> Set[str]: +def recursive_dependencies(root: WDL.Tree.WorkflowNode) -> set[str]: """ Get the combined workflow_node_dependencies of root and everything under it, which are not on anything in that subtree. @@ -663,9 +723,9 @@ def recursive_dependencies(root: WDL.Tree.WorkflowNode) -> Set[str]: """ # What are all dependencies? - needed: Set[str] = set() + needed: set[str] = set() # And what dependencies are provided internally? - provided: Set[str] = set() + provided: set[str] = set() for node in for_each_node(root): # Record everything each node needs @@ -677,7 +737,9 @@ def recursive_dependencies(root: WDL.Tree.WorkflowNode) -> Set[str]: return needed - provided -def parse_disks(spec: str, disks_spec: Union[List[WDL.Value.String], str]) -> Tuple[Optional[str], float, str]: +def parse_disks( + spec: str, disks_spec: list[WDL.Value.String] | str +) -> tuple[str | None, float, str]: """ Parse a WDL disk spec into a disk mount specification. :param spec: Disks spec to parse @@ -688,12 +750,14 @@ def parse_disks(spec: str, disks_spec: Union[List[WDL.Value.String], str]) -> Tu # are empty, and we want to allow people to use spaces after # their commas when separating the list, like in Cromwell's # examples, so we strip whitespace. - spec_parts = spec.strip().split(' ') + spec_parts = spec.strip().split(" ") # First check that this is a format we support. Both the WDL spec and Cromwell allow a max 3-piece specification # So if there are more than 3 pieces, raise an error if len(spec_parts) > 3: - raise RuntimeError(f"Could not parse disks = {disks_spec} because {spec_parts} contains more than 3 parts") + raise RuntimeError( + f"Could not parse disks = {disks_spec} because {spec_parts} contains more than 3 parts" + ) part_size = None # default to GiB as per spec part_suffix: str = "GiB" # The WDL spec's default is 1 GiB @@ -713,7 +777,9 @@ def parse_disks(spec: str, disks_spec: Union[List[WDL.Value.String], str]) -> Tu # can't imagine that ever being standardized; just leave it # alone so that the workflow doesn't rely on this weird and # likely-to-change Cromwell detail. - logger.warning('Not rounding LOCAL disk to the nearest 375 GB; workflow execution will differ from Cromwell!') + logger.warning( + "Not rounding LOCAL disk to the nearest 375 GB; workflow execution will differ from Cromwell!" + ) elif unit_spec in ("HDD", "SSD"): # For cromwell compatibility, assume this means GB in units # We don't actually differentiate between HDD and SSD @@ -732,7 +798,9 @@ def parse_disks(spec: str, disks_spec: Union[List[WDL.Value.String], str]) -> Tu if part_size is None: # Disk spec did not include a size - raise ValueError(f"Could not parse disks = {disks_spec} because {spec} does not specify a disk size") + raise ValueError( + f"Could not parse disks = {disks_spec} because {spec} does not specify a disk size" + ) return specified_mount_point, part_size, part_suffix @@ -744,10 +812,12 @@ def parse_disks(spec: str, disks_spec: Union[List[WDL.Value.String], str]) -> Tu # TODO: We need to also make sure files from the same source directory end up # in the same destination directory, when dealing with basename conflicts. -TOIL_URI_SCHEME = 'toilfile:' +TOIL_URI_SCHEME = "toilfile:" -def pack_toil_uri(file_id: FileID, task_path: str, dir_id: uuid.UUID, file_basename: str) -> str: +def pack_toil_uri( + file_id: FileID, task_path: str, dir_id: uuid.UUID, file_basename: str +) -> str: """ Encode a Toil file ID and metadata about who wrote it as a URI. @@ -757,27 +827,32 @@ def pack_toil_uri(file_id: FileID, task_path: str, dir_id: uuid.UUID, file_basen # We urlencode everything, including any slashes. We need to use a slash to # set off the actual filename, so the WDL standard library basename # function works correctly. - return TOIL_URI_SCHEME + "/".join([ - quote(file_id.pack(), safe=''), - quote(task_path, safe=''), - quote(str(dir_id)), - quote(file_basename, safe='') - ]) + return TOIL_URI_SCHEME + "/".join( + [ + quote(file_id.pack(), safe=""), + quote(task_path, safe=""), + quote(str(dir_id)), + quote(file_basename, safe=""), + ] + ) + -def unpack_toil_uri(toil_uri: str) -> Tuple[FileID, str, str, str]: +def unpack_toil_uri(toil_uri: str) -> tuple[FileID, str, str, str]: """ Unpack a URI made by make_toil_uri to retrieve the FileID and the basename (no path prefix) that the file is supposed to have. """ # Split out scheme and rest of URL - parts = toil_uri.split(':') + parts = toil_uri.split(":") if len(parts) != 2: raise ValueError(f"Wrong number of colons in URI: {toil_uri}") - if parts[0] + ':' != TOIL_URI_SCHEME: - raise ValueError(f"URI doesn't start with {TOIL_URI_SCHEME} and should: {toil_uri}") + if parts[0] + ":" != TOIL_URI_SCHEME: + raise ValueError( + f"URI doesn't start with {TOIL_URI_SCHEME} and should: {toil_uri}" + ) # Split encoded file ID from filename - parts = parts[1].split('/') + parts = parts[1].split("/") if len(parts) != 4: raise ValueError(f"Wrong number of path segments in URI: {toil_uri}") file_id = FileID.unpack(unquote(parts[0])) @@ -998,8 +1073,15 @@ def assign_shared_fs_path(file: WDL.Value.File) -> WDL.Value.File: return output_bindings -DirectoryNamingStateDict = Dict[str, Tuple[Dict[str, str], Set[str]]] -def choose_human_readable_directory(root_dir: str, source_task_path: str, parent_id: str, state: DirectoryNamingStateDict) -> str: +DirectoryNamingStateDict = dict[str, tuple[dict[str, str], set[str]]] + + +def choose_human_readable_directory( + root_dir: str, + source_task_path: str, + parent_id: str, + state: DirectoryNamingStateDict, +) -> str: """ Select a good directory to save files from a task and source directory in. @@ -1024,7 +1106,14 @@ def choose_human_readable_directory(root_dir: str, source_task_path: str, parent # # For each local directory, we need to know if we used it for a parent ID already (set). id_to_dir, used_dirs = state.setdefault(root_dir, ({}, set())) - logger.debug("Pick location for parent %s source %s root %s against id map %s and used set %s", parent_id, source_task_path, root_dir, id_to_dir, used_dirs) + logger.debug( + "Pick location for parent %s source %s root %s against id map %s and used set %s", + parent_id, + source_task_path, + root_dir, + id_to_dir, + used_dirs, + ) if parent_id not in id_to_dir: # Make a path for this parent named after this source task @@ -1050,8 +1139,13 @@ def choose_human_readable_directory(root_dir: str, source_task_path: str, parent return result -def evaluate_decls_to_bindings(decls: List[WDL.Tree.Decl], all_bindings: WDL.Env.Bindings[WDL.Value.Base], standard_library: ToilWDLStdLibBase, - include_previous: bool = False, drop_missing_files: bool = False) -> WDL.Env.Bindings[WDL.Value.Base]: +def evaluate_decls_to_bindings( + decls: list[WDL.Tree.Decl], + all_bindings: WDL.Env.Bindings[WDL.Value.Base], + standard_library: ToilWDLStdLibBase, + include_previous: bool = False, + drop_missing_files: bool = False, +) -> WDL.Env.Bindings[WDL.Value.Base]: """ Evaluate decls with a given bindings environment and standard library. Creates a new bindings object that only contains the bindings from the given decls. @@ -1067,21 +1161,30 @@ def evaluate_decls_to_bindings(decls: List[WDL.Tree.Decl], all_bindings: WDL.Env # all_bindings contains current bindings + previous all_bindings # bindings only contains the decl bindings themselves so that bindings from other sections prior aren't included bindings: WDL.Env.Bindings[WDL.Value.Base] = WDL.Env.Bindings() - drop_if_missing_with_workdir = partial(drop_if_missing, standard_library=standard_library) + drop_if_missing_with_workdir = partial( + drop_if_missing, standard_library=standard_library + ) for each_decl in decls: - output_value = evaluate_defaultable_decl(each_decl, all_bindings, standard_library) + output_value = evaluate_defaultable_decl( + each_decl, all_bindings, standard_library + ) if drop_missing_files: - dropped_output_value = map_over_typed_files_in_value(output_value, drop_if_missing_with_workdir) + dropped_output_value = map_over_typed_files_in_value( + output_value, drop_if_missing_with_workdir + ) # Typecheck that the new binding value with dropped files is valid for the declaration's type # If a dropped file exists where the type is not optional File?, raise FileNotFoundError # Ideally, map_over_typed_files_in_value should do this check, but that will require retooling the map functions # to carry through WDL types as well; currently miniwdl's WDL value has a type which we use, but that does not carry the optional flag through - ensure_null_files_are_nullable(dropped_output_value, output_value, each_decl.type) + ensure_null_files_are_nullable( + dropped_output_value, output_value, each_decl.type + ) output_value = dropped_output_value all_bindings = all_bindings.bind(each_decl.name, output_value) bindings = bindings.bind(each_decl.name, output_value) return all_bindings if include_previous else bindings + class NonDownloadingSize(WDL.StdLib._Size): """ WDL size() implementation that avoids downloading files. @@ -1093,13 +1196,21 @@ class NonDownloadingSize(WDL.StdLib._Size): using the FileID's stored size info. """ - def _call_eager(self, expr: "WDL.Expr.Apply", arguments: List[WDL.Value.Base]) -> WDL.Value.Base: + def _call_eager( + self, expr: WDL.Expr.Apply, arguments: list[WDL.Value.Base] + ) -> WDL.Value.Base: """ Replacement evaluation implementation that avoids downloads. """ # Get all the URIs of files that actually are set. - file_objects: List[WDL.Value.File] = [f for f in arguments[0].coerce(WDL.Type.Array(WDL.Type.File(optional=True))).value if not isinstance(f, WDL.Value.Null)] + file_objects: list[WDL.Value.File] = [ + f + for f in arguments[0] + .coerce(WDL.Type.Array(WDL.Type.File(optional=True))) + .value + if not isinstance(f, WDL.Value.Null) + ] total_size = 0.0 for file in file_objects: @@ -1137,15 +1248,15 @@ def _call_eager(self, expr: "WDL.Expr.Apply", arguments: List[WDL.Value.Base]) - # Return the result as a WDL float value return WDL.Value.Float(total_size) -STANDARD_SCHEMES = ['http:', 'https:', 's3:', 'gs:', 'ftp:'] +STANDARD_SCHEMES = ["http:", "https:", "s3:", "gs:", "ftp:"] REMOTE_SCHEMES = STANDARD_SCHEMES + [TOIL_URI_SCHEME] -ALL_SCHEMES = REMOTE_SCHEMES + ['file:'] +ALL_SCHEMES = REMOTE_SCHEMES + ["file:"] def is_toil_url(filename: str) -> bool: return is_url_with_scheme(filename, [TOIL_URI_SCHEME]) def is_file_url(filename: str) -> bool: - return is_url_with_scheme(filename, ['file:']) + return is_url_with_scheme(filename, ["file:"]) def is_standard_url(filename: str) -> bool: return is_url_with_scheme(filename, STANDARD_SCHEMES) @@ -1165,7 +1276,7 @@ def is_any_url(filename: str) -> bool: """ return is_url_with_scheme(filename, ALL_SCHEMES) -def is_url_with_scheme(filename: str, schemes: List[str]) -> bool: +def is_url_with_scheme(filename: str, schemes: list[str]) -> bool: """ Return True if filename is a URL with any of the given schemes and False otherwise. """ @@ -1175,8 +1286,14 @@ def is_url_with_scheme(filename: str, schemes: List[str]) -> bool: return True return False -def convert_remote_files(environment: WDLBindings, file_source: AbstractJobStore, task_path: str, search_paths: Optional[List[str]] = None, import_remote_files: bool = True, - execution_dir: Optional[str] = None) -> WDLBindings: +def convert_remote_files( + environment: WDLBindings, + file_source: AbstractJobStore, + task_path: str, + search_paths: list[str] | None = None, + import_remote_files: bool = True, + execution_dir: str | None = None, +) -> WDLBindings: """ Resolve relative-URI files in the given environment and import all files. @@ -1193,9 +1310,10 @@ def convert_remote_files(environment: WDLBindings, file_source: AbstractJobStore directories in this list. :param import_remote_files: If set, import files from remote locations. Else leave them as URI references. """ - path_to_id: Dict[str, uuid.UUID] = {} + path_to_id: dict[str, uuid.UUID] = {} + @memoize - def import_filename(filename: str) -> Tuple[Optional[str], Optional[str]]: + def import_filename(filename: str) -> tuple[str | None, str | None]: """ Given a relative URI that a file comes from, poll the possible absolute URIs that that could mean until one is found. If one is found, returns @@ -1210,7 +1328,11 @@ def import_filename(filename: str) -> Tuple[Optional[str], Optional[str]]: """ # Search through any input search paths passed in and download it if found tried = [] - for candidate_uri in potential_absolute_uris(filename, search_paths if search_paths is not None else [], execution_dir=execution_dir): + for candidate_uri in potential_absolute_uris( + filename, + search_paths if search_paths is not None else [], + execution_dir=execution_dir, + ): tried.append(candidate_uri) try: # Try polling existence first. @@ -1237,11 +1359,13 @@ def import_filename(filename: str) -> Tuple[Optional[str], Optional[str]]: except UnimplementedURLException as e: # We can't find anything that can even support this URL scheme. # Report to the user, they are probably missing an extra. - logger.critical('Error: ' + str(e)) + logger.critical("Error: " + str(e)) raise except HTTPError as e: # Something went wrong looking for it there. - logger.warning("Checked URL %s but got HTTP status %s", candidate_uri, e.code) + logger.warning( + "Checked URL %s but got HTTP status %s", candidate_uri, e.code + ) # Try the next location. continue except FileNotFoundError: @@ -1251,7 +1375,10 @@ def import_filename(filename: str) -> Tuple[Optional[str], Optional[str]]: except Exception: # Something went wrong besides the file not being found. Maybe # we have no auth. - logger.error("Something went wrong when testing for existence of %s", candidate_uri) + logger.error( + "Something went wrong when testing for existence of %s", + candidate_uri, + ) raise if imported is None: @@ -1266,7 +1393,9 @@ def import_filename(filename: str) -> Tuple[Optional[str], Optional[str]]: if file_basename == "": # We can't have files with no basename because we need to # download them at that basename later. - raise RuntimeError(f"File {candidate_uri} has no basename and so cannot be a WDL File") + raise RuntimeError( + f"File {candidate_uri} has no basename and so cannot be a WDL File" + ) # Was actually found if is_any_url(candidate_uri): @@ -1289,7 +1418,7 @@ def import_filename(filename: str) -> Tuple[Optional[str], Optional[str]]: toil_uri = pack_toil_uri(imported, task_path, dir_id, file_basename) - logger.info('Converting input file path %s to %s', filename, candidate_uri) + logger.info("Converting input file path %s to %s", filename, candidate_uri) return candidate_uri, toil_uri # Not found, return None @@ -1304,7 +1433,9 @@ def convert_file_to_uri(file: WDL.Value.File) -> WDL.Value.File: if candidate_uri is None and toil_uri is None: # If we get here we tried all the candidates - raise RuntimeError(f"Could not find {file.value} at any of: {list(potential_absolute_uris(file.value, search_paths if search_paths is not None else []))}") + raise RuntimeError( + f"Could not find {file.value} at any of: {list(potential_absolute_uris(file.value, search_paths if search_paths is not None else []))}" + ) elif candidate_uri is not None and toil_uri is None: # A candidate exists but importing is disabled because import_remote_files is false new_file = set_file_value(file, candidate_uri) @@ -1355,11 +1486,18 @@ def convert_file_to_uri(file: WDL.Value.File) -> WDL.Value.File: # TODO: De-overload the "virtualized"/"devirtualized" notion. + class ToilWDLStdLibBase(WDL.StdLib.Base): """ Standard library implementation for WDL as run on Toil. """ - def __init__(self, file_store: AbstractFileStore, wdl_options: WDLContext, share_files_with: Optional["ToilWDLStdLibBase"] = None): + + def __init__( + self, + file_store: AbstractFileStore, + wdl_options: WDLContext, + share_files_with: ToilWDLStdLibBase | None = None, + ): """ Set up the standard library. :param wdl_options: Options to pass into the standard library to use. @@ -1384,24 +1522,28 @@ def __init__(self, file_store: AbstractFileStore, wdl_options: WDLContext, share # We get fresh file download/upload state # Map forward from virtualized files to absolute devirtualized ones. - self._virtualized_to_devirtualized: Dict[str, str] = {} + self._virtualized_to_devirtualized: dict[str, str] = {} # Allow mapping back from absolute devirtualized files to virtualized # paths, to save re-uploads. - self._devirtualized_to_virtualized: Dict[str, str] = {} + self._devirtualized_to_virtualized: dict[str, str] = {} # State we need for choosing good names for devirtualized files self._devirtualization_state: DirectoryNamingStateDict = {} # UUID to differentiate which node files are virtualized from - self._parent_dir_to_ids: Dict[str, uuid.UUID] = dict() + self._parent_dir_to_ids: dict[str, uuid.UUID] = dict() else: # Share file download/upload state - self._virtualized_to_devirtualized = share_files_with._virtualized_to_devirtualized - self._devirtualized_to_virtualized = share_files_with._devirtualized_to_virtualized + self._virtualized_to_devirtualized = ( + share_files_with._virtualized_to_devirtualized + ) + self._devirtualized_to_virtualized = ( + share_files_with._devirtualized_to_virtualized + ) self._devirtualization_state = share_files_with._devirtualization_state self._parent_dir_to_ids = share_files_with._parent_dir_to_ids @property - def execution_dir(self) -> Optional[str]: - execution_dir: Optional[str] = self._wdl_options.get("execution_dir") + def execution_dir(self) -> str | None: + execution_dir: str | None = self._wdl_options.get("execution_dir") return execution_dir @property @@ -1409,15 +1551,16 @@ def task_path(self) -> str: task_path: str = self._wdl_options["task_path"] return task_path - def get_local_paths(self) -> List[str]: + def get_local_paths(self) -> list[str]: """ Get all the local paths of files devirtualized (or virtualized) through the stdlib. """ return list(self._virtualized_to_devirtualized.values()) - - def _read(self, parse: Callable[[str], WDL.Value.Base]) -> Callable[[WDL.Value.File], WDL.Value.Base]: + def _read( + self, parse: Callable[[str], WDL.Value.Base] + ) -> Callable[[WDL.Value.File], WDL.Value.Base]: # To only virtualize on task/function boundaries, rely on the _read function # as this is called before every WDL function that takes a file input # We want to virtualize before any function call so we can control the caching @@ -1442,7 +1585,9 @@ def _f( v: WDL.Value.Base, ) -> WDL.Value.File: os.makedirs(self._write_dir, exist_ok=True) - with tempfile.NamedTemporaryFile(dir=self._write_dir, delete=False) as outfile: + with tempfile.NamedTemporaryFile( + dir=self._write_dir, delete=False + ) as outfile: serialize(v, outfile) filename = outfile.name chmod_R_plus(filename, file_bits=0o660) @@ -1465,7 +1610,9 @@ def _devirtualize_file(self, file: WDL.Value.File) -> WDL.Value.File: logger.debug("File has no virtualized value so not changing value") return file - def _virtualize_file(self, file: WDL.Value.File, enforce_existence: bool = True) -> WDL.Value.File: + def _virtualize_file( + self, file: WDL.Value.File, enforce_existence: bool = True + ) -> WDL.Value.File: logger.debug("Virtualizing %s", file) # If enforce_existence is true, then if a file is detected as nonexistent, raise an error. Else, let it pass through if get_file_virtualized_value(file) is not None: @@ -1478,7 +1625,11 @@ def _virtualize_file(self, file: WDL.Value.File, enforce_existence: bool = True) if is_standard_url(file.value): file_uri = Toil.normalize_uri(file.value) else: - abs_filepath = os.path.join(self.execution_dir, file.value) if self.execution_dir is not None else os.path.abspath(file.value) + abs_filepath = ( + os.path.join(self.execution_dir, file.value) + if self.execution_dir is not None + else os.path.abspath(file.value) + ) file_uri = Toil.normalize_uri(abs_filepath) if not AbstractJobStore.url_exists(file_uri): @@ -1507,7 +1658,13 @@ def _devirtualize_filename(self, filename: str) -> str: return result @staticmethod - def _devirtualize_uri(filename: str, dest_dir: str, file_source: Union[AbstractFileStore, Toil], state: DirectoryNamingStateDict, export: Optional[bool] = None) -> str: + def _devirtualize_uri( + filename: str, + dest_dir: str, + file_source: AbstractFileStore | Toil, + state: DirectoryNamingStateDict, + export: Optional[bool] = None, + ) -> str: """ Given a filename, either return the devirtualized path or the filename itself if not a virtualized URI. @@ -1519,7 +1676,9 @@ def _devirtualize_uri(filename: str, dest_dir: str, file_source: Union[AbstractF file_id, task_path, parent_id, file_basename = unpack_toil_uri(filename) # Decide where it should be put. - dir_path = choose_human_readable_directory(dest_dir, task_path, parent_id, state) + dir_path = choose_human_readable_directory( + dest_dir, task_path, parent_id, state + ) else: # Parse the URL and extract the basename file_basename = os.path.basename(urlsplit(filename).path) @@ -1528,7 +1687,7 @@ def _devirtualize_uri(filename: str, dest_dir: str, file_source: Union[AbstractF # in, not relative to the thing. parent_url = urljoin(filename, ".") # Turn it into a string we can make a directory for - dir_path = os.path.join(dest_dir, quote(parent_url, safe='')) + dir_path = os.path.join(dest_dir, quote(parent_url, safe="")) if not os.path.exists(dir_path): # Make sure the chosen directory exists @@ -1548,13 +1707,17 @@ def _devirtualize_uri(filename: str, dest_dir: str, file_source: Union[AbstractF # . # We try to get away with symlinks and hope the task # container can mount the destination file. - result = file_source.readGlobalFile(file_id, dest_path, mutable=False, symlink=True) + result = file_source.readGlobalFile( + file_id, dest_path, mutable=False, symlink=True + ) else: - raise RuntimeError(f"Unsupported file source: {file_source}") + raise RuntimeError( + f"Unsupported file source: {file_source}" + ) else: # Download to a local file with the right name and execute bit. # Open it exclusively - with open(dest_path, 'xb') as dest_file: + with open(dest_path, "xb") as dest_file: # And save to it size, executable = AbstractJobStore.read_from_url(filename, dest_file) if executable: @@ -1568,12 +1731,12 @@ def _devirtualize_uri(filename: str, dest_dir: str, file_source: Union[AbstractF def devirtualize_to( filename: str, dest_dir: str, - file_source: Union[AbstractFileStore, Toil], + file_source: AbstractFileStore | Toil, state: DirectoryNamingStateDict, wdl_options: WDLContext, - devirtualized_to_virtualized: Optional[Dict[str, str]] = None, - virtualized_to_devirtualized: Optional[Dict[str, str]] = None, - export: Optional[bool] = None + devirtualized_to_virtualized: dict[str, str] | None = None, + virtualized_to_devirtualized: dict[str, str] | None = None, + export: bool | None = None, ) -> str: """ Download or export a WDL virtualized filename/URL to the given directory. @@ -1602,24 +1765,35 @@ def devirtualize_to( # os.mkdir fails saying the directory *being made* caused a # FileNotFoundError. So check the dest_dir before trying to make # directories under it. - raise RuntimeError(f"Cannot devirtualize {filename} into nonexistent directory {dest_dir}") + raise RuntimeError( + f"Cannot devirtualize {filename} into nonexistent directory {dest_dir}" + ) # TODO: Support people doing path operations (join, split, get parent directory) on the virtualized filenames. - if is_remote_url(filename): - if virtualized_to_devirtualized is not None and filename in virtualized_to_devirtualized: + if is_url(filename): + if ( + virtualized_to_devirtualized is not None + and filename in virtualized_to_devirtualized + ): # The virtualized file is in the cache, so grab the already devirtualized result result = virtualized_to_devirtualized[filename] - logger.debug("Found virtualized %s in cache with devirtualized path %s", filename, result) + logger.debug( + "Found virtualized %s in cache with devirtualized path %s", + filename, + result, + ) return result # Actually need to download/put in place/export - result = ToilWDLStdLibBase._devirtualize_uri(filename, dest_dir, file_source, state, export=export) + result = ToilWDLStdLibBase._devirtualize_uri( + filename, dest_dir, file_source, state, export=export + ) if devirtualized_to_virtualized is not None: # Store the back mapping devirtualized_to_virtualized[result] = filename if virtualized_to_devirtualized is not None: # And the other way virtualized_to_devirtualized[filename] = result - logger.debug('Devirtualized %s as openable file %s', filename, result) + logger.debug("Devirtualized %s as openable file %s", filename, result) else: # This is a local file or file URL if is_file_url(filename): @@ -1635,7 +1809,9 @@ def devirtualize_to( if not os.path.exists(result): # Catch if something made it through without going through the proper virtualization/devirtualization steps - raise RuntimeError(f"Virtualized file {filename} looks like a local file but isn't!") + raise RuntimeError( + f"Virtualized file {filename} looks like a local file but isn't!" + ) return result @@ -1649,7 +1825,7 @@ def _virtualize_filename(self, filename: str) -> str: if is_toil_url(filename): # Already virtual - logger.debug('Already virtual: %s', filename) + logger.debug("Already virtual: %s", filename) return filename elif is_standard_url(filename): # This is a URL (http, s3, etc) that we want to virtualize @@ -1657,16 +1833,24 @@ def _virtualize_filename(self, filename: str) -> str: if filename in self._devirtualized_to_virtualized: # Note: this is a little duplicative with the local file path branch, but the keys are different result = self._devirtualized_to_virtualized[filename] - logger.debug("Re-using virtualized WDL file %s for %s", result, filename) + logger.debug( + "Re-using virtualized WDL file %s for %s", result, filename + ) return result try: imported = self._file_store.import_file(filename) except FileNotFoundError: - logger.error("File at URL %s does not exist or is inaccessible." % filename) + logger.error( + "File at URL %s does not exist or is inaccessible." % filename + ) raise except HTTPError as e: # Something went wrong with the connection - logger.error("File %s could not be downloaded due to HTTP error %d", filename, e.code) + logger.error( + "File %s could not be downloaded due to HTTP error %d", + filename, + e.code, + ) raise if imported is None: # Satisfy mypy, this should never happen though as we don't pass a shared file name (which is the only way import_file returns None) @@ -1677,7 +1861,7 @@ def _virtualize_filename(self, filename: str) -> str: # Pack a UUID of the parent directory dir_id = self._parent_dir_to_ids.setdefault(parent_dir, uuid.uuid4()) result = pack_toil_uri(imported, self.task_path, dir_id, file_basename) - logger.debug('Virtualized %s as WDL file %s', filename, result) + logger.debug("Virtualized %s as WDL file %s", filename, result) # We can't put the Toil URI in the virtualized_to_devirtualized cache because it would point to the URL instead of a # local file on the machine, so only store the forward mapping self._devirtualized_to_virtualized[filename] = result @@ -1696,15 +1880,19 @@ def _virtualize_filename(self, filename: str) -> str: # This is a previously devirtualized thing so we can just use the # virtual version we remembered instead of reuploading it. result = self._devirtualized_to_virtualized[abs_filename] - logger.debug("Re-using virtualized WDL file %s for %s", result, filename) + logger.debug( + "Re-using virtualized WDL file %s for %s", result, filename + ) return result file_id = self._file_store.writeGlobalFile(abs_filename) file_dir = os.path.dirname(abs_filename) parent_id = self._parent_dir_to_ids.setdefault(file_dir, uuid.uuid4()) - result = pack_toil_uri(file_id, self.task_path, parent_id, os.path.basename(abs_filename)) - logger.debug('Virtualized %s as WDL file %s', filename, result) + result = pack_toil_uri( + file_id, self.task_path, parent_id, os.path.basename(abs_filename) + ) + logger.debug("Virtualized %s as WDL file %s", filename, result) # Remember the upload in case we share a cache self._devirtualized_to_virtualized[abs_filename] = result # And remember the local path in case we want a redownload @@ -1767,7 +1955,9 @@ def wrapper(v: WDL.Value.Base) -> WDL.Value.File: devirtualized_filename = self._devirtualize_filename(virtualized_file.value) # Hash the file to hex hex_digest = file_digest(open(devirtualized_filename, "rb"), "sha256").hexdigest() - file_input_bindings = WDL.Env.Bindings(WDL.Env.Binding("file_sha256", cast(WDL.Value.Base, WDL.Value.String(hex_digest)))) + file_input_bindings = WDL.Env.Bindings( + WDL.Env.Binding("file_sha256", cast(WDL.Value.Base, WDL.Value.String(hex_digest))) + ) # Make an environment of "file_sha256" to that as a WDL string, and # digest that, and make a write_ cache key. No need to transform to # shared FS paths sonce no paths are in it. @@ -1776,19 +1966,27 @@ def wrapper(v: WDL.Value.Base) -> WDL.Value.File: file_cache_key = "write_/" + input_digest # Construct a description of the types we expect to get from the # cache: just a File-type variable named "file" - expected_types = WDL.Env.Bindings(WDL.Env.Binding("file", cast(WDL.Type.Base, WDL.Type.File()))) + expected_types = WDL.Env.Bindings( + WDL.Env.Binding("file", cast(WDL.Type.Base, WDL.Type.File())) + ) # Query the cache assert self._miniwdl_cache is not None - file_output_bindings = self._miniwdl_cache.get(file_cache_key, file_input_bindings, expected_types) + file_output_bindings = self._miniwdl_cache.get( + file_cache_key, file_input_bindings, expected_types + ) if file_output_bindings: # File with this hash is cached. # Adjust virtualized_file to carry that path as its local-filesystem path. - virtualized_file = set_shared_fs_path(virtualized_file, file_output_bindings.resolve("file").value) + virtualized_file = set_shared_fs_path( + virtualized_file, file_output_bindings.resolve("file").value + ) elif self._miniwdl_cache._cfg["call_cache"].get_bool("put"): # Save our novel file to the cache. # Determine where we will save the file. - output_directory = os.path.join(self._miniwdl_cache._call_cache_dir, file_cache_key) + output_directory = os.path.join( + self._miniwdl_cache._call_cache_dir, file_cache_key + ) # This needs to exist before we can export to it os.makedirs(output_directory, exist_ok=True) @@ -1810,7 +2008,9 @@ def wrapper(v: WDL.Value.Base) -> WDL.Value.File: # Save the cache entry pointing to it self._miniwdl_cache.put( file_cache_key, - WDL.Env.Bindings(WDL.Env.Binding("file", WDL.Value.File(exported_path))) + WDL.Env.Bindings( + WDL.Env.Binding("file", WDL.Value.File(exported_path)) + ) ) # Apply the shared filesystem path to the virtualized file @@ -1830,7 +2030,12 @@ class ToilWDLStdLibTaskCommand(ToilWDLStdLibBase): are host-side paths. """ - def __init__(self, file_store: AbstractFileStore, container: TaskContainer, wdl_options: WDLContext): + def __init__( + self, + file_store: AbstractFileStore, + container: TaskContainer, + wdl_options: WDLContext, + ): """ Set up the standard library for the task command section. """ @@ -1848,10 +2053,12 @@ def __init__(self, file_store: AbstractFileStore, container: TaskContainer, wdl_ # MiniWDL created a file representing the in-container path, which does not exist on the host machine # In _write, we need virtualize to an in-container path from a host machine path because we mount the file through. The ideal spot for this virtualization # to happen is here before the path injection - def _read(self, parse: Callable[[str], WDL.Value.Base]) -> Callable[[WDL.Value.File], WDL.Value.Base]: + def _read( + self, parse: Callable[[str], WDL.Value.Base] + ) -> Callable[[WDL.Value.File], WDL.Value.Base]: # todo: figure out better way than reoverriding overridden function def _f(file: WDL.Value.File) -> WDL.Value.Base: - with open(self._devirtualize_filename(file.value), "r") as infile: + with open(self._devirtualize_filename(file.value)) as infile: return parse(infile.read()) return _f @@ -1863,7 +2070,9 @@ def _f( v: WDL.Value.Base, ) -> WDL.Value.File: os.makedirs(self._write_dir, exist_ok=True) - with tempfile.NamedTemporaryFile(dir=self._write_dir, delete=False) as outfile: + with tempfile.NamedTemporaryFile( + dir=self._write_dir, delete=False + ) as outfile: serialize(v, outfile) filename = outfile.name chmod_R_plus(filename, file_bits=0o660) @@ -1883,7 +2092,9 @@ def _devirtualize_filename(self, filename: str) -> str: # We shouldn't have to deal with URLs here; we want to have exactly # two nicely stacked/back-to-back layers of virtualization, joined # on the out-of-container paths. - raise RuntimeError(f"File {filename} is a URL but should already be an in-container-virtualized filename") + raise RuntimeError( + f"File {filename} is a URL but should already be an in-container-virtualized filename" + ) # If this is a local path it will be in the container. Make sure we # use the out-of-container equivalent. @@ -1891,9 +2102,11 @@ def _devirtualize_filename(self, filename: str) -> str: if result is None: # We really shouldn't have files in here that we didn't virtualize. - raise RuntimeError(f"File {filename} in container is not mounted from the host and can't be opened from the host") + raise RuntimeError( + f"File {filename} in container is not mounted from the host and can't be opened from the host" + ) - logger.debug('Devirtualized %s as out-of-container file %s', filename, result) + logger.debug("Devirtualized %s as out-of-container file %s", filename, result) return result @memoize @@ -1909,9 +2122,10 @@ def _virtualize_filename(self, filename: str) -> str: result = self.container.input_path_map[filename] - logger.debug('Virtualized %s as WDL file %s', filename, result) + logger.debug("Virtualized %s as WDL file %s", filename, result) return result + class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs): """ Standard library implementation for WDL as run on Toil, with additional @@ -1923,9 +2137,9 @@ def __init__( file_store: AbstractFileStore, stdout_path: str, stderr_path: str, - file_to_mountpoint: Dict[str, str], + file_to_mountpoint: dict[str, str], wdl_options: WDLContext, - share_files_with: Optional["ToilWDLStdLibBase"] = None + share_files_with: ToilWDLStdLibBase | None = None, ): """ Set up the standard library for a task output section. Needs to know @@ -1968,7 +2182,9 @@ def __init__( setattr( self, "glob", - WDL.StdLib.StaticFunction("glob", [WDL.Type.String()], WDL.Type.Array(WDL.Type.File()), self._glob), + WDL.StdLib.StaticFunction( + "glob", [WDL.Type.String()], WDL.Type.Array(WDL.Type.File()), self._glob + ), ) def _stdout(self) -> WDL.Value.File: @@ -2015,7 +2231,7 @@ def _glob(self, pattern: WDL.Value.String) -> WDL.Value.Array: # So we send a little Bash script that can delimit the files with something, and assume the Bash really is a Bash. # This needs to run in the work directory that the container used, if any. - work_dir = '.' if not self.execution_dir else self.execution_dir + work_dir = "." if not self.execution_dir else self.execution_dir # TODO: get this to run in the right container if there is one # We would use compgen -G to resolve the glob but that doesn't output @@ -2030,20 +2246,29 @@ def _glob(self, pattern: WDL.Value.String) -> WDL.Value.Array: # unquoted variable, with IFS cleared, allows it to be globbed as a # single unit. Then we loop over the results and print them # newline-delimited. - lines = subprocess.run(['bash', '-c', ''.join([ - 'cd ', - shlex.quote(work_dir), - ' && (shopt -s nullglob; IFS=""; PATTERN=', - shlex.quote(pattern_string), - '; for RESULT in ${PATTERN} ; do echo "${RESULT}" ; done)' - ])], stdout=subprocess.PIPE).stdout.decode('utf-8') + lines = subprocess.run( + [ + "bash", + "-c", + "".join( + [ + "cd ", + shlex.quote(work_dir), + ' && (shopt -s nullglob; IFS=""; PATTERN=', + shlex.quote(pattern_string), + '; for RESULT in ${PATTERN} ; do echo "${RESULT}" ; done)', + ] + ), + ], + stdout=subprocess.PIPE, + ).stdout.decode("utf-8") # Get each name that is a file results = [] - for line in lines.split('\n'): + for line in lines.split("\n"): if not line: continue - if not line.startswith('/'): + if not line.startswith("/"): # Make sure to be working with absolute paths since the glob # might not share our current directory line = os.path.join(work_dir, line) @@ -2062,10 +2287,10 @@ def _devirtualize_filename(self, filename: str) -> str: Any WDL-side filenames which are relative will be relative to the current directory override, if set. """ - if not is_any_url(filename) and not filename.startswith('/'): + if not is_any_url(filename) and not filename.startswith("/"): # We are getting a bare relative path from the WDL side. # Find a real path to it relative to the current directory override. - work_dir = '.' if not self.execution_dir else self.execution_dir + work_dir = "." if not self.execution_dir else self.execution_dir filename = os.path.join(work_dir, filename) return super()._devirtualize_filename(filename) @@ -2080,10 +2305,10 @@ def _virtualize_filename(self, filename: str) -> str: filenames. """ - if not is_any_url(filename) and not filename.startswith('/'): + if not is_any_url(filename) and not filename.startswith("/"): # We are getting a bare relative path on the supposedly devirtualized side. # Find a real path to it relative to the current directory override. - work_dir = '.' if not self.execution_dir else self.execution_dir + work_dir = "." if not self.execution_dir else self.execution_dir filename = os.path.join(work_dir, filename) if filename in self._devirtualized_to_virtualized: @@ -2098,7 +2323,7 @@ def _virtualize_filename(self, filename: str) -> str: seen = {here} while os.path.islink(here): dest = os.readlink(here) - if not dest.startswith('/'): + if not dest.startswith("/"): # Make it absolute dest = os.path.join(os.path.dirname(here), dest) here = dest @@ -2109,21 +2334,38 @@ def _virtualize_filename(self, filename: str) -> str: # Check the virtualized filenames before following symlinks # all the way back to workflow inputs. result = self._devirtualized_to_virtualized[here] - logger.debug("Re-using virtualized filename %s for %s linked from %s", result, here, filename) + logger.debug( + "Re-using virtualized filename %s for %s linked from %s", + result, + here, + filename, + ) return result if here in seen: - raise RuntimeError(f"Symlink {filename} leads to symlink loop at {here}") + raise RuntimeError( + f"Symlink {filename} leads to symlink loop at {here}" + ) seen.add(here) if os.path.exists(here): logger.debug("Handling symlink %s ultimately to %s", filename, here) else: - logger.error("Handling broken symlink %s ultimately to %s", filename, here) + logger.error( + "Handling broken symlink %s ultimately to %s", filename, here + ) filename = here return super()._virtualize_filename(filename) -def evaluate_named_expression(context: Union[WDL.Error.SourceNode, WDL.Error.SourcePosition], name: str, expected_type: Optional[WDL.Type.Base], expression: Optional[WDL.Expr.Base], environment: WDLBindings, stdlib: WDL.StdLib.Base) -> WDL.Value.Base: + +def evaluate_named_expression( + context: WDL.Error.SourceNode | WDL.Error.SourcePosition, + name: str, + expected_type: WDL.Type.Base | None, + expression: WDL.Expr.Base | None, + environment: WDLBindings, + stdlib: WDL.StdLib.Base, +) -> WDL.Value.Base: """ Evaluate an expression when we know the name of it. """ @@ -2133,7 +2375,9 @@ def evaluate_named_expression(context: Union[WDL.Error.SourceNode, WDL.Error.Sou # We can just leave the value as null value: WDL.Value.Base = WDL.Value.Null() else: - raise WDL.Error.EvalError(context, "Cannot evaluate no expression for " + name) + raise WDL.Error.EvalError( + context, "Cannot evaluate no expression for " + name + ) else: logger.debug("Evaluate expression for %s: %s", name, expression) try: @@ -2146,7 +2390,9 @@ def evaluate_named_expression(context: Union[WDL.Error.SourceNode, WDL.Error.Sou logger.debug("Got value %s of type %s", value, value.type) except Exception: # If something goes wrong, dump. - logger.exception("Expression evaluation failed for %s: %s", name, expression) + logger.exception( + "Expression evaluation failed for %s: %s", name, expression + ) log_bindings(logger.error, "Expression was evaluated in:", [environment]) raise @@ -2156,14 +2402,26 @@ def evaluate_named_expression(context: Union[WDL.Error.SourceNode, WDL.Error.Sou return value -def evaluate_decl(node: WDL.Tree.Decl, environment: WDLBindings, stdlib: WDL.StdLib.Base) -> WDL.Value.Base: + +def evaluate_decl( + node: WDL.Tree.Decl, environment: WDLBindings, stdlib: WDL.StdLib.Base +) -> WDL.Value.Base: """ Evaluate the expression of a declaration node, or raise an error. """ - return evaluate_named_expression(node, node.name, node.type, node.expr, environment, stdlib) + return evaluate_named_expression( + node, node.name, node.type, node.expr, environment, stdlib + ) -def evaluate_call_inputs(context: Union[WDL.Error.SourceNode, WDL.Error.SourcePosition], expressions: Dict[str, WDL.Expr.Base], environment: WDLBindings, stdlib: WDL.StdLib.Base, inputs_dict: Optional[Dict[str, WDL.Type.Base]] = None) -> WDLBindings: + +def evaluate_call_inputs( + context: WDL.Error.SourceNode | WDL.Error.SourcePosition, + expressions: dict[str, WDL.Expr.Base], + environment: WDLBindings, + stdlib: WDL.StdLib.Base, + inputs_dict: dict[str, WDL.Type.Base] | None = None, +) -> WDLBindings: """ Evaluate a bunch of expressions with names, and make them into a fresh set of bindings. `inputs_dict` is a mapping of variable names to their expected type for the input decls in a task. @@ -2177,21 +2435,36 @@ def evaluate_call_inputs(context: Union[WDL.Error.SourceNode, WDL.Error.SourcePo # This is done to enable passing in a string into a task input of file type expected_type = inputs_dict.get(k, None) try: - new_bindings = new_bindings.bind(k, evaluate_named_expression(context, k, expected_type, v, environment, stdlib)) + new_bindings = new_bindings.bind( + k, + evaluate_named_expression( + context, k, expected_type, v, environment, stdlib + ), + ) except FileNotFoundError as e: # MiniWDL's type coercion will raise this when trying to make a File out of Null. - raise WDL.Error.EvalError(context, f"Cannot evaluate expression for {k} with value {v}") + raise WDL.Error.EvalError( + context, f"Cannot evaluate expression for {k} with value {v}" + ) return new_bindings -def evaluate_defaultable_decl(node: WDL.Tree.Decl, environment: WDLBindings, stdlib: WDL.StdLib.Base) -> WDL.Value.Base: + +def evaluate_defaultable_decl( + node: WDL.Tree.Decl, environment: WDLBindings, stdlib: WDL.StdLib.Base +) -> WDL.Value.Base: """ If the name of the declaration is already defined in the environment, return its value. Otherwise, return the evaluated expression. """ try: - if ((node.name in environment and not isinstance(environment[node.name], WDL.Value.Null)) - or (isinstance(environment.get(node.name), WDL.Value.Null) and node.type.optional)): - logger.debug('Name %s is already defined, not using default', node.name) + if ( + node.name in environment + and not isinstance(environment[node.name], WDL.Value.Null) + ) or ( + isinstance(environment.get(node.name), WDL.Value.Null) + and node.type.optional + ): + logger.debug("Name %s is already defined, not using default", node.name) if not isinstance(environment[node.name].type, type(node.type)): return environment[node.name].coerce(node.type) else: @@ -2199,8 +2472,11 @@ def evaluate_defaultable_decl(node: WDL.Tree.Decl, environment: WDLBindings, std else: if node.type is not None and not node.type.optional and node.expr is None: # We need a value for this but there isn't one. - raise WDL.Error.EvalError(node, f"Value for {node.name} was not provided and no default value is available") - logger.info('Defaulting %s to %s', node.name, node.expr) + raise WDL.Error.EvalError( + node, + f"Value for {node.name} was not provided and no default value is available", + ) + logger.info("Defaulting %s to %s", node.name, node.expr) return evaluate_decl(node, environment, stdlib) except Exception: # If something goes wrong, dump. @@ -2208,8 +2484,11 @@ def evaluate_defaultable_decl(node: WDL.Tree.Decl, environment: WDLBindings, std log_bindings(logger.error, "Statement was evaluated in:", [environment]) raise + # TODO: make these stdlib methods??? -def devirtualize_files(environment: WDLBindings, stdlib: ToilWDLStdLibBase) -> WDLBindings: +def devirtualize_files( + environment: WDLBindings, stdlib: ToilWDLStdLibBase +) -> WDLBindings: """ Make sure all the File values embedded in the given bindings point to files that are actually available to command line commands. @@ -2218,33 +2497,44 @@ def devirtualize_files(environment: WDLBindings, stdlib: ToilWDLStdLibBase) -> W logger.info("Devirtualizing files") return map_over_files_in_bindings(environment, stdlib._devirtualize_file) -def virtualize_files(environment: WDLBindings, stdlib: ToilWDLStdLibBase, enforce_existence: bool = True) -> WDLBindings: + +def virtualize_files( + environment: WDLBindings, stdlib: ToilWDLStdLibBase, enforce_existence: bool = True +) -> WDLBindings: """ Make sure all the File values embedded in the given bindings point to files that are usable from other machines. """ logger.info("Virtualizing files") - virtualize_func = partial(stdlib._virtualize_file, enforce_existence=enforce_existence) + virtualize_func = partial( + stdlib._virtualize_file, enforce_existence=enforce_existence + ) return map_over_files_in_bindings(environment, virtualize_func) + def add_paths(task_container: TaskContainer, host_paths: Iterable[str]) -> None: """ Based off of WDL.runtime.task_container.add_paths from miniwdl Maps the host path to the container paths """ # partition the files by host directory - host_paths_by_dir: Dict[str, Set[str]] = {} + host_paths_by_dir: dict[str, set[str]] = {} for host_path in host_paths: host_path_strip = host_path.rstrip("/") - if host_path not in task_container.input_path_map and host_path_strip not in task_container.input_path_map: + if ( + host_path not in task_container.input_path_map + and host_path_strip not in task_container.input_path_map + ): if not os.path.exists(host_path_strip): raise WDL.Error.InputError("input path not found: " + host_path) - host_paths_by_dir.setdefault(os.path.dirname(host_path_strip), set()).add(host_path) + host_paths_by_dir.setdefault(os.path.dirname(host_path_strip), set()).add( + host_path + ) # for each such partition of files # - if there are no basename collisions under input subdirectory 0, then mount them there. # - otherwise, mount them in a fresh subdirectory subd = 0 - id_to_subd: Dict[str, str] = {} + id_to_subd: dict[str, str] = {} for paths in host_paths_by_dir.values(): based = os.path.join(task_container.container_dir, "work/_miniwdl_inputs") for host_path in paths: @@ -2253,15 +2543,20 @@ def add_paths(task_container: TaskContainer, host_paths: Iterable[str]) -> None: id_to_subd[parent_id] = str(subd) subd += 1 host_path_subd = id_to_subd[parent_id] - container_path = os.path.join(based, host_path_subd, os.path.basename(host_path.rstrip("/"))) + container_path = os.path.join( + based, host_path_subd, os.path.basename(host_path.rstrip("/")) + ) if host_path.endswith("/"): container_path += "/" - assert container_path not in task_container.input_path_map_rev, f"{container_path}, {task_container.input_path_map_rev}" + assert ( + container_path not in task_container.input_path_map_rev + ), f"{container_path}, {task_container.input_path_map_rev}" task_container.input_path_map[host_path] = container_path task_container.input_path_map_rev[container_path] = host_path -def drop_if_missing(file: WDL.Value.File, standard_library: ToilWDLStdLibBase) -> Optional[WDL.Value.File]: - +def drop_if_missing( + file: WDL.Value.File, standard_library: ToilWDLStdLibBase +) -> WDL.Value.File | None: """ Return None if a file doesn't exist, or its path if it does. @@ -2275,30 +2570,52 @@ def drop_if_missing(file: WDL.Value.File, standard_library: ToilWDLStdLibBase) - if filename is not None and is_any_url(filename): try: - if filename.startswith(TOIL_URI_SCHEME) or AbstractJobStore.url_exists(filename): + if filename.startswith(TOIL_URI_SCHEME) or AbstractJobStore.url_exists( + filename + ): # We assume anything in the filestore actually exists. - devirtualized_filename = standard_library._devirtualize_filename(filename) + devirtualized_filename = standard_library._devirtualize_filename( + filename + ) file = set_file_value(file, devirtualized_filename) file = set_file_virtualized_value(file, filename) return file else: - logger.warning('File %s with type %s does not actually exist at its URI', filename, value_type) + logger.warning( + "File %s with type %s does not actually exist at its URI", + filename, + value_type, + ) return None except HTTPError as e: # The error doesn't always include the URL in its message. - logger.error("File %s could not be checked for existence due to HTTP error %d", filename, e.code) + logger.error( + "File %s could not be checked for existence due to HTTP error %d", + filename, + e.code, + ) raise else: # Get the absolute path, not resolving symlinks - effective_path = os.path.abspath(os.path.join(work_dir or os.getcwd(), filename)) + effective_path = os.path.abspath( + os.path.join(work_dir or os.getcwd(), filename) + ) if os.path.islink(effective_path) or os.path.exists(effective_path): # This is a broken symlink or a working symlink or a file. return file else: - logger.warning('File %s with type %s does not actually exist at %s', filename, value_type, effective_path) + logger.warning( + "File %s with type %s does not actually exist at %s", + filename, + value_type, + effective_path, + ) return None -def drop_missing_files(environment: WDLBindings, standard_library: ToilWDLStdLibBase) -> WDLBindings: + +def drop_missing_files( + environment: WDLBindings, standard_library: ToilWDLStdLibBase +) -> WDLBindings: """ Make sure all the File values embedded in the given bindings point to files that exist, or are null. @@ -2307,10 +2624,13 @@ def drop_missing_files(environment: WDLBindings, standard_library: ToilWDLStdLib """ # Determine where to evaluate relative paths relative to - drop_if_missing_with_workdir = partial(drop_if_missing, standard_library=standard_library) + drop_if_missing_with_workdir = partial( + drop_if_missing, standard_library=standard_library + ) return map_over_files_in_bindings(environment, drop_if_missing_with_workdir) -def get_file_paths_in_bindings(environment: WDLBindings) -> List[str]: + +def get_file_paths_in_bindings(environment: WDLBindings) -> list[str]: """ Get the paths of all files in the bindings. Doesn't guarantee that duplicates are removed. @@ -2321,17 +2641,22 @@ def get_file_paths_in_bindings(environment: WDLBindings) -> List[str]: paths = [] - def append_to_paths(file: WDL.Value.File) -> Optional[WDL.Value.File]: + def append_to_paths(file: WDL.Value.File) -> WDL.Value.File | None: # Append element and return the element. This is to avoid a logger warning inside map_over_typed_files_in_value() # But don't process nonexistent files if get_file_nonexistent(file) is False: path = file.value paths.append(path) return file + map_over_files_in_bindings(environment, append_to_paths) return paths -def map_over_files_in_bindings(environment: WDLBindings, transform: Callable[[WDL.Value.File], Optional[WDL.Value.File]]) -> WDLBindings: + +def map_over_files_in_bindings( + environment: WDLBindings, + transform: Callable[[WDL.Value.File], WDL.Value.File | None], +) -> WDLBindings: """ Run all File values embedded in the given bindings through the given transformation function. @@ -2344,7 +2669,10 @@ def map_over_files_in_bindings(environment: WDLBindings, transform: Callable[[WD return environment.map(lambda b: map_over_files_in_binding(b, transform)) -def map_over_files_in_binding(binding: WDL.Env.Binding[WDL.Value.Base], transform: Callable[[WDL.Value.File], Optional[WDL.Value.File]]) -> WDL.Env.Binding[WDL.Value.Base]: +def map_over_files_in_binding( + binding: WDL.Env.Binding[WDL.Value.Base], + transform: Callable[[WDL.Value.File], WDL.Value.File | None], +) -> WDL.Env.Binding[WDL.Value.Base]: """ Run all File values' types and values embedded in the given binding's value through the given transformation function. @@ -2352,7 +2680,12 @@ def map_over_files_in_binding(binding: WDL.Env.Binding[WDL.Value.Base], transfor The transformation function must not mutate the original File. """ - return WDL.Env.Binding(binding.name, map_over_typed_files_in_value(binding.value, transform), binding.info) + return WDL.Env.Binding( + binding.name, + map_over_typed_files_in_value(binding.value, transform), + binding.info, + ) + # TODO: We want to type this to say, for anything descended from a WDL type, we # return something descended from the same WDL type or a null. But I can't @@ -2361,7 +2694,9 @@ def map_over_files_in_binding(binding: WDL.Env.Binding[WDL.Value.Base], transfor # # For now we assume that any types extending the WDL value types will implement # compatible constructors. -def map_over_typed_files_in_value(value: WDL.Value.Base, transform: Callable[[WDL.Value.File], Optional[WDL.Value.File]]) -> WDL.Value.Base: +def map_over_typed_files_in_value( + value: WDL.Value.Base, transform: Callable[[WDL.Value.File], WDL.Value.File | None] +) -> WDL.Value.Base: """ Run all File values embedded in the given value through the given transformation function. @@ -2393,22 +2728,54 @@ def map_over_typed_files_in_value(value: WDL.Value.Base, transform: Callable[[WD return new_file elif isinstance(value, WDL.Value.Array): # This is an array, so recurse on the items - return WDL.Value.Array(value.type.item_type, [map_over_typed_files_in_value(v, transform) for v in value.value], value.expr) + return WDL.Value.Array( + value.type.item_type, + [map_over_typed_files_in_value(v, transform) for v in value.value], + value.expr, + ) elif isinstance(value, WDL.Value.Map): # This is a map, so recurse on the members of the items, which are tuples (but not wrapped as WDL Pair objects) # TODO: Can we avoid a cast in a comprehension if we get MyPy to know that each pair is always a 2-element tuple? - return WDL.Value.Map(value.type.item_type, [cast(Tuple[WDL.Value.Base, WDL.Value.Base], tuple((map_over_typed_files_in_value(v, transform) for v in pair))) for pair in value.value], value.expr) + return WDL.Value.Map( + value.type.item_type, + [ + cast( + tuple[WDL.Value.Base, WDL.Value.Base], + tuple(map_over_typed_files_in_value(v, transform) for v in pair), + ) + for pair in value.value + ], + value.expr, + ) elif isinstance(value, WDL.Value.Pair): # This is a pair, so recurse on the left and right items - return WDL.Value.Pair(value.type.left_type, value.type.right_type, cast(Tuple[WDL.Value.Base, WDL.Value.Base], tuple((map_over_typed_files_in_value(v, transform) for v in value.value))), value.expr) + return WDL.Value.Pair( + value.type.left_type, + value.type.right_type, + cast( + tuple[WDL.Value.Base, WDL.Value.Base], + tuple(map_over_typed_files_in_value(v, transform) for v in value.value), + ), + value.expr, + ) elif isinstance(value, WDL.Value.Struct): # This is a struct, so recurse on the values in the backing dict - return WDL.Value.Struct(cast(Union[WDL.Type.StructInstance, WDL.Type.Object], value.type), {k: map_over_typed_files_in_value(v, transform) for k, v in value.value.items()}, value.expr) + return WDL.Value.Struct( + cast(Union[WDL.Type.StructInstance, WDL.Type.Object], value.type), + { + k: map_over_typed_files_in_value(v, transform) + for k, v in value.value.items() + }, + value.expr, + ) else: # All other kinds of value can be passed through unmodified. return value -def ensure_null_files_are_nullable(value: WDL.Value.Base, original_value: WDL.Value.Base, expected_type: WDL.Type.Base) -> None: + +def ensure_null_files_are_nullable( + value: WDL.Value.Base, original_value: WDL.Value.Base, expected_type: WDL.Type.Base +) -> None: """ Run through all nested values embedded in the given value and check that the null values are valid. @@ -2425,29 +2792,44 @@ def ensure_null_files_are_nullable(value: WDL.Value.Base, original_value: WDL.Va """ if isinstance(value, WDL.Value.File): pass - elif isinstance(value, WDL.Value.Array) and isinstance(expected_type, WDL.Type.Array): + elif isinstance(value, WDL.Value.Array) and isinstance( + expected_type, WDL.Type.Array + ): for elem, orig_elem in zip(value.value, original_value.value): ensure_null_files_are_nullable(elem, orig_elem, expected_type.item_type) elif isinstance(value, WDL.Value.Map) and isinstance(expected_type, WDL.Type.Map): for pair, orig_pair in zip(value.value, original_value.value): # The key of the map cannot be optional or else it is not serializable, so we only need to check the value - ensure_null_files_are_nullable(pair[1], orig_pair[1], expected_type.item_type[1]) + ensure_null_files_are_nullable( + pair[1], orig_pair[1], expected_type.item_type[1] + ) elif isinstance(value, WDL.Value.Pair) and isinstance(expected_type, WDL.Type.Pair): - ensure_null_files_are_nullable(value.value[0], original_value.value[0], expected_type.left_type) - ensure_null_files_are_nullable(value.value[1], original_value.value[1], expected_type.right_type) - elif isinstance(value, WDL.Value.Struct) and isinstance(expected_type, WDL.Type.StructInstance): - for (k, v), (_, orig_v) in zip(value.value.items(), original_value.value.items()): + ensure_null_files_are_nullable( + value.value[0], original_value.value[0], expected_type.left_type + ) + ensure_null_files_are_nullable( + value.value[1], original_value.value[1], expected_type.right_type + ) + elif isinstance(value, WDL.Value.Struct) and isinstance( + expected_type, WDL.Type.StructInstance + ): + for (k, v), (_, orig_v) in zip( + value.value.items(), original_value.value.items() + ): # The parameters method for WDL.Type.StructInstance returns the values rather than the dictionary # While dictionaries are ordered, this should be more robust; the else branch should never be hit if expected_type.members is not None: ensure_null_files_are_nullable(v, orig_v, expected_type.members[k]) elif isinstance(value, WDL.Value.Null): if not expected_type.optional: - raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), original_value.value) + raise FileNotFoundError( + errno.ENOENT, os.strerror(errno.ENOENT), original_value.value + ) else: # Don't check other (unsupported?) types return + class WDLBaseJob(Job): """ Base job class for all WDL-related jobs. @@ -2471,8 +2853,8 @@ def __init__(self, wdl_options: WDLContext, **kwargs: Any) -> None: """ # Default everything to being a local job - if 'local' not in kwargs: - kwargs['local'] = True + if "local" not in kwargs: + kwargs["local"] = True super().__init__(**kwargs) @@ -2485,7 +2867,7 @@ def __init__(self, wdl_options: WDLContext, **kwargs: Any) -> None: # We need an ordered list of postprocessing steps to apply, because we # may have coalesced postprocessing steps deferred by several levels of # jobs returning other jobs' promised RVs. - self._postprocessing_steps: List[Tuple[str, Union[str, Promised[WDLBindings]]]] = [] + self._postprocessing_steps: list[tuple[str, str | Promised[WDLBindings]]] = [] self._wdl_options = wdl_options @@ -2571,7 +2953,7 @@ def postprocess(self, bindings: WDLBindings) -> WDLBindings: return bindings - def defer_postprocessing(self, other: "WDLBaseJob") -> None: + def defer_postprocessing(self, other: WDLBaseJob) -> None: """ Give our postprocessing steps to a different job. @@ -2583,6 +2965,7 @@ def defer_postprocessing(self, other: "WDLBaseJob") -> None: logger.debug("Assigned postprocessing steps from %s to %s", self, other) + class WDLTaskWrapperJob(WDLBaseJob): """ Job that determines the resources needed to run a WDL job. @@ -2594,7 +2977,14 @@ class WDLTaskWrapperJob(WDLBaseJob): All bindings are in terms of task-internal names. """ - def __init__(self, task: WDL.Tree.Task, prev_node_results: Sequence[Promised[WDLBindings]], task_id: List[str], wdl_options: WDLContext, **kwargs: Any) -> None: + def __init__( + self, + task: WDL.Tree.Task, + prev_node_results: Sequence[Promised[WDLBindings]], + task_id: list[str], + wdl_options: WDLContext, + **kwargs: Any, + ) -> None: """ Make a new job to determine resources and run a task. @@ -2602,9 +2992,18 @@ def __init__(self, task: WDL.Tree.Task, prev_node_results: Sequence[Promised[WDL The caller has alredy added the task's own name. """ # task_path in wdl_options is like the namespace, but including subscript numbers for scatters - super().__init__(unitName=wdl_options["task_path"] + ".inputs", displayName=wdl_options["namespace"] + ".inputs", wdl_options=wdl_options, **kwargs) + super().__init__( + unitName=wdl_options["task_path"] + ".inputs", + displayName=wdl_options["namespace"] + ".inputs", + wdl_options=wdl_options, + **kwargs, + ) - logger.info("Preparing to run task code for %s as %s", task.name, wdl_options["namespace"]) + logger.info( + "Preparing to run task code for %s as %s", + task.name, + wdl_options["namespace"], + ) log_bindings(logger.debug, "Incoming bindings:", prev_node_results) self._task = task @@ -2617,7 +3016,12 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: Evaluate inputs and runtime and schedule the task. """ super().run(file_store) - logger.info("Evaluating inputs and runtime for task %s (%s) called as %s", self._task.name, self._task_id, self._wdl_options["namespace"]) + logger.info( + "Evaluating inputs and runtime for task %s (%s) called as %s", + self._task.name, + self._task_id, + self._wdl_options["namespace"], + ) # Set up the WDL standard library standard_library = ToilWDLStdLibBase(file_store, self._wdl_options) @@ -2648,93 +3052,122 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: if self._task.inputs: logger.debug("Evaluating task code") # Evaluate all the inputs that aren't pre-set - bindings = evaluate_decls_to_bindings(self._task.inputs, bindings, standard_library, include_previous=True) + bindings = evaluate_decls_to_bindings( + self._task.inputs, bindings, standard_library, include_previous=True + ) if self._task.postinputs: # Evaluate all the postinput decls. # We need these in order to evaluate the runtime. # TODO: What if they wanted resources from the runtime? - bindings = evaluate_decls_to_bindings(self._task.postinputs, bindings, standard_library, include_previous=True) + bindings = evaluate_decls_to_bindings( + self._task.postinputs, bindings, standard_library, include_previous=True + ) log_bindings(logger.debug, "Task bindings:", [bindings]) # Evaluate the runtime section - runtime_bindings = evaluate_call_inputs(self._task, self._task.runtime, bindings, standard_library) + runtime_bindings = evaluate_call_inputs( + self._task, self._task.runtime, bindings, standard_library + ) # Fill these in with not-None if the workflow asks for each resource. - runtime_memory: Optional[int] = None - runtime_cores: Optional[float] = None - runtime_disk: Optional[int] = None - runtime_accelerators: Optional[List[AcceleratorRequirement]] = None + runtime_memory: int | None = None + runtime_cores: float | None = None + runtime_disk: int | None = None + runtime_accelerators: list[AcceleratorRequirement] | None = None - if runtime_bindings.has_binding('cpu'): - cpu_spec: int = runtime_bindings.resolve('cpu').value + if runtime_bindings.has_binding("cpu"): + cpu_spec: int = runtime_bindings.resolve("cpu").value runtime_cores = float(cpu_spec) - if runtime_bindings.has_binding('memory'): + if runtime_bindings.has_binding("memory"): # Get the memory requirement and convert to bytes - memory_spec: Union[int, str] = runtime_bindings.resolve('memory').value + memory_spec: int | str = runtime_bindings.resolve("memory").value if isinstance(memory_spec, str): memory_spec = human2bytes(memory_spec) runtime_memory = memory_spec - mount_spec: Dict[Optional[str], int] = dict() - if runtime_bindings.has_binding('disks'): + mount_spec: dict[str | None, int] = dict() + if runtime_bindings.has_binding("disks"): # Miniwdl doesn't have this, but we need to be able to parse things like: # local-disk 5 SSD # which would mean we need 5 GB space. Cromwell docs for this are at https://cromwell.readthedocs.io/en/stable/RuntimeAttributes/#disks # We ignore all disk types, and complain if the mount point is not `local-disk`. - disks_spec: Union[List[WDL.Value.String], str] = runtime_bindings.resolve('disks').value + disks_spec: list[WDL.Value.String] | str = runtime_bindings.resolve( + "disks" + ).value if isinstance(disks_spec, list): # SPEC says to use the first one # the parser gives an array of WDL string objects all_specs = [part.value for part in disks_spec] else: - all_specs = disks_spec.split(',') + all_specs = disks_spec.split(",") # Sum up the space in each disk specification total_bytes: float = 0 for spec in all_specs: - specified_mount_point, part_size, part_suffix = parse_disks(spec, disks_spec) + specified_mount_point, part_size, part_suffix = parse_disks( + spec, disks_spec + ) per_part_size = convert_units(part_size, part_suffix) total_bytes += per_part_size if mount_spec.get(specified_mount_point) is not None: if specified_mount_point is not None: # raise an error as all mount points must be unique - raise ValueError(f"Could not parse disks = {disks_spec} because the mount point {specified_mount_point} is specified multiple times") + raise ValueError( + f"Could not parse disks = {disks_spec} because the mount point {specified_mount_point} is specified multiple times" + ) else: - raise ValueError(f"Could not parse disks = {disks_spec} because the mount point is omitted more than once") + raise ValueError( + f"Could not parse disks = {disks_spec} because the mount point is omitted more than once" + ) # TODO: we always ignore the disk type and assume we have the right one. mount_spec[specified_mount_point] = int(per_part_size) runtime_disk = int(total_bytes) - if not runtime_bindings.has_binding("gpu") and self._task.effective_wdl_version in ('1.0', 'draft-2'): + if not runtime_bindings.has_binding( + "gpu" + ) and self._task.effective_wdl_version in ("1.0", "draft-2"): # For old WDL versions, guess whether the task wants GPUs if not specified. - use_gpus = (runtime_bindings.has_binding('gpuCount') or - runtime_bindings.has_binding('gpuType') or - runtime_bindings.has_binding('nvidiaDriverVersion')) + use_gpus = ( + runtime_bindings.has_binding("gpuCount") + or runtime_bindings.has_binding("gpuType") + or runtime_bindings.has_binding("nvidiaDriverVersion") + ) else: # The gpu field is the WDL 1.1 standard with a default value of false, # so in 1.1+ documents, this field will be the absolute # truth on whether to use GPUs or not. # Fields such as gpuType and gpuCount will control what GPUs are provided. - use_gpus = cast(WDL.Value.Boolean, runtime_bindings.get('gpu', WDL.Value.Boolean(False))).value + use_gpus = cast( + WDL.Value.Boolean, runtime_bindings.get("gpu", WDL.Value.Boolean(False)) + ).value if use_gpus: # We want to have GPUs # TODO: actually coerce types here instead of casting to detect user mistakes # Get the GPU count if set, or 1 if not, - gpu_count: int = cast(WDL.Value.Int, runtime_bindings.get('gpuCount', WDL.Value.Int(1))).value + gpu_count: int = cast( + WDL.Value.Int, runtime_bindings.get("gpuCount", WDL.Value.Int(1)) + ).value # Get the GPU model constraint if set, or None if not - gpu_model: Optional[str] = cast(Union[WDL.Value.String, WDL.Value.Null], runtime_bindings.get('gpuType', WDL.Value.Null())).value + gpu_model: str | None = cast( + Union[WDL.Value.String, WDL.Value.Null], + runtime_bindings.get("gpuType", WDL.Value.Null()), + ).value # We can't enforce a driver version, but if an nvidia driver # version is set, manually set nvidia brand - gpu_brand: Optional[str] = 'nvidia' if runtime_bindings.has_binding('nvidiaDriverVersion') else None + gpu_brand: str | None = ( + "nvidia" + if runtime_bindings.has_binding("nvidiaDriverVersion") + else None + ) # Make a dict from this - accelerator_spec: Dict[str, Union[str, int]] = {'kind': 'gpu', 'count': gpu_count} + accelerator_spec: dict[str, str | int] = {"kind": "gpu", "count": gpu_count} if gpu_model is not None: - accelerator_spec['model'] = gpu_model + accelerator_spec["model"] = gpu_model if gpu_brand is not None: - accelerator_spec['brand'] = gpu_brand + accelerator_spec["brand"] = gpu_brand accelerator_requirement = parse_accelerator(accelerator_spec) runtime_accelerators = [accelerator_requirement] @@ -2747,7 +3180,9 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: run_job = WDLTaskJob( self._task, virtualize_files(bindings, standard_library, enforce_existence=False), - virtualize_files(runtime_bindings, standard_library, enforce_existence=False), + virtualize_files( + runtime_bindings, standard_library, enforce_existence=False + ), self._task_id, cores=runtime_cores or self.cores, memory=runtime_memory or self.memory, @@ -2767,7 +3202,6 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: return run_job.rv() - class WDLTaskJob(WDLBaseJob): """ Job that runs a WDL task. @@ -2784,11 +3218,11 @@ def __init__( task: WDL.Tree.Task, task_internal_bindings: Promised[WDLBindings], runtime_bindings: Promised[WDLBindings], - task_id: List[str], - mount_spec: Dict[Optional[str], int], + task_id: list[str], + mount_spec: dict[str | None, int], wdl_options: WDLContext, - cache_key: Optional[str] = None, - **kwargs: Any + cache_key: str | None = None, + **kwargs: Any, ) -> None: """ Make a new job to run a task. @@ -2799,9 +3233,17 @@ def __init__( # This job should not be local because it represents a real workflow task. # task_path in wdl_options is like the namespace, but including subscript numbers for scatters - super().__init__(unitName=wdl_options["task_path"] + ".command", displayName=wdl_options["namespace"] + ".command", local=False, wdl_options=wdl_options, **kwargs) + super().__init__( + unitName=wdl_options["task_path"] + ".command", + displayName=wdl_options["namespace"] + ".command", + local=False, + wdl_options=wdl_options, + **kwargs, + ) - logger.info("Preparing to run task %s as %s", task.name, wdl_options["namespace"]) + logger.info( + "Preparing to run task %s as %s", task.name, wdl_options["namespace"] + ) log_bindings(logger.debug, "Internal bindings:", [task_internal_bindings]) log_bindings(logger.debug, "Runtime bindings:", [runtime_bindings]) @@ -2835,7 +3277,8 @@ def add_injections(self, command_string: str, task_container: TaskContainer) -> # and so on from inside the container, since it won't be attributed # to Toil child processes in the leader's self-monitoring. # TODO: Mount this from a file Toil installs instead or something. - script = textwrap.dedent("""\ + script = textwrap.dedent( + """\ function _toil_resource_monitor () { # Turn off error checking and echo in here set +ex @@ -2866,7 +3309,8 @@ def add_injections(self, command_string: str, task_container: TaskContainer) -> sleep 1 done } - """) + """ + ) parts.append(script) # Launch in a subshell so that it doesn't interfere with Bash "wait" in the main shell parts.append(f"(_toil_resource_monitor {self.INJECTED_MESSAGE_DIR} &)") @@ -2876,7 +3320,8 @@ def add_injections(self, command_string: str, task_container: TaskContainer) -> # being mounted may appear as size 0 in the container due to a race # condition. Check for this and produce an approperiate error. - script = textwrap.dedent("""\ + script = textwrap.dedent( + """\ function _toil_check_size () { TARGET_FILE="${1}" GOT_SIZE="$(stat -c %s "${TARGET_FILE}")" @@ -2889,23 +3334,28 @@ def add_injections(self, command_string: str, task_container: TaskContainer) -> exit 1 fi } - """) + """ + ) parts.append(script) for host_path, job_path in task_container.input_path_map.items(): expected_size = os.path.getsize(host_path) if expected_size != 0: - parts.append(f"_toil_check_size \"{job_path}\" {expected_size}") + parts.append(f'_toil_check_size "{job_path}" {expected_size}') parts.append(command_string) return "\n".join(parts) - def handle_injection_messages(self, outputs_library: ToilWDLStdLibTaskOutputs) -> None: + def handle_injection_messages( + self, outputs_library: ToilWDLStdLibTaskOutputs + ) -> None: """ Handle any data received from injected runtime code in the container. """ - message_files = outputs_library._glob(WDL.Value.String(os.path.join(self.INJECTED_MESSAGE_DIR, "*"))) + message_files = outputs_library._glob( + WDL.Value.String(os.path.join(self.INJECTED_MESSAGE_DIR, "*")) + ) logger.debug("Handling message files: %s", message_files) for message_file in message_files.value: self.handle_message_file(message_file.value) @@ -2918,9 +3368,9 @@ def handle_message_file(self, file_path: str) -> None: """ if os.path.basename(file_path) == "resources.tsv": # This is a TSV of resource usage info. - first_cpu_usec: Optional[int] = None - last_cpu_usec: Optional[int] = None - max_memory_bytes: Optional[int] = None + first_cpu_usec: int | None = None + last_cpu_usec: int | None = None + max_memory_bytes: int | None = None for line in open(file_path): if not line.endswith("\n"): @@ -2946,11 +3396,14 @@ def handle_message_file(self, file_path: str) -> None: max_memory_bytes = memory_bytes if max_memory_bytes is not None: - logger.info("Container used at about %s bytes of memory at peak", max_memory_bytes) + logger.info( + "Container used at about %s bytes of memory at peak", + max_memory_bytes, + ) # Treat it as if used by a child process ResourceMonitor.record_extra_memory(max_memory_bytes // 1024) if last_cpu_usec is not None: - assert(first_cpu_usec is not None) + assert first_cpu_usec is not None cpu_seconds = (last_cpu_usec - first_cpu_usec) / 1000000 logger.info("Container used about %s seconds of CPU time", cpu_seconds) # Treat it as if used by a child process @@ -2967,17 +3420,19 @@ def can_fake_root(self) -> bool: # We need to have an entry for our user in /etc/subuid to grant us a range of UIDs to use, for fakeroot to work. try: - subuid_file = open('/etc/subuid') + subuid_file = open("/etc/subuid") except OSError as e: - logger.warning('Cannot open /etc/subuid due to %s; assuming no subuids available', e) + logger.warning( + "Cannot open /etc/subuid due to %s; assuming no subuids available", e + ) return False username = get_user_name() for line in subuid_file: - if line.split(':')[0].strip() == username: + if line.split(":")[0].strip() == username: # We have a line assigning subuids return True # If there is no line, we have no subuids - logger.warning('No subuids are assigned to %s; cannot fake root.', username) + logger.warning("No subuids are assigned to %s; cannot fake root.", username) return False def can_mount_proc(self) -> bool: @@ -2990,7 +3445,9 @@ def can_mount_proc(self) -> bool: """ return "KUBERNETES_SERVICE_HOST" not in os.environ - def ensure_mount_point(self, file_store: AbstractFileStore, mount_spec: Dict[Optional[str], int]) -> Dict[str, str]: + def ensure_mount_point( + self, file_store: AbstractFileStore, mount_spec: dict[str | None, int] + ) -> dict[str, str]: """ Ensure the mount point sources are available. @@ -3016,23 +3473,39 @@ def ensure_mount_point(self, file_store: AbstractFileStore, mount_spec: Dict[Opt total_mount_size = sum(mount_spec.values()) try: # Use arguments from the df POSIX standard - df_line = subprocess.check_output(["df", "-k", "-P", tmpdir], encoding="utf-8").split("\n")[1] + df_line = subprocess.check_output( + ["df", "-k", "-P", tmpdir], encoding="utf-8" + ).split("\n")[1] m = re.match(regex_df, df_line) if m is None: logger.debug("Output of df may be malformed: %s", df_line) - logger.warning("Unable to check disk requirements as output of 'df' command is malformed. Will assume storage is always available.") + logger.warning( + "Unable to check disk requirements as output of 'df' command is malformed. Will assume storage is always available." + ) else: # Block size will always be 1024 available_space = int(m[1]) * 1024 if available_space < total_mount_size: # We do not have enough space available for this mount point # An omitted mount point is the task's execution directory so show that to the user instead - raise InsufficientMountDiskSpace([mount_point if mount_point is not None else "/mnt/miniwdl_task_container/work" for mount_point in mount_spec.keys()], - total_mount_size, available_space) + raise InsufficientMountDiskSpace( + [ + ( + mount_point + if mount_point is not None + else "/mnt/miniwdl_task_container/work" + ) + for mount_point in mount_spec.keys() + ], + total_mount_size, + available_space, + ) except subprocess.CalledProcessError as e: # If df somehow isn't available logger.debug("Unable to call df. stdout: %s stderr: %s", e.stdout, e.stderr) - logger.warning("Unable to check disk requirements as call to 'df' command failed. Will assume storage is always available.") + logger.warning( + "Unable to check disk requirements as call to 'df' command failed. Will assume storage is always available." + ) for mount_target in mount_spec.keys(): # Create a new subdirectory for each mount point source_location = os.path.join(tmpdir, str(uuid.uuid4())) @@ -3048,7 +3521,12 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: Actually run the task. """ super().run(file_store) - logger.info("Running task command for %s (%s) called as %s", self._task.name, self._task_id, self._wdl_options["namespace"]) + logger.info( + "Running task command for %s (%s) called as %s", + self._task.name, + self._task_id, + self._wdl_options["namespace"], + ) # Set up the WDL standard library # UUID to use for virtualizing files @@ -3068,7 +3546,10 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # We have all the resources we need, so run the task - if shutil.which('singularity') and self._wdl_options.get("container") in ["singularity", "auto"]: + if shutil.which("singularity") and self._wdl_options.get("container") in [ + "singularity", + "auto", + ]: # Prepare to use Singularity. We will need plenty of space to # download images. # Default the Singularity and MiniWDL cache directories. This sets the cache to the same place as @@ -3080,38 +3561,50 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: miniwdl_singularity_cache = os.path.join(os.path.expanduser("~"), ".cache/miniwdl") # Cache Singularity's layers somewhere known to have space - os.environ['SINGULARITY_CACHEDIR'] = os.environ.get("SINGULARITY_CACHEDIR", singularity_cache) + os.environ["SINGULARITY_CACHEDIR"] = os.environ.get( + "SINGULARITY_CACHEDIR", singularity_cache + ) # Make sure it exists. - os.makedirs(os.environ['SINGULARITY_CACHEDIR'], exist_ok=True) + os.makedirs(os.environ["SINGULARITY_CACHEDIR"], exist_ok=True) # Cache Singularity images for the workflow on this machine. # Since MiniWDL does only within-process synchronization for pulls, # we also will need to pre-pull one image into here at a time. - os.environ['MINIWDL__SINGULARITY__IMAGE_CACHE'] = os.environ.get("MINIWDL__SINGULARITY__IMAGE_CACHE", miniwdl_singularity_cache) + os.environ["MINIWDL__SINGULARITY__IMAGE_CACHE"] = os.environ.get( + "MINIWDL__SINGULARITY__IMAGE_CACHE", miniwdl_singularity_cache + ) # Make sure it exists. - os.makedirs(os.environ['MINIWDL__SINGULARITY__IMAGE_CACHE'], exist_ok=True) + os.makedirs(os.environ["MINIWDL__SINGULARITY__IMAGE_CACHE"], exist_ok=True) # Run containers with Singularity - TaskContainerImplementation: Type[TaskContainer] = SingularityContainer + TaskContainerImplementation: type[TaskContainer] = SingularityContainer elif self._wdl_options.get("container") in ["docker", "auto"]: # Run containers with Docker # TODO: Poll if it is available and don't just try and fail. TaskContainerImplementation = SwarmContainer - if runtime_bindings.has_binding('gpuType') or runtime_bindings.has_binding('gpuCount') or runtime_bindings.has_binding('nvidiaDriverVersion'): + if ( + runtime_bindings.has_binding("gpuType") + or runtime_bindings.has_binding("gpuCount") + or runtime_bindings.has_binding("nvidiaDriverVersion") + ): # Complain to the user that this is unlikely to work. - logger.warning("Running job that might need accelerators with Docker. " - "Accelerator and GPU support " - "is not yet implemented in the MiniWDL Docker " - "containerization implementation.") + logger.warning( + "Running job that might need accelerators with Docker. " + "Accelerator and GPU support " + "is not yet implemented in the MiniWDL Docker " + "containerization implementation." + ) else: - raise RuntimeError(f"Could not find a working container engine to use; told to use {self._wdl_options.get('container')}") + raise RuntimeError( + f"Could not find a working container engine to use; told to use {self._wdl_options.get('container')}" + ) # Set up the MiniWDL container running stuff miniwdl_logger = logging.getLogger("MiniWDL") miniwdl_config = WDL.runtime.config.Loader(miniwdl_logger) - if not getattr(TaskContainerImplementation, 'toil_initialized__', False): + if not getattr(TaskContainerImplementation, "toil_initialized__", False): # Initialize the cointainer system TaskContainerImplementation.global_init(miniwdl_config, miniwdl_logger) @@ -3122,16 +3615,18 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # up. If we don't do this, we error out trying to make # _SubprocessScheduler instances because its class-level condition # variable doesn't exist. - TaskContainerImplementation.detect_resource_limits(miniwdl_config, miniwdl_logger) + TaskContainerImplementation.detect_resource_limits( + miniwdl_config, miniwdl_logger + ) # And remember we did it - setattr(TaskContainerImplementation, 'toil_initialized__', True) + setattr(TaskContainerImplementation, "toil_initialized__", True) # TODO: not thread safe! # Records, if we use a container, where its workdir is on our # filesystem, so we can interpret file anmes and globs relative to # there. - workdir_in_container: Optional[str] = None + workdir_in_container: str | None = None task_path = self._wdl_options["task_path"] if self._task.command: @@ -3156,10 +3651,12 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # TODO: What is this? run_id = str(uuid.uuid4()) # Directory on the host where the conteiner is allowed to put files. - host_dir = os.path.abspath('.') + host_dir = os.path.abspath(".") # Container working directory is guaranteed (?) to be at "work" inside there workdir_in_container = os.path.join(host_dir, "work") - task_container = TaskContainerImplementation(miniwdl_config, run_id, host_dir) + task_container = TaskContainerImplementation( + miniwdl_config, run_id, host_dir + ) if isinstance(task_container, SingularityContainer): # We need to patch the Singularity container run invocation @@ -3169,29 +3666,32 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # --fakeroot if we lack sub-UIDs. So we sneakily monkey patch it # here. original_run_invocation = task_container._run_invocation - def patched_run_invocation(*args: Any, **kwargs: Any) -> List[str]: + + def patched_run_invocation(*args: Any, **kwargs: Any) -> list[str]: """ Invoke the original _run_invocation to get a base Singularity command line, and then adjust the result to pass GPUs and not fake root if needed. """ - command_line: List[str] = original_run_invocation(*args, **kwargs) + command_line: list[str] = original_run_invocation(*args, **kwargs) - logger.debug('MiniWDL wants to run command line: %s', command_line) + logger.debug("MiniWDL wants to run command line: %s", command_line) # "exec" can be at index 1 or 2 depending on if we have a --verbose. subcommand_index = 2 if command_line[1] == "--verbose" else 1 - if '--fakeroot' in command_line and not self.can_fake_root(): + if "--fakeroot" in command_line and not self.can_fake_root(): # We can't fake root so don't try. - command_line.remove('--fakeroot') + command_line.remove("--fakeroot") # If on Kubernetes and proc cannot be mounted, get rid of --containall - if '--containall' in command_line and not self.can_mount_proc(): - command_line.remove('--containall') + if "--containall" in command_line and not self.can_mount_proc(): + command_line.remove("--containall") - extra_flags: Set[str] = set() - accelerators_needed: Optional[List[AcceleratorRequirement]] = self.accelerators + extra_flags: set[str] = set() + accelerators_needed: list[AcceleratorRequirement] | None = ( + self.accelerators + ) local_accelerators = get_individual_local_accelerators() if accelerators_needed is not None: for accelerator in accelerators_needed: @@ -3199,25 +3699,32 @@ def patched_run_invocation(*args: Any, **kwargs: Any) -> List[str]: # Right now this assumes all GPUs on the node are the same; we only look at the first available GPU # and assume homogeneity # This shouldn't cause issues unless a user has a very odd machine setup, which should be rare - if accelerator['kind'] == 'gpu': + if accelerator["kind"] == "gpu": # Grab detected GPUs - local_gpus: List[Optional[str]] = [accel['brand'] for accel in local_accelerators if accel['kind'] == 'gpu'] or [None] + local_gpus: list[str | None] = [ + accel["brand"] + for accel in local_accelerators + if accel["kind"] == "gpu" + ] or [None] # Tell singularity the GPU type - gpu_brand = accelerator.get('brand') or local_gpus[0] - if gpu_brand == 'nvidia': + gpu_brand = accelerator.get("brand") or local_gpus[0] + if gpu_brand == "nvidia": # Tell Singularity to expose nvidia GPUs - extra_flags.add('--nv') - elif gpu_brand == 'amd': + extra_flags.add("--nv") + elif gpu_brand == "amd": # Tell Singularity to expose ROCm GPUs - extra_flags.add('--rocm') + extra_flags.add("--rocm") else: - raise RuntimeError('Cannot expose allocated accelerator %s to Singularity job', accelerator) + raise RuntimeError( + "Cannot expose allocated accelerator %s to Singularity job", + accelerator, + ) for flag in extra_flags: # Put in all those flags command_line.insert(subcommand_index + 1, flag) - logger.debug('Amended command line to: %s', command_line) + logger.debug("Amended command line to: %s", command_line) # Return the modified command line return command_line @@ -3227,47 +3734,65 @@ def patched_run_invocation(*args: Any, **kwargs: Any) -> List[str]: singularity_original_prepare_mounts = task_container.prepare_mounts - def patch_prepare_mounts_singularity() -> List[Tuple[str, str, bool]]: + def patch_prepare_mounts_singularity() -> list[tuple[str, str, bool]]: """ Mount the mount points specified from the disk requirements. The singularity and docker patch are separate as they have different function signatures """ - mounts: List[Tuple[str, str, bool]] = singularity_original_prepare_mounts() + mounts: list[tuple[str, str, bool]] = ( + singularity_original_prepare_mounts() + ) # todo: support AWS EBS/Kubernetes persistent volumes # this logic likely only works for local clusters as we don't deal with the size of each mount point for mount_point, source_location in mount_mapping.items(): mounts.append((mount_point, source_location, True)) return mounts + task_container.prepare_mounts = patch_prepare_mounts_singularity # type: ignore[method-assign] elif isinstance(task_container, SwarmContainer): docker_original_prepare_mounts = task_container.prepare_mounts try: # miniwdl depends on docker so this should be available but check just in case - import docker + pass # docker stubs are still WIP: https://github.com/docker/docker-py/issues/2796 from docker.types import Mount # type: ignore[import-untyped] - def patch_prepare_mounts_docker(logger: logging.Logger) -> List[Mount]: + def patch_prepare_mounts_docker( + logger: logging.Logger, + ) -> list[Mount]: """ Same as the singularity patch but for docker """ - mounts: List[Mount] = docker_original_prepare_mounts(logger) + mounts: list[Mount] = docker_original_prepare_mounts(logger) for mount_point, source_location in mount_mapping.items(): mounts.append( Mount( mount_point.rstrip("/").replace("{{", '{{"{{"}}'), - source_location.rstrip("/").replace("{{", '{{"{{"}}'), + source_location.rstrip("/").replace( + "{{", '{{"{{"}}' + ), type="bind", ) ) return mounts + task_container.prepare_mounts = patch_prepare_mounts_docker # type: ignore[method-assign] except ImportError: - logger.warning("Docker package not installed. Unable to add mount points.") + logger.warning( + "Docker package not installed. Unable to add mount points." + ) # Show the runtime info to the container - task_container.process_runtime(miniwdl_logger, {binding.name: binding.value for binding in devirtualize_files(runtime_bindings, standard_library)}) + task_container.process_runtime( + miniwdl_logger, + { + binding.name: binding.value + for binding in devirtualize_files( + runtime_bindings, standard_library + ) + }, + ) # Tell the container to take up all these files. It will assign # them all new paths in task_container.input_path_map which we can @@ -3278,19 +3803,38 @@ def patch_prepare_mounts_docker(logger: logging.Logger) -> List[Mount]: # Replace everything with in-container paths for the command. # TODO: MiniWDL deals with directory paths specially here. - def get_path_in_container(file: WDL.Value.File) -> Optional[WDL.Value.File]: + def get_path_in_container(file: WDL.Value.File) -> WDL.Value.File | None: if get_file_nonexistent(file) is False: - return set_file_value(file, task_container.input_path_map[file.value]) - contained_bindings = map_over_files_in_bindings(bindings, get_path_in_container) + return set_file_value( + file, task_container.input_path_map[file.value] + ) + return None + + contained_bindings = map_over_files_in_bindings( + bindings, get_path_in_container + ) # Make a new standard library for evaluating the command specifically, which only deals with in-container paths and out-of-container paths. command_wdl_options: WDLContext = self._wdl_options.copy() if workdir_in_container is not None: command_wdl_options["execution_dir"] = workdir_in_container - command_library = ToilWDLStdLibTaskCommand(file_store, task_container, wdl_options=command_wdl_options) + command_library = ToilWDLStdLibTaskCommand( + file_store, task_container, wdl_options=command_wdl_options + ) # Work out the command string, and unwrap it - command_string: str = evaluate_named_expression(self._task, "command", WDL.Type.String(), remove_common_leading_whitespace(self._task.command), contained_bindings, command_library).coerce(WDL.Type.String()).value + command_string: str = ( + evaluate_named_expression( + self._task, + "command", + WDL.Type.String(), + remove_common_leading_whitespace(self._task.command), + contained_bindings, + command_library, + ) + .coerce(WDL.Type.String()) + .value + ) # Do any command injection we might need to do command_string = self.add_injections(command_string, task_container) @@ -3299,27 +3843,35 @@ def get_path_in_container(file: WDL.Value.File) -> Optional[WDL.Value.File]: # them because in the current MiniWDL version they are untyped. # TODO: MyPy will complain if we accomodate this and they later # become typed. - host_stdout_txt: str = task_container.host_stdout_txt() # type: ignore - host_stderr_txt: str = task_container.host_stderr_txt() # type: ignore + host_stdout_txt: str = task_container.host_stdout_txt() # type: ignore + host_stderr_txt: str = task_container.host_stderr_txt() # type: ignore if isinstance(task_container, SingularityContainer): # Before running the command, we need to make sure the container's # image is already pulled, so MiniWDL doesn't try and pull it. # MiniWDL only locks its cache directory within a process, and we # need to coordinate with other processes sharing the cache. - with global_mutex(os.environ['MINIWDL__SINGULARITY__IMAGE_CACHE'], 'toil_miniwdl_sif_cache_mutex'): + with global_mutex( + os.environ["MINIWDL__SINGULARITY__IMAGE_CACHE"], + "toil_miniwdl_sif_cache_mutex", + ): # Also lock the Singularity layer cache in case it is shared with a different set of hosts # TODO: Will these locks work well across machines??? - with global_mutex(os.environ['SINGULARITY_CACHEDIR'], 'toil_singularity_cache_mutex'): + with global_mutex( + os.environ["SINGULARITY_CACHEDIR"], + "toil_singularity_cache_mutex", + ): with ExitStack() as cleanup: task_container._pull(miniwdl_logger, cleanup) # Log that we are about to run the command in the container - logger.info('Executing command in %s: %s', task_container, command_string) + logger.info("Executing command in %s: %s", task_container, command_string) # Now our inputs are all downloaded. Let debugging break in (after command is logged). # But we need to hint which host paths are meant to be which container paths - host_and_job_paths: List[Tuple[str, str]] = [(k, v) for k, v in task_container.input_path_map.items()] + host_and_job_paths: list[tuple[str, str]] = [ + (k, v) for k, v in task_container.input_path_map.items() + ] self.files_downloaded_hook(host_and_job_paths) # TODO: Really we might want to set up a fake container working directory, to actually help the user. @@ -3329,10 +3881,16 @@ def get_path_in_container(file: WDL.Value.File) -> Optional[WDL.Value.File]: except Exception: if os.path.exists(host_stderr_txt): size = os.path.getsize(host_stderr_txt) - logger.error('Failed task left standard error at %s of %d bytes', host_stderr_txt, size) + logger.error( + "Failed task left standard error at %s of %d bytes", + host_stderr_txt, + size, + ) if size > 0: # Send the whole error stream. - file_store.log_user_stream(task_path + '.stderr', open(host_stderr_txt, 'rb')) + file_store.log_user_stream( + task_path + ".stderr", open(host_stderr_txt, "rb") + ) if logger.isEnabledFor(logging.DEBUG): logger.debug("MiniWDL already logged standard error") else: @@ -3342,18 +3900,24 @@ def get_path_in_container(file: WDL.Value.File) -> Optional[WDL.Value.File]: # gets printed at the end of the workflow. So log # the error log ourselves. logger.error("====TASK ERROR LOG====") - for line in open(host_stderr_txt, 'r', errors="replace"): - logger.error("> %s", line.rstrip('\n')) + for line in open(host_stderr_txt, errors="replace"): + logger.error("> %s", line.rstrip("\n")) logger.error("====TASK ERROR LOG====") if os.path.exists(host_stdout_txt): size = os.path.getsize(host_stdout_txt) - logger.info('Failed task left standard output at %s of %d bytes', host_stdout_txt, size) + logger.info( + "Failed task left standard output at %s of %d bytes", + host_stdout_txt, + size, + ) if size > 0: # Save the whole output stream. # TODO: We can't tell if this was supposed to be # captured. It might really be huge binary data. - file_store.log_user_stream(task_path + '.stdout', open(host_stdout_txt, 'rb')) + file_store.log_user_stream( + task_path + ".stdout", open(host_stdout_txt, "rb") + ) # Keep crashing raise @@ -3373,38 +3937,59 @@ def get_path_in_container(file: WDL.Value.File) -> Optional[WDL.Value.File]: output_wdl_options: WDLContext = self._wdl_options.copy() if workdir_in_container is not None: output_wdl_options["execution_dir"] = workdir_in_container - outputs_library = ToilWDLStdLibTaskOutputs(file_store, host_stdout_txt, host_stderr_txt, task_container.input_path_map, wdl_options=output_wdl_options, share_files_with=standard_library) - output_bindings = evaluate_decls_to_bindings(self._task.outputs, bindings, outputs_library, drop_missing_files=True) + outputs_library = ToilWDLStdLibTaskOutputs( + file_store, + host_stdout_txt, + host_stderr_txt, + task_container.input_path_map, + wdl_options=output_wdl_options, + share_files_with=standard_library, + ) + output_bindings = evaluate_decls_to_bindings( + self._task.outputs, bindings, outputs_library, drop_missing_files=True + ) # Now we know if the standard output and error were sent somewhere by # the workflow. If not, we should report them to the leader. if not outputs_library.stderr_used() and os.path.exists(host_stderr_txt): size = os.path.getsize(host_stderr_txt) - logger.info('Unused standard error at %s of %d bytes', host_stderr_txt, size) + logger.info( + "Unused standard error at %s of %d bytes", host_stderr_txt, size + ) if size > 0: # Save the whole error stream because the workflow didn't capture it. - file_store.log_user_stream(task_path + '.stderr', open(host_stderr_txt, 'rb')) + file_store.log_user_stream( + task_path + ".stderr", open(host_stderr_txt, "rb") + ) if not outputs_library.stdout_used() and os.path.exists(host_stdout_txt): size = os.path.getsize(host_stdout_txt) - logger.info('Unused standard output at %s of %d bytes', host_stdout_txt, size) + logger.info( + "Unused standard output at %s of %d bytes", host_stdout_txt, size + ) if size > 0: # Save the whole output stream because the workflow didn't capture it. - file_store.log_user_stream(task_path + '.stdout', open(host_stdout_txt, 'rb')) + file_store.log_user_stream( + task_path + ".stdout", open(host_stdout_txt, "rb") + ) # Collect output messages from any code Toil injected into the task. self.handle_injection_messages(outputs_library) # Drop any files from the output which don't actually exist - output_bindings = drop_missing_files(output_bindings, standard_library=outputs_library) + output_bindings = drop_missing_files( + output_bindings, standard_library=outputs_library + ) for decl in self._task.outputs: if not decl.type.optional and output_bindings[decl.name].value is None: # todo: make recursive # We have an unacceptable null value. This can happen if a file # is missing but not optional. Don't let it out to annoy the # next task. - raise WDL.Error.EvalError(decl, f"non-optional value {decl.name} = {decl.expr} is missing") + raise WDL.Error.EvalError( + decl, f"non-optional value {decl.name} = {decl.expr} is missing" + ) # Upload any files in the outputs if not uploaded already. Accounts for # how relative paths may still need to be container-relative. @@ -3419,16 +4004,28 @@ def get_path_in_container(file: WDL.Value.File) -> Optional[WDL.Value.File]: return output_bindings + class WDLWorkflowNodeJob(WDLBaseJob): """ Job that evaluates a WDL workflow node. """ - def __init__(self, node: WDL.Tree.WorkflowNode, prev_node_results: Sequence[Promised[WDLBindings]], wdl_options: WDLContext, **kwargs: Any) -> None: + def __init__( + self, + node: WDL.Tree.WorkflowNode, + prev_node_results: Sequence[Promised[WDLBindings]], + wdl_options: WDLContext, + **kwargs: Any, + ) -> None: """ Make a new job to run a workflow node to completion. """ - super().__init__(unitName=node.workflow_node_id, displayName=node.workflow_node_id, wdl_options=wdl_options, **kwargs) + super().__init__( + unitName=node.workflow_node_id, + displayName=node.workflow_node_id, + wdl_options=wdl_options, + **kwargs, + ) self._node = node self._prev_node_results = prev_node_results @@ -3451,7 +4048,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: if isinstance(self._node, WDL.Tree.Decl): # This is a variable assignment - logger.info('Setting %s to %s', self._node.name, self._node.expr) + logger.info("Setting %s to %s", self._node.name, self._node.expr) value = evaluate_decl(self._node, incoming_bindings, standard_library) bindings = incoming_bindings.bind(self._node.name, value) return self.postprocess(bindings) @@ -3465,8 +4062,16 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # This should never be None, but mypy gets unhappy and this is better than an assert inputs_mapping = None else: - inputs_mapping = {e.name: e.type for e in self._node.callee.inputs or []} - input_bindings = evaluate_call_inputs(self._node, self._node.inputs, incoming_bindings, standard_library, inputs_mapping) + inputs_mapping = { + e.name: e.type for e in self._node.callee.inputs or [] + } + input_bindings = evaluate_call_inputs( + self._node, + self._node.inputs, + incoming_bindings, + standard_library, + inputs_mapping, + ) # Bindings may also be added in from the enclosing workflow inputs # TODO: this is letting us also inject them from the workflow body. @@ -3476,19 +4081,33 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: task_path = self._wdl_options.get("task_path") namespace = self._wdl_options.get("namespace") wdl_options = self._wdl_options.copy() - wdl_options["task_path"] = f'{task_path}.{self._node.name}' - wdl_options["namespace"] = f'{namespace}.{self._node.name}' + wdl_options["task_path"] = f"{task_path}.{self._node.name}" + wdl_options["namespace"] = f"{namespace}.{self._node.name}" if isinstance(self._node.callee, WDL.Tree.Workflow): # This is a call of a workflow - subjob: WDLBaseJob = WDLWorkflowJob(self._node.callee, [input_bindings, passed_down_bindings], self._node.callee_id, wdl_options=wdl_options, local=True) + subjob: WDLBaseJob = WDLWorkflowJob( + self._node.callee, + [input_bindings, passed_down_bindings], + self._node.callee_id, + wdl_options=wdl_options, + local=True, + ) self.addChild(subjob) elif isinstance(self._node.callee, WDL.Tree.Task): # This is a call of a task - subjob = WDLTaskWrapperJob(self._node.callee, [input_bindings, passed_down_bindings], self._node.callee_id, wdl_options=wdl_options, local=True) + subjob = WDLTaskWrapperJob( + self._node.callee, + [input_bindings, passed_down_bindings], + self._node.callee_id, + wdl_options=wdl_options, + local=True, + ) self.addChild(subjob) else: - raise WDL.Error.InvalidType(self._node, "Cannot call a " + str(type(self._node.callee))) + raise WDL.Error.InvalidType( + self._node, "Cannot call a " + str(type(self._node.callee)) + ) # We need to agregate outputs namespaced with our node name, and existing bindings subjob.then_namespace(self._node.name) @@ -3496,21 +4115,34 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: self.defer_postprocessing(subjob) return subjob.rv() elif isinstance(self._node, WDL.Tree.Scatter): - subjob = WDLScatterJob(self._node, [incoming_bindings], wdl_options=self._wdl_options, local=True) + subjob = WDLScatterJob( + self._node, + [incoming_bindings], + wdl_options=self._wdl_options, + local=True, + ) self.addChild(subjob) # Scatters don't really make a namespace, just kind of a scope? # TODO: Let stuff leave scope! self.defer_postprocessing(subjob) return subjob.rv() elif isinstance(self._node, WDL.Tree.Conditional): - subjob = WDLConditionalJob(self._node, [incoming_bindings], wdl_options=self._wdl_options, local=True) + subjob = WDLConditionalJob( + self._node, + [incoming_bindings], + wdl_options=self._wdl_options, + local=True, + ) self.addChild(subjob) # Conditionals don't really make a namespace, just kind of a scope? # TODO: Let stuff leave scope! self.defer_postprocessing(subjob) return subjob.rv() else: - raise WDL.Error.InvalidType(self._node, "Unimplemented WorkflowNode: " + str(type(self._node))) + raise WDL.Error.InvalidType( + self._node, "Unimplemented WorkflowNode: " + str(type(self._node)) + ) + class WDLWorkflowNodeListJob(WDLBaseJob): """ @@ -3519,18 +4151,31 @@ class WDLWorkflowNodeListJob(WDLBaseJob): workflows or tasks or sections. """ - def __init__(self, nodes: List[WDL.Tree.WorkflowNode], prev_node_results: Sequence[Promised[WDLBindings]], wdl_options: WDLContext, **kwargs: Any) -> None: + def __init__( + self, + nodes: list[WDL.Tree.WorkflowNode], + prev_node_results: Sequence[Promised[WDLBindings]], + wdl_options: WDLContext, + **kwargs: Any, + ) -> None: """ Make a new job to run a list of workflow nodes to completion. """ - super().__init__(unitName=nodes[0].workflow_node_id + '+', displayName=nodes[0].workflow_node_id + '+', wdl_options=wdl_options, **kwargs) + super().__init__( + unitName=nodes[0].workflow_node_id + "+", + displayName=nodes[0].workflow_node_id + "+", + wdl_options=wdl_options, + **kwargs, + ) self._nodes = nodes self._prev_node_results = prev_node_results for n in self._nodes: if isinstance(n, (WDL.Tree.Call, WDL.Tree.Scatter, WDL.Tree.Conditional)): - raise RuntimeError("Node cannot be evaluated with other nodes: " + str(n)) + raise RuntimeError( + "Node cannot be evaluated with other nodes: " + str(n) + ) @report_wdl_errors("run workflow node list") def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: @@ -3547,11 +4192,13 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: for node in self._nodes: if isinstance(node, WDL.Tree.Decl): # This is a variable assignment - logger.info('Setting %s to %s', node.name, node.expr) + logger.info("Setting %s to %s", node.name, node.expr) value = evaluate_decl(node, current_bindings, standard_library) current_bindings = current_bindings.bind(node.name, value) else: - raise WDL.Error.InvalidType(node, "Unimplemented WorkflowNode: " + str(type(node))) + raise WDL.Error.InvalidType( + node, "Unimplemented WorkflowNode: " + str(type(node)) + ) return self.postprocess(current_bindings) @@ -3562,7 +4209,9 @@ class WDLCombineBindingsJob(WDLBaseJob): environment changes. """ - def __init__(self, prev_node_results: Sequence[Promised[WDLBindings]], **kwargs: Any) -> None: + def __init__( + self, prev_node_results: Sequence[Promised[WDLBindings]], **kwargs: Any + ) -> None: """ Make a new job to combine the results of previous jobs. @@ -3588,6 +4237,7 @@ def run(self, file_store: AbstractFileStore) -> WDLBindings: # Make sure to run the universal postprocessing steps return self.postprocess(combined) + class WDLWorkflowGraph: """ Represents a graph of WDL WorkflowNodes. @@ -3607,14 +4257,20 @@ def __init__(self, nodes: Sequence[WDL.Tree.WorkflowNode]) -> None: # For Gather nodes, the Toil interpreter handles them as part of their # associated section. So make a map from gather ID to the section node # ID. - self._gather_to_section: Dict[str, str] = {} + self._gather_to_section: dict[str, str] = {} for node in nodes: if isinstance(node, WDL.Tree.WorkflowSection): for gather_node in node.gathers.values(): - self._gather_to_section[gather_node.workflow_node_id] = node.workflow_node_id + self._gather_to_section[gather_node.workflow_node_id] = ( + node.workflow_node_id + ) # Store all the nodes by ID, except the gathers which we elide. - self._nodes: Dict[str, WDL.Tree.WorkflowNode] = {node.workflow_node_id: node for node in nodes if not isinstance(node, WDL.Tree.Gather)} + self._nodes: dict[str, WDL.Tree.WorkflowNode] = { + node.workflow_node_id: node + for node in nodes + if not isinstance(node, WDL.Tree.Gather) + } def real_id(self, node_id: str) -> str: """ @@ -3637,7 +4293,7 @@ def get(self, node_id: str) -> WDL.Tree.WorkflowNode: """ return self._nodes[self.real_id(node_id)] - def get_dependencies(self, node_id: str) -> Set[str]: + def get_dependencies(self, node_id: str) -> set[str]: """ Get all the nodes that a node depends on, recursively (into the node if it has a body) but not transitively. @@ -3667,13 +4323,13 @@ def get_dependencies(self, node_id: str) -> Set[str]: return dependencies - def get_transitive_dependencies(self, node_id: str) -> Set[str]: + def get_transitive_dependencies(self, node_id: str) -> set[str]: """ Get all the nodes that a node depends on, transitively. """ - dependencies: Set[str] = set() - visited: Set[str] = set() + dependencies: set[str] = set() + visited: set[str] = set() queue = [node_id] while len(queue) > 0: @@ -3695,18 +4351,18 @@ def get_transitive_dependencies(self, node_id: str) -> Set[str]: return dependencies - def topological_order(self) -> List[str]: + def topological_order(self) -> list[str]: """ Get a topological order of the nodes, based on their dependencies. """ - sorter : TopologicalSorter[str] = TopologicalSorter() + sorter: TopologicalSorter[str] = TopologicalSorter() for node_id in self._nodes.keys(): # Add all the edges sorter.add(node_id, *self.get_dependencies(node_id)) return list(sorter.static_order()) - def leaves(self) -> List[str]: + def leaves(self) -> list[str]: """ Get all the workflow node IDs that have no dependents in the graph. """ @@ -3733,7 +4389,9 @@ def __init__(self, wdl_options: WDLContext, **kwargs: Any) -> None: super().__init__(wdl_options=wdl_options, **kwargs) @staticmethod - def coalesce_nodes(order: List[str], section_graph: WDLWorkflowGraph) -> List[List[str]]: + def coalesce_nodes( + order: list[str], section_graph: WDLWorkflowGraph + ) -> list[list[str]]: """ Given a topological order of WDL workflow node IDs, produce a list of lists of IDs, still in topological order, where each list of IDs can be @@ -3741,16 +4399,20 @@ def coalesce_nodes(order: List[str], section_graph: WDLWorkflowGraph) -> List[Li """ # All the buckets of merged nodes - to_return: List[List[str]] = [] + to_return: list[list[str]] = [] # The nodes we are currently merging, in topological order - current_bucket: List[str] = [] + current_bucket: list[str] = [] # All the non-decl transitive dependencies of nodes in the bucket - current_bucket_dependencies: Set[str] = set() + current_bucket_dependencies: set[str] = set() for next_id in order: # Consider adding each node to the bucket # Get all the dependencies on things that aren't decls. - next_dependencies = {dep for dep in section_graph.get_transitive_dependencies(next_id) if not section_graph.is_decl(dep)} + next_dependencies = { + dep + for dep in section_graph.get_transitive_dependencies(next_id) + if not section_graph.is_decl(dep) + } if len(current_bucket) == 0: # This is the first thing for the bucket current_bucket.append(next_id) @@ -3759,7 +4421,9 @@ def coalesce_nodes(order: List[str], section_graph: WDLWorkflowGraph) -> List[Li # Get a node already in the bucket current_id = current_bucket[0] - if not section_graph.is_decl(current_id) or not section_graph.is_decl(next_id): + if not section_graph.is_decl(current_id) or not section_graph.is_decl( + next_id + ): # We can only combine decls with decls, so we can't go in # the bucket. @@ -3796,9 +4460,14 @@ def coalesce_nodes(order: List[str], section_graph: WDLWorkflowGraph) -> List[Li return to_return - - - def create_subgraph(self, nodes: Sequence[WDL.Tree.WorkflowNode], gather_nodes: Sequence[WDL.Tree.Gather], environment: WDLBindings, local_environment: Optional[WDLBindings] = None, subscript: Optional[int] = None) -> WDLBaseJob: + def create_subgraph( + self, + nodes: Sequence[WDL.Tree.WorkflowNode], + gather_nodes: Sequence[WDL.Tree.Gather], + environment: WDLBindings, + local_environment: WDLBindings | None = None, + subscript: int | None = None, + ) -> WDLBaseJob: """ Make a Toil job to evaluate a subgraph inside a workflow or workflow section. @@ -3822,7 +4491,7 @@ def create_subgraph(self, nodes: Sequence[WDL.Tree.WorkflowNode], gather_nodes: task_path = self._wdl_options["task_path"] if subscript is not None: # We need to include a scatter loop number. - task_path += f'.{subscript}' + task_path += f".{subscript}" if local_environment is not None: # Bring local environment into scope @@ -3836,12 +4505,12 @@ def create_subgraph(self, nodes: Sequence[WDL.Tree.WorkflowNode], gather_nodes: # properly. # When a WDL node depends on another, we need to be able to find the Toil job we need an rv from. - wdl_id_to_toil_job: Dict[str, WDLBaseJob] = {} + wdl_id_to_toil_job: dict[str, WDLBaseJob] = {} # We need the set of Toil jobs not depended on so we can wire them up to the sink. # This maps from Toil job store ID to job. - toil_leaves: Dict[Union[str, TemporaryID], WDLBaseJob] = {} + toil_leaves: dict[str | TemporaryID, WDLBaseJob] = {} - def get_job_set_any(wdl_ids: Set[str]) -> List[WDLBaseJob]: + def get_job_set_any(wdl_ids: set[str]) -> list[WDLBaseJob]: """ Get the distinct Toil jobs executing any of the given WDL nodes. """ @@ -3856,18 +4525,22 @@ def get_job_set_any(wdl_ids: Set[str]) -> List[WDLBaseJob]: return jobs creation_order = section_graph.topological_order() - logger.debug('Creation order: %s', creation_order) + logger.debug("Creation order: %s", creation_order) # Now we want to organize the linear list of nodes into collections of nodes that can be in the same Toil job. creation_jobs = self.coalesce_nodes(creation_order, section_graph) - logger.debug('Creation jobs: %s', creation_jobs) + logger.debug("Creation jobs: %s", creation_jobs) for node_ids in creation_jobs: - logger.debug('Make Toil job for %s', node_ids) + logger.debug("Make Toil job for %s", node_ids) # Collect the return values from previous jobs. Some nodes may have been inputs, without jobs. # Don't inlude stuff in the current batch. - prev_node_ids = {prev_node_id for node_id in node_ids for prev_node_id in section_graph.get_dependencies(node_id) if prev_node_id not in node_ids} - + prev_node_ids = { + prev_node_id + for node_id in node_ids + for prev_node_id in section_graph.get_dependencies(node_id) + if prev_node_id not in node_ids + } # Get the Toil jobs we depend on prev_jobs = get_job_set_any(prev_node_ids) @@ -3877,16 +4550,26 @@ def get_job_set_any(wdl_ids: Set[str]) -> List[WDLBaseJob]: del toil_leaves[prev_job.jobStoreID] # Get their return values to feed into the new job - rvs: List[Union[WDLBindings, Promise]] = [prev_job.rv() for prev_job in prev_jobs] + rvs: list[WDLBindings | Promise] = [prev_job.rv() for prev_job in prev_jobs] # We also need access to section-level bindings like inputs rvs.append(environment) if len(node_ids) == 1: # Make a one-node job - job: WDLBaseJob = WDLWorkflowNodeJob(section_graph.get(node_ids[0]), rvs, wdl_options=self._wdl_options, local=True) + job: WDLBaseJob = WDLWorkflowNodeJob( + section_graph.get(node_ids[0]), + rvs, + wdl_options=self._wdl_options, + local=True, + ) else: # Make a multi-node job - job = WDLWorkflowNodeListJob([section_graph.get(node_id) for node_id in node_ids], rvs, wdl_options=self._wdl_options, local=True) + job = WDLWorkflowNodeListJob( + [section_graph.get(node_id) for node_id in node_ids], + rvs, + wdl_options=self._wdl_options, + local=True, + ) for prev_job in prev_jobs: # Connect up the happens-after relationships to make sure the # return values are available. @@ -3911,12 +4594,16 @@ def get_job_set_any(wdl_ids: Set[str]) -> List[WDLBaseJob]: else: # We need to bring together with a new sink # Make the sink job to collect all their results. - leaf_rvs: List[Union[WDLBindings, Promise]] = [leaf_job.rv() for leaf_job in toil_leaves.values()] + leaf_rvs: list[WDLBindings | Promise] = [ + leaf_job.rv() for leaf_job in toil_leaves.values() + ] # Make sure to also send the section-level bindings leaf_rvs.append(environment) # And to fill in bindings from code not executed in this instantiation # with Null, and filter out stuff that should leave scope. - sink = WDLCombineBindingsJob(leaf_rvs, wdl_options=self._wdl_options, local=True) + sink = WDLCombineBindingsJob( + leaf_rvs, wdl_options=self._wdl_options, local=True + ) # It runs inside us self.addChild(sink) for leaf_job in toil_leaves.values(): @@ -3925,7 +4612,6 @@ def get_job_set_any(wdl_ids: Set[str]) -> List[WDLBaseJob]: logger.debug("Sink job is: %s", sink) - # Apply the final postprocessing for leaving the section. sink.then_underlay(self.make_gather_bindings(gather_nodes, WDL.Value.Null())) if local_environment is not None: @@ -3933,7 +4619,9 @@ def get_job_set_any(wdl_ids: Set[str]) -> List[WDLBaseJob]: return sink - def make_gather_bindings(self, gathers: Sequence[WDL.Tree.Gather], undefined: WDL.Value.Base) -> WDLBindings: + def make_gather_bindings( + self, gathers: Sequence[WDL.Tree.Gather], undefined: WDL.Value.Base + ) -> WDLBindings: """ Given a collection of Gathers, create bindings from every identifier gathered, to the given "undefined" placeholder (which would be Null for @@ -3972,10 +4660,13 @@ def make_gather_bindings(self, gathers: Sequence[WDL.Tree.Gather], undefined: WD new_bindings = new_bindings.bind(call_binding.name, undefined) else: # Either something unrecognized or final_referee lied and gave us a Gather. - raise TypeError(f"Cannot generate bindings for a gather over a {type(bindings_source)}") + raise TypeError( + f"Cannot generate bindings for a gather over a {type(bindings_source)}" + ) return new_bindings + class WDLScatterJob(WDLSectionJob): """ Job that evaluates a scatter in a WDL workflow. Runs the body for each @@ -3983,11 +4674,23 @@ class WDLScatterJob(WDLSectionJob): instance of the body. If an instance of the body doesn't create a binding, it gets a null value in the corresponding array. """ - def __init__(self, scatter: WDL.Tree.Scatter, prev_node_results: Sequence[Promised[WDLBindings]], wdl_options: WDLContext, **kwargs: Any) -> None: + + def __init__( + self, + scatter: WDL.Tree.Scatter, + prev_node_results: Sequence[Promised[WDLBindings]], + wdl_options: WDLContext, + **kwargs: Any, + ) -> None: """ Create a subtree that will run a WDL scatter. The scatter itself and the contents live in the given namespace. """ - super().__init__(**kwargs, unitName=scatter.workflow_node_id, displayName=scatter.workflow_node_id, wdl_options=wdl_options) + super().__init__( + **kwargs, + unitName=scatter.workflow_node_id, + displayName=scatter.workflow_node_id, + wdl_options=wdl_options, + ) # Because we need to return the return value of the workflow, we need # to return a Toil promise for the last/sink job in the workflow's @@ -4018,13 +4721,24 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # Get what to scatter over try: - scatter_value = evaluate_named_expression(self._scatter, self._scatter.variable, None, self._scatter.expr, bindings, standard_library) + scatter_value = evaluate_named_expression( + self._scatter, + self._scatter.variable, + None, + self._scatter.expr, + bindings, + standard_library, + ) finally: # Report all files are downloaded now that all expressions are evaluated. - self.files_downloaded_hook([(p, p) for p in standard_library.get_local_paths()]) + self.files_downloaded_hook( + [(p, p) for p in standard_library.get_local_paths()] + ) if not isinstance(scatter_value, WDL.Value.Array): - raise RuntimeError("The returned value from a scatter is not an Array type.") + raise RuntimeError( + "The returned value from a scatter is not an Array type." + ) scatter_jobs = [] for subscript, item in enumerate(scatter_value.value): @@ -4036,7 +4750,15 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # TODO: We need to turn values() into a list because MyPy seems to # think a dict_values isn't a Sequence. This is a waste of time to # appease MyPy but probably better than a cast? - scatter_jobs.append(self.create_subgraph(self._scatter.body, list(self._scatter.gathers.values()), bindings, local_bindings, subscript=subscript)) + scatter_jobs.append( + self.create_subgraph( + self._scatter.body, + list(self._scatter.gathers.values()), + bindings, + local_bindings, + subscript=subscript, + ) + ) if len(scatter_jobs) == 0: # No scattering is needed. We just need to bind all the names. @@ -4047,7 +4769,9 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # if nothing in the scatter actually runs. This should be some kind of # empty array. empty_array = WDL.Value.Array(WDL.Type.Any(optional=True, null=True), []) - return self.make_gather_bindings(list(self._scatter.gathers.values()), empty_array) + return self.make_gather_bindings( + list(self._scatter.gathers.values()), empty_array + ) # Otherwise we actually have some scatter jobs. @@ -4056,13 +4780,16 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # of maybe-optional values. Each body execution will define names it # doesn't make as nulls, so we don't have to worry about # totally-missing names. - gather_job = WDLArrayBindingsJob([j.rv() for j in scatter_jobs], bindings, wdl_options=self._wdl_options) + gather_job = WDLArrayBindingsJob( + [j.rv() for j in scatter_jobs], bindings, wdl_options=self._wdl_options + ) self.addChild(gather_job) for j in scatter_jobs: j.addFollowOn(gather_job) self.defer_postprocessing(gather_job) return gather_job.rv() + class WDLArrayBindingsJob(WDLBaseJob): """ Job that takes all new bindings created in an array of input environments, @@ -4073,7 +4800,12 @@ class WDLArrayBindingsJob(WDLBaseJob): Useful for producing the results of a scatter. """ - def __init__(self, input_bindings: Sequence[Promised[WDLBindings]], base_bindings: WDLBindings, **kwargs: Any) -> None: + def __init__( + self, + input_bindings: Sequence[Promised[WDLBindings]], + base_bindings: WDLBindings, + **kwargs: Any, + ) -> None: """ Make a new job to array-ify the given input bindings. @@ -4095,7 +4827,10 @@ def run(self, file_store: AbstractFileStore) -> WDLBindings: super().run(file_store) # Subtract base bindings to get just the new bindings created in each input - new_bindings = [env.subtract(self._base_bindings) for env in unwrap_all(self._input_bindings)] + new_bindings = [ + env.subtract(self._base_bindings) + for env in unwrap_all(self._input_bindings) + ] # Make a set of all the new names. # TODO: They ought to maybe have types? Spec just says "any scalar # outputs of these tasks is now an array", with no hint on what to do @@ -4109,27 +4844,51 @@ def run(self, file_store: AbstractFileStore) -> WDLBindings: # Problem: the WDL type types are not hashable, so we need to do bad N^2 deduplication observed_types = [] for env in new_bindings: - binding_type = env.resolve(name).type if env.has_binding(name) else WDL.Type.Any() + binding_type = ( + env.resolve(name).type if env.has_binding(name) else WDL.Type.Any() + ) if binding_type not in observed_types: observed_types.append(binding_type) # Get the supertype of those types supertype: WDL.Type.Base = get_supertype(observed_types) # Bind an array of the values # TODO: We should be able to assume the binding is always there if this is a scatter, because we create and underlay bindings based on the gathers. - result = result.bind(name, WDL.Value.Array(supertype, [env.resolve(name) if env.has_binding(name) else WDL.Value.Null() for env in new_bindings])) + result = result.bind( + name, + WDL.Value.Array( + supertype, + [ + env.resolve(name) if env.has_binding(name) else WDL.Value.Null() + for env in new_bindings + ], + ), + ) # Base bindings are already included so return the result return self.postprocess(result) + class WDLConditionalJob(WDLSectionJob): """ Job that evaluates a conditional in a WDL workflow. """ - def __init__(self, conditional: WDL.Tree.Conditional, prev_node_results: Sequence[Promised[WDLBindings]], wdl_options: WDLContext, **kwargs: Any) -> None: + + def __init__( + self, + conditional: WDL.Tree.Conditional, + prev_node_results: Sequence[Promised[WDLBindings]], + wdl_options: WDLContext, + **kwargs: Any, + ) -> None: """ Create a subtree that will run a WDL conditional. The conditional itself and its contents live in the given namespace. """ - super().__init__(**kwargs, unitName=conditional.workflow_node_id, displayName=conditional.workflow_node_id, wdl_options=wdl_options) + super().__init__( + **kwargs, + unitName=conditional.workflow_node_id, + displayName=conditional.workflow_node_id, + wdl_options=wdl_options, + ) # Once again we need to ship the whole body template to be instantiated # into Toil jobs only if it will actually run. @@ -4146,7 +4905,11 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: """ super().run(file_store) - logger.info("Checking condition for %s: %s", self._conditional.workflow_node_id, self._conditional.expr) + logger.info( + "Checking condition for %s: %s", + self._conditional.workflow_node_id, + self._conditional.expr, + ) # Combine the bindings we get from previous jobs. # For a task we only see the insode-the-task namespace. @@ -4156,31 +4919,54 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # Get the expression value. Fake a name. try: - expr_value = evaluate_named_expression(self._conditional, "", WDL.Type.Boolean(), self._conditional.expr, bindings, standard_library) + expr_value = evaluate_named_expression( + self._conditional, + "", + WDL.Type.Boolean(), + self._conditional.expr, + bindings, + standard_library, + ) finally: # Report all files are downloaded now that all expressions are evaluated. - self.files_downloaded_hook([(p, p) for p in standard_library.get_local_paths()]) + self.files_downloaded_hook( + [(p, p) for p in standard_library.get_local_paths()] + ) if expr_value.value: # Evaluated to true! - logger.info('Condition is true') + logger.info("Condition is true") # Run the body and return its effects - body_job = self.create_subgraph(self._conditional.body, list(self._conditional.gathers.values()), bindings) + body_job = self.create_subgraph( + self._conditional.body, + list(self._conditional.gathers.values()), + bindings, + ) self.defer_postprocessing(body_job) return body_job.rv() else: - logger.info('Condition is false') + logger.info("Condition is false") # Return the input bindings and null bindings for all our gathers. # Should not collide at all. - gather_bindings = self.make_gather_bindings(list(self._conditional.gathers.values()), WDL.Value.Null()) + gather_bindings = self.make_gather_bindings( + list(self._conditional.gathers.values()), WDL.Value.Null() + ) return self.postprocess(combine_bindings([bindings, gather_bindings])) + class WDLWorkflowJob(WDLSectionJob): """ Job that evaluates an entire WDL workflow. """ - def __init__(self, workflow: WDL.Tree.Workflow, prev_node_results: Sequence[Promised[WDLBindings]], workflow_id: List[str], wdl_options: WDLContext, **kwargs: Any) -> None: + def __init__( + self, + workflow: WDL.Tree.Workflow, + prev_node_results: Sequence[Promised[WDLBindings]], + workflow_id: list[str], + wdl_options: WDLContext, + **kwargs: Any, + ) -> None: """ Create a subtree that will run a WDL workflow. The job returns the return value of the workflow. @@ -4210,7 +4996,12 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: """ super().run(file_store) - logger.info("Running workflow %s (%s) called as %s", self._workflow.name, self._workflow_id, self._wdl_options["namespace"]) + logger.info( + "Running workflow %s (%s) called as %s", + self._workflow.name, + self._workflow_id, + self._wdl_options["namespace"], + ) # Set up the WDL standard library standard_library = ToilWDLStdLibWorkflow(file_store, self._wdl_options) @@ -4227,10 +5018,17 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: if self._workflow.inputs: try: - bindings = evaluate_decls_to_bindings(self._workflow.inputs, bindings, standard_library, include_previous=True) + bindings = evaluate_decls_to_bindings( + self._workflow.inputs, + bindings, + standard_library, + include_previous=True, + ) finally: # Report all files are downloaded now that all expressions are evaluated. - self.files_downloaded_hook([(p, p) for p in standard_library.get_local_paths()]) + self.files_downloaded_hook( + [(p, p) for p in standard_library.get_local_paths()] + ) bindings = virtualize_files(bindings, standard_library, enforce_existence=False) # Make jobs to run all the parts of the workflow @@ -4250,19 +5048,21 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: self.defer_postprocessing(outputs_job) return outputs_job.rv() + class WDLOutputsJob(WDLBaseJob): """ Job which evaluates an outputs section for a workflow. Returns an environment with just the outputs bound, in no namespace. """ + def __init__( self, workflow: WDL.Tree.Workflow, bindings: Promised[WDLBindings], wdl_options: WDLContext, - cache_key: Optional[str] = None, - **kwargs: Any + cache_key: str | None = None, + **kwargs: Any, ): """ Make a new WDLWorkflowOutputsJob for the given workflow, with the given set of bindings after its body runs. @@ -4292,12 +5092,16 @@ def run(self, file_store: AbstractFileStore) -> WDLBindings: # Output section is declared and is nonempty, so evaluate normally # Combine the bindings from the previous job - output_bindings = evaluate_decls_to_bindings(self._workflow.outputs, unwrap(self._bindings), standard_library) + output_bindings = evaluate_decls_to_bindings( + self._workflow.outputs, unwrap(self._bindings), standard_library + ) else: # If no output section is present, start with an empty bindings output_bindings = WDL.Env.Bindings() - if self._workflow.outputs is None or self._wdl_options.get("all_call_outputs", False): + if self._workflow.outputs is None or self._wdl_options.get( + "all_call_outputs", False + ): # The output section is not declared, or we want to keep task outputs anyway. # Get all task outputs and return that @@ -4316,7 +5120,9 @@ def run(self, file_store: AbstractFileStore) -> WDLBindings: # *this* workflow? for type_binding in node.effective_outputs: output_set.add(type_binding.name) - elif isinstance(node, WDL.Tree.Scatter) or isinstance(node, WDL.Tree.Conditional): + elif isinstance(node, WDL.Tree.Scatter) or isinstance( + node, WDL.Tree.Conditional + ): # For scatters and conditionals, recurse looking for calls. for subnode in node.body: stack.append(subnode) @@ -4324,12 +5130,19 @@ def run(self, file_store: AbstractFileStore) -> WDLBindings: for binding in unwrap(self._bindings): if binding.name in output_set: # The bindings will already be namespaced with the task namespaces - output_bindings = output_bindings.bind(binding.name, binding.value) + output_bindings = output_bindings.bind( + binding.name, binding.value + ) else: # Output section is declared and is nonempty, so evaluate normally # Combine the bindings from the previous job - output_bindings = evaluate_decls_to_bindings(self._workflow.outputs, unwrap(self._bindings), standard_library, drop_missing_files=True) + output_bindings = evaluate_decls_to_bindings( + self._workflow.outputs, + unwrap(self._bindings), + standard_library, + drop_missing_files=True, + ) finally: # We don't actually know when all our files are downloaded since # anything we evaluate might devirtualize inside any expression. @@ -4339,16 +5152,21 @@ def run(self, file_store: AbstractFileStore) -> WDLBindings: # # Make sure to feed in all the paths we devirtualized as if they # were mounted into a container at their actual paths. - self.files_downloaded_hook([(p, p) for p in standard_library.get_local_paths()]) + self.files_downloaded_hook( + [(p, p) for p in standard_library.get_local_paths()] + ) # Null nonexistent optional values and error on the rest - output_bindings = drop_missing_files(output_bindings, standard_library=standard_library) + output_bindings = drop_missing_files( + output_bindings, standard_library=standard_library + ) if self._cache_key is not None: output_bindings = fill_execution_cache(self._cache_key, output_bindings, file_store, self._wdl_options) return self.postprocess(output_bindings) + class WDLStartJob(WDLSectionJob): """ Job that evaluates an entire WDL workflow, and returns the workflow outputs @@ -4356,7 +5174,13 @@ class WDLStartJob(WDLSectionJob): the workflow name; both forms are accepted. """ - def __init__(self, target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: WDLBindings, wdl_options: WDLContext, **kwargs: Any) -> None: + def __init__( + self, + target: WDL.Tree.Workflow | WDL.Tree.Task, + inputs: WDLBindings, + wdl_options: WDLContext, + **kwargs: Any, + ) -> None: """ Create a subtree to run the workflow and namespace the outputs. """ @@ -4375,26 +5199,46 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: if isinstance(self._target, WDL.Tree.Workflow): # Create a workflow job. We rely in this to handle entering the input # namespace if needed, or handling free-floating inputs. - job: WDLBaseJob = WDLWorkflowJob(self._target, [self._inputs], [self._target.name], wdl_options=self._wdl_options, local=True) + job: WDLBaseJob = WDLWorkflowJob( + self._target, + [self._inputs], + [self._target.name], + wdl_options=self._wdl_options, + local=True, + ) else: # There is no workflow. Create a task job. - job = WDLTaskWrapperJob(self._target, [self._inputs], [self._target.name], wdl_options=self._wdl_options, local=True) + job = WDLTaskWrapperJob( + self._target, + [self._inputs], + [self._target.name], + wdl_options=self._wdl_options, + local=True, + ) # Run the task or workflow job.then_namespace(self._wdl_options["namespace"]) self.addChild(job) self.defer_postprocessing(job) return job.rv() + class WDLImportJob(WDLSectionJob): - def __init__(self, target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: WDLBindings, wdl_options: WDLContext, path: Optional[List[str]] = None, skip_remote: bool = False, - disk_size: Optional[ParseableIndivisibleResource] = None, **kwargs: Any): + def __init__( + self, + target: WDL.Tree.Workflow | WDL.Tree.Task, + inputs: WDLBindings, + wdl_options: WDLContext, + path: list[str] | None = None, + skip_remote: bool = False, + disk_size: ParseableIndivisibleResource | None = None, + **kwargs: Any, + ): """ Job to take the inputs from the WDL workflow and import them on a worker instead of a leader. Assumes all local and cloud files are accessible. This class is only used when runImportsOnWorkers is enabled. """ - super().__init__(wdl_options=wdl_options, local=False, - disk=disk_size, **kwargs) + super().__init__(wdl_options=wdl_options, local=False, disk=disk_size, **kwargs) self._target = target self._inputs = inputs self._path = path @@ -4405,20 +5249,49 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: Import the workflow inputs and then create and run the workflow. :return: Promise of workflow outputs """ - imported_inputs = convert_remote_files(self._inputs, file_store.jobStore, self._target.name, self._path, self._skip_remote, self._wdl_options.get("execution_dir")) - root_job = WDLStartJob(self._target, imported_inputs, wdl_options=self._wdl_options) + imported_inputs = convert_remote_files( + self._inputs, + file_store.jobStore, + self._target.name, + self._path, + self._skip_remote, + self._wdl_options.get("execution_dir") + ) + root_job = WDLStartJob( + self._target, imported_inputs, wdl_options=self._wdl_options + ) self.addChild(root_job) return root_job.rv() -def make_root_job(target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: WDLBindings, inputs_search_path: List[str], toil: Toil, wdl_options: WDLContext, options: Namespace) -> WDLSectionJob: +def make_root_job( + target: WDL.Tree.Workflow | WDL.Tree.Task, + inputs: WDLBindings, + inputs_search_path: list[str], + toil: Toil, + wdl_options: WDLContext, + options: Namespace, +) -> WDLSectionJob: if options.run_imports_on_workers: # Run WDL imports on a worker instead - root_job: WDLSectionJob = WDLImportJob(target, inputs, wdl_options=wdl_options, path=inputs_search_path, skip_remote=options.reference_inputs, disk_size=options.import_workers_disk) + root_job: WDLSectionJob = WDLImportJob( + target, + inputs, + wdl_options=wdl_options, + path=inputs_search_path, + skip_remote=options.reference_inputs, + disk_size=options.import_workers_disk, + ) else: # Run WDL imports on leader # Import any files in the bindings - imported_inputs = convert_remote_files(inputs, toil._jobStore, target.name, inputs_search_path, import_remote_files=options.reference_inputs) + imported_inputs = convert_remote_files( + inputs, + toil._jobStore, + target.name, + inputs_search_path, + import_remote_files=options.reference_inputs, + ) # Run the workflow and get its outputs namespaced with the workflow name. root_job = WDLStartJob(target, imported_inputs, wdl_options=wdl_options) return root_job @@ -4431,7 +5304,7 @@ def main() -> None: """ args = sys.argv[1:] - parser = ArgParser(description='Runs WDL files with toil.') + parser = ArgParser(description="Runs WDL files with toil.") addOptions(parser, jobstore_as_flag=True, wdl=True) options = parser.parse_args(args) @@ -4439,16 +5312,22 @@ def main() -> None: # Make sure we have a jobStore if options.jobStore is None: # TODO: Move cwltoil's generate_default_job_store where we can use it - options.jobStore = os.path.join(mkdtemp(), 'tree') + options.jobStore = os.path.join(mkdtemp(), "tree") # Take care of incompatible arguments related to file imports if options.run_imports_on_workers is True and options.import_workers_disk is None: - raise RuntimeError("Commandline arguments --runImportsOnWorkers and --importWorkersDisk must both be set to run file imports on workers.") + raise RuntimeError( + "Commandline arguments --runImportsOnWorkers and --importWorkersDisk must both be set to run file imports on workers." + ) # Make sure we have an output directory (or URL prefix) and we don't need # to ever worry about a None, and MyPy knows it. # If we don't have a directory assigned, make one in the current directory. - output_directory: str = options.output_directory if options.output_directory else mkdtemp(prefix='wdl-out-', dir=os.getcwd()) + output_directory: str = ( + options.output_directory + if options.output_directory + else mkdtemp(prefix="wdl-out-", dir=os.getcwd()) + ) try: with Toil(options) as toil: @@ -4456,16 +5335,20 @@ def main() -> None: output_bindings = toil.restart() else: # Load the WDL document - document: WDL.Tree.Document = WDL.load(options.wdl_uri, read_source=toil_read_source) + document: WDL.Tree.Document = WDL.load( + options.wdl_uri, read_source=toil_read_source + ) # See if we're going to run a workflow or a task - target: Union[WDL.Tree.Workflow, WDL.Tree.Task] + target: WDL.Tree.Workflow | WDL.Tree.Task if document.workflow: target = document.workflow elif len(document.tasks) == 1: target = document.tasks[0] elif len(document.tasks) > 1: - raise WDL.Error.InputError("Multiple tasks found with no workflow! Either add a workflow or keep one task.") + raise WDL.Error.InputError( + "Multiple tasks found with no workflow! Either add a workflow or keep one task." + ) else: raise WDL.Error.InputError("WDL document is empty!") @@ -4475,7 +5358,9 @@ def main() -> None: # . # # TODO: We don't support generating anything that CROO can read. - logger.warning("This WDL expects to be used with the Cromwell Output Organizer (croo) . Toil cannot yet produce the outputs that croo requires. You will not be able to use croo on the output of this Toil run!") + logger.warning( + "This WDL expects to be used with the Cromwell Output Organizer (croo) . Toil cannot yet produce the outputs that croo requires. You will not be able to use croo on the output of this Toil run!" + ) # But we can assume that we need to preserve individual # taks outputs since the point of CROO is fetching those @@ -4485,10 +5370,11 @@ def main() -> None: # runs of the popular # if options.all_call_outputs is None: - logger.warning("Inferring --allCallOutputs=True to preserve probable actual outputs of a croo WDL file.") + logger.warning( + "Inferring --allCallOutputs=True to preserve probable actual outputs of a croo WDL file." + ) options.all_call_outputs = True - if options.inputs_uri: # Load the inputs. Use the same loading mechanism, which means we # have to break into async temporarily. @@ -4497,14 +5383,30 @@ def main() -> None: elif options.inputs_uri == "-": input_json = sys.stdin.read() else: - input_json = asyncio.run(toil_read_source(options.inputs_uri, [], None)).source_text + input_json = asyncio.run( + toil_read_source(options.inputs_uri, [], None) + ).source_text try: inputs = json.loads(input_json) except json.JSONDecodeError as e: # Complain about the JSON document. # We need the absolute path or URL to raise the error - inputs_abspath = options.inputs_uri if not os.path.exists(options.inputs_uri) else os.path.abspath(options.inputs_uri) - raise WDL.Error.ValidationError(WDL.Error.SourcePosition(options.inputs_uri, inputs_abspath, e.lineno, e.colno, e.lineno, e.colno + 1), "Cannot parse input JSON: " + e.msg) from e + inputs_abspath = ( + options.inputs_uri + if not os.path.exists(options.inputs_uri) + else os.path.abspath(options.inputs_uri) + ) + raise WDL.Error.ValidationError( + WDL.Error.SourcePosition( + options.inputs_uri, + inputs_abspath, + e.lineno, + e.colno, + e.lineno, + e.colno + 1, + ), + "Cannot parse input JSON: " + e.msg, + ) from e else: inputs = {} @@ -4514,12 +5416,14 @@ def main() -> None: # have to cast from more specific to less specific ones here. # The miniwld values_from_json function can evaluate # expressions in the inputs or something. - WDLTypeDeclBindings = Union[WDL.Env.Bindings[WDL.Tree.Decl], WDL.Env.Bindings[WDL.Type.Base]] + WDLTypeDeclBindings = Union[ + WDL.Env.Bindings[WDL.Tree.Decl], WDL.Env.Bindings[WDL.Type.Base] + ] input_bindings = WDL.values_from_json( inputs, cast(WDLTypeDeclBindings, target.available_inputs), cast(Optional[WDLTypeDeclBindings], target.required_inputs), - target.name + target.name, ) # Determine where to look for files referenced in the inputs, in addition to here. @@ -4527,12 +5431,17 @@ def main() -> None: if options.inputs_uri: inputs_search_path.append(options.inputs_uri) - match = re.match(r'https://raw\.githubusercontent\.com/[^/]*/[^/]*/[^/]*/', options.inputs_uri) + match = re.match( + r"https://raw\.githubusercontent\.com/[^/]*/[^/]*/[^/]*/", + options.inputs_uri, + ) if match: # Special magic for Github repos to make e.g. # https://raw.githubusercontent.com/vgteam/vg_wdl/44a03d9664db3f6d041a2f4a69bbc4f65c79533f/params/giraffe.json # work when it references things relative to repo root. - logger.info("Inputs appear to come from a Github repository; adding repository root to file search path") + logger.info( + "Inputs appear to come from a Github repository; adding repository root to file search path" + ) inputs_search_path.append(match.group(0)) # TODO: Automatically set a good MINIWDL__SINGULARITY__IMAGE_CACHE ? @@ -4540,22 +5449,40 @@ def main() -> None: # Get the execution directory execution_dir = os.getcwd() - imported_inputs = convert_remote_files(input_bindings, toil._jobStore, task_path=target.name, search_paths=inputs_search_path, import_remote_files=options.reference_inputs) + imported_inputs = convert_remote_files( + input_bindings, + toil._jobStore, + task_path=target.name, + search_paths=inputs_search_path, + import_remote_files=options.reference_inputs, + ) # Configure workflow interpreter options - wdl_options: WDLContext = {"execution_dir": execution_dir, "container": options.container, "task_path": target.name, - "namespace": target.name, "all_call_outputs": options.all_call_outputs} + wdl_options: WDLContext = { + "execution_dir": execution_dir, + "container": options.container, + "task_path": target.name, + "namespace": target.name, + "all_call_outputs": options.all_call_outputs, + } assert wdl_options.get("container") is not None # Run the workflow and get its outputs namespaced with the workflow name. - root_job = make_root_job(target, imported_inputs, inputs_search_path, toil, wdl_options, options) + root_job = make_root_job( + target, + imported_inputs, + inputs_search_path, + toil, + wdl_options, + options, + ) output_bindings = toil.start(root_job) if not isinstance(output_bindings, WDL.Env.Bindings): raise RuntimeError("The output of the WDL job is not a binding.") devirtualization_state: DirectoryNamingStateDict = {} - devirtualized_to_virtualized: Dict[str, str] = dict() - virtualized_to_devirtualized: Dict[str, str] = dict() + devirtualized_to_virtualized: dict[str, str] = dict() + virtualized_to_devirtualized: dict[str, str] = dict() # Fetch all the output files def devirtualize_output(file: WDL.Value.File) -> WDL.Value.File: @@ -4575,17 +5502,19 @@ def devirtualize_output(file: WDL.Value.File) -> WDL.Value.File: wdl_options, devirtualized_to_virtualized, virtualized_to_devirtualized, - export=True + export=True, ) return set_file_value(file, new_value) # Make all the files local files - output_bindings = map_over_files_in_bindings(output_bindings, devirtualize_output) + output_bindings = map_over_files_in_bindings( + output_bindings, devirtualize_output + ) # Report the result in the right format outputs = WDL.values_to_json(output_bindings) - if options.output_dialect == 'miniwdl': - outputs = {'dir': output_directory, 'outputs': outputs} + if options.output_dialect == "miniwdl": + outputs = {"dir": output_directory, "outputs": outputs} if options.output_file is None: # Send outputs to standard out print(json.dumps(outputs)) @@ -4593,10 +5522,10 @@ def devirtualize_output(file: WDL.Value.File) -> WDL.Value.File: # Export output to path or URL. # So we need to import and then export. fd, filename = mkstemp() - with open(fd, 'w') as handle: + with open(fd, "w") as handle: # Populate the file handle.write(json.dumps(outputs)) - handle.write('\n') + handle.write("\n") # Import it. Don't link because the temp file will go away. file_id = toil.import_file(filename, symlink=False) # Delete the temp file @@ -4610,8 +5539,3 @@ def devirtualize_output(file: WDL.Value.File) -> WDL.Value.File: if __name__ == "__main__": main() - - - - - diff --git a/src/toil/worker.py b/src/toil/worker.py index c880ad23e0..614aae9a35 100644 --- a/src/toil/worker.py +++ b/src/toil/worker.py @@ -25,23 +25,31 @@ import sys import time import traceback +from collections.abc import Iterator from contextlib import contextmanager -from typing import Any, Callable, Iterator, List, Set, Optional +from typing import Any, Callable, Optional from configargparse import ArgParser from toil import logProcessContext from toil.common import Config, Toil, safeUnpickleFromStream -from toil.cwl.utils import (CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION, - CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE) +from toil.cwl.utils import ( + CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION, + CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE, +) from toil.deferred import DeferredFunctionManager from toil.fileStores.abstractFileStore import AbstractFileStore -from toil.job import CheckpointJobDescription, Job, JobDescription, DebugStoppingPointReached +from toil.job import ( + CheckpointJobDescription, + DebugStoppingPointReached, + Job, + JobDescription, +) from toil.jobStores.abstractJobStore import AbstractJobStore from toil.lib.expando import MagicExpando from toil.lib.io import make_public_dir from toil.lib.resources import ResourceMonitor -from toil.statsAndLogging import configure_root_logger, set_log_level, install_log_color +from toil.statsAndLogging import configure_root_logger, install_log_color, set_log_level logger = logging.getLogger(__name__) @@ -49,10 +57,12 @@ class StatsDict(MagicExpando): """Subclass of MagicExpando for type-checking purposes.""" - jobs: List[MagicExpando] + jobs: list[MagicExpando] -def nextChainable(predecessor: JobDescription, job_store: AbstractJobStore, config: Config) -> Optional[JobDescription]: +def nextChainable( + predecessor: JobDescription, job_store: AbstractJobStore, config: Config +) -> Optional[JobDescription]: """ Returns the next chainable job's JobDescription after the given predecessor JobDescription, if one exists, or None if the chain must terminate. @@ -61,24 +71,41 @@ def nextChainable(predecessor: JobDescription, job_store: AbstractJobStore, conf :param job_store: The JobStore to fetch JobDescriptions from. :param config: The configuration for the current run. """ - #If no more jobs to run or services not finished, quit - if predecessor.nextSuccessors() is None or len(predecessor.services) > 0 or (isinstance(predecessor, CheckpointJobDescription) and predecessor.checkpoint is not None): - logger.debug("Stopping running chain of jobs: no successors: %s, services: %s, checkpoint: %s", - predecessor.nextSuccessors() is None, len(predecessor.services), (isinstance(predecessor, CheckpointJobDescription) and predecessor.checkpoint is not None)) + # If no more jobs to run or services not finished, quit + if ( + predecessor.nextSuccessors() is None + or len(predecessor.services) > 0 + or ( + isinstance(predecessor, CheckpointJobDescription) + and predecessor.checkpoint is not None + ) + ): + logger.debug( + "Stopping running chain of jobs: no successors: %s, services: %s, checkpoint: %s", + predecessor.nextSuccessors() is None, + len(predecessor.services), + ( + isinstance(predecessor, CheckpointJobDescription) + and predecessor.checkpoint is not None + ), + ) return None - - #Get the next set of jobs to run + # Get the next set of jobs to run jobs = list(predecessor.nextSuccessors() or set()) if len(jobs) == 0: # If there are no jobs, we might just not have any children. - logger.debug("Stopping running chain of jobs because job has no ready children or follow-ons") + logger.debug( + "Stopping running chain of jobs because job has no ready children or follow-ons" + ) return None - #If there are 2 or more jobs to run in parallel we quit + # If there are 2 or more jobs to run in parallel we quit if len(jobs) >= 2: - logger.debug("No more jobs can run in series by this worker," - " it's got %i successors", len(jobs)) + logger.debug( + "No more jobs can run in series by this worker," " it's got %i successors", + len(jobs), + ) logger.debug("Two distinct successors are %s and %s", jobs[0], jobs[1]) return None @@ -90,8 +117,8 @@ def nextChainable(predecessor: JobDescription, job_store: AbstractJobStore, conf # Load the successor JobDescription successor = job_store.load_job(successorID) - #We check the requirements of the successor to see if we can run it - #within the current worker + # We check the requirements of the successor to see if we can run it + # within the current worker if successor.memory > predecessor.memory: logger.debug("We need more memory for the next job, so finishing") return None @@ -102,14 +129,20 @@ def nextChainable(predecessor: JobDescription, job_store: AbstractJobStore, conf logger.debug("We need more disk for the next job, so finishing") return None if successor.preemptible != predecessor.preemptible: - logger.debug("Preemptibility is different for the next job, returning to the leader") + logger.debug( + "Preemptibility is different for the next job, returning to the leader" + ) return None if successor.predecessorNumber > 1: - logger.debug("The next job has multiple predecessors; we must return to the leader.") + logger.debug( + "The next job has multiple predecessors; we must return to the leader." + ) return None if len(successor.services) > 0: - logger.debug("The next job requires services that will not yet be started; we must return to the leader.") + logger.debug( + "The next job requires services that will not yet be started; we must return to the leader." + ) return None if isinstance(successor, CheckpointJobDescription): @@ -117,7 +150,11 @@ def nextChainable(predecessor: JobDescription, job_store: AbstractJobStore, conf logger.debug("Next job is checkpoint, so finishing") return None - if not config.run_local_jobs_on_workers and predecessor.local and not successor.local: + if ( + not config.run_local_jobs_on_workers + and predecessor.local + and not successor.local + ): # This job might be running on the leader, but the next job may not. # # TODO: Optimize by detecting whether we actually are on the leader, @@ -128,6 +165,7 @@ def nextChainable(predecessor: JobDescription, job_store: AbstractJobStore, conf # Made it through! This job is chainable. return successor + def workerScript( job_store: AbstractJobStore, config: Config, @@ -135,7 +173,7 @@ def workerScript( job_store_id: str, redirect_output_to_log_file: bool = True, local_worker_temp_dir: Optional[str] = None, - debug_flags: Optional[Set[str]] = None + debug_flags: Optional[set[str]] = None, ) -> int: """ Worker process script, runs a job. @@ -162,7 +200,7 @@ def workerScript( logger.debug("Worker started for job %s...", job_name) ########################################## - #Create the worker killer, if requested + # Create the worker killer, if requested ########################################## logFileByteReportLimit = config.maxLogFileSize @@ -203,10 +241,10 @@ def workerScript( # before it does. Either way, init will have to clean it up for us. ########################################## - #Load the environment for the job + # Load the environment for the job ########################################## - #First load the environment for the job. + # First load the environment for the job. with job_store.read_shared_file_stream("environment.pickle") as fileHandle: environment = safeUnpickleFromStream(fileHandle) env_reject = { @@ -224,15 +262,15 @@ def workerScript( "XDG_SESSION_ID", "XDG_RUNTIME_DIR", "XDG_DATA_DIRS", - "DBUS_SESSION_BUS_ADDRESS" + "DBUS_SESSION_BUS_ADDRESS", } for i in environment: if i == "PATH": # Handle path specially. Sometimes e.g. leader may not include # /bin, but the Toil appliance needs it. - if i in os.environ and os.environ[i] != '': + if i in os.environ and os.environ[i] != "": # Use the provided PATH and then the local system's PATH - os.environ[i] = environment[i] + ':' + os.environ[i] + os.environ[i] = environment[i] + ":" + os.environ[i] else: # Use the provided PATH only os.environ[i] = environment[i] @@ -240,41 +278,45 @@ def workerScript( os.environ[i] = environment[i] # sys.path is used by __import__ to find modules if "PYTHONPATH" in environment: - for e in environment["PYTHONPATH"].split(':'): - if e != '': + for e in environment["PYTHONPATH"].split(":"): + if e != "": sys.path.append(e) ########################################## - #Setup the temporary directories. + # Setup the temporary directories. ########################################## # Dir to put all this worker's temp files in. if config.workflowID is None: raise RuntimeError("The worker workflow ID was never set.") toilWorkflowDir = Toil.getLocalWorkflowDir(config.workflowID, config.workDir) # Dir to put lock files in, ideally not on NFS. - toil_coordination_dir = Toil.get_local_workflow_coordination_dir(config.workflowID, config.workDir, config.coordination_dir) + toil_coordination_dir = Toil.get_local_workflow_coordination_dir( + config.workflowID, config.workDir, config.coordination_dir + ) if local_worker_temp_dir is None: # Invent a temp directory to work in local_worker_temp_dir = make_public_dir(toilWorkflowDir) os.chmod(local_worker_temp_dir, 0o755) ########################################## - #Setup the logging + # Setup the logging ########################################## - #This is mildly tricky because we don't just want to - #redirect stdout and stderr for this Python process; we want to redirect it - #for this process and all children. Consequently, we can't just replace - #sys.stdout and sys.stderr; we need to mess with the underlying OS-level - #file descriptors. See + # This is mildly tricky because we don't just want to + # redirect stdout and stderr for this Python process; we want to redirect it + # for this process and all children. Consequently, we can't just replace + # sys.stdout and sys.stderr; we need to mess with the underlying OS-level + # file descriptors. See - #When we start, standard input is file descriptor 0, standard output is - #file descriptor 1, and standard error is file descriptor 2. + # When we start, standard input is file descriptor 0, standard output is + # file descriptor 1, and standard error is file descriptor 2. # Do we even want to redirect output? Let the config make us not do it. - redirect_output_to_log_file = redirect_output_to_log_file and not config.disableWorkerOutputCapture + redirect_output_to_log_file = ( + redirect_output_to_log_file and not config.disableWorkerOutputCapture + ) - #What file do we want to point FDs 1 and 2 to? + # What file do we want to point FDs 1 and 2 to? tempWorkerLogPath = os.path.join(local_worker_temp_dir, "worker_log.txt") if redirect_output_to_log_file: @@ -322,6 +364,7 @@ def workerScript( def blockFn() -> bool: return True + job = None try: @@ -365,7 +408,10 @@ def blockFn() -> bool: # If a checkpoint exists, restart from the checkpoint ########################################## - if isinstance(jobDesc, CheckpointJobDescription) and jobDesc.checkpoint is not None: + if ( + isinstance(jobDesc, CheckpointJobDescription) + and jobDesc.checkpoint is not None + ): # The job is a checkpoint, and is being restarted after previously completing logger.debug("Job is a checkpoint") # If the checkpoint still has extant successors or services, its @@ -381,12 +427,23 @@ def blockFn() -> bool: # Otherwise, the job and successors are done, and we can cleanup stuff we couldn't clean # because of the job being a checkpoint else: - logger.debug("The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete.") - #Delete any remnant files - list(map(job_store.delete_file, list(filter(job_store.file_exists, jobDesc.checkpointFilesToDelete)))) + logger.debug( + "The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete." + ) + # Delete any remnant files + list( + map( + job_store.delete_file, + list( + filter( + job_store.file_exists, jobDesc.checkpointFilesToDelete + ) + ), + ) + ) ########################################## - #Setup the stats, if requested + # Setup the stats, if requested ########################################## if config.stats: @@ -397,7 +454,7 @@ def blockFn() -> bool: startTime = time.time() while True: ########################################## - #Run the job body, if there is one + # Run the job body, if there is one ########################################## logger.info("Working on job %s", jobDesc) @@ -417,11 +474,17 @@ def blockFn() -> bool: job.set_debug_flag(flag) # Create a fileStore object for the job - fileStore = AbstractFileStore.createFileStore(job_store, jobDesc, local_worker_temp_dir, blockFn, - caching=config.caching) + fileStore = AbstractFileStore.createFileStore( + job_store, + jobDesc, + local_worker_temp_dir, + blockFn, + caching=config.caching, + ) try: - with job._executor(stats=statsDict if config.stats else None, - fileStore=fileStore): + with job._executor( + stats=statsDict if config.stats else None, fileStore=fileStore + ): with deferredFunctionManager.open() as defer: with fileStore.open(job): # Get the next block function to wait on committing this job @@ -436,7 +499,12 @@ def blockFn() -> bool: # wants across multiple Toil versions. We also # still pass a jobGraph argument to placate old # versions of Cactus. - job._runner(jobGraph=None, jobStore=job_store, fileStore=fileStore, defer=defer) + job._runner( + jobGraph=None, + jobStore=job_store, + fileStore=fileStore, + defer=defer, + ) # When the executor for the job finishes it will # kick off a commit with the link to the job body @@ -445,7 +513,9 @@ def blockFn() -> bool: # Accumulate messages from this job & any subsequent chained jobs. # Keep the messages even if the job fails. statsDict.workers.logs_to_leader += fileStore.logging_messages - statsDict.workers.logging_user_streams += fileStore.logging_user_streams + statsDict.workers.logging_user_streams += ( + fileStore.logging_user_streams + ) logger.info("Completed body for %s", jobDesc) @@ -460,7 +530,7 @@ def blockFn() -> bool: raise RuntimeError("The termination flag is set") ########################################## - #Establish if we can run another job within the worker + # Establish if we can run another job within the worker ########################################## successor = nextChainable(jobDesc, job_store, config) if successor is None or config.disableChaining: @@ -483,9 +553,13 @@ def blockFn() -> bool: # Make sure nothing has gone wrong and we can really chain if jobDesc.memory < successor.memory: - raise RuntimeError("Cannot chain jobs. A job's memory cannot be less than it's successor.") + raise RuntimeError( + "Cannot chain jobs. A job's memory cannot be less than it's successor." + ) if jobDesc.cores < successor.cores: - raise RuntimeError("Cannot chain jobs. A job's cores cannot be less than it's successor.") + raise RuntimeError( + "Cannot chain jobs. A job's cores cannot be less than it's successor." + ) # Save the successor's original ID, so we can clean it (and its # body) up after we finish executing it. @@ -501,8 +575,13 @@ def blockFn() -> bool: # Build a fileStore to update the job and commit the replacement. # TODO: can we have a commit operation without an entire FileStore??? - fileStore = AbstractFileStore.createFileStore(job_store, jobDesc, local_worker_temp_dir, blockFn, - caching=config.caching) + fileStore = AbstractFileStore.createFileStore( + job_store, + jobDesc, + local_worker_temp_dir, + blockFn, + caching=config.caching, + ) # Update blockFn to wait for that commit operation. blockFn = fileStore.waitForCommit @@ -513,10 +592,12 @@ def blockFn() -> bool: logger.debug("Starting the next job") ########################################## - #Finish up the stats + # Finish up the stats ########################################## if config.stats: - totalCPUTime, totalMemoryUsage = ResourceMonitor.get_total_cpu_time_and_memory_usage() + totalCPUTime, totalMemoryUsage = ( + ResourceMonitor.get_total_cpu_time_and_memory_usage() + ) statsDict.workers.time = str(time.time() - startTime) statsDict.workers.clock = str(totalCPUTime - startClock) statsDict.workers.memory = str(totalMemoryUsage) @@ -528,25 +609,37 @@ def blockFn() -> bool: statsDict.workers.disk = str(max_bytes) # Count the jobs executed. # TODO: toil stats could compute this but its parser is too general to hook into simply. - statsDict.workers.jobs_run = len(statsDict.jobs) - + statsDict.workers.jobs_run = len(statsDict.jobs) # log the worker log path here so that if the file is truncated the path can still be found if redirect_output_to_log_file: - logger.info("Worker log can be found at %s. Set --cleanWorkDir to retain this log", local_worker_temp_dir) - - logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds", time.time() - startTime) + logger.info( + "Worker log can be found at %s. Set --cleanWorkDir to retain this log", + local_worker_temp_dir, + ) + + logger.info( + "Finished running the chain of jobs on this node, we ran for a total of %f seconds", + time.time() - startTime, + ) ########################################## - #Trapping where worker goes wrong + # Trapping where worker goes wrong ########################################## except DebugStoppingPointReached: # Job wants the worker to stop for debugging raise - except BaseException as e: #Case that something goes wrong in worker, or we are asked to stop + except ( + BaseException + ) as e: # Case that something goes wrong in worker, or we are asked to stop if not isinstance(e, SystemExit): - logger.critical("Worker crashed with traceback:\n%s", traceback.format_exc()) - logger.error("Exiting the worker because of a failed job on host %s", socket.gethostname()) + logger.critical( + "Worker crashed with traceback:\n%s", traceback.format_exc() + ) + logger.error( + "Exiting the worker because of a failed job on host %s", + socket.gethostname(), + ) if isinstance(e, CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION): # We need to inform the leader that this is a CWL workflow problem # and it needs to inform its caller. @@ -557,6 +650,7 @@ def blockFn() -> bool: else: try: from WDL.runtime.error import CommandFailed + if isinstance(e, CommandFailed): failure_exit_code = e.exit_status except ImportError: @@ -577,16 +671,15 @@ def blockFn() -> bool: logger.debug("cwltool.main._terminate_processess exception: %s", (e)) raise e - ########################################## - #Wait for the asynchronous chain of writes/updates to finish + # Wait for the asynchronous chain of writes/updates to finish ########################################## blockFn() ########################################## - #All the asynchronous worker/update threads must be finished now, - #so safe to test if they completed okay + # All the asynchronous worker/update threads must be finished now, + # so safe to test if they completed okay ########################################## if AbstractFileStore._terminateEvent.is_set(): @@ -599,7 +692,7 @@ def blockFn() -> bool: jobAttemptFailed = True ########################################## - #Cleanup + # Cleanup ########################################## # Close the worker logging @@ -638,32 +731,48 @@ def blockFn() -> bool: jobDesc.jobStoreID, cleanup=True ) with job_store.update_file_stream(logJobStoreFileID) as w: - with open(tempWorkerLogPath, 'rb') as f: - if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit !=0: + with open(tempWorkerLogPath, "rb") as f: + if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit != 0: if logFileByteReportLimit > 0: - f.seek(-logFileByteReportLimit, 2) # seek to last tooBig bytes of file + f.seek( + -logFileByteReportLimit, 2 + ) # seek to last tooBig bytes of file elif logFileByteReportLimit < 0: - f.seek(logFileByteReportLimit, 0) # seek to first tooBig bytes of file + f.seek( + logFileByteReportLimit, 0 + ) # seek to first tooBig bytes of file # Dump the possibly-invalid-Unicode bytes into the log file - w.write(f.read()) # TODO load file using a buffer + w.write(f.read()) # TODO load file using a buffer # Commit log file reference back to JobStore job_store.update_job(jobDesc) - elif ((debugging or (config.writeLogsFromAllJobs and not jobDesc.local)) - and redirect_output_to_log_file): # write log messages - with open(tempWorkerLogPath, 'rb') as logFile: + elif ( + debugging or (config.writeLogsFromAllJobs and not jobDesc.local) + ) and redirect_output_to_log_file: # write log messages + with open(tempWorkerLogPath, "rb") as logFile: if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit != 0: if logFileByteReportLimit > 0: - logFile.seek(-logFileByteReportLimit, 2) # seek to last tooBig bytes of file + logFile.seek( + -logFileByteReportLimit, 2 + ) # seek to last tooBig bytes of file elif logFileByteReportLimit < 0: - logFile.seek(logFileByteReportLimit, 0) # seek to first tooBig bytes of file + logFile.seek( + logFileByteReportLimit, 0 + ) # seek to first tooBig bytes of file # Make sure lines are Unicode so they can be JSON serialized as part of the dict. # We may have damaged the Unicode text by cutting it at an arbitrary byte so we drop bad characters. - logMessages = [line.decode('utf-8', 'skip') for line in logFile.read().splitlines()] + logMessages = [ + line.decode("utf-8", "skip") for line in logFile.read().splitlines() + ] statsDict.logs.names = [names.stats_name for names in jobDesc.get_chain()] statsDict.logs.messages = logMessages - if debugging or config.stats or statsDict.workers.logs_to_leader or statsDict.workers.logging_user_streams: + if ( + debugging + or config.stats + or statsDict.workers.logs_to_leader + or statsDict.workers.logging_user_streams + ): # We have stats/logging to report back. # We report even if the job attempt failed. # TODO: Will that upset analysis of the stats? @@ -671,7 +780,12 @@ def blockFn() -> bool: # Remove the temp dir cleanUp = config.cleanWorkDir - if cleanUp == 'always' or (cleanUp == 'onSuccess' and not jobAttemptFailed) or (cleanUp == 'onError' and jobAttemptFailed): + if ( + cleanUp == "always" + or (cleanUp == "onSuccess" and not jobAttemptFailed) + or (cleanUp == "onError" and jobAttemptFailed) + ): + def make_parent_writable(func: Callable[[str], Any], path: str, _: Any) -> None: """ When encountering an error removing a file or directory, make sure @@ -682,9 +796,17 @@ def make_parent_writable(func: Callable[[str], Any], path: str, _: Any) -> None: """ # Just chmod it for rwx for user. This can't work anyway if it isn't ours. try: - os.chmod(os.path.dirname(path), stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) + os.chmod( + os.path.dirname(path), stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR + ) except PermissionError as e: - logger.error('Could not set permissions on %s to allow cleanup of %s: %s', os.path.dirname(path), path, e) + logger.error( + "Could not set permissions on %s to allow cleanup of %s: %s", + os.path.dirname(path), + path, + e, + ) + shutil.rmtree(local_worker_temp_dir, onerror=make_parent_writable) # This must happen after the log file is done with, else there is no place to put the log @@ -693,13 +815,13 @@ def make_parent_writable(func: Callable[[str], Any], path: str, _: Any) -> None: # We can now safely get rid of the JobDescription, and all jobs it chained up job_store.delete_job(merged_in.job_store_id) - if jobAttemptFailed: return failure_exit_code else: return 0 -def parse_args(args: List[str]) -> Any: + +def parse_args(args: list[str]) -> Any: """ Parse command-line arguments to the worker. """ @@ -713,26 +835,33 @@ def parse_args(args: List[str]) -> Any: # Now add all the options to it # Base required job information - parser.add_argument("jobName", type=str, - help="Text name of the job being run") - parser.add_argument("jobStoreLocator", type=str, - help="Information required to connect to the job store") - parser.add_argument("jobStoreID", type=str, - help="ID of the job within the job store") + parser.add_argument("jobName", type=str, help="Text name of the job being run") + parser.add_argument( + "jobStoreLocator", + type=str, + help="Information required to connect to the job store", + ) + parser.add_argument( + "jobStoreID", type=str, help="ID of the job within the job store" + ) # Additional worker abilities - parser.add_argument("--context", default=[], action="append", + parser.add_argument( + "--context", + default=[], + action="append", help="""Pickled, base64-encoded context manager(s) to run job inside of. Allows the Toil leader to pass setup and cleanup work provided by the batch system, in the form of pickled Python context manager objects, that the worker can then run before/after the job on the batch - system's behalf.""") + system's behalf.""", + ) return parser.parse_args(args) @contextmanager -def in_contexts(contexts: List[str]) -> Iterator[None]: +def in_contexts(contexts: list[str]) -> Iterator[None]: """ Unpickle and enter all the pickled, base64-encoded context managers in the given list. Then do the body, then leave them all. @@ -746,10 +875,12 @@ def in_contexts(contexts: List[str]) -> Iterator[None]: rest = contexts[1:] try: - manager = pickle.loads(base64.b64decode(first.encode('utf-8'))) + manager = pickle.loads(base64.b64decode(first.encode("utf-8"))) except: exc_info = sys.exc_info() - logger.error('Exception while unpickling context manager: ', exc_info=exc_info) + logger.error( + "Exception while unpickling context manager: ", exc_info=exc_info + ) raise with manager: @@ -759,14 +890,14 @@ def in_contexts(contexts: List[str]) -> Iterator[None]: yield -def main(argv: Optional[List[str]] = None) -> None: +def main(argv: Optional[list[str]] = None) -> None: if argv is None: argv = sys.argv # Parse our command line options = parse_args(argv) ########################################## - #Load the jobStore/config file + # Load the jobStore/config file ########################################## job_store = Toil.resumeJobStore(options.jobStoreLocator) diff --git a/version_template.py b/version_template.py index 59855d7496..24a3caea58 100644 --- a/version_template.py +++ b/version_template.py @@ -28,8 +28,8 @@ # - don't import even standard modules at global scope without renaming them # to have leading/trailing underscores -baseVersion = '7.1.0a1' -cgcloudVersion = '1.6.0a1.dev393' +baseVersion = "7.1.0a1" +cgcloudVersion = "1.6.0a1.dev393" def version(): @@ -37,7 +37,10 @@ def version(): A version identifier that includes the full-length commit SHA1 and an optional suffix to indicate that the working copy is dirty. """ - return '-'.join(filter(None, [distVersion(), currentCommit(), ('dirty' if dirty() else None)])) + return "-".join( + filter(None, [distVersion(), currentCommit(), ("dirty" if dirty() else None)]) + ) + def cacheTag(): """ @@ -45,22 +48,27 @@ def cacheTag(): """ import os - return ''.join([ - "cache-", - # Pick up branch or tag from Gitlagb CI, or just use "local" for everyone. - ((os.getenv('CI_COMMIT_BRANCH', '') + os.getenv('CI_COMMIT_TAG', '')) or 'local').replace('/', '-'), - _pythonVersionSuffix() - ]) + + return "".join( + [ + "cache-", + # Pick up branch or tag from Gitlagb CI, or just use "local" for everyone. + ( + (os.getenv("CI_COMMIT_BRANCH", "") + os.getenv("CI_COMMIT_TAG", "")) + or "local" + ).replace("/", "-"), + _pythonVersionSuffix(), + ] + ) + def mainCacheTag(): """ A Docker tag where the Toil mainline builds cache their layers. """ - return ''.join([ - "cache-master", - _pythonVersionSuffix() - ]) + return "".join(["cache-master", _pythonVersionSuffix()]) + def distVersion(): """The distribution version identifying a published release on PyPI.""" @@ -73,7 +81,8 @@ def exactPython(): for. Something like 'python2.7' or 'python3.6'. """ import sys - return f'python{sys.version_info[0]}.{sys.version_info[1]}' + + return f"python{sys.version_info[0]}.{sys.version_info[1]}" def python(): @@ -99,7 +108,7 @@ def _pythonVersionSuffix(): # For now, we assume all Python 3 releases are intercompatible. # We also only tag the Python 2 releases specially, since Python 2 is old and busted. - return f'-py{sys.version_info[0]}.{sys.version_info[1]}' + return f"-py{sys.version_info[0]}.{sys.version_info[1]}" def dockerTag(): @@ -110,37 +119,49 @@ def dockerTag(): def currentCommit(): import os from subprocess import check_output + try: git_root_dir = os.path.dirname(os.path.abspath(__file__)) - output = check_output(f'git log --pretty=oneline -n 1 -- {git_root_dir}', - shell=True, - cwd=git_root_dir).decode('utf-8').split()[0] + output = ( + check_output( + f"git log --pretty=oneline -n 1 -- {git_root_dir}", + shell=True, + cwd=git_root_dir, + ) + .decode("utf-8") + .split()[0] + ) except: # Return this if we are not in a git environment. - return '000' + return "000" if isinstance(output, bytes): - return output.decode('utf-8') + return output.decode("utf-8") return str(output) def dockerRegistry(): import os - return os.getenv('TOIL_DOCKER_REGISTRY', 'quay.io/ucsc_cgl') + + return os.getenv("TOIL_DOCKER_REGISTRY", "quay.io/ucsc_cgl") def dockerName(): import os - return os.getenv('TOIL_DOCKER_NAME', 'toil') + + return os.getenv("TOIL_DOCKER_NAME", "toil") def dirty(): import os from subprocess import call + try: git_root_dir = os.path.dirname(os.path.abspath(__file__)) - return 0 != call('(git diff --exit-code && git diff --cached --exit-code) > /dev/null', - shell=True, - cwd=git_root_dir) + return 0 != call( + "(git diff --exit-code && git diff --cached --exit-code) > /dev/null", + shell=True, + cwd=git_root_dir, + ) except: return False # In case the git call fails. @@ -154,8 +175,11 @@ def expand_(name=None, others=None): :param dict others: A dictionary of additional variables to be included in the return value. """ - variables = {k: v for k, v in globals().items() - if not k.startswith('_') and not k.endswith('_')} + variables = { + k: v + for k, v in globals().items() + if not k.startswith("_") and not k.endswith("_") + } if others is not None: variables.update(others) @@ -167,15 +191,16 @@ def resolve(k): return v if name is None: - return ''.join(f"{k} = {repr(resolve(k))}\n" for k, v in variables.items()) + return "".join(f"{k} = {repr(resolve(k))}\n" for k, v in variables.items()) else: return resolve(name) def _main(): import sys + sys.stdout.write(expand_(*sys.argv[1:])) -if __name__ == '__main__': +if __name__ == "__main__": _main()