From 2c5368790b4b72a81d8e3a3070e04e3779675695 Mon Sep 17 00:00:00 2001 From: Kiuk Chung Date: Wed, 16 Feb 2022 12:58:29 -0800 Subject: [PATCH] (torchx/components) expose raw resource params for dist.ddp Summary: Exposes raw resources (cpu, gpu, memMB) in addition to "host" (named resource) for dist.ddp and utils.python to make it convenient for users who don't want to register named resources in entrypoint. Reviewed By: aivanou Differential Revision: D34260962 fbshipit-source-id: 3d9db25b60b0a3aef5a0ecf5962f20d68d776afe --- torchx/components/dist.py | 16 +++++++++--- torchx/components/utils.py | 16 +++++++++--- torchx/specs/__init__.py | 47 ++++++++++++++++++++++++++++++++++- torchx/specs/test/api_test.py | 14 ++++++++++- 4 files changed, 85 insertions(+), 8 deletions(-) diff --git a/torchx/components/dist.py b/torchx/components/dist.py index 69b9b41c7..e369ec5b7 100644 --- a/torchx/components/dist.py +++ b/torchx/components/dist.py @@ -134,7 +134,10 @@ def ddp( script: str, image: str = torchx.IMAGE, name: Optional[str] = None, - h: str = "aws_t3.medium", + cpu: int = 2, + gpu: int = 0, + memMB: int = 1024, + h: Optional[str] = None, j: str = "1x2", rdzv_endpoint: str = "etcd-server.default.svc.cluster.local:2379", ) -> specs.AppDef: @@ -143,12 +146,19 @@ def ddp( Uses `torch.distributed.run `_ to launch and coordinate pytorch worker processes. + Note: (cpu, gpu, memMB) parameters are mutually exclusive with ``h`` (named resource) where + ``h`` takes precedence if specified for setting resource requirements. + See `registering named resources `_. + Args: script_args: arguments to the main module script: script or binary to run within the image image: image (e.g. docker) name: job name override (uses the script name if not specified) - h: a registered named resource + cpu: number of cpus per replica + gpu: number of gpus per replica + memMB: cpu memory in MB per replica + h: a registered named resource (if specified takes precedence over cpu, gpu, memMB) j: {nnodes}x{nproc_per_node}, for gpu hosts, nproc_per_node must not exceed num gpus rdzv_endpoint: etcd server endpoint (only matters when nnodes > 1) """ @@ -172,7 +182,7 @@ def ddp( image=image, entrypoint="python", num_replicas=nnodes, - resource=specs.named_resources[h], + resource=specs.resource(cpu=cpu, gpu=gpu, memMB=memMB, h=h), args=[ "-m", "torch.distributed.run", diff --git a/torchx/components/utils.py b/torchx/components/utils.py index 7fbaed67a..a8267981b 100644 --- a/torchx/components/utils.py +++ b/torchx/components/utils.py @@ -101,7 +101,10 @@ def python( c: Optional[str] = None, image: str = torchx.IMAGE, name: str = "torchx_utils_python", - host: str = "aws_t3.medium", + cpu: int = 2, + gpu: int = 0, + memMB: int = 1024, + h: Optional[str] = None, num_replicas: int = 1, ) -> specs.AppDef: """ @@ -109,13 +112,20 @@ def python( image and host. Use ``--`` to separate component args and program args (e.g. ``torchx run utils.python --m foo.main -- --args to --main``) + Note: (cpu, gpu, memMB) parameters are mutually exclusive with ``h`` (named resource) where + ``h`` takes precedence if specified for setting resource requirements. + See `registering named resources `_. + Args: args: arguments passed to the program in sys.argv[1:] (ignored with `--c`) m: run library module as a script c: program passed as string (may error if scheduler has a length limit on args) image: image to run on name: name of the job - host: a registered named resource + cpu: number of cpus per replica + gpu: number of gpus per replica + memMB: cpu memory in MB per replica + h: a registered named resource (if specified takes precedence over cpu, gpu, memMB) num_replicas: number of copies to run (each on its own container) :return: """ @@ -134,7 +144,7 @@ def python( image=image, entrypoint="python", num_replicas=num_replicas, - resource=specs.named_resources[host], + resource=specs.resource(cpu=cpu, gpu=gpu, memMB=memMB, h=h), # pyre-ignore[6]: one of (only one of) m or c HAS to be not null args=[ "-m" if m else "-c", diff --git a/torchx/specs/__init__.py b/torchx/specs/__init__.py index 4b462163f..b26614057 100644 --- a/torchx/specs/__init__.py +++ b/torchx/specs/__init__.py @@ -11,7 +11,7 @@ scheduler or pipeline adapter. """ -from typing import Dict +from typing import Dict, Optional import torchx.specs.named_resources_aws as aws_resources from torchx.util.entrypoints import load_group @@ -72,6 +72,51 @@ def _load_named_resources() -> Dict[str, Resource]: named_resources: Dict[str, Resource] = _load_named_resources() +def resource( + cpu: Optional[int] = None, + gpu: Optional[int] = None, + memMB: Optional[int] = None, + h: Optional[str] = None, +) -> Resource: + """ + Convenience method to create a ``Resource`` object from either the + raw resource specs (cpu, gpu, memMB) or the registered named resource (``h``). + Note that the (cpu, gpu, memMB) is mutually exclusive with ``h`` + with ``h`` taking predecence if specified. + + If ``h`` is specified then it is used to look up the + resource specs from the list of registered named resources. + See `registering named resource `_. + + Otherwise a ``Resource`` object is created from the raw resource specs. + + Example: + + .. code-block:: python + resource(cpu=1) # returns Resource(cpu=1) + resource(named_resource="foobar") # returns registered named resource "foo" + resource(cpu=1, named_resource="foobar") # returns registered named resource "foo" (cpu=1 ignored) + resource() # returns default resource values + resource(cpu=None, gpu=None, memMB=None) # throws + """ + + if h: + return get_named_resources(h) + else: + # could make these defaults customizable via entrypoint + # not doing that now since its not a requested feature and may just over complicate things + # keeping these defaults method local so that no one else takes a dep on it + DEFAULT_CPU = 2 + DEFAULT_GPU = 0 + DEFAULT_MEM_MB = 1024 + + return Resource( + cpu=cpu or DEFAULT_CPU, + gpu=gpu or DEFAULT_GPU, + memMB=memMB or DEFAULT_MEM_MB, + ) + + def get_named_resources(res: str) -> Resource: """ Get resource object based on the string definition registered via entrypoints.txt. diff --git a/torchx/specs/test/api_test.py b/torchx/specs/test/api_test.py index e10acb547..af9409171 100644 --- a/torchx/specs/test/api_test.py +++ b/torchx/specs/test/api_test.py @@ -14,7 +14,7 @@ import torchx.specs.named_resources_aws as named_resources_aws from pyre_extensions import none_throws -from torchx.specs import named_resources +from torchx.specs import named_resources, resource from torchx.specs.api import ( _TERMINAL_STATES, MISSING, @@ -124,6 +124,18 @@ def test_named_resources(self) -> None: named_resources_aws.aws_p3_8xlarge(), named_resources["aws_p3.8xlarge"] ) + def test_resource_util_fn(self) -> None: + self.assertEqual(Resource(cpu=2, gpu=0, memMB=1024), resource()) + self.assertEqual(Resource(cpu=1, gpu=0, memMB=1024), resource(cpu=1)) + self.assertEqual(Resource(cpu=2, gpu=1, memMB=1024), resource(cpu=2, gpu=1)) + self.assertEqual( + Resource(cpu=2, gpu=1, memMB=2048), resource(cpu=2, gpu=1, memMB=2048) + ) + + h = "aws_t3.medium" + self.assertEqual(named_resources[h], resource(h=h)) + self.assertEqual(named_resources[h], resource(cpu=16, gpu=4, h="aws_t3.medium")) + class RoleBuilderTest(unittest.TestCase): def test_defaults(self) -> None: