diff --git a/torchx/components/dist.py b/torchx/components/dist.py index 69b9b41c7..e369ec5b7 100644 --- a/torchx/components/dist.py +++ b/torchx/components/dist.py @@ -134,7 +134,10 @@ def ddp( script: str, image: str = torchx.IMAGE, name: Optional[str] = None, - h: str = "aws_t3.medium", + cpu: int = 2, + gpu: int = 0, + memMB: int = 1024, + h: Optional[str] = None, j: str = "1x2", rdzv_endpoint: str = "etcd-server.default.svc.cluster.local:2379", ) -> specs.AppDef: @@ -143,12 +146,19 @@ def ddp( Uses `torch.distributed.run `_ to launch and coordinate pytorch worker processes. + Note: (cpu, gpu, memMB) parameters are mutually exclusive with ``h`` (named resource) where + ``h`` takes precedence if specified for setting resource requirements. + See `registering named resources `_. + Args: script_args: arguments to the main module script: script or binary to run within the image image: image (e.g. docker) name: job name override (uses the script name if not specified) - h: a registered named resource + cpu: number of cpus per replica + gpu: number of gpus per replica + memMB: cpu memory in MB per replica + h: a registered named resource (if specified takes precedence over cpu, gpu, memMB) j: {nnodes}x{nproc_per_node}, for gpu hosts, nproc_per_node must not exceed num gpus rdzv_endpoint: etcd server endpoint (only matters when nnodes > 1) """ @@ -172,7 +182,7 @@ def ddp( image=image, entrypoint="python", num_replicas=nnodes, - resource=specs.named_resources[h], + resource=specs.resource(cpu=cpu, gpu=gpu, memMB=memMB, h=h), args=[ "-m", "torch.distributed.run", diff --git a/torchx/components/utils.py b/torchx/components/utils.py index 7fbaed67a..a8267981b 100644 --- a/torchx/components/utils.py +++ b/torchx/components/utils.py @@ -101,7 +101,10 @@ def python( c: Optional[str] = None, image: str = torchx.IMAGE, name: str = "torchx_utils_python", - host: str = "aws_t3.medium", + cpu: int = 2, + gpu: int = 0, + memMB: int = 1024, + h: Optional[str] = None, num_replicas: int = 1, ) -> specs.AppDef: """ @@ -109,13 +112,20 @@ def python( image and host. Use ``--`` to separate component args and program args (e.g. ``torchx run utils.python --m foo.main -- --args to --main``) + Note: (cpu, gpu, memMB) parameters are mutually exclusive with ``h`` (named resource) where + ``h`` takes precedence if specified for setting resource requirements. + See `registering named resources `_. + Args: args: arguments passed to the program in sys.argv[1:] (ignored with `--c`) m: run library module as a script c: program passed as string (may error if scheduler has a length limit on args) image: image to run on name: name of the job - host: a registered named resource + cpu: number of cpus per replica + gpu: number of gpus per replica + memMB: cpu memory in MB per replica + h: a registered named resource (if specified takes precedence over cpu, gpu, memMB) num_replicas: number of copies to run (each on its own container) :return: """ @@ -134,7 +144,7 @@ def python( image=image, entrypoint="python", num_replicas=num_replicas, - resource=specs.named_resources[host], + resource=specs.resource(cpu=cpu, gpu=gpu, memMB=memMB, h=h), # pyre-ignore[6]: one of (only one of) m or c HAS to be not null args=[ "-m" if m else "-c", diff --git a/torchx/specs/__init__.py b/torchx/specs/__init__.py index 4b462163f..b26614057 100644 --- a/torchx/specs/__init__.py +++ b/torchx/specs/__init__.py @@ -11,7 +11,7 @@ scheduler or pipeline adapter. """ -from typing import Dict +from typing import Dict, Optional import torchx.specs.named_resources_aws as aws_resources from torchx.util.entrypoints import load_group @@ -72,6 +72,51 @@ def _load_named_resources() -> Dict[str, Resource]: named_resources: Dict[str, Resource] = _load_named_resources() +def resource( + cpu: Optional[int] = None, + gpu: Optional[int] = None, + memMB: Optional[int] = None, + h: Optional[str] = None, +) -> Resource: + """ + Convenience method to create a ``Resource`` object from either the + raw resource specs (cpu, gpu, memMB) or the registered named resource (``h``). + Note that the (cpu, gpu, memMB) is mutually exclusive with ``h`` + with ``h`` taking predecence if specified. + + If ``h`` is specified then it is used to look up the + resource specs from the list of registered named resources. + See `registering named resource `_. + + Otherwise a ``Resource`` object is created from the raw resource specs. + + Example: + + .. code-block:: python + resource(cpu=1) # returns Resource(cpu=1) + resource(named_resource="foobar") # returns registered named resource "foo" + resource(cpu=1, named_resource="foobar") # returns registered named resource "foo" (cpu=1 ignored) + resource() # returns default resource values + resource(cpu=None, gpu=None, memMB=None) # throws + """ + + if h: + return get_named_resources(h) + else: + # could make these defaults customizable via entrypoint + # not doing that now since its not a requested feature and may just over complicate things + # keeping these defaults method local so that no one else takes a dep on it + DEFAULT_CPU = 2 + DEFAULT_GPU = 0 + DEFAULT_MEM_MB = 1024 + + return Resource( + cpu=cpu or DEFAULT_CPU, + gpu=gpu or DEFAULT_GPU, + memMB=memMB or DEFAULT_MEM_MB, + ) + + def get_named_resources(res: str) -> Resource: """ Get resource object based on the string definition registered via entrypoints.txt. diff --git a/torchx/specs/test/api_test.py b/torchx/specs/test/api_test.py index e10acb547..af9409171 100644 --- a/torchx/specs/test/api_test.py +++ b/torchx/specs/test/api_test.py @@ -14,7 +14,7 @@ import torchx.specs.named_resources_aws as named_resources_aws from pyre_extensions import none_throws -from torchx.specs import named_resources +from torchx.specs import named_resources, resource from torchx.specs.api import ( _TERMINAL_STATES, MISSING, @@ -124,6 +124,18 @@ def test_named_resources(self) -> None: named_resources_aws.aws_p3_8xlarge(), named_resources["aws_p3.8xlarge"] ) + def test_resource_util_fn(self) -> None: + self.assertEqual(Resource(cpu=2, gpu=0, memMB=1024), resource()) + self.assertEqual(Resource(cpu=1, gpu=0, memMB=1024), resource(cpu=1)) + self.assertEqual(Resource(cpu=2, gpu=1, memMB=1024), resource(cpu=2, gpu=1)) + self.assertEqual( + Resource(cpu=2, gpu=1, memMB=2048), resource(cpu=2, gpu=1, memMB=2048) + ) + + h = "aws_t3.medium" + self.assertEqual(named_resources[h], resource(h=h)) + self.assertEqual(named_resources[h], resource(cpu=16, gpu=4, h="aws_t3.medium")) + class RoleBuilderTest(unittest.TestCase): def test_defaults(self) -> None: