meta-pytorch · kiukchung · Feb 16, 2022
diff --git a/torchx/components/dist.py b/torchx/components/dist.py
@@ -134,7 +134,10 @@ def ddp(
     script: str,
     image: str = torchx.IMAGE,
     name: Optional[str] = None,
-    h: str = "aws_t3.medium",
+    cpu: int = 2,
+    gpu: int = 0,
+    memMB: int = 1024,
+    h: Optional[str] = None,
     j: str = "1x2",
     rdzv_endpoint: str = "etcd-server.default.svc.cluster.local:2379",
 ) -> specs.AppDef:
@@ -143,12 +146,19 @@ def ddp(
     Uses `torch.distributed.run <https://pytorch.org/docs/stable/distributed.elastic.html>`_
     to launch and coordinate pytorch worker processes.
 
+    Note: (cpu, gpu, memMB) parameters are mutually exclusive with ``h`` (named resource) where
+          ``h`` takes precedence if specified for setting resource requirements.
+          See `registering named resources <https://pytorch.org/torchx/latest/advanced.html#registering-named-resources>`_.
+
     Args:
         script_args: arguments to the main module
         script: script or binary to run within the image
         image: image (e.g. docker)
         name: job name override (uses the script name if not specified)
-        h: a registered named resource
+        cpu: number of cpus per replica
+        gpu: number of gpus per replica
+        memMB: cpu memory in MB per replica
+        h: a registered named resource (if specified takes precedence over cpu, gpu, memMB)
         j: {nnodes}x{nproc_per_node}, for gpu hosts, nproc_per_node must not exceed num gpus
         rdzv_endpoint: etcd server endpoint (only matters when nnodes > 1)
     """
@@ -172,7 +182,7 @@ def ddp(
                 image=image,
                 entrypoint="python",
                 num_replicas=nnodes,
-                resource=specs.named_resources[h],
+                resource=specs.resource(cpu=cpu, gpu=gpu, memMB=memMB, h=h),
                 args=[
                     "-m",
                     "torch.distributed.run",

diff --git a/torchx/components/utils.py b/torchx/components/utils.py
@@ -101,21 +101,31 @@ def python(
     c: Optional[str] = None,
     image: str = torchx.IMAGE,
     name: str = "torchx_utils_python",
-    host: str = "aws_t3.medium",
+    cpu: int = 2,
+    gpu: int = 0,
+    memMB: int = 1024,
+    h: Optional[str] = None,
     num_replicas: int = 1,
 ) -> specs.AppDef:
     """
     Runs ``python -c CMD`` or ``python -m MODULE`` on the specified
     image and host. Use ``--`` to separate component args and program args
     (e.g. ``torchx run utils.python --m foo.main -- --args to --main``)
 
+    Note: (cpu, gpu, memMB) parameters are mutually exclusive with ``h`` (named resource) where
+          ``h`` takes precedence if specified for setting resource requirements.
+          See `registering named resources <https://pytorch.org/torchx/latest/advanced.html#registering-named-resources>`_.
+
     Args:
         args: arguments passed to the program in sys.argv[1:] (ignored with `--c`)
         m: run library module as a script
         c: program passed as string (may error if scheduler has a length limit on args)
         image: image to run on
         name: name of the job
-        host: a registered named resource
+        cpu: number of cpus per replica
+        gpu: number of gpus per replica
+        memMB: cpu memory in MB per replica
+        h: a registered named resource (if specified takes precedence over cpu, gpu, memMB)
         num_replicas: number of copies to run (each on its own container)
     :return:
     """
@@ -134,7 +144,7 @@ def python(
                 image=image,
                 entrypoint="python",
                 num_replicas=num_replicas,
-                resource=specs.named_resources[host],
+                resource=specs.resource(cpu=cpu, gpu=gpu, memMB=memMB, h=h),
                 # pyre-ignore[6]: one of (only one of) m or c HAS to be not null
                 args=[
                     "-m" if m else "-c",

diff --git a/torchx/specs/__init__.py b/torchx/specs/__init__.py
@@ -11,7 +11,7 @@
 scheduler or pipeline adapter.
 """
 
-from typing import Dict
+from typing import Dict, Optional
 
 import torchx.specs.named_resources_aws as aws_resources
 from torchx.util.entrypoints import load_group
@@ -72,6 +72,51 @@ def _load_named_resources() -> Dict[str, Resource]:
 named_resources: Dict[str, Resource] = _load_named_resources()
 
 
+def resource(
+    cpu: Optional[int] = None,
+    gpu: Optional[int] = None,
+    memMB: Optional[int] = None,
+    h: Optional[str] = None,
+) -> Resource:
+    """
+    Convenience method to create a ``Resource`` object from either the
+    raw resource specs (cpu, gpu, memMB) or the registered named resource (``h``).
+    Note that the (cpu, gpu, memMB) is mutually exclusive with ``h``
+    with ``h`` taking predecence if specified.
+
+    If ``h`` is specified then it is used to look up the
+    resource specs from the list of registered named resources.
+    See `registering named resource <https://pytorch.org/torchx/latest/advanced.html#registering-named-resources>`_.
+
+    Otherwise a ``Resource`` object is created from the raw resource specs.
+
+    Example:
+
+    .. code-block:: python
+     resource(cpu=1) # returns Resource(cpu=1)
+     resource(named_resource="foobar") # returns registered named resource "foo"
+     resource(cpu=1, named_resource="foobar") # returns registered named resource "foo" (cpu=1 ignored)
+     resource() # returns default resource values
+     resource(cpu=None, gpu=None, memMB=None) # throws
+    """
+
+    if h:
+        return get_named_resources(h)
+    else:
+        # could make these defaults customizable via entrypoint
+        # not doing that now since its not a requested feature and may just over complicate things
+        # keeping these defaults method local so that no one else takes a dep on it
+        DEFAULT_CPU = 2
+        DEFAULT_GPU = 0
+        DEFAULT_MEM_MB = 1024
+
+        return Resource(
+            cpu=cpu or DEFAULT_CPU,
+            gpu=gpu or DEFAULT_GPU,
+            memMB=memMB or DEFAULT_MEM_MB,
+        )
+
+
 def get_named_resources(res: str) -> Resource:
     """
     Get resource object based on the string definition registered via entrypoints.txt.

diff --git a/torchx/specs/test/api_test.py b/torchx/specs/test/api_test.py
@@ -14,7 +14,7 @@
 
 import torchx.specs.named_resources_aws as named_resources_aws
 from pyre_extensions import none_throws
-from torchx.specs import named_resources
+from torchx.specs import named_resources, resource
 from torchx.specs.api import (
     _TERMINAL_STATES,
     MISSING,
@@ -124,6 +124,18 @@ def test_named_resources(self) -> None:
             named_resources_aws.aws_p3_8xlarge(), named_resources["aws_p3.8xlarge"]
         )
 
+    def test_resource_util_fn(self) -> None:
+        self.assertEqual(Resource(cpu=2, gpu=0, memMB=1024), resource())
+        self.assertEqual(Resource(cpu=1, gpu=0, memMB=1024), resource(cpu=1))
+        self.assertEqual(Resource(cpu=2, gpu=1, memMB=1024), resource(cpu=2, gpu=1))
+        self.assertEqual(
+            Resource(cpu=2, gpu=1, memMB=2048), resource(cpu=2, gpu=1, memMB=2048)
+        )
+
+        h = "aws_t3.medium"
+        self.assertEqual(named_resources[h], resource(h=h))
+        self.assertEqual(named_resources[h], resource(cpu=16, gpu=4, h="aws_t3.medium"))
+
 
 class RoleBuilderTest(unittest.TestCase):
     def test_defaults(self) -> None: