Skip to content

(torchx/components) expose raw resource params for dist.ddp #395

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions torchx/components/dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,10 @@ def ddp(
script: str,
image: str = torchx.IMAGE,
name: Optional[str] = None,
h: str = "aws_t3.medium",
cpu: int = 2,
gpu: int = 0,
memMB: int = 1024,
h: Optional[str] = None,
j: str = "1x2",
rdzv_endpoint: str = "etcd-server.default.svc.cluster.local:2379",
) -> specs.AppDef:
Expand All @@ -143,12 +146,19 @@ def ddp(
Uses `torch.distributed.run <https://pytorch.org/docs/stable/distributed.elastic.html>`_
to launch and coordinate pytorch worker processes.

Note: (cpu, gpu, memMB) parameters are mutually exclusive with ``h`` (named resource) where
``h`` takes precedence if specified for setting resource requirements.
See `registering named resources <https://pytorch.org/torchx/latest/advanced.html#registering-named-resources>`_.

Args:
script_args: arguments to the main module
script: script or binary to run within the image
image: image (e.g. docker)
name: job name override (uses the script name if not specified)
h: a registered named resource
cpu: number of cpus per replica
gpu: number of gpus per replica
memMB: cpu memory in MB per replica
h: a registered named resource (if specified takes precedence over cpu, gpu, memMB)
j: {nnodes}x{nproc_per_node}, for gpu hosts, nproc_per_node must not exceed num gpus
rdzv_endpoint: etcd server endpoint (only matters when nnodes > 1)
"""
Expand All @@ -172,7 +182,7 @@ def ddp(
image=image,
entrypoint="python",
num_replicas=nnodes,
resource=specs.named_resources[h],
resource=specs.resource(cpu=cpu, gpu=gpu, memMB=memMB, h=h),
args=[
"-m",
"torch.distributed.run",
Expand Down
16 changes: 13 additions & 3 deletions torchx/components/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,21 +101,31 @@ def python(
c: Optional[str] = None,
image: str = torchx.IMAGE,
name: str = "torchx_utils_python",
host: str = "aws_t3.medium",
cpu: int = 2,
gpu: int = 0,
memMB: int = 1024,
h: Optional[str] = None,
num_replicas: int = 1,
) -> specs.AppDef:
"""
Runs ``python -c CMD`` or ``python -m MODULE`` on the specified
image and host. Use ``--`` to separate component args and program args
(e.g. ``torchx run utils.python --m foo.main -- --args to --main``)

Note: (cpu, gpu, memMB) parameters are mutually exclusive with ``h`` (named resource) where
``h`` takes precedence if specified for setting resource requirements.
See `registering named resources <https://pytorch.org/torchx/latest/advanced.html#registering-named-resources>`_.

Args:
args: arguments passed to the program in sys.argv[1:] (ignored with `--c`)
m: run library module as a script
c: program passed as string (may error if scheduler has a length limit on args)
image: image to run on
name: name of the job
host: a registered named resource
cpu: number of cpus per replica
gpu: number of gpus per replica
memMB: cpu memory in MB per replica
h: a registered named resource (if specified takes precedence over cpu, gpu, memMB)
num_replicas: number of copies to run (each on its own container)
:return:
"""
Expand All @@ -134,7 +144,7 @@ def python(
image=image,
entrypoint="python",
num_replicas=num_replicas,
resource=specs.named_resources[host],
resource=specs.resource(cpu=cpu, gpu=gpu, memMB=memMB, h=h),
# pyre-ignore[6]: one of (only one of) m or c HAS to be not null
args=[
"-m" if m else "-c",
Expand Down
47 changes: 46 additions & 1 deletion torchx/specs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
scheduler or pipeline adapter.
"""

from typing import Dict
from typing import Dict, Optional

import torchx.specs.named_resources_aws as aws_resources
from torchx.util.entrypoints import load_group
Expand Down Expand Up @@ -72,6 +72,51 @@ def _load_named_resources() -> Dict[str, Resource]:
named_resources: Dict[str, Resource] = _load_named_resources()


def resource(
cpu: Optional[int] = None,
gpu: Optional[int] = None,
memMB: Optional[int] = None,
h: Optional[str] = None,
) -> Resource:
"""
Convenience method to create a ``Resource`` object from either the
raw resource specs (cpu, gpu, memMB) or the registered named resource (``h``).
Note that the (cpu, gpu, memMB) is mutually exclusive with ``h``
with ``h`` taking predecence if specified.

If ``h`` is specified then it is used to look up the
resource specs from the list of registered named resources.
See `registering named resource <https://pytorch.org/torchx/latest/advanced.html#registering-named-resources>`_.

Otherwise a ``Resource`` object is created from the raw resource specs.

Example:

.. code-block:: python
resource(cpu=1) # returns Resource(cpu=1)
resource(named_resource="foobar") # returns registered named resource "foo"
resource(cpu=1, named_resource="foobar") # returns registered named resource "foo" (cpu=1 ignored)
resource() # returns default resource values
resource(cpu=None, gpu=None, memMB=None) # throws
"""

if h:
return get_named_resources(h)
else:
# could make these defaults customizable via entrypoint
# not doing that now since its not a requested feature and may just over complicate things
# keeping these defaults method local so that no one else takes a dep on it
DEFAULT_CPU = 2
DEFAULT_GPU = 0
DEFAULT_MEM_MB = 1024

return Resource(
cpu=cpu or DEFAULT_CPU,
gpu=gpu or DEFAULT_GPU,
memMB=memMB or DEFAULT_MEM_MB,
)


def get_named_resources(res: str) -> Resource:
"""
Get resource object based on the string definition registered via entrypoints.txt.
Expand Down
14 changes: 13 additions & 1 deletion torchx/specs/test/api_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

import torchx.specs.named_resources_aws as named_resources_aws
from pyre_extensions import none_throws
from torchx.specs import named_resources
from torchx.specs import named_resources, resource
from torchx.specs.api import (
_TERMINAL_STATES,
MISSING,
Expand Down Expand Up @@ -124,6 +124,18 @@ def test_named_resources(self) -> None:
named_resources_aws.aws_p3_8xlarge(), named_resources["aws_p3.8xlarge"]
)

def test_resource_util_fn(self) -> None:
self.assertEqual(Resource(cpu=2, gpu=0, memMB=1024), resource())
self.assertEqual(Resource(cpu=1, gpu=0, memMB=1024), resource(cpu=1))
self.assertEqual(Resource(cpu=2, gpu=1, memMB=1024), resource(cpu=2, gpu=1))
self.assertEqual(
Resource(cpu=2, gpu=1, memMB=2048), resource(cpu=2, gpu=1, memMB=2048)
)

h = "aws_t3.medium"
self.assertEqual(named_resources[h], resource(h=h))
self.assertEqual(named_resources[h], resource(cpu=16, gpu=4, h="aws_t3.medium"))


class RoleBuilderTest(unittest.TestCase):
def test_defaults(self) -> None:
Expand Down