Skip to content

Commit

Permalink
Merge pull request #1068 from Libensemble/feature/gen_procs_gpus
Browse files Browse the repository at this point in the history
Add gen options for num_procs and num_gpus
  • Loading branch information
shuds13 authored Sep 1, 2023
2 parents 1de569c + 51d102c commit aa2447c
Show file tree
Hide file tree
Showing 11 changed files with 334 additions and 47 deletions.
1 change: 1 addition & 0 deletions .spell
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ apoints
numer
hist
inout
slac
10 changes: 10 additions & 0 deletions docs/data_structures/libE_specs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,16 @@ the ``LibeSpecs`` class. When provided as a Python class, options are validated
By default resources will be divided by workers (excluding
``zero_resource_workers``).

"gen_num_procs" [int] = ``0``:
The default number of processors (MPI ranks) required by generators. Unless
overridden by equivalent `persis_info` settings, generators will be allocated
this many processors for applications launched via the MPIExecutor.

"gen_num_gpus" [int] = ``0``:
The default number of GPUs required by generators. Unless overridden by
the equivalent `persis_info` settings, generators will be allocated this
many GPUs.

"enforce_worker_core_bounds" [bool] = ``False``:
Permit submission of tasks with a
higher processor count than the CPUs available to the worker.
Expand Down
15 changes: 11 additions & 4 deletions docs/resource_manager/overview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -193,10 +193,17 @@ if ``split2fit`` is *False*, as this could otherwise never be scheduled.
Varying generator resources
^^^^^^^^^^^^^^^^^^^^^^^^^^^

For all supporting allocation functions, setting the ``persis_info["gen_resources"]``
to an integer value will provide resource sets to generators when they are started,
with the default to provide no resources. This could be set in the calling script
or inside the allocation function.
By default, generators are not allocated resources in dynamic mode. Fixed resources
for the generator can be set using the *libE_specs* options
``gen_num_procs`` and ``gen_num_gpus``, which takes an integer value.
If only ``gen_num_gpus`` is set, then number of processors will match.

To vary generator resources, ``persis_info`` settings can be used in allocation
functions before calling the ``gen_work`` support function. This takes the
same options (``gen_num_procs`` and ``gen_num_gpus``)

Alternatively, the setting ``persis_info["gen_resources"]`` can also be set to
a number of resource sets.

Note that persistent workers maintain their resources until coming out of a
persistent state.
Expand Down
14 changes: 11 additions & 3 deletions docs/resource_manager/zero_resource_workers.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,17 @@ worker for the persistent generator - a common use-case.

In general, the number of resource sets should be set to enable the maximum
concurrency desired by the ensemble, taking into account generators and simulators.
Users can set generator resources by setting ``persis_info["gen_resources"]``
to an integer value, representing the number of resource sets to give to the
generator. The default is zero.

Users can set generator resources using the *libE_specs* options
``gen_num_procs`` and/or ``gen_num_gpus``, which take an integer values.
If only ``gen_num_gpus`` is set, then number of processors will match.

To vary generator resources, ``persis_info`` settings can be used in allocation
functions before calling the ``gen_work`` support function. This takes the
same options (``gen_num_procs`` and ``gen_num_gpus``).

Alternatively, the setting ``persis_info["gen_resources"]`` can also be set to
a number of resource sets.

The available nodes are always divided by the number of resource sets, and there
may be multiple nodes or a partition of a node in each resource set. If the split
Expand Down
51 changes: 51 additions & 0 deletions libensemble/gen_funcs/persistent_sampling_var_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,17 @@

import numpy as np

from libensemble.executors.executor import Executor
from libensemble.message_numbers import EVAL_GEN_TAG, FINISHED_PERSISTENT_GEN_TAG, PERSIS_STOP, STOP_TAG
from libensemble.tools.persistent_support import PersistentSupport
from libensemble.tools.test_support import check_gpu_setting

__all__ = [
"uniform_sample",
"uniform_sample_with_procs_gpus",
"uniform_sample_with_var_priorities",
"uniform_sample_diff_simulations",
"uniform_sample_with_sim_gen_resources",
]


Expand Down Expand Up @@ -145,3 +148,51 @@ def uniform_sample_diff_simulations(_, persis_info, gen_specs, libE_info):
b = len(calc_in)

return H_o, persis_info, FINISHED_PERSISTENT_GEN_TAG


def uniform_sample_with_sim_gen_resources(_, persis_info, gen_specs, libE_info):
"""
Randomly requests a different number of processors and gpus to be used in the
evaluation of the generated points.
.. seealso::
`test_GPU_variable_resources.py <https://github.com/Libensemble/libensemble/blob/develop/libensemble/tests/regression_tests/test_GPU_variable_resources.py>`_
""" # noqa

b, n, lb, ub = _get_user_params(gen_specs["user"])
rng = persis_info["rand_stream"]
ps = PersistentSupport(libE_info, EVAL_GEN_TAG)
tag = None

dry_run = gen_specs["user"].get("dry_run", False) # logs run lines instead of running

while tag not in [STOP_TAG, PERSIS_STOP]:
H_o = np.zeros(b, dtype=gen_specs["out"])
H_o["x"] = rng.uniform(lb, ub, (b, n))

# Run an app using resources given by libE_specs or persis_info (test purposes only)
task = Executor.executor.submit(
app_name="six_hump_camel",
app_args="-0.99 -0.19",
stdout="out.txt",
stderr="err.txt",
dry_run=dry_run,
)

if not dry_run:
task.wait() # Wait for run to complete

# Asserts GPU set correctly (for known MPI runners)
check_gpu_setting(task, print_setting=True)

# Set resources for sims
nprocs = rng.integers(1, gen_specs["user"]["max_procs"] + 1, b)
H_o["num_procs"] = nprocs # This would get matched to GPUs anyway, if no other config given
H_o["num_gpus"] = nprocs
print(f"GEN created {b} sims requiring {nprocs} procs. One GPU per proc", flush=True)

tag, Work, calc_in = ps.send_recv(H_o)
if hasattr(calc_in, "__len__"):
b = len(calc_in)

return H_o, persis_info, FINISHED_PERSISTENT_GEN_TAG
4 changes: 4 additions & 0 deletions libensemble/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,8 @@ def __init__(
dyn_keys = ("resource_sets", "num_procs", "num_gpus")
dyn_keys_in_H = any(k in self.hist.H.dtype.names for k in dyn_keys)
self.use_resource_sets = dyn_keys_in_H or self.libE_specs.get("num_resource_sets")
self.gen_num_procs = libE_specs.get("gen_num_procs", 0)
self.gen_num_gpus = libE_specs.get("gen_num_gpus", 0)

self.W = np.zeros(len(self.wcomms), dtype=Manager.worker_dtype)
self.W["worker_id"] = np.arange(len(self.wcomms)) + 1
Expand Down Expand Up @@ -571,6 +573,8 @@ def _get_alloc_libE_info(self) -> dict:
"sim_ended_count": self.hist.sim_ended_count,
"sim_max_given": self._sim_max_given(),
"use_resource_sets": self.use_resource_sets,
"gen_num_procs": self.gen_num_procs,
"gen_num_gpus": self.gen_num_gpus,
}

def _alloc_work(self, H: npt.NDArray, persis_info: dict) -> dict:
Expand Down
14 changes: 14 additions & 0 deletions libensemble/specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,20 @@ class LibeSpecs(BaseModel):
If not set, resources will be divided evenly (excluding zero_resource_workers).
"""

gen_num_procs: Optional[int]
"""
The default number of processors (MPI ranks) required by generators. Unless
overridden by the equivalent `persis_info` settings, generators will be
allocated this many processors for applications launched via the MPIExecutor.
"""

gen_num_gpus: Optional[int]
"""
The default number of GPUs required by generators. Unless overridden by
the equivalent `persis_info` settings, generators will be allocated this
many GPUs.
"""

enforce_worker_core_bounds: Optional[bool] = False
"""
If ``False``, the Executor will permit submission of tasks with a
Expand Down
130 changes: 130 additions & 0 deletions libensemble/tests/functionality_tests/test_GPU_gen_resources.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
"""
Tests variable resource detection and automatic GPU assignment in both
generator and simulators.
The persistent generator creates simulations with variable resource requirements,
while also requiring resources itself. The resources required by a sim must
not be larger than what remains once the generator resources are assigned.
The sim_f (gpu_variable_resources_from_gen) asserts that GPUs assignment
is correct for the default method for the MPI runner. GPUs are not actually
used for default application. Four GPUs per node is mocked up below (if this line
is removed, libEnsemble will detect any GPUs available).
A dry_run option is provided. This can be set in the calling script, and will
just print run-lines and GPU settings. This may be used for testing run-lines
produced and GPU settings for different MPI runners.
Execute via one of the following commands (e.g. 4 workers):
mpiexec -np 5 python test_GPU_gen_resources.py
python test_GPU_gen_resources.py --comms local --nworkers 4
When running with the above command, the number of concurrent evaluations of
the objective function will be 4, as one of the five workers will be the
persistent generator.
"""

# Do not change these lines - they are parsed by run-tests.sh
# TESTSUITE_COMMS: mpi local
# TESTSUITE_NPROCS: 5

import sys

import numpy as np

from libensemble.alloc_funcs.start_only_persistent import only_persistent_gens as alloc_f
from libensemble.executors.mpi_executor import MPIExecutor
from libensemble.gen_funcs.persistent_sampling_var_resources import uniform_sample_with_sim_gen_resources as gen_f

# Import libEnsemble items for this test
from libensemble.libE import libE
from libensemble.sim_funcs import six_hump_camel
from libensemble.sim_funcs.var_resources import gpu_variable_resources_from_gen as sim_f
from libensemble.tools import add_unique_random_streams, parse_args

# from libensemble import logger
# logger.set_level("DEBUG") # For testing the test


# Main block is necessary only when using local comms with spawn start method (default on macOS and Windows).
if __name__ == "__main__":
nworkers, is_manager, libE_specs, _ = parse_args()

libE_specs["num_resource_sets"] = nworkers # Persistent gen DOES need resources

# Mock GPU system / uncomment to detect GPUs
libE_specs["sim_dirs_make"] = True # Will only contain files if dry_run is False
libE_specs["gen_dirs_make"] = True # Will only contain files if dry_run is False
libE_specs["ensemble_dir_path"] = "./ensemble_GPU_gen_resources_w" + str(nworkers)
libE_specs["reuse_output_dir"] = True
dry_run = True

if libE_specs["comms"] == "tcp":
sys.exit("This test only runs with MPI or local -- aborting...")

# Get paths for applications to run
six_hump_camel_app = six_hump_camel.__file__
exctr = MPIExecutor()
exctr.register_app(full_path=six_hump_camel_app, app_name="six_hump_camel")

n = 2
sim_specs = {
"sim_f": sim_f,
"in": ["x"],
"out": [("f", float)],
"user": {"dry_run": dry_run},
}

gen_specs = {
"gen_f": gen_f,
"persis_in": ["f", "x", "sim_id"],
"out": [("num_procs", int), ("num_gpus", int), ("x", float, n)],
"user": {
"initial_batch_size": nworkers - 1,
"max_procs": nworkers - 1, # Any sim created can req. 1 worker up to all.
"lb": np.array([-3, -2]),
"ub": np.array([3, 2]),
"dry_run": dry_run,
},
}

alloc_specs = {
"alloc_f": alloc_f,
"user": {
"give_all_with_same_priority": False,
"async_return": False, # False batch returns
},
}

exit_criteria = {"sim_max": 20}
libE_specs["resource_info"] = {"cores_on_node": (nworkers * 2, nworkers * 4), "gpus_on_node": nworkers}

base_libE_specs = libE_specs.copy()
for run in range(5):

# reset
libE_specs = base_libE_specs.copy()
persis_info = add_unique_random_streams({}, nworkers + 1)

if run == 0:
libE_specs["gen_num_procs"] = 2
elif run == 1:
libE_specs["gen_num_gpus"] = 1
elif run == 2:
persis_info["gen_num_gpus"] = 1
elif run == 3:
# Two GPUs per resource set
libE_specs["resource_info"]["gpus_on_node"] = nworkers * 2
persis_info["gen_num_gpus"] = 1
elif run == 4:
# Two GPUs requested for gen
persis_info["gen_num_procs"] = 2
persis_info["gen_num_gpus"] = 2
gen_specs["user"]["max_procs"] = max(nworkers - 2, 1)

# Perform the run
H, persis_info, flag = libE(
sim_specs, gen_specs, exit_criteria, persis_info, libE_specs=libE_specs, alloc_specs=alloc_specs
)

# All asserts are in gen and sim funcs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
The persistent generator creates simulations with variable resource requirements.
The sim_f (gpu_variable_resources) asserts that GPUs assignment
The sim_f (gpu_variable_resources_from_gen) asserts that GPUs assignment
is correct for the default method for the MPI runner. GPUs are not actually
used for default application. Four GPUs per node is mocked up below (if this line
is removed, libEnsemble will detect any GPUs available).
Expand Down
39 changes: 31 additions & 8 deletions libensemble/tests/unit_tests/test_allocation_funcs_and_support.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@


def initialize_resources():
Resources.init_resources({"comms": "local", "nworkers": 4, "num_resource_sets": 4})
platform_info = {"cores_per_node": 8, "gpus_per_node": 4}
libE_specs = {"comms": "local", "nworkers": 4, "num_resource_sets": 4}
Resources.init_resources(libE_specs=libE_specs, platform_info=platform_info)
Resources.resources.set_resource_manager(4)


Expand Down Expand Up @@ -218,8 +220,20 @@ def test_als_gen_work():
als = AllocSupport(W, True, persis_info=persis_info)
Work = {}
Work[1] = als.gen_work(1, ["sim_id"], range(0, 5), persis_info[1])
assert Work[1]["libE_info"]["rset_team"] == [0], "Resource set should be assigned in libE_info"
del persis_info["gen_resources"]

assert len(Work[1]["libE_info"]["rset_team"]), "Resource set should be assigned in libE_info"
persis_info["gen_num_procs"] = 2
Work[2] = als.gen_work(1, ["sim_id"], range(0, 5), persis_info[2])
assert Work[2]["libE_info"]["rset_team"] == [1], "Resource set should be assigned in libE_info"
assert Work[2]["libE_info"]["num_procs"] == 2, "num_procs set should be assigned in libE_info"

persis_info["gen_num_procs"] = 2
persis_info["gen_num_gpus"] = 2
Work[3] = als.gen_work(1, ["sim_id"], range(0, 5), persis_info[3])
assert Work[3]["libE_info"]["rset_team"] == [2, 3], "Resource set should be assigned in libE_info"
assert Work[3]["libE_info"]["num_procs"] == 2, "num_procs set should be assigned in libE_info"
assert Work[3]["libE_info"]["num_gpus"] == 2, "num_procs set should be assigned in libE_info"

clear_resources()

Expand Down Expand Up @@ -424,30 +438,39 @@ def test_als_points_by_priority():

def test_convert_to_rsets():
user_params = []
libE_info = {}
gen_fields = [("num_procs", int), ("num_gpus", int)]
H = np.zeros(5, dtype=libE_fields + gen_fields)

H_rows = 1
H[H_rows]["num_gpus"] = 3
num_gpus = 3
H[H_rows]["num_gpus"] = num_gpus
units_str = "num_gpus"

gpus_per_rset = 1
num_rsets = AllocSupport._convert_to_rsets(libE_info, user_params, H, H_rows, gpus_per_rset, units_str)
libE_info, num_rsets = {}, None # Reset
num_rsets = AllocSupport._convert_to_rsets(libE_info, user_params, gpus_per_rset, num_gpus, units_str)
assert num_rsets == 3, f"Unexpected number of rsets {num_rsets}"
assert libE_info["num_gpus"] == 3, f"Unexpected number for num_gpus {libE_info['num_gpus']}"

libE_info, num_rsets = {}, None # Reset
num_rsets = AllocSupport._convert_rows_to_rsets(libE_info, user_params, H, H_rows, gpus_per_rset, units_str)
assert num_rsets == 3, f"Unexpected number of rsets {num_rsets}"
assert libE_info["num_gpus"] == 3, f"Unexpected number for num_gpus {libE_info['num_gpus']}"

gpus_per_rset = 2
num_rsets = AllocSupport._convert_to_rsets(libE_info, user_params, H, H_rows, gpus_per_rset, units_str)
libE_info, num_rsets = {}, None # Reset
num_rsets = AllocSupport._convert_rows_to_rsets(libE_info, user_params, H, H_rows, gpus_per_rset, units_str)
assert num_rsets == 2, f"Unexpected number of rsets {num_rsets}"
assert libE_info["num_gpus"] == 3, f"Unexpected number for num_gpus {libE_info['num_gpus']}"

gpus_per_rset = 0
libE_info, num_rsets = {}, None # Reset
with pytest.raises(InsufficientResourcesError):
num_rsets = AllocSupport._convert_to_rsets(libE_info, user_params, H, H_rows, gpus_per_rset, units_str)
num_rsets = AllocSupport._convert_rows_to_rsets(libE_info, user_params, H, H_rows, gpus_per_rset, units_str)

H[H_rows]["num_gpus"] = 0
num_rsets = AllocSupport._convert_to_rsets(libE_info, user_params, H, H_rows, gpus_per_rset, units_str)
libE_info, num_rsets = {}, None # Reset
num_rsets = AllocSupport._convert_rows_to_rsets(libE_info, user_params, H, H_rows, gpus_per_rset, units_str)
assert num_rsets == 0, f"Unexpected number of rsets {num_rsets}"
assert libE_info["num_gpus"] == 0, f"Unexpected number for num_gpus {libE_info['num_gpus']}"

Expand Down
Loading

0 comments on commit aa2447c

Please sign in to comment.