diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst index e76ee354b72f..953c808ea6fd 100644 --- a/doc/source/rllib.rst +++ b/doc/source/rllib.rst @@ -300,9 +300,10 @@ in the ``config`` section of the experiments. 'cartpole-ppo': { 'run': 'PPO', 'env': 'CartPole-v0', - 'resources': { - 'cpu': 2, - 'driver_cpu_limit': 1}, + 'trial_resources': { + 'cpu': 1, + 'extra_cpu': 2, # for workers + }, 'stop': { 'episode_reward_mean': 200, 'time_total_s': 180 diff --git a/doc/source/tune-config.rst b/doc/source/tune-config.rst index 0ed21d9e1b45..f00538421cb0 100644 --- a/doc/source/tune-config.rst +++ b/doc/source/tune-config.rst @@ -31,7 +31,7 @@ dictionary. run_experiments({ "my_experiment_name": { "run": "my_func", - "resources": { "cpu": 1, "gpu": 0 }, + "trial_resources": { "cpu": 1, "gpu": 0 }, "stop": { "mean_accuracy": 100 }, "config": { "alpha": grid_search([0.2, 0.4, 0.6]), @@ -73,9 +73,9 @@ For more information on variant generation, see `variant_generator.py `__. For example, if a trainable class requires 1 GPU itself, but will launch 4 actors each using another GPU, then it should set ``"gpu": 5, "driver_gpu_limit": 1``. +If your trainable function / class creates further Ray actors or tasks that also consume CPU / GPU resources, you will also want to set ``extra_cpu`` or ``extra_gpu`` to reserve extra resource slots for the actors you will create. For example, if a trainable class requires 1 GPU itself, but will launch 4 actors each using another GPU, then it should set ``"gpu": 1, "extra_gpu": 4``. diff --git a/examples/carla/a3c_lane_keep.py b/examples/carla/a3c_lane_keep.py index 555ddbe8f457..1338736d23f5 100644 --- a/examples/carla/a3c_lane_keep.py +++ b/examples/carla/a3c_lane_keep.py @@ -31,7 +31,7 @@ "carla-a3c": { "run": "A3C", "env": "carla_env", - "resources": {"cpu": 4, "gpu": 1}, + "trial_resources": {"cpu": 4, "gpu": 1}, "config": { "env_config": env_config, "model": { diff --git a/examples/carla/dqn_lane_keep.py b/examples/carla/dqn_lane_keep.py index 19733864c78c..2746a1c4bbd8 100644 --- a/examples/carla/dqn_lane_keep.py +++ b/examples/carla/dqn_lane_keep.py @@ -31,7 +31,7 @@ "carla-dqn": { "run": "DQN", "env": "carla_env", - "resources": {"cpu": 4, "gpu": 1}, + "trial_resources": {"cpu": 4, "gpu": 1}, "config": { "env_config": env_config, "model": { diff --git a/examples/carla/ppo_lane_keep.py b/examples/carla/ppo_lane_keep.py index 0ebf65bc2972..25e5acbf328c 100644 --- a/examples/carla/ppo_lane_keep.py +++ b/examples/carla/ppo_lane_keep.py @@ -31,7 +31,7 @@ "carla-ppo": { "run": "PPO", "env": "carla_env", - "resources": {"cpu": 4, "gpu": 1}, + "trial_resources": {"cpu": 4, "gpu": 1}, "config": { "env_config": env_config, "model": { diff --git a/examples/carla/train_a3c.py b/examples/carla/train_a3c.py index 58bdc0da7e3f..75856aef266e 100644 --- a/examples/carla/train_a3c.py +++ b/examples/carla/train_a3c.py @@ -32,7 +32,7 @@ "carla-a3c": { "run": "A3C", "env": "carla_env", - "resources": {"cpu": 5, "gpu": 2, "driver_gpu_limit": 0}, + "trial_resources": {"cpu": 5, "extra_gpu": 2}, "config": { "env_config": env_config, "use_gpu_for_workers": True, diff --git a/examples/carla/train_dqn.py b/examples/carla/train_dqn.py index 7289ce2ca5a1..6180ca48f0dd 100644 --- a/examples/carla/train_dqn.py +++ b/examples/carla/train_dqn.py @@ -29,7 +29,7 @@ "carla-dqn": { "run": "DQN", "env": "carla_env", - "resources": {"cpu": 4, "gpu": 1}, + "trial_resources": {"cpu": 4, "gpu": 1}, "config": { "env_config": env_config, "model": { diff --git a/examples/carla/train_ppo.py b/examples/carla/train_ppo.py index cdf77c173082..4f3ebf5eab83 100644 --- a/examples/carla/train_ppo.py +++ b/examples/carla/train_ppo.py @@ -28,7 +28,7 @@ "carla": { "run": "PPO", "env": "carla_env", - "resources": {"cpu": 4, "gpu": 1}, + "trial_resources": {"cpu": 4, "gpu": 1}, "config": { "env_config": env_config, "model": { diff --git a/python/ray/rllib/train.py b/python/ray/rllib/train.py index 309c8a6d9ced..a5e5f283a471 100755 --- a/python/ray/rllib/train.py +++ b/python/ray/rllib/train.py @@ -62,7 +62,7 @@ "run": args.run, "checkpoint_freq": args.checkpoint_freq, "local_dir": args.local_dir, - "resources": resources_to_json(args.resources), + "trial_resources": resources_to_json(args.trial_resources), "stop": args.stop, "config": dict(args.config, env=args.env), "restore": args.restore, diff --git a/python/ray/rllib/tuned_examples/cartpole-grid-search-example.yaml b/python/ray/rllib/tuned_examples/cartpole-grid-search-example.yaml index 7aa56af317c2..f677d3eea5ae 100644 --- a/python/ray/rllib/tuned_examples/cartpole-grid-search-example.yaml +++ b/python/ray/rllib/tuned_examples/cartpole-grid-search-example.yaml @@ -4,9 +4,9 @@ cartpole-ppo: stop: episode_reward_mean: 200 time_total_s: 180 - resources: - cpu: 3 - driver_cpu_limit: 1 + trial_resources: + cpu: 1 + extra_cpu: 1 config: num_workers: 2 num_sgd_iter: diff --git a/python/ray/rllib/tuned_examples/hopper-ppo.yaml b/python/ray/rllib/tuned_examples/hopper-ppo.yaml index cf9f35e96142..ef3627202bbe 100644 --- a/python/ray/rllib/tuned_examples/hopper-ppo.yaml +++ b/python/ray/rllib/tuned_examples/hopper-ppo.yaml @@ -1,9 +1,8 @@ hopper-ppo: env: Hopper-v1 run: PPO - resources: - cpu: 65 + trial_resources: + cpu: 1 gpu: 4 - driver_cpu_limit: 1 - driver_gpu_limit: 4 + extra_cpu: 64 config: {"gamma": 0.995, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_workers": 64} diff --git a/python/ray/rllib/tuned_examples/humanoid-es.yaml b/python/ray/rllib/tuned_examples/humanoid-es.yaml index 793363eee951..1c499d17e549 100644 --- a/python/ray/rllib/tuned_examples/humanoid-es.yaml +++ b/python/ray/rllib/tuned_examples/humanoid-es.yaml @@ -1,9 +1,9 @@ humanoid-es: env: Humanoid-v1 run: ES - resources: - cpu: 101 - driver_cpu_limit: 1 + trial_resources: + cpu: 1 + extra_cpu: 100 stop: episode_reward_mean: 6000 config: diff --git a/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml b/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml index 007c6349a6b8..b316a085a81c 100644 --- a/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml +++ b/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml @@ -3,9 +3,9 @@ humanoid-ppo-gae: run: PPO stop: episode_reward_mean: 6000 - resources: - cpu: 65 + trial_resources: + cpu: 1 gpu: 4 - driver_cpu_limit: 1 + extra_cpu: 64 config: {"lambda": 0.95, "clip_param": 0.2, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "horizon": 5000, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "write_logs": false} diff --git a/python/ray/rllib/tuned_examples/humanoid-ppo.yaml b/python/ray/rllib/tuned_examples/humanoid-ppo.yaml index a5f55b1a4532..22c3d8a0b465 100644 --- a/python/ray/rllib/tuned_examples/humanoid-ppo.yaml +++ b/python/ray/rllib/tuned_examples/humanoid-ppo.yaml @@ -3,8 +3,8 @@ humanoid-ppo: run: PPO stop: episode_reward_mean: 6000 - resources: - cpu: 65 + trial_resources: + cpu: 1 gpu: 4 - driver_cpu_limit: 1 + extra_cpu: 64 config: {"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "use_gae": false} diff --git a/python/ray/rllib/tuned_examples/hyperband-cartpole.yaml b/python/ray/rllib/tuned_examples/hyperband-cartpole.yaml index a6cb718dbae3..e21b2a6b4368 100644 --- a/python/ray/rllib/tuned_examples/hyperband-cartpole.yaml +++ b/python/ray/rllib/tuned_examples/hyperband-cartpole.yaml @@ -5,9 +5,9 @@ cartpole-ppo: stop: episode_reward_mean: 200 time_total_s: 180 - resources: - cpu: 2 - driver_cpu_limit: 1 + trial_resources: + cpu: 1 + extra_cpu: 1 config: num_workers: 1 num_sgd_iter: diff --git a/python/ray/rllib/tuned_examples/pendulum-ppo.yaml b/python/ray/rllib/tuned_examples/pendulum-ppo.yaml index 9e8ad92217fa..c100c9f593bb 100644 --- a/python/ray/rllib/tuned_examples/pendulum-ppo.yaml +++ b/python/ray/rllib/tuned_examples/pendulum-ppo.yaml @@ -2,9 +2,9 @@ pendulum-ppo: env: Pendulum-v0 run: PPO - resources: - cpu: 5 - driver_cpu_limit: 1 + trial_resources: + cpu: 1 + extra_cpu: 4 config: timesteps_per_batch: 2048 num_workers: 4 diff --git a/python/ray/rllib/tuned_examples/pong-a3c-pytorch.yaml b/python/ray/rllib/tuned_examples/pong-a3c-pytorch.yaml index 59c3a1b9c0da..55e8f0169391 100644 --- a/python/ray/rllib/tuned_examples/pong-a3c-pytorch.yaml +++ b/python/ray/rllib/tuned_examples/pong-a3c-pytorch.yaml @@ -1,9 +1,9 @@ pong-a3c-pytorch-cnn: env: PongDeterministic-v4 run: A3C - resources: - cpu: 17 - driver_cpu_limit: 1 + trial_resources: + cpu: 1 + extra_cpu: 16 config: num_workers: 16 batch_size: 20 diff --git a/python/ray/rllib/tuned_examples/pong-a3c.yaml b/python/ray/rllib/tuned_examples/pong-a3c.yaml index 0d261a3cc750..8f196b5c46d7 100644 --- a/python/ray/rllib/tuned_examples/pong-a3c.yaml +++ b/python/ray/rllib/tuned_examples/pong-a3c.yaml @@ -1,9 +1,9 @@ pong-a3c: env: PongDeterministic-v4 run: A3C - resources: - cpu: 17 - driver_cpu_limit: 1 + trial_resources: + cpu: 1 + extra_cpu: 16 config: num_workers: 16 batch_size: 20 diff --git a/python/ray/rllib/tuned_examples/pong-apex.yaml b/python/ray/rllib/tuned_examples/pong-apex.yaml index 1eaa104806ae..6f7cf7974d21 100644 --- a/python/ray/rllib/tuned_examples/pong-apex.yaml +++ b/python/ray/rllib/tuned_examples/pong-apex.yaml @@ -4,11 +4,11 @@ pong-apex: env: PongNoFrameskip-v4 run: APEX - resources: - cpu: - eval: 1 + spec.config.num_workers - driver_cpu_limit: 1 + trial_resources: + cpu: 1 gpu: 1 + extra_cpu: + eval: 4 + spec.config.num_workers config: target_network_update_freq: 50000 num_workers: 32 diff --git a/python/ray/rllib/tuned_examples/pong-dqn.yaml b/python/ray/rllib/tuned_examples/pong-dqn.yaml index 396f3ea5ada9..51e1bd2e66c2 100644 --- a/python/ray/rllib/tuned_examples/pong-dqn.yaml +++ b/python/ray/rllib/tuned_examples/pong-dqn.yaml @@ -2,7 +2,7 @@ pong-deterministic-dqn: env: PongDeterministic-v4 run: DQN - resources: + trial_resources: cpu: 1 gpu: 1 stop: diff --git a/python/ray/rllib/tuned_examples/pong-ppo.yaml b/python/ray/rllib/tuned_examples/pong-ppo.yaml index 58956bc0519c..0a221601468c 100644 --- a/python/ray/rllib/tuned_examples/pong-ppo.yaml +++ b/python/ray/rllib/tuned_examples/pong-ppo.yaml @@ -8,10 +8,10 @@ pong-deterministic-ppo: env: PongDeterministic-v4 run: PPO - resources: - cpu: 5 + trial_resources: + cpu: 1 gpu: 1 - driver_cpu_limit: 1 + extra_cpu: 4 stop: episode_reward_mean: 21 config: diff --git a/python/ray/rllib/tuned_examples/regression_tests/cartpole-a3c.yaml b/python/ray/rllib/tuned_examples/regression_tests/cartpole-a3c.yaml index 2b16e48e05fc..f94c8ec7a5e1 100644 --- a/python/ray/rllib/tuned_examples/regression_tests/cartpole-a3c.yaml +++ b/python/ray/rllib/tuned_examples/regression_tests/cartpole-a3c.yaml @@ -4,7 +4,7 @@ cartpole-a3c: stop: episode_reward_mean: 200 time_total_s: 600 - resources: + trial_resources: cpu: 2 config: num_workers: 4 diff --git a/python/ray/rllib/tuned_examples/regression_tests/cartpole-dqn.yaml b/python/ray/rllib/tuned_examples/regression_tests/cartpole-dqn.yaml index da916312f126..1e78dfa12414 100644 --- a/python/ray/rllib/tuned_examples/regression_tests/cartpole-dqn.yaml +++ b/python/ray/rllib/tuned_examples/regression_tests/cartpole-dqn.yaml @@ -4,7 +4,7 @@ cartpole-dqn: stop: episode_reward_mean: 200 time_total_s: 600 - resources: + trial_resources: cpu: 1 config: n_step: 3 diff --git a/python/ray/rllib/tuned_examples/regression_tests/cartpole-es.yaml b/python/ray/rllib/tuned_examples/regression_tests/cartpole-es.yaml index 45392e156fd8..1ed5caa1e65f 100644 --- a/python/ray/rllib/tuned_examples/regression_tests/cartpole-es.yaml +++ b/python/ray/rllib/tuned_examples/regression_tests/cartpole-es.yaml @@ -4,7 +4,7 @@ cartpole-es: stop: episode_reward_mean: 200 time_total_s: 300 - resources: + trial_resources: cpu: 2 config: num_workers: 2 diff --git a/python/ray/rllib/tuned_examples/regression_tests/cartpole-ppo.yaml b/python/ray/rllib/tuned_examples/regression_tests/cartpole-ppo.yaml index 0f7c23ff2135..aeddb12556de 100644 --- a/python/ray/rllib/tuned_examples/regression_tests/cartpole-ppo.yaml +++ b/python/ray/rllib/tuned_examples/regression_tests/cartpole-ppo.yaml @@ -4,7 +4,7 @@ cartpole-ppo: stop: episode_reward_mean: 200 time_total_s: 300 - resources: + trial_resources: cpu: 1 config: num_workers: 1 diff --git a/python/ray/rllib/tuned_examples/walker2d-ppo.yaml b/python/ray/rllib/tuned_examples/walker2d-ppo.yaml index 95fbeeb51dc2..35efb128d17f 100644 --- a/python/ray/rllib/tuned_examples/walker2d-ppo.yaml +++ b/python/ray/rllib/tuned_examples/walker2d-ppo.yaml @@ -1,8 +1,8 @@ walker2d-v1-ppo: env: Walker2d-v1 run: PPO - resources: - cpu: 65 + trial_resources: + cpu: 1 gpu: 4 - driver_cpu_limit: 1 + extra_cpu: 64 config: {"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64} diff --git a/python/ray/tune/config_parser.py b/python/ray/tune/config_parser.py index ae7ab35c0014..d4f851b63b12 100644 --- a/python/ray/tune/config_parser.py +++ b/python/ray/tune/config_parser.py @@ -15,24 +15,34 @@ def json_to_resources(data): if type(data) is str: data = json.loads(data) for k in data: + if k in ["driver_cpu_limit", "driver_gpu_limit"]: + raise TuneError( + "The field `{}` is no longer supported. Use `extra_cpu` " + "or `extra_gpu` instead.".format(k)) if k not in Resources._fields: raise TuneError( "Unknown resource type {}, must be one of {}".format( k, Resources._fields)) return Resources( data.get("cpu", 1), data.get("gpu", 0), - data.get("driver_cpu_limit"), data.get("driver_gpu_limit")) + data.get("extra_cpu", 0), data.get("extra_gpu", 0)) def resources_to_json(resources): + if resources is None: + resources = Resources(cpu=1, gpu=0) return { "cpu": resources.cpu, "gpu": resources.gpu, - "driver_cpu_limit": resources.driver_cpu_limit, - "driver_gpu_limit": resources.driver_gpu_limit, + "extra_cpu": resources.extra_cpu, + "extra_gpu": resources.extra_gpu, } +def _tune_error(msg): + raise TuneError(msg) + + def make_parser(**kwargs): """Returns a base argument parser for the ray.tune tool.""" @@ -56,7 +66,12 @@ def make_parser(**kwargs): help="Algorithm-specific configuration (e.g. env, hyperparams), " "specified in JSON.") parser.add_argument( - "--resources", default='{"cpu": 1}', type=json_to_resources, + "--resources", help="Deprecated, use --trial-resources.", + type=lambda v: _tune_error( + "The `resources` argument is no longer supported. " + "Use `trial_resources` or --trial-resources instead.")) + parser.add_argument( + "--trial-resources", default='{"cpu": 1}', type=json_to_resources, help="Machine resources to allocate per trial, e.g. " "'{\"cpu\": 64, \"gpu\": 8}'. Note that GPUs will not be assigned " "unless you specify them here.") diff --git a/python/ray/tune/examples/async_hyperband_example.py b/python/ray/tune/examples/async_hyperband_example.py index f6abe27f84b7..444e3bbc70d1 100644 --- a/python/ray/tune/examples/async_hyperband_example.py +++ b/python/ray/tune/examples/async_hyperband_example.py @@ -68,7 +68,7 @@ def _restore(self, checkpoint_path): "run": "my_class", "stop": {"training_iteration": 1 if args.smoke_test else 99999}, "repeat": 20, - "resources": {"cpu": 1, "gpu": 0}, + "trial_resources": {"cpu": 1, "gpu": 0}, "config": { "width": lambda spec: 10 + int(90 * random.random()), "height": lambda spec: int(100 * random.random()), diff --git a/python/ray/tune/examples/pbt_example.py b/python/ray/tune/examples/pbt_example.py index 089f181b1d61..0f7a5eb6585c 100755 --- a/python/ray/tune/examples/pbt_example.py +++ b/python/ray/tune/examples/pbt_example.py @@ -79,7 +79,7 @@ def _restore(self, checkpoint_path): "run": "my_class", "stop": {"training_iteration": 2 if args.smoke_test else 99999}, "repeat": 10, - "resources": {"cpu": 1, "gpu": 0}, + "trial_resources": {"cpu": 1, "gpu": 0}, "config": { "factor_1": 4.0, "factor_2": 1.0, diff --git a/python/ray/tune/examples/pbt_ppo_example.py b/python/ray/tune/examples/pbt_ppo_example.py index 02843e7a98d4..4afac136e6c2 100755 --- a/python/ray/tune/examples/pbt_ppo_example.py +++ b/python/ray/tune/examples/pbt_ppo_example.py @@ -50,7 +50,7 @@ def explore(config): "run": "PPO", "env": "Humanoid-v1", "repeat": 8, - "resources": {"cpu": 4, "gpu": 1}, + "trial_resources": {"cpu": 4, "gpu": 1}, "config": { "kl_coeff": 1.0, "num_workers": 8, diff --git a/python/ray/tune/experiment.py b/python/ray/tune/experiment.py index 0ebfa0aabb7d..134e8720768c 100644 --- a/python/ray/tune/experiment.py +++ b/python/ray/tune/experiment.py @@ -20,7 +20,7 @@ class Experiment(object): empty dict. config (dict): Algorithm-specific configuration (e.g. env, hyperparams). Defaults to empty dict. - resources (dict): Machine resources to allocate per trial, + trial_resources (dict): Machine resources to allocate per trial, e.g. ``{"cpu": 64, "gpu": 8}``. Note that GPUs will not be assigned unless you specify them here. Defaults to 1 CPU and 0 GPUs. @@ -36,13 +36,13 @@ class Experiment(object): checkpointing is enabled. Defaults to 3. """ def __init__(self, name, run, stop=None, config=None, - resources=None, repeat=1, local_dir=None, + trial_resources=None, repeat=1, local_dir=None, upload_dir="", checkpoint_freq=0, max_failures=3): spec = { "run": run, "stop": stop or {}, "config": config or {}, - "resources": resources or {"cpu": 1, "gpu": 0}, + "trial_resources": trial_resources or {"cpu": 1, "gpu": 0}, "repeat": repeat, "local_dir": local_dir or DEFAULT_RESULTS_DIR, "upload_dir": upload_dir, diff --git a/python/ray/tune/test/trial_runner_test.py b/python/ray/tune/test/trial_runner_test.py index cb858403fe7f..bea3d98bde01 100644 --- a/python/ray/tune/test/trial_runner_test.py +++ b/python/ray/tune/test/trial_runner_test.py @@ -130,7 +130,7 @@ def testBadParams6(self): def f(): run_experiments({"foo": { "run": "PPO", - "resources": {"asdf": 1} + "trial_resources": {"asdf": 1} }}) self.assertRaises(TuneError, f) @@ -453,6 +453,27 @@ def testTrialErrorOnStart(self): except Exception as e: self.assertIn("a class", str(e)) + def testExtraResources(self): + ray.init(num_cpus=4, num_gpus=2) + runner = TrialRunner() + kwargs = { + "stopping_criterion": {"training_iteration": 1}, + "resources": Resources(cpu=1, gpu=0, extra_cpu=3, extra_gpu=1), + } + trials = [ + Trial("__fake", **kwargs), + Trial("__fake", **kwargs)] + for t in trials: + runner.add_trial(t) + + runner.step() + self.assertEqual(trials[0].status, Trial.RUNNING) + self.assertEqual(trials[1].status, Trial.PENDING) + + runner.step() + self.assertEqual(trials[0].status, Trial.TERMINATED) + self.assertEqual(trials[1].status, Trial.PENDING) + def testResourceScheduler(self): ray.init(num_cpus=4, num_gpus=1) runner = TrialRunner() diff --git a/python/ray/tune/test/tune_server_test.py b/python/ray/tune/test/tune_server_test.py index d0c0fe0ad5b3..735399251324 100644 --- a/python/ray/tune/test/tune_server_test.py +++ b/python/ray/tune/test/tune_server_test.py @@ -62,7 +62,7 @@ def testAddTrial(self): spec = { "run": "__fake", "stop": {"training_iteration": 3}, - "resources": dict(cpu=1, gpu=1), + "trial_resources": dict(cpu=1, gpu=1), } client.add_trial("test", spec) runner.step() diff --git a/python/ray/tune/trial.py b/python/ray/tune/trial.py index caa920e9a0b4..80adbc0b5fec 100644 --- a/python/ray/tune/trial.py +++ b/python/ray/tune/trial.py @@ -28,34 +28,32 @@ def date_str(): class Resources( - namedtuple("Resources", [ - "cpu", "gpu", "driver_cpu_limit", "driver_gpu_limit"])): + namedtuple("Resources", ["cpu", "gpu", "extra_cpu", "extra_gpu"])): """Ray resources required to schedule a trial. Attributes: - cpu (int): Number of CPUs required for the trial total. - gpu (int): Number of GPUs required for the trial total. - driver_cpu_limit (int): Max CPUs allocated to the driver. - Defaults to all of the required CPUs. - driver_gpu_limit (int): Max GPUs allocated to the driver. - Defaults to all of the required GPUs. + cpu (int): Number of CPUs to allocate to the trial. + gpu (int): Number of GPUs to allocate to the trial. + extra_cpu (int): Extra CPUs to reserve in case the trial needs to + launch additional Ray actors that use CPUs. + extra_gpu (int): Extra GPUs to reserve in case the trial needs to + launch additional Ray actors that use GPUs. """ __slots__ = () - def __new__(cls, cpu, gpu, driver_cpu_limit=None, driver_gpu_limit=None): - if driver_cpu_limit is not None: - assert driver_cpu_limit <= cpu - else: - driver_cpu_limit = cpu - if driver_gpu_limit is not None: - assert driver_gpu_limit <= gpu - else: - driver_gpu_limit = gpu + def __new__(cls, cpu, gpu, extra_cpu=0, extra_gpu=0): return super(Resources, cls).__new__( - cls, cpu, gpu, driver_cpu_limit, driver_gpu_limit) + cls, cpu, gpu, extra_cpu, extra_gpu) def summary_string(self): - return "{} CPUs, {} GPUs".format(self.cpu, self.gpu) + return "{} CPUs, {} GPUs".format( + self.cpu + self.extra_cpu, self.gpu + self.extra_gpu) + + def cpu_total(self): + return self.cpu + self.extra_cpu + + def gpu_total(self): + return self.gpu + self.extra_gpu class Trial(object): @@ -66,9 +64,6 @@ class Trial(object): Trials start in the PENDING state, and transition to RUNNING once started. On error it transitions to ERROR, otherwise TERMINATED on success. - - The driver for the trial will be allocated at most `driver_cpu_limit` and - `driver_gpu_limit` CPUs and GPUs. """ PENDING = "PENDING" @@ -79,7 +74,7 @@ class Trial(object): def __init__( self, trainable_name, config=None, local_dir=DEFAULT_RESULTS_DIR, - experiment_tag=None, resources=Resources(cpu=1, gpu=0), + experiment_tag="", resources=Resources(cpu=1, gpu=0), stopping_criterion=None, checkpoint_freq=0, restore_path=None, upload_dir=None, max_failures=0): """Initialize a new trial. @@ -347,8 +342,8 @@ def _setup_runner(self): trainable_cls = ray.tune.registry.get_registry().get( ray.tune.registry.TRAINABLE_CLASS, self.trainable_name) cls = ray.remote( - num_cpus=self.resources.driver_cpu_limit, - num_gpus=self.resources.driver_gpu_limit)(trainable_cls) + num_cpus=self.resources.cpu, + num_gpus=self.resources.gpu)(trainable_cls) if not self.result_logger: if not os.path.exists(self.local_dir): os.makedirs(self.local_dir) diff --git a/python/ray/tune/trial_runner.py b/python/ray/tune/trial_runner.py index 61b16e450204..5b4c77789e14 100644 --- a/python/ray/tune/trial_runner.py +++ b/python/ray/tune/trial_runner.py @@ -181,7 +181,9 @@ def has_resources(self, resources): cpu_avail = self._avail_resources.cpu - self._committed_resources.cpu gpu_avail = self._avail_resources.gpu - self._committed_resources.gpu - return resources.cpu <= cpu_avail and resources.gpu <= gpu_avail + return ( + resources.cpu_total() <= cpu_avail and + resources.gpu_total() <= gpu_avail) def _can_launch_more(self): self._update_avail_resources() @@ -265,13 +267,13 @@ def _get_runnable(self): def _commit_resources(self, resources): self._committed_resources = Resources( - self._committed_resources.cpu + resources.cpu, - self._committed_resources.gpu + resources.gpu) + self._committed_resources.cpu + resources.cpu_total(), + self._committed_resources.gpu + resources.gpu_total()) def _return_resources(self, resources): self._committed_resources = Resources( - self._committed_resources.cpu - resources.cpu, - self._committed_resources.gpu - resources.gpu) + self._committed_resources.cpu - resources.cpu_total(), + self._committed_resources.gpu - resources.gpu_total()) assert self._committed_resources.cpu >= 0 assert self._committed_resources.gpu >= 0 diff --git a/python/ray/tune/variant_generator.py b/python/ray/tune/variant_generator.py index b12b6ca6825a..aec308fd5b17 100644 --- a/python/ray/tune/variant_generator.py +++ b/python/ray/tune/variant_generator.py @@ -58,7 +58,7 @@ def generate_trials(unresolved_spec, output_path=''): config=spec.get("config", {}), local_dir=os.path.join(args.local_dir, output_path), experiment_tag=experiment_tag, - resources=json_to_resources(spec.get("resources", {})), + resources=json_to_resources(spec.get("trial_resources", {})), stopping_criterion=spec.get("stop", {}), checkpoint_freq=args.checkpoint_freq, restore_path=spec.get("restore"), @@ -118,7 +118,7 @@ def grid_search(values): def _format_vars(resolved_vars): out = [] for path, value in sorted(resolved_vars.items()): - if path[0] in ["run", "env", "resources"]: + if path[0] in ["run", "env", "trial_resources"]: continue # TrialRunner already has these in the experiment_tag pieces = [] last_string = True