Skip to content

Commit d2427e0

Browse files
justinvyuWeichenXu123
authored andcommitted
[Tune] Add option to not override the working directory (ray-project#29258)
See ray-project#29128 for more context on the problem. This PR does the following: 1. **Fix `TUNE_ORIG_WORKING_DIR`** to pull the correct current working directory within the worker process when the Tune trial logdir is being created. This PR deprecates this environment variable as it's confusing to get Tune metadata from sources other than `session`. 2. **Introduce a `chdir_to_trial_dir` flag** in the TuneConfig that defaults to `True`, which configures whether or not Tune should change the working directory of each worker to its corresponding trial directory. - If this flag is set to False, the user may still want to access the Tune trial directory. This can be done with a **newly added `session.get_trial_dir()` API.** 3. Make the `TUNE_ORIG_WORKING_DIR` deprecation, `chdir_to_trial_dir` flag, and `session.get_trial_dir()` more visible in the documentation with an **example in the Tune FAQ.** Signed-off-by: Justin Yu <justinvyu@berkeley.edu> Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
1 parent c1792bf commit d2427e0

File tree

14 files changed

+278
-54
lines changed

14 files changed

+278
-54
lines changed

doc/source/tune/api_docs/trainable.rst

+4-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
Training (tune.Trainable, session.report)
88
==========================================
99

10-
Training can be done with either a **Class API** (``tune.Trainable``) or **function API** (``session.report``).
10+
Training can be done with either a **Function API** (``session.report``) or **Class API** (``tune.Trainable``).
1111

1212
For the sake of example, let's maximize this objective function:
1313

@@ -327,6 +327,9 @@ session (Function API)
327327
.. autofunction:: ray.air.session.get_trial_resources
328328
:noindex:
329329

330+
.. autofunction:: ray.air.session.get_trial_dir
331+
:noindex:
332+
330333
tune.Trainable (Class API)
331334
--------------------------
332335

doc/source/tune/doc_code/faq.py

+25
Original file line numberDiff line numberDiff line change
@@ -411,3 +411,28 @@ def wait(self):
411411
),
412412
)
413413
# __grid_search_2_end__
414+
415+
if not MOCK:
416+
import os
417+
from pathlib import Path
418+
419+
# __no_chdir_start__
420+
def train_func(config):
421+
# Read from relative paths
422+
print(open("./read.txt").read())
423+
424+
# The working directory shouldn't have changed from the original
425+
# NOTE: The `TUNE_ORIG_WORKING_DIR` environment variable is deprecated.
426+
assert os.getcwd() == os.environ["TUNE_ORIG_WORKING_DIR"]
427+
428+
# Write to the Tune trial directory, not the shared working dir
429+
tune_trial_dir = Path(session.get_trial_dir())
430+
with open(tune_trial_dir / "write.txt", "w") as f:
431+
f.write("trial saved artifact")
432+
433+
tuner = tune.Tuner(
434+
train_func,
435+
tune_config=tune.TuneConfig(..., chdir_to_trial_dir=False),
436+
)
437+
tuner.fit()
438+
# __no_chdir_end__

doc/source/tune/faq.rst

+32
Original file line numberDiff line numberDiff line change
@@ -733,3 +733,35 @@ If `grid_search` is provided as an argument, the grid will be repeated ``num_sam
733733
Note that search spaces may not be interoperable across different search algorithms.
734734
For example, for many search algorithms, you will not be able to use a ``grid_search`` or ``sample_from`` parameters.
735735
Read about this in the :ref:`Search Space API <tune-search-space>` page.
736+
737+
.. _tune-working-dir:
738+
739+
How do I access relative filepaths in my Tune training function?
740+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
741+
742+
Let's say you launch a Tune experiment from ``~/code/my_script.py``. By default, Tune
743+
changes the working directory of each worker from ``~/code`` to its corresponding trial
744+
directory (e.g. ``~/ray_results/exp_name/trial_0000x``). This default
745+
guarantees separate working directories for each worker process, avoiding conflicts when
746+
saving trial-specific outputs.
747+
748+
You can configure this by setting `chdir_to_trial_dir=False` in `tune.TuneConfig`.
749+
This explicitly tells Tune to not change the working directory
750+
to the trial directory, giving access to paths relative to the original working directory.
751+
One caveat is that the working directory is now shared between workers, so the
752+
:meth:`session.get_trial_dir() <ray.air.session.get_trial_dir>`
753+
API should be used to get the path for saving trial-specific outputs.
754+
755+
.. literalinclude:: doc_code/faq.py
756+
:dedent:
757+
:emphasize-lines: 3, 10, 11, 12, 16
758+
:language: python
759+
:start-after: __no_chdir_start__
760+
:end-before: __no_chdir_end__
761+
762+
.. warning::
763+
764+
The `TUNE_ORIG_WORKING_DIR` environment variable was the original workaround for
765+
accessing paths relative to the original working directory. This environment
766+
variable is deprecated, and the `chdir_to_trial_dir` flag described above should be
767+
used instead.

python/ray/air/_internal/session.py

+5
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,11 @@ def trial_resources(self) -> "PlacementGroupFactory":
7474
"""Trial resources for the corresponding trial."""
7575
raise NotImplementedError
7676

77+
@property
78+
def trial_dir(self) -> str:
79+
"""Trial-level log directory for the corresponding trial."""
80+
raise NotImplementedError
81+
7782

7883
def _get_session() -> Optional[Session]:
7984
from ray.train._internal.session import _session_v2 as train_session

python/ray/air/session.py

+54-34
Original file line numberDiff line numberDiff line change
@@ -68,44 +68,43 @@ def get_checkpoint() -> Optional[Checkpoint]:
6868
Checkpoint object if the session is currently being resumed.
6969
Otherwise, return None.
7070
71-
Example:
72-
.. code-block: python
73-
74-
######## Using it in the *per worker* train loop (TrainSession) ######
75-
from ray.air import session
76-
from ray.air.checkpoint import Checkpoint
77-
from ray.air.config import ScalingConfig
78-
def train_func():
79-
ckpt = session.get_checkpoint()
80-
if ckpt:
81-
with ckpt.as_directory() as loaded_checkpoint_dir:
82-
import tensorflow as tf
83-
84-
model = tf.keras.models.load_model(loaded_checkpoint_dir)
85-
else:
86-
model = build_model()
71+
.. code-block:: python
8772
88-
model.save("my_model", overwrite=True)
89-
session.report(
90-
metrics={"iter": 1},
91-
checkpoint=Checkpoint.from_directory("my_model")
92-
)
73+
######## Using it in the *per worker* train loop (TrainSession) ######
74+
from ray.air import session
75+
from ray.air.checkpoint import Checkpoint
76+
from ray.air.config import ScalingConfig
77+
def train_func():
78+
ckpt = session.get_checkpoint()
79+
if ckpt:
80+
with ckpt.as_directory() as loaded_checkpoint_dir:
81+
import tensorflow as tf
82+
83+
model = tf.keras.models.load_model(loaded_checkpoint_dir)
84+
else:
85+
model = build_model()
9386
94-
scaling_config = ScalingConfig(num_workers=2)
95-
trainer = TensorflowTrainer(
96-
train_loop_per_worker=train_func, scaling_config=scaling_config
87+
model.save("my_model", overwrite=True)
88+
session.report(
89+
metrics={"iter": 1},
90+
checkpoint=Checkpoint.from_directory("my_model")
9791
)
98-
result = trainer.fit()
9992
100-
# trainer2 will pick up from the checkpoint saved by trainer1.
101-
trainer2 = TensorflowTrainer(
102-
train_loop_per_worker=train_func,
103-
scaling_config=scaling_config,
104-
# this is ultimately what is accessed through
105-
# ``Session.get_checkpoint()``
106-
resume_from_checkpoint=result.checkpoint,
107-
)
108-
result2 = trainer2.fit()
93+
scaling_config = ScalingConfig(num_workers=2)
94+
trainer = TensorflowTrainer(
95+
train_loop_per_worker=train_func, scaling_config=scaling_config
96+
)
97+
result = trainer.fit()
98+
99+
# trainer2 will pick up from the checkpoint saved by trainer1.
100+
trainer2 = TensorflowTrainer(
101+
train_loop_per_worker=train_func,
102+
scaling_config=scaling_config,
103+
# this is ultimately what is accessed through
104+
# ``Session.get_checkpoint()``
105+
resume_from_checkpoint=result.checkpoint,
106+
)
107+
result2 = trainer2.fit()
109108
"""
110109

111110
return _get_session().loaded_checkpoint
@@ -126,6 +125,27 @@ def get_trial_resources() -> "PlacementGroupFactory":
126125
return _get_session().trial_resources
127126

128127

128+
def get_trial_dir() -> str:
129+
"""Log directory corresponding to the trial directory for a Tune session.
130+
If calling from a Train session, this will give the trial directory of its parent
131+
Tune session.
132+
133+
.. code-block:: python
134+
135+
from ray import tune
136+
from ray.air import session
137+
138+
def train_func():
139+
# Example:
140+
# >>> session.get_trial_dir()
141+
# ~/ray_results/<exp-name>/<trial-dir>
142+
143+
tuner = tune.Tuner(train_func)
144+
tuner.fit()
145+
"""
146+
return _get_session().trial_dir
147+
148+
129149
def get_world_size() -> int:
130150
"""Get the current world size (i.e. total number of workers) for this run.
131151

python/ray/train/data_parallel_trainer.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import inspect
22
import logging
3-
import os
43
from pathlib import Path
54
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Type, Union
65
from tabulate import tabulate
@@ -338,7 +337,7 @@ def training_loop(self) -> None:
338337
name=session.get_trial_name(),
339338
id=session.get_trial_id(),
340339
resources=session.get_trial_resources(),
341-
logdir=os.getcwd(),
340+
logdir=session.get_trial_dir(),
342341
)
343342

344343
backend_executor = self._backend_executor_cls(

python/ray/train/session.py

+4
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ def trial_id(self) -> str:
4545
def trial_resources(self) -> "PlacementGroupFactory":
4646
return self._session.trial_info.resources
4747

48+
@property
49+
def trial_dir(self) -> str:
50+
return self._session.trial_info.logdir
51+
4852
@property
4953
def world_size(self) -> int:
5054
return self._session.world_size

python/ray/tune/execution/ray_trial_executor.py

+18-6
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,6 @@ def __init__(self):
7272
def get(self, trainable_cls):
7373
"""Gets the wrapped trainable_cls, otherwise calls ray.remote."""
7474
env_vars = DEFAULT_ENV_VARS.copy()
75-
env_vars["TUNE_ORIG_WORKING_DIR"] = os.getcwd()
7675

7776
runtime_env = {"env_vars": env_vars}
7877
if trainable_cls not in self._cache:
@@ -145,11 +144,17 @@ def is_empty(self):
145144
return len(self._future_to_insert_time) == 0
146145

147146

148-
def _noop_logger_creator(config, logdir):
149-
# Set the working dir in the remote process, for user file writes
147+
def _noop_logger_creator(config, logdir, should_chdir: bool = True):
148+
# Upon remote process setup, record the actor's original working dir before
149+
# changing the working dir to the Tune logdir
150+
os.environ["TUNE_ORIG_WORKING_DIR"] = os.getcwd()
151+
150152
os.makedirs(logdir, exist_ok=True)
151-
if not ray._private.worker._mode() == ray._private.worker.LOCAL_MODE:
152-
os.chdir(logdir)
153+
if should_chdir:
154+
# Set the working dir to the trial directory in the remote process,
155+
# for user file writes
156+
if not ray._private.worker._mode() == ray._private.worker.LOCAL_MODE:
157+
os.chdir(logdir)
153158
return NoopLogger(config, logdir)
154159

155160

@@ -205,6 +210,7 @@ def __init__(
205210
reuse_actors: bool = False,
206211
result_buffer_length: Optional[int] = None,
207212
refresh_period: Optional[float] = None,
213+
chdir_to_trial_dir: bool = False,
208214
):
209215
self._cached_trial_state = {}
210216
self._trials_to_cache = set()
@@ -245,6 +251,8 @@ def __init__(
245251
)
246252
self._trainable_kwargs = {}
247253

254+
self._chdir_to_trial_dir = chdir_to_trial_dir
255+
248256
def setup(
249257
self, max_pending_trials: int, trainable_kwargs: Optional[Dict] = None
250258
) -> None:
@@ -335,7 +343,11 @@ def _setup_remote_runner(self, trial):
335343
trial.init_logdir()
336344
# We checkpoint metadata here to try mitigating logdir duplication
337345
self._trials_to_cache.add(trial)
338-
logger_creator = partial(_noop_logger_creator, logdir=trial.logdir)
346+
logger_creator = partial(
347+
_noop_logger_creator,
348+
logdir=trial.logdir,
349+
should_chdir=self._chdir_to_trial_dir,
350+
)
339351

340352
if len(self._cached_actor_pg) > 0:
341353
assert self._reuse_actors

python/ray/tune/impl/tuner_internal.py

+2
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
if TYPE_CHECKING:
2323
from ray.train.trainer import BaseTrainer
2424

25+
2526
_TRAINABLE_PKL = "trainable.pkl"
2627
_TUNER_PKL = "tuner.pkl"
2728
_TRAINABLE_KEY = "_trainable"
@@ -382,6 +383,7 @@ def _get_tune_run_arguments(self, trainable) -> Dict[str, Any]:
382383
reuse_actors=self._tune_config.reuse_actors,
383384
max_concurrent_trials=self._tune_config.max_concurrent_trials,
384385
time_budget_s=self._tune_config.time_budget_s,
386+
chdir_to_trial_dir=self._tune_config.chdir_to_trial_dir,
385387
)
386388

387389
def _fit_internal(self, trainable, param_space) -> ExperimentAnalysis:

0 commit comments

Comments
 (0)