Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Refactor] Upgrade ray from 1.9.1 to 2.1.0 #81

Merged
merged 31 commits into from
Dec 19, 2022
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
a234607
Bump ray from 1.9.1 to 2.1.0
KKIEEK Nov 30, 2022
f6e85e4
Init
Dec 1, 2022
cac34ea
Update mmseg config
Dec 1, 2022
756147b
Fix deprecated warning
Dec 1, 2022
73aa245
Fix trainable function signature
Dec 1, 2022
f8fa7b2
Fix rewriter
Dec 1, 2022
5d3ac5b
Fix minor
Dec 1, 2022
04a5250
Fix reporter
Dec 2, 2022
59a86da
Fix apis
Dec 2, 2022
4fb42dd
Fix RayCheckpointHook
Dec 2, 2022
2c1215c
Fix requirements
Dec 2, 2022
709bb9c
Fix test code for rewriters
Dec 2, 2022
940320b
Fix test code for hooks
Dec 2, 2022
cddfc3c
Fix test code for tasks
Dec 2, 2022
b47f3c0
Fix test code for apis
Dec 2, 2022
7fd3e67
Merge branch 'main' into ray/v2.1.0
KKIEEK Dec 14, 2022
ca42bfc
:memo: Del checkpoint for base task proc
yhna940 Dec 15, 2022
411f307
Update siatune/apis/analysis.py
KKIEEK Dec 15, 2022
791111d
Update siatune/mm/tasks/mmtrainbase.py
KKIEEK Dec 15, 2022
703d5a1
Update siatune/mm/tasks/mmtrainbase.py
KKIEEK Dec 15, 2022
9eda02d
Support custom trainer and backend (#91)
KKIEEK Dec 15, 2022
082ea7b
Update siatune/mm/tasks/mmtrainbase.py
KKIEEK Dec 15, 2022
2cbe000
Merge branch 'main' into ray/v2.1.0
KKIEEK Dec 15, 2022
789ca62
Upgrade MMTask (#97)
KKIEEK Dec 16, 2022
d2ff007
Fix minor (#100)
KKIEEK Dec 16, 2022
e63911d
Update siatune/mm/tasks/mmtrainbase.py
KKIEEK Dec 16, 2022
9d4f5e6
Fix typo
Dec 16, 2022
cf5a79b
Supplement documentations (#102)
KKIEEK Dec 17, 2022
ab04069
Update siatune/ray/tuner.py
KKIEEK Dec 17, 2022
9ac7b32
Support resume (#104)
KKIEEK Dec 19, 2022
8574379
Merge branch 'main' into ray/v2.1.0
KKIEEK Dec 19, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion configs/_base_/scheduler/asynchb.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
scheduler = dict(
trial_scheduler = dict(
type='AsyncHyperBandScheduler',
time_attr='training_iteration',
max_t=20,
Expand Down
7 changes: 2 additions & 5 deletions configs/mmcls/mmcls_cifar_100_asynchb_nevergrad_pso.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,11 @@
]

space = {
'data.samples_per_gpu': {{_base_.batch_size}},
'model': {{_base_.model}},
'model.head.num_classes': 100,
'optimizer': {{_base_.optimizer}},
'data.samples_per_gpu': {{_base_.batch_size}},
}

task = dict(type='MMClassification')
metric = 'val/accuracy_top-1'
mode = 'max'
raise_on_failed_trial = False
num_samples = 256
tune_cfg = dict(num_samples=8, metric='val/accuracy_top-1', mode='max')
7 changes: 2 additions & 5 deletions configs/mmdet/mmdet_asynchb_nevergrad_pso.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,10 @@
]

space = {
'data.samples_per_gpu': {{_base_.batch_size}},
'model': {{_base_.model}},
'optimizer': {{_base_.optimizer}},
'data.samples_per_gpu': {{_base_.batch_size}},
}

task = dict(type='MMDetection')
metric = 'val/AP'
mode = 'max'
raise_on_failed_trial = False
num_samples = 256
tune_cfg = dict(num_samples=8, metric='val/AP', mode='max')
9 changes: 3 additions & 6 deletions configs/mmseg/mmseg_asynchb_nevergrad_pso.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,12 @@
]

space = {
'model': {{_base_.model}},
'optimizer': {{_base_.optimizer}},
'data.samples_per_gpu': {{_base_.batch_size}},
'model': {{_base_.model}},
'model.decode_head.num_classes': 21,
'model.auxiliary_head.num_classes': 21,
'optimizer': {{_base_.optimizer}},
}

task = dict(type='MMSegmentation')
metric = 'val/mIoU'
mode = 'max'
raise_on_failed_trial = False
num_samples = 256
tune_cfg = dict(num_samples=8, metric='val/mIoU', mode='max')
2 changes: 1 addition & 1 deletion requirements/optional.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
bayesian-optimization==1.2.0
flaml==0.9.7
flaml==1.0.14
hyperopt==0.2.5
mlflow==1.23.1
nevergrad==0.4.3.post7
Expand Down
3 changes: 2 additions & 1 deletion requirements/runtime.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
pandas
protobuf<=3.20
ray[default]==1.9.1
pyarrow
ray[default]==2.1.0
tabulate
4 changes: 4 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
[flake8]
per-file-ignores =
siatune/mm/tasks/mm*.py: E251,E501

[isort]
line_length = 79
multi_line_output = 0
Expand Down
19 changes: 9 additions & 10 deletions siatune/apis/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,19 @@
from typing import Optional

from mmcv.utils import Config, get_logger
from ray import tune
from ray.tune import ResultGrid

from siatune.utils import ImmutableContainer, dump_cfg


def log_analysis(analysis: tune.ExperimentAnalysis,
def log_analysis(results: ResultGrid,
tune_config: Config,
task_config: Optional[Config] = None,
log_dir: Optional[str] = None) -> None:
"""Log the analysis of the experiment.

Args:
analysis (tune.ExperimentAnalysis): The analysis of the experiment.
results (ResultGrid): Experiment results of `Tuner.fit()`.
tune_config (Config): The tune config.
task_config (Optional[Config]): The task config. Defaults to None.
log_dir (Optional[str]): The log dir. Defaults to None.
Expand All @@ -33,10 +33,9 @@ def log_analysis(analysis: tune.ExperimentAnalysis,
logger = get_logger(
'siatune', log_file=osp.join(log_dir, f'{timestamp}.log'))

logger.info(
f'Best Hyperparam: \n'
f'{pformat(ImmutableContainer.decouple(analysis.best_config))}')
logger.info(
f'Best Results: \n'
f'{pformat(ImmutableContainer.decouple(analysis.best_result))}')
logger.info(f'Best Logdir: {analysis.best_logdir}')
result = results.get_best_result()
logger.info(f'Best Result: \n'
f'{pformat(ImmutableContainer.decouple(result))}')
logger.info(f'Best Hyperparam: \n'
f'{pformat(ImmutableContainer.decouple(result.config))}')
logger.info(f'Best Logdir: {result.log_dir}')
58 changes: 3 additions & 55 deletions siatune/apis/tune.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,10 @@
# Copyright (c) SI-Analytics. All rights reserved.
from os import path as osp

import mmcv
import ray
from mmcv.utils import Config

from siatune.mm.tasks import BaseTask
from siatune.ray.callbacks import build_callback
from siatune.ray.schedulers import build_scheduler
from siatune.ray.searchers import build_searcher
from siatune.ray.spaces import build_space
from siatune.ray.stoppers import build_stopper
from siatune.ray import Tuner


def tune(task_processor: BaseTask, tune_config: Config,
Expand All @@ -29,51 +23,5 @@ def tune(task_processor: BaseTask, tune_config: Config,
trainable_cfg = tune_config.get('trainable', dict())
trainable = task_processor.create_trainable(**trainable_cfg)

assert hasattr(tune_config, 'metric')
assert hasattr(tune_config, 'mode') and tune_config.mode in ['min', 'max']

tune_artifact_dir = osp.join(tune_config.work_dir, 'artifact')
mmcv.mkdir_or_exist(tune_artifact_dir)

stopper = tune_config.get('stop', None)
if stopper is not None:
stopper = build_stopper(stopper)

space = tune_config.get('space', None)
if space is not None:
space = build_space(space)

resources_per_trial = None
if not hasattr(trainable, 'default_resource_request'):
resources_per_trial = dict(
gpu=task_processor.num_workers *
task_processor.num_gpus_per_worker,
cpu=task_processor.num_workers *
task_processor.num_cpus_per_worker)

searcher = tune_config.get('searcher', None)
if searcher is not None:
searcher = build_searcher(searcher)

scheduler = tune_config.get('scheduler', None)
if scheduler is not None:
scheduler = build_scheduler(scheduler)

callbacks = tune_config.get('callbacks', None)
if callbacks is not None:
callbacks = [build_callback(callback) for callback in callbacks]

return ray.tune.run(
trainable,
name=exp_name,
metric=tune_config.metric,
mode=tune_config.mode,
stop=stopper,
config=space,
resources_per_trial=resources_per_trial,
num_samples=tune_config.get('num_samples', -1),
local_dir=tune_artifact_dir,
search_alg=searcher,
scheduler=scheduler,
raise_on_failed_trial=tune_config.get('raise_on_failed_trial', False),
callbacks=callbacks)
tuner = Tuner.from_cfg(tune_config, trainable)
return tuner.fit()
4 changes: 2 additions & 2 deletions siatune/mm/context/rewriters/dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from os import path as osp
from typing import Dict

import ray
from ray.air import session

from siatune.utils import dump_cfg
from .base import BaseRewriter
Expand Down Expand Up @@ -46,7 +46,7 @@ def __call__(self, context: Dict) -> Dict:
Dict: The context after rewriting.
"""
cfg = context.pop(self.key)
trial_id = ray.tune.get_trial_id()
trial_id = session.get_trial_id()
tmp_path = self.get_temporary_path(f'{trial_id}.py')
setattr(context.get('args'), self.arg_name, tmp_path)
dump_cfg(cfg, tmp_path)
Expand Down
4 changes: 2 additions & 2 deletions siatune/mm/context/rewriters/path.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright (c) SI-Analytics. All rights reserved.
from os import path as osp

import ray
from ray.air import session

from .base import BaseRewriter
from .builder import REWRITERS
Expand Down Expand Up @@ -31,5 +31,5 @@ def __call__(self, context: dict) -> dict:
"""
value = getattr(context['args'], self.arg_name)
setattr(context['args'], self.arg_name,
osp.join(value, ray.tune.get_trial_id()))
osp.join(value, session.get_trial_id()))
return context
11 changes: 6 additions & 5 deletions siatune/mm/hooks/checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
from typing import Optional

import mmcv
import ray.tune as tune
import torch
from mmcv.parallel import is_module_wrapper
from mmcv.runner import HOOKS, BaseRunner
from mmcv.runner.checkpoint import get_state_dict, weights_to_cpu
from mmcv.runner.dist_utils import master_only
from mmcv.runner.hooks import CheckpointHook as _CheckpointHook
from ray.tune.integration.torch import distributed_checkpoint_dir
from torch.optim import Optimizer


Expand Down Expand Up @@ -100,9 +100,10 @@ def _save_checkpoint(self, runner: BaseRunner) -> None:
for name, optim in optimizer.items():
checkpoint['optimizer'][name] = optim.state_dict()

with distributed_checkpoint_dir(
step=(runner.epoch + 1) //
self.interval if self.by_epoch else (runner.iter + 1) //
self.interval) as checkpoint_dir:
step = (runner.epoch + 1) // self.interval
if not self.by_epoch:
step //= runner.iter + 1

with tune.checkpoint_dir(step=step) as checkpoint_dir:
path = os.path.join(checkpoint_dir, 'ray_ckpt.pth')
torch.save(checkpoint, path)
4 changes: 2 additions & 2 deletions siatune/mm/hooks/reporter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Copyright (c) SI-Analytics. All rights reserved.
import ray
from mmcv.runner import HOOKS, BaseRunner
from mmcv.runner.dist_utils import get_dist_info
from mmcv.runner.hooks.logger import LoggerHook
from ray.air import session
from torch import distributed as dist


Expand Down Expand Up @@ -90,4 +90,4 @@ def log(self, runner: BaseRunner) -> None:
filter(lambda elem: self.filtering_key in elem, tags.keys())):
return
tags['global_step'] = self.get_iter(runner)
ray.tune.report(**tags)
session.report(tags)
16 changes: 5 additions & 11 deletions siatune/mm/tasks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from copy import deepcopy
from typing import Any, Dict, List, Optional, Sequence

import ray
from ray.tune import Trainable

from siatune.mm.context import ContextManager
from siatune.utils import ImmutableContainer
Expand All @@ -29,9 +29,7 @@ class BaseTask(metaclass=ABCMeta):
1. args (argparse.Namespace): The low level CLI arguments.
2. searched_cfg (Dict):
The configuration searched by the algorithm.
3. checkpoint_dir (Optional[str]):
KKIEEK marked this conversation as resolved.
Show resolved Hide resolved
The directory of checkpoints that contains the states.
Inputs: searched_cfg (Dict), checkpoint_dir (Optional[str])
Inputs: searched_cfg (Dict)
Outputs: None
"""

Expand Down Expand Up @@ -106,16 +104,14 @@ def parse_args(self, args: Sequence[str]) -> argparse.Namespace:
"""
pass

def context_aware_run(self,
searched_cfg: Dict,
checkpoint_dir: Optional[str] = None,
**kwargs) -> Any:
def context_aware_run(self, searched_cfg: Dict) -> Any:
"""Gather and refine the information received by users and Ray.tune to
execute the objective task.

Args:
searched_cfg (Dict): The searched configuration.
kwargs (**kwargs): The kwargs.

Returns:
Any: The result of the objective task.
"""
Expand All @@ -124,9 +120,7 @@ def context_aware_run(self,
context = dict(
args=deepcopy(self.args),
searched_cfg=deepcopy(ImmutableContainer.decouple(searched_cfg)),
checkpoint_dir=checkpoint_dir,
)
context.update(kwargs)
return context_manager(self.run)(**context)

@abstractmethod
Expand All @@ -140,7 +134,7 @@ def run(self, *, args: argparse.Namespace, **kwargs) -> None:
pass

@abstractmethod
def create_trainable(self, *args, **kwargs) -> ray.tune.Trainable:
def create_trainable(self, *args, **kwargs) -> Trainable:
"""Get ray trainable task.

Args:
Expand Down
Loading