|
| 1 | +# DanLing |
| 2 | +# Copyright (C) 2022-Present DanLing |
| 3 | + |
| 4 | +# This program is free software: you can redistribute it and/or modify |
| 5 | +# it under the terms of the following licenses: |
| 6 | +# - The Unlicense |
| 7 | +# - GNU Affero General Public License v3.0 or later |
| 8 | +# - GNU General Public License v2.0 or later |
| 9 | +# - BSD 4-Clause "Original" or "Old" License |
| 10 | +# - MIT License |
| 11 | +# - Apache License 2.0 |
| 12 | + |
| 13 | +# This program is distributed in the hope that it will be useful, |
| 14 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 15 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 16 | +# See the LICENSE file for more details. |
| 17 | + |
| 18 | +from __future__ import annotations |
| 19 | + |
| 20 | +import os |
| 21 | +import shutil |
| 22 | + |
| 23 | +import torch |
| 24 | +from chanfig import NestedDict |
| 25 | +from lazy_imports import try_import |
| 26 | +from torch import distributed as dist |
| 27 | +from torch import nn |
| 28 | +from torch.nn.utils import clip_grad_value_ |
| 29 | + |
| 30 | +from danling.runner.config import Config |
| 31 | +from danling.utils import catch |
| 32 | + |
| 33 | +from .torch_runner import TorchRunner |
| 34 | + |
| 35 | +with try_import() as ds: |
| 36 | + import deepspeed |
| 37 | + |
| 38 | + |
| 39 | +class DeepSpeedRunner(TorchRunner): |
| 40 | + |
| 41 | + def __init__(self, config: Config) -> None: |
| 42 | + ds.check() |
| 43 | + super().__init__(config) |
| 44 | + |
| 45 | + def init_distributed(self) -> None: |
| 46 | + r""" |
| 47 | + Set up distributed training. |
| 48 | +
|
| 49 | + Initialise process group and set up DDP variables. |
| 50 | + """ |
| 51 | + |
| 52 | + backend = self.config.get("backend", os.getenv("BACKEND")) |
| 53 | + init_method = self.config.get("init_method", os.getenv("INIT_METHOD")) |
| 54 | + world_size = int(self.config.get("world_size", os.getenv("WORLD_SIZE", "1"))) |
| 55 | + rank = int(self.config.get("rank", os.getenv("RANK", "0"))) |
| 56 | + if world_size > 1: |
| 57 | + if torch.cuda.is_available(): |
| 58 | + torch.cuda.set_device(self.get_local_rank()) |
| 59 | + deepspeed.init_distributed(dist_backend=backend, init_method=init_method, world_size=world_size, rank=rank) |
| 60 | + object_list = [self.id, self.timestamp] |
| 61 | + dist.broadcast_object_list(object_list) |
| 62 | + self.id, self.timestamp = object_list |
| 63 | + |
| 64 | + def __post_init__(self): |
| 65 | + super().__post_init__() |
| 66 | + self.config.deepspeed = self.get_deepspeed_config() |
| 67 | + self.model, self.optimizer, _, self.scheduler = deepspeed.initialize( |
| 68 | + model=self.model, |
| 69 | + optimizer=self.optimizer, |
| 70 | + lr_scheduler=self.scheduler, |
| 71 | + config=self.config.deepspeed, |
| 72 | + ) |
| 73 | + |
| 74 | + def advance(self, loss) -> None: |
| 75 | + self.backward(loss) |
| 76 | + if self.config.get("max_grad_value") is not None: |
| 77 | + clip_grad_value_(self.model.parameters(), self.config["max_grad_value"]) |
| 78 | + self.model.step() |
| 79 | + if self.ema is not None: |
| 80 | + self.ema.update() |
| 81 | + self.config.steps = self.model.global_steps |
| 82 | + |
| 83 | + def backward(self, loss: torch.Tensor) -> None: |
| 84 | + return self.model.backward(loss) |
| 85 | + |
| 86 | + def get_local_rank(self) -> int: |
| 87 | + local_rank = self.config.get("local_rank", os.getenv("LOCAL_RANK")) |
| 88 | + if local_rank is not None: |
| 89 | + return int(local_rank) |
| 90 | + rank = self.config.get("rank", os.getenv("RANK")) |
| 91 | + world_size = self.config.get("world_size", os.getenv("WORLD_SIZE")) |
| 92 | + if world_size is None or rank is None: |
| 93 | + raise ValueError("Please provide either `local_rank` or `world_size` and `rank`") |
| 94 | + return int(world_size) % int(rank) |
| 95 | + |
| 96 | + def unwrap(self, model: nn.Module) -> nn.Module: |
| 97 | + while isinstance(model, (deepspeed.DeepSpeedEngine, nn.parallel.DistributedDataParallel)): |
| 98 | + model = model.module |
| 99 | + return model |
| 100 | + |
| 101 | + @property |
| 102 | + def deepspeed(self) -> NestedDict | None: |
| 103 | + if isinstance(self.model, deepspeed.DeepSpeedEngine): |
| 104 | + return self.model.config |
| 105 | + return None |
| 106 | + |
| 107 | + @catch |
| 108 | + def save_checkpoint(self, name: str = "latest", epoch: int | None = None, save_best: bool = True) -> None: |
| 109 | + r""" |
| 110 | + Save checkpoint to `self.checkpoint_dir`. |
| 111 | +
|
| 112 | + Args: |
| 113 | + name: Name of the checkpoint. Defaults to `"latest"`. |
| 114 | + epoch: Epoch to save. Defaults to `self.epochs`. |
| 115 | + save_best: If `True`, when `self.is_best` is `True`, the checkpoint will also be copied to |
| 116 | + `self.checkpoint_dir/best`. |
| 117 | +
|
| 118 | + If `self.config.save_interval` is positive and `epochs + 1` is a multiple of `save_interval`, |
| 119 | + the checkpoint will also be copied to `self.checkpoint_dir/epoch-{epochs}`. |
| 120 | + """ |
| 121 | + |
| 122 | + epoch = epoch or self.epochs |
| 123 | + save_interval = self.config.get("save_interval", -1) |
| 124 | + latest_path = os.path.join(self.checkpoint_dir, name) |
| 125 | + os.makedirs(latest_path, exist_ok=True) |
| 126 | + self.yaml(os.path.join(latest_path, "runner.yaml")) |
| 127 | + self.model.save_checkpoint( |
| 128 | + self.checkpoint_dir, tag=name, client_state={"runner": self.config.dict()}, save_latest=False |
| 129 | + ) |
| 130 | + if save_interval > 0 and (epoch + 1) % save_interval == 0: |
| 131 | + save_path = os.path.join(self.checkpoint_dir, f"epoch-{epoch}") |
| 132 | + shutil.copytree(latest_path, save_path, dirs_exist_ok=True) |
| 133 | + if save_best and self.is_best: |
| 134 | + best_path = os.path.join(self.checkpoint_dir, "best") |
| 135 | + shutil.copytree(latest_path, best_path, dirs_exist_ok=True) |
| 136 | + |
| 137 | + def load_checkpoint(self, checkpoint: bytes | str | os.PathLike, *args, **kwargs) -> None: # type: ignore[override] |
| 138 | + """ |
| 139 | + Load model, optimizer, and scheduler from checkpoint. |
| 140 | +
|
| 141 | + Args: |
| 142 | + checkpoint: Checkpoint (or its path) to load. |
| 143 | + *args: Additional arguments to pass to `self.load`. |
| 144 | + **kwargs: Additional keyword arguments to pass to `self.load`. |
| 145 | +
|
| 146 | + Raises: |
| 147 | + ValueError: If `model` is not defined. |
| 148 | + ValueError: If `model` is not an instance of `deepspeed.DeepSpeedEngine`. |
| 149 | +
|
| 150 | + See Also: |
| 151 | + [`from_checkpoint`][danling.BaseRunner.from_checkpoint]: Build runner from checkpoint. |
| 152 | + [`load_pretrained`][danling.BaseRunner.load_pretrained]: Load model parameters from pretrained checkpoint. |
| 153 | + """ |
| 154 | + |
| 155 | + if self.model is None: |
| 156 | + raise ValueError("model is not defined") |
| 157 | + if not isinstance(self.model, deepspeed.DeepSpeedEngine): |
| 158 | + raise ValueError("model is not an instance of `deepspeed.DeepSpeedEngine`") |
| 159 | + |
| 160 | + self.model.load_checkpoint(checkpoint) |
| 161 | + self.config.checkpoint = checkpoint |
| 162 | + |
| 163 | + def load_pretrained(self, checkpoint: bytes | str | os.PathLike, *args, **kwargs) -> None: # type: ignore[override] |
| 164 | + """ |
| 165 | + Load model from pretrained checkpoint. |
| 166 | +
|
| 167 | + This method only loads the model weights. |
| 168 | +
|
| 169 | + Args: |
| 170 | + checkpoint: Pretrained checkpoint directory. |
| 171 | + *args: Additional arguments to pass to `self.load`. |
| 172 | + **kwargs: Additional keyword arguments to pass to `self.load`. |
| 173 | +
|
| 174 | + Raises: |
| 175 | + ValueError: If `model` is not defined. |
| 176 | +
|
| 177 | + See Also: |
| 178 | + [`load_checkpoint`][danling.BaseRunner.load_checkpoint]: Load model, optimizer, and scheduler from |
| 179 | + checkpoint. |
| 180 | + """ |
| 181 | + |
| 182 | + if self.model is None: |
| 183 | + raise ValueError("model is not defined") |
| 184 | + |
| 185 | + self.model.load_checkpoint(checkpoint, load_module_only=True) |
| 186 | + self.config.pretrained = checkpoint |
| 187 | + |
| 188 | + def load_config( |
| 189 | + self, checkpoint: bytes | str | os.PathLike, overwrite: bool = False, *args, **kwargs # type: ignore[override] |
| 190 | + ) -> None: |
| 191 | + r""" |
| 192 | + Load config from checkpoint. |
| 193 | +
|
| 194 | + Args: |
| 195 | + checkpoint: Checkpoint (or its path) to load. |
| 196 | + overwrite: If `True`, overwrite the current config with the loaded config. |
| 197 | + Defaults to `False`. |
| 198 | + *args: Additional arguments to pass to `self.load`. |
| 199 | + **kwargs: Additional keyword arguments to pass to `self.load`. |
| 200 | +
|
| 201 | + Raises: |
| 202 | + FileNotFoundError: If `checkpoint` does not exists. |
| 203 | + """ |
| 204 | + |
| 205 | + if isinstance(checkpoint, bytes): |
| 206 | + checkpoint = checkpoint.decode() |
| 207 | + |
| 208 | + config = self.load(os.path.join(checkpoint, "runner.yaml"), *args, **kwargs) |
| 209 | + self.config.merge(config, overwrite=overwrite) |
| 210 | + self.step_begin = config["steps"] + 1 |
| 211 | + self.epoch_begin = config["epochs"] + 1 |
0 commit comments