Skip to content

Commit

Permalink
polish colossalai plugin's code (#2)
Browse files Browse the repository at this point in the history
Polish colossalai plugin's code to pass the tests of pre-commit hooks.
  • Loading branch information
1SAA committed Sep 13, 2022
1 parent d78eb95 commit 5742d32
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 11 deletions.
6 changes: 6 additions & 0 deletions src/pytorch_lightning/plugins/precision/colossalai.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from torch import Tensor
from torch.optim import Optimizer
from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin
from pytorch_lightning.utilities.exceptions import MisconfigurationException


class ColossalAIPrecisionPlugin(PrecisionPlugin):
Expand All @@ -19,4 +20,9 @@ def clip_grad_by_norm(self, optimizer: Optimizer, clip_val: Union[int, float]) -
def optimizer_step(self, model, optimizer, optimizer_idx: int, closure, **kwargs: Any) -> Any:
closure_result = closure()
self._after_closure(model, optimizer, optimizer_idx)
skipped_backward = closure_result is None
if isinstance(model, pl.LightningModule) and model.automatic_optimization and skipped_backward:
raise MisconfigurationException(
"Skipping backward by returning `None` from your `training_step` is not supported by `Colossalai`"
)
optimizer.step()
2 changes: 1 addition & 1 deletion src/pytorch_lightning/strategies/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from pytorch_lightning.strategies.strategy import Strategy # noqa: F401
from pytorch_lightning.strategies.strategy_registry import call_register_strategies, StrategyRegistry # noqa: F401
from pytorch_lightning.strategies.tpu_spawn import TPUSpawnStrategy # noqa: F401
from pytorch_lightning.strategies.colossalai import ColossalAIStrategy
from pytorch_lightning.strategies.colossalai import ColossalAIStrategy # noqa: F401

STRATEGIES_BASE_MODULE = "pytorch_lightning.strategies"

Expand Down
27 changes: 17 additions & 10 deletions src/pytorch_lightning/strategies/colossalai.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ def _post_init_method(self, module: torch.nn.Module, *args, **kwargs):

class ColossalAIStrategy(DDPStrategy):
"""ColossalAI strategy.
It only supports single optimizer which must be `colossalai.nn.optimizer.CPUAdam`_ or `colossalai.nn.optimizer.HybridAdam`_ now.
It only supports single optimizer which must be `colossalai.nn.optimizer.CPUAdam`_ or
`colossalai.nn.optimizer.HybridAdam`_ now.
You must initialize your model in ``configure_sharded_model()``.
It configures accelerator and precision, and you should not configure them when initializing ``Trainer``.
Expand All @@ -48,32 +49,38 @@ def on_load_checkpoint(self, checkpoint) -> None:
It can speed up training, but slightly more memory will be used. Defaults to True.
chunk_size (Optional[int], optional): The size of a chunk.
It will be ignored when ``use_chunk=False``.
If it's None, a best chunk size will be searched out based on ``chunk_search_range``, ``chunk_search_n_grids`` and ``min_chunk_size``.
If it's None, a best chunk size will be searched out based on ``chunk_search_range``,
``chunk_search_n_grids`` and ``min_chunk_size``.
Defaults to None.
enable_distributed_storage (bool, optional): Whether to storage model in a distributed manner.
It reduces memory from 1 to 1/N, but it may slow down training.
Defaults to True.
placement_policy (str, optional): It can be "cpu", "cuda" and "auto".
If it's "cpu", parameters, gradients and optimizer states will be offloaded to CPU, which means min CUDA memory will be used.
If it's "cpu", parameters, gradients and optimizer states will be offloaded to CPU,
which means min CUDA memory will be used.
If it's "cuda", they won't be offloaded, which means max CUDA memory will be used. It's the fastest.
If it's "auto", they are moving dynamically based on CPU and CUDA memory usage. It will utilize heterogeneous memory space evenly and well.
If it's "auto", they are moving dynamically based on CPU and CUDA memory usage.
It will utilize heterogeneous memory space evenly and well.
Note that "auto" policy can only work well when no other processes use CUDA during your training.
Defaults to 'auto'.
force_outputs_fp32 (bool, optional): Whether to cast outputs to fp32. Defaults to False.
gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward)
which will be used by optimizer.
gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward)
which will be used by optimizer.
This argument will be ignored when ``placement_policy`` is not "auto".
Defaults to 0.0.
chunk_search_range (int, optional): The range of chunk size to search.
The actual search range will be from ``max(min_chunk_size, max_param_size)`` to ``max(min_chunk_size, max_param_size) + chunk_search_range``.
The actual search range will be from
``max(min_chunk_size, max_param_size)`` to ``max(min_chunk_size, max_param_size) + chunk_search_range``.
Defaults to 64*1024**2.
chunk_search_n_grids (int, optional): The number of intervals in the search range. Defaults to 1024.
min_chunk_size (Optional[int], optional): The minimum size for a chunk. Defaults to None.
initial_scale (float, optional): The initial dynamic loss scale value. Defaults to 2**32.
min_scale (float, optional): The minimum dynamic loss scaling value. Defaults to 1.
growth_factor (float, optional): The multiplication factor for increasing loss scale. Defaults to 2.
backoff_factor (float, optional): The multiplication factor for decreasing loss scale. Defaults to 0.5.
growth_interval (int, optional): The number of steps to increase loss scale when no overflow occurs. Defaults to 1000.
growth_interval (int, optional):
The number of steps to increase loss scale when no overflow occurs.
Defaults to 1000.
hysteresis (int, optional): The number of overflows before decreasing loss scale. Defaults to 2.
max_scale (float, optional): The maximum dynamic loss scaling value. Defaults to 2**32.
Expand Down Expand Up @@ -150,8 +157,8 @@ def setup_precision_plugin(self) -> None:
super().setup_precision_plugin()
assert len(self.optimizers) == 1, 'ColossalAIStrategy only supports single Optimizer now.'
optimizer = self.optimizers[0]
assert isinstance(optimizer, (CPUAdam, HybridAdam)
), 'ColossalAIStrategy only supports colossalai.nn.optimizer.CPUAdam and colossalai.nn.optimizer.HybridAdam now'
assert isinstance(optimizer, (CPUAdam, HybridAdam)), \
'ColossalAIStrategy only supports colossalai.nn.optimizer.CPUAdam and colossalai.nn.optimizer.HybridAdam.'
if self.use_chunk:
chunk_size = self.chunk_size or ChunkManager.search_chunk_size(self.model, **self.chunk_size_search_kwargs)
else:
Expand Down

0 comments on commit 5742d32

Please sign in to comment.