microsoft · QuanluZhang · May 20, 2022 · Jan 11, 2022 · Jan 11, 2022 · Jan 13, 2022
diff --git a/nni/retiarii/evaluator/pytorch/lightning.py b/nni/retiarii/evaluator/pytorch/lightning.py
@@ -4,7 +4,7 @@
 import os
 import warnings
 from pathlib import Path
-from typing import Dict, Union, Optional, List, Callable, Type
+from typing import Any, Dict, Union, Optional, List, Callable, Type
 
 import pytorch_lightning as pl
 import torch.nn as nn
@@ -22,6 +22,7 @@
  cgo_import_failed = True
 
 from nni.retiarii.graph import Evaluator
+from nni.typehint import Literal
 
 
 __all__ = ['LightningModule', 'Trainer', 'DataLoader', 'Lightning', 'Classification', 'Regression']
@@ -36,6 +37,11 @@ class LightningModule(pl.LightningModule):
  See https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html
  """
 
+ running_mode: Literal['multi', 'oneshot'] = 'multi'
+ """An indicator of whether current module is running in a multi-trial experiment or an one-shot.
+ This flag should be automatically set by experiments when they start to run.
+ """
+
  def set_model(self, model: Union[Callable[[], nn.Module], nn.Module]) -> None:
  """Set the inner model (architecture) to train / evaluate.
 
@@ -59,6 +65,7 @@ def set_model(self, model: Union[Callable[[], nn.Module], nn.Module]) -> None:
 Traced version of ``torch.utils.data.DataLoader``. See https://pytorch.org/docs/stable/data.html
 """
 
+
 @nni.trace
 class Lightning(Evaluator):
  """
@@ -74,51 +81,67 @@ class Lightning(Evaluator):
 
  Parameters
  ----------
- lightning_module : LightningModule
+ lightning_module
  Lightning module that defines the training logic.
- trainer : Trainer
+ trainer
  Lightning trainer that handles the training.
- train_dataloders : DataLoader
+ train_dataloders
  Used in ``trainer.fit()``. A PyTorch DataLoader with training samples.
  If the ``lightning_module`` has a predefined train_dataloader method this will be skipped.
- val_dataloaders : DataLoader or List of DataLoader
+ It can be `any types of dataloader supported by Lightning <https://pytorch-lightning.readthedocs.io/en/stable/guides/data.html>`__.
+ val_dataloaders
  Used in ``trainer.fit()``. Either a single PyTorch Dataloader or a list of them, specifying validation samples.
  If the ``lightning_module`` has a predefined val_dataloaders method this will be skipped.
+ It can be `any types of dataloader supported by Lightning <https://pytorch-lightning.readthedocs.io/en/stable/guides/data.html>`__.
  """
 
  def __init__(self, lightning_module: LightningModule, trainer: Trainer,
- train_dataloader: Optional[DataLoader] = None,
- val_dataloaders: Union[DataLoader, List[DataLoader], None] = None):
+ train_dataloaders: Optional[Any] = None,
+ val_dataloaders: Optional[Any] = None,
+ train_dataloader: Optional[Any] = None):
  assert isinstance(lightning_module, LightningModule), f'Lightning module must be an instance of {__name__}.LightningModule.'
+ if train_dataloader is not None:
+ warnings.warn('`train_dataloader` is deprecated and replaced with `train_dataloaders`.', DeprecationWarning)
+ train_dataloaders = train_dataloader
  if cgo_import_failed:
  assert isinstance(trainer, pl.Trainer) and is_traceable(trainer), f'Trainer must be imported from {__name__}'
  else:
  # this is not isinstance(trainer, Trainer) because with a different trace call, it can be different
  assert (isinstance(trainer, pl.Trainer) and is_traceable(trainer)) or isinstance(trainer, cgo_trainer.Trainer), \
  f'Trainer must be imported from {__name__} or nni.retiarii.evaluator.pytorch.cgo.trainer'
- assert _check_dataloader(train_dataloader), f'Wrong dataloader type. Try import DataLoader from {__name__}.'
- assert _check_dataloader(val_dataloaders), f'Wrong dataloader type. Try import DataLoader from {__name__}.'
+ if not _check_dataloader(train_dataloaders):
+ warnings.warn(f'Please try to wrap PyTorch DataLoader with nni.trace or '
+ f'import DataLoader from {__name__}: {train_dataloaders}',
+ RuntimeWarning)
+ if not _check_dataloader(val_dataloaders):
+ warnings.warn(f'Please try to wrap PyTorch DataLoader with nni.trace or '
+ f'import DataLoader from {__name__}: {val_dataloaders}',
+ RuntimeWarning)
  self.module = lightning_module
  self.trainer = trainer
- self.train_dataloader = train_dataloader
+ self.train_dataloaders = train_dataloaders
  self.val_dataloaders = val_dataloaders
 
  @staticmethod
  def _load(ir):
- return Lightning(ir['module'], ir['trainer'], ir['train_dataloader'], ir['val_dataloaders'])
+ return Lightning(ir['module'], ir['trainer'], ir['train_dataloaders'], ir['val_dataloaders'])
 
  def _dump(self):
  return {
  'type': self.__class__,
  'module': self.module,
  'trainer': self.trainer,
- 'train_dataloader': self.train_dataloader,
+ 'train_dataloaders': self.train_dataloaders,
  'val_dataloaders': self.val_dataloaders
  }
 
  def _execute(self, model_cls):
  return self.fit(model_cls)
 
+ @property
+ def train_dataloader(self):
+ warnings.warn('train_dataloader is deprecated, please use `train_dataloaders`.', DeprecationWarning)
+
  def __eq__(self, other):
  eq_func = False
  eq_args = False
@@ -146,15 +169,18 @@ def fit(self, model):
  The model to fit.
  """
  self.module.set_model(model)
- return self.trainer.fit(self.module, self.train_dataloader, self.val_dataloaders)
+ return self.trainer.fit(self.module, self.train_dataloaders, self.val_dataloaders)
 
 
 def _check_dataloader(dataloader):
- if dataloader is None:
- return True
+ # Check the type of dataloader recursively.
  if isinstance(dataloader, list):
  return all([_check_dataloader(d) for d in dataloader])
- return isinstance(dataloader, torch_data.DataLoader) and is_traceable(dataloader)
+ if isinstance(dataloader, dict):
+ return all([_check_dataloader(v) for v in dataloader.values()])
+ if isinstance(dataloader, torch_data.DataLoader):
+ return is_traceable(dataloader)
+ return True
 
 
 ### The following are some commonly used Lightning modules ###
@@ -176,7 +202,6 @@ def __init__(self, criterion: Type[nn.Module], metrics: Dict[str, Type[torchmetr
 
  if export_onnx is None or export_onnx is True:
  self.export_onnx = Path(os.environ.get('NNI_OUTPUT_DIR', '.')) / 'model.onnx'
- self.export_onnx.parent.mkdir(exist_ok=True)
  elif export_onnx:
  self.export_onnx = Path(export_onnx)
  else:
@@ -199,7 +224,8 @@ def validation_step(self, batch, batch_idx):
  x, y = batch
  y_hat = self(x)
 
- if self.export_onnx is not None:
+ if self.running_mode == 'multi' and self.export_onnx is not None:
+ self.export_onnx.parent.mkdir(exist_ok=True)
  try:
  self.to_onnx(self.export_onnx, x, export_params=True)
  except RuntimeError as e:
@@ -221,10 +247,12 @@ def configure_optimizers(self):
  return self.optimizer(self.parameters(), lr=self.hparams.learning_rate, weight_decay=self.hparams.weight_decay) # type: ignore
 
  def on_validation_epoch_end(self):
- nni.report_intermediate_result(self._get_validation_metrics())
+ if self.running_mode == 'multi':
+ nni.report_intermediate_result(self._get_validation_metrics())
 
  def on_fit_end(self):
- nni.report_final_result(self._get_validation_metrics())
+ if self.running_mode == 'multi':
+ nni.report_final_result(self._get_validation_metrics())
 
  def _get_validation_metrics(self):
  if len(self.metrics) == 1:
@@ -283,14 +311,18 @@ def __init__(self, criterion: Type[nn.Module] = nn.CrossEntropyLoss,
  learning_rate: float = 0.001,
  weight_decay: float = 0.,
  optimizer: Type[optim.Optimizer] = optim.Adam,
- train_dataloader: Optional[DataLoader] = None,
+ train_dataloaders: Optional[DataLoader] = None,
  val_dataloaders: Union[DataLoader, List[DataLoader], None] = None,
  export_onnx: bool = True,
+ train_dataloader: Optional[DataLoader] = None,
  **trainer_kwargs):
+ if train_dataloader is not None:
+ warnings.warn('`train_dataloader` is deprecated and replaced with `train_dataloaders`.', DeprecationWarning)
+ train_dataloaders = train_dataloader
  module = _ClassificationModule(criterion=criterion, learning_rate=learning_rate,
  weight_decay=weight_decay, optimizer=optimizer, export_onnx=export_onnx)
  super().__init__(module, Trainer(**trainer_kwargs),
- train_dataloader=train_dataloader, val_dataloaders=val_dataloaders)
+ train_dataloaders=train_dataloaders, val_dataloaders=val_dataloaders)
 
 
 @nni.trace
@@ -336,11 +368,15 @@ def __init__(self, criterion: Type[nn.Module] = nn.MSELoss,
  learning_rate: float = 0.001,
  weight_decay: float = 0.,
  optimizer: Type[optim.Optimizer] = optim.Adam,
- train_dataloader: Optional[DataLoader] = None,
+ train_dataloaders: Optional[DataLoader] = None,
  val_dataloaders: Union[DataLoader, List[DataLoader], None] = None,
  export_onnx: bool = True,
+ train_dataloader: Optional[DataLoader] = None,
  **trainer_kwargs):
+ if train_dataloader is not None:
+ warnings.warn('`train_dataloader` is deprecated and replaced with `train_dataloaders`.', DeprecationWarning)
+ train_dataloaders = train_dataloader
  module = _RegressionModule(criterion=criterion, learning_rate=learning_rate,
  weight_decay=weight_decay, optimizer=optimizer, export_onnx=export_onnx)
  super().__init__(module, Trainer(**trainer_kwargs),
- train_dataloader=train_dataloader, val_dataloaders=val_dataloaders)
+ train_dataloaders=train_dataloaders, val_dataloaders=val_dataloaders)
diff --git a/nni/retiarii/oneshot/pytorch/base_lightning.py b/nni/retiarii/oneshot/pytorch/base_lightning.py
@@ -18,6 +18,7 @@
 from nni.common.hpo_utils import ParameterSpec
 from nni.common.serializer import is_traceable
 from nni.retiarii.nn.pytorch.api import ValueChoiceX
+from nni.typehint import Literal
 from .supermodule.base import BaseSuperNetModule
 
 __all__ = ['MutationHook', 'BaseSuperNetModule', 'BaseOneShotLightningModule', 'traverse_and_mutate_submodules']
@@ -334,28 +335,29 @@ def configure_optimizers(self):
  return arc_optimizers + w_optimizers, lr_schedulers
 
  def on_train_start(self):
- # redirect the access to trainer/log to this module
- # but note that we might be missing other attributes,
- # which could potentially be a problem
- self.model.trainer = self.trainer # type: ignore
- self.model.log = self.log
  return self.model.on_train_start()
 
  def on_train_end(self):
  return self.model.on_train_end()
 
  def on_fit_start(self):
- return self.model.on_train_start()
+ # redirect the access to trainer/log to this module
+ # but note that we might be missing other attributes,
+ # which could potentially be a problem
+ self.model.trainer = self.trainer # type: ignore
+ self.model.log = self.log
+ return self.model.on_fit_start()
 
  def on_fit_end(self):
- return self.model.on_train_end()
+ return self.model.on_fit_end()
 
  def on_train_batch_start(self, batch, batch_idx, unused=0):
  return self.model.on_train_batch_start(batch, batch_idx, unused)
 
  def on_train_batch_end(self, outputs, batch, batch_idx, unused=0):
  return self.model.on_train_batch_end(outputs, batch, batch_idx, unused)
 
+ # Deprecated hooks in pytorch-lightning
  def on_epoch_start(self):
  return self.model.on_epoch_start()
 
@@ -427,7 +429,7 @@ def apply(lr_scheduler):
  else:
  apply(lr_schedulers)
 
- def call_weight_optimizers(self, method):
+ def call_weight_optimizers(self, method: Literal['step', 'zero_grad']):
  """
  Function that imitates lightning trainer's behavior of calling user's optimizers. Since auto_optimization is turned off by this
  class, you can use this function to make user optimizers behave as they were automatically handled by the lightning trainer.

diff --git a/nni/retiarii/oneshot/pytorch/dataloader.py b/nni/retiarii/oneshot/pytorch/dataloader.py
@@ -0,0 +1,77 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from __future__ import annotations
+
+from typing import Any
+
+from pytorch_lightning.trainer.supporters import CombinedLoader, CombinedLoaderIterator
+
+
+class ConcatLoader(CombinedLoader):
+ """This loader is same as CombinedLoader in PyTorch-Lightning, but concatenate sub-loaders
+ instead of loading them in parallel.
+
+ Parameters
+ ----------
+ loaders
+ For example, ::
+
+ {
+ "train": DataLoader(train_dataset),
+ "val": DataLoader(val_dataset)
+ }
+
+ In this example, the loader will first produce the batches from "train", then "val".
+
+ mode
+ Only support "min_size" for now.
+ """
+
+ def __init__(self, loaders: dict[str, Any], mode: str = 'min_size'):
+ # FIXME: max_cycle will make dataloaders cycle iterators,
+ # causing extra problems.
+ if mode != 'min_size':
+ raise ValueError('Only min_size mode is supported now.')
+ super().__init__(loaders, mode)
+
+ def __iter__(self) -> Any:
+ """Replace the super-class iterator with ours."""
+ self._try_to_patch_pytorch_dataloader()
+ iterator = ConcatLoaderIterator(self.loaders)
+ # handle fault tolerant restart.
+ self.on_restart(iterator)
+ self._iterator = iterator
+ return iterator
+
+ @staticmethod
+ def _try_to_patch_pytorch_dataloader():
+ """Copied from CombinedLoader."""
+ from torch.utils.data.dataloader import _BaseDataLoaderIter
+
+ # prevent `NotImplementedError` from PyTorch:
+ # https://github.com/pytorch/pytorch/blob/v1.9.0/torch/utils/data/dataloader.py#L541
+ def __getstate__patch__(*_):
+ return {}
+
+ _BaseDataLoaderIter.__getstate__ = __getstate__patch__ # type: ignore
+
+ def __len__(self) -> int:
+ return int(sum(self._calc_num_batches(loader) for loader in self.loaders.values()))
+
+
+class ConcatLoaderIterator(CombinedLoaderIterator):
+ """Similar to CombinedLoaderIterator in Lightning, but in a concat manner."""
+
+ def __next__(self) -> Any:
+ """Fetches the next batch from multiple data loaders,
+ by looking for the first iterator that isn't exhausted yet.
+ """
+ if not len(self.loader_iters) == len(self.loaders):
+ raise RuntimeError('loader_iters must have the same length as loaders.')
+ for i, (loader_name, iterator) in enumerate(self.loader_iters.items()):
+ try:
+ return (self.request_next_batch(iterator), loader_name)
+ except StopIteration:
+ if i + 1 == len(self.loader_iters):
+ raise
diff --git a/nni/retiarii/oneshot/pytorch/differentiable.py b/nni/retiarii/oneshot/pytorch/differentiable.py
@@ -75,8 +75,9 @@ def training_step(self, batch, batch_idx):
  if not isinstance(arc_optim, optim.Optimizer):
  raise TypeError(f'Expect arc_optim to be a single Optimizer, but found: {arc_optim}')
 
- # The InterleavedTrainValDataLoader yields both train and val data in a batch
- trn_batch, val_batch = batch
+ # DARTS strategy makes sure that ``train`` and ``val`` must be in the batch
+ trn_batch = batch['train']
+ val_batch = batch['val']
 
  # phase 1: architecture step
  # The _resample hook is kept for some darts-based NAS methods like proxyless.