Lightning-AI · awaelchli · Apr 15, 2023 · Apr 4, 2023 · Apr 4, 2023 · Apr 4, 2023
diff --git a/src/lightning/pytorch/loggers/logger.py b/src/lightning/pytorch/loggers/logger.py
@@ -20,7 +20,7 @@
 from collections import defaultdict
 from typing import Any, Callable, Dict, Mapping, Optional, Sequence
 
-import numpy as np
+import torch
 
 from lightning.fabric.loggers import Logger as FabricLogger
 from lightning.fabric.loggers.logger import _DummyExperiment as DummyExperiment # for backward compatibility
@@ -94,7 +94,7 @@ def method(*args: Any, **kwargs: Any) -> None:
 def merge_dicts( # pragma: no cover
  dicts: Sequence[Mapping],
  agg_key_funcs: Optional[Mapping] = None,
- default_func: Callable[[Sequence[float]], float] = np.mean,
+ default_func: Callable[[torch.Tensor], torch.Tensor] = torch.mean,
 ) -> Dict:
  """Merge a sequence with dictionaries into one dictionary by aggregating the same keys with some given
  function.
@@ -120,25 +120,50 @@ def merge_dicts( # pragma: no cover
  >>> d1 = {'a': 1.7, 'b': 2.0, 'c': 1, 'd': {'d1': 1, 'd3': 3}}
  >>> d2 = {'a': 1.1, 'b': 2.2, 'v': 1, 'd': {'d1': 2, 'd2': 3}}
  >>> d3 = {'a': 1.1, 'v': 2.3, 'd': {'d3': 3, 'd4': {'d5': 1}}}
- >>> dflt_func = min
- >>> agg_funcs = {'a': np.mean, 'v': max, 'd': {'d1': sum}}
+ >>> dflt_func = torch.min
+ >>> agg_funcs = {'a': torch.mean, 'v': torch.max, 'd': {'d1': torch.sum}}
  >>> pprint.pprint(merge_dicts([d1, d2, d3], agg_funcs, dflt_func))
- {'a': 1.3,
- 'b': 2.0,
- 'c': 1,
- 'd': {'d1': 3, 'd2': 3, 'd3': 3, 'd4': {'d5': 1}},
- 'v': 2.3}
+ {'a': tensor(1.3000),
+ 'b': tensor(2.),
+ 'c': tensor(1.),
+ 'd': {'d1': tensor(3.),
+ 'd2': tensor(3.),
+ 'd3': tensor(3.),
+ 'd4': {'d5': tensor(1.)}},
+ 'v': tensor(2.3000)}
  """
+ # If agg_key_funcs is not provided, initialize it as an empty dictionary
  agg_key_funcs = agg_key_funcs or {}
+
+ # Collect all unique keys from the input dictionaries
  keys = list(functools.reduce(operator.or_, [set(d.keys()) for d in dicts]))
+
+ # Initialize the output dictionary using defaultdict
  d_out: Dict = defaultdict(dict)
+
+ # Iterate over all unique keys
  for k in keys:
+ # Get the aggregation function for the current key, if available
  fn = agg_key_funcs.get(k)
+
+ # Collect values associated with the current key from all input dictionaries
  values_to_agg = [v for v in [d_in.get(k) for d_in in dicts] if v is not None]
 
+ # Check if the values to aggregate are dictionaries
  if isinstance(values_to_agg[0], dict):
+ # Call the merge_dicts function recursively for nested dictionaries
  d_out[k] = merge_dicts(values_to_agg, fn, default_func)
+
  else:
- d_out[k] = (fn or default_func)(values_to_agg)
+ # Convert values_to_agg to a tensor with float32 data type
+ values_to_agg_tensor = torch.tensor(values_to_agg, dtype=torch.float32)
+
+ # Apply the aggregation function (fn) or the default function (default_func) to the tensor
+ aggregated_value = (fn or default_func)(values_to_agg_tensor)
+
+ # Assign the aggregated value to the output dictionary
+ # The check is necessary because aggregation functions can return floats instead of tensors
+ d_out[k] = aggregated_value if isinstance(aggregated_value, float) else aggregated_value
 
+ # Convert the defaultdict to a regular dictionary and return it
  return dict(d_out)
@@ -19,7 +19,7 @@
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Union
 
-import numpy as np
+import torch
 
 from lightning.pytorch.profilers.profiler import Profiler
 
@@ -80,15 +80,24 @@ def stop(self, action_name: str) -> None:
  def _make_report_extended(self) -> Tuple[_TABLE_DATA_EXTENDED, float, float]:
  total_duration = time.monotonic() - self.start_time
  report = [
- (a, np.mean(d), len(d), np.sum(d), 100.0 * np.sum(d) / total_duration)
+ (
+ a,
+ torch.mean(torch.tensor(d)).item(),
+ len(d),
+ torch.sum(torch.tensor(d)).item(),
+ 100.0 * torch.sum(torch.tensor(d)).item() / total_duration,
+ )
  for a, d in self.recorded_durations.items()
  ]
  report.sort(key=lambda x: x[4], reverse=True)
  total_calls = sum(x[2] for x in report)
  return report, total_calls, total_duration
 
  def _make_report(self) -> _TABLE_DATA:
- report = [(action, np.mean(d), np.sum(d)) for action, d in self.recorded_durations.items()]
+ report = [
+ (action, torch.mean(torch.tensor(d)).item(), torch.sum(torch.tensor(d)).item())
+ for action, d in self.recorded_durations.items()
+ ]
  report.sort(key=lambda x: x[1], reverse=True)
  return report
 

diff --git a/src/lightning/pytorch/strategies/launchers/multiprocessing.py b/src/lightning/pytorch/strategies/launchers/multiprocessing.py
@@ -19,7 +19,6 @@
 from multiprocessing.queues import SimpleQueue
 from typing import Any, Callable, Dict, List, Literal, NamedTuple, Optional
 
-import numpy as np
 import torch
 import torch.backends.cudnn
 import torch.multiprocessing as mp
@@ -210,7 +209,6 @@ def _check_torchdistx_support(self) -> None:
 
  def get_extra_results(self, trainer: "pl.Trainer") -> Dict[str, Any]:
  """Gather extra state from the Trainer and return it as a dictionary for sending back to the main process.
- To avoid issues with memory sharing, we cast the data to numpy.
 
  Args:
  trainer: reference to the Trainer.
@@ -219,9 +217,7 @@ def get_extra_results(self, trainer: "pl.Trainer") -> Dict[str, Any]:
  A dictionary with items to send back to the main process where :meth:`update_main_process_results` will
  process this output.
  """
- callback_metrics: dict = apply_to_collection(
- trainer.callback_metrics, Tensor, lambda x: x.cpu().numpy()
- ) # send as numpy to avoid issues with memory sharing
+ callback_metrics: dict = apply_to_collection(trainer.callback_metrics, Tensor, lambda x: x.cpu())
  return {"callback_metrics": callback_metrics}
 
  def update_main_process_results(self, trainer: "pl.Trainer", extra: Dict[str, Any]) -> None:
@@ -235,7 +231,7 @@ def update_main_process_results(self, trainer: "pl.Trainer", extra: Dict[str, An
  """
  # NOTE: `get_extra_results` needs to be called before
  callback_metrics = extra["callback_metrics"]
- trainer.callback_metrics.update(apply_to_collection(callback_metrics, np.ndarray, lambda x: torch.tensor(x)))
+ trainer.callback_metrics.update(callback_metrics)
 
  def kill(self, signum: _SIGNUM) -> None:
  for proc in self.procs:

@@ -17,7 +17,6 @@
 from collections import OrderedDict
 from typing import Any, cast, Dict, List, Optional, Tuple, Union
 
-import numpy as np
 import torch
 import torch.nn as nn
 from torch import Tensor
@@ -120,8 +119,13 @@ def layer_type(self) -> str:
  @property
  def num_parameters(self) -> int:
  """Returns the number of parameters in this module."""
- return sum(
- cast(int, np.prod(p.shape)) if not _is_lazy_weight_tensor(p) else 0 for p in self._module.parameters()
+ return int(
+ sum(
+ cast(int, torch.prod(torch.tensor(p.shape, dtype=torch.float32)))
+ if not _is_lazy_weight_tensor(p)
+ else 0
+ for p in self._module.parameters()
+ )
  )
 
 
@@ -392,8 +396,8 @@ def get_human_readable_count(number: int) -> str:
  """
  assert number >= 0
  labels = PARAMETER_NUM_UNITS
- num_digits = int(np.floor(np.log10(number)) + 1 if number > 0 else 1)
- num_groups = int(np.ceil(num_digits / 3))
+ num_digits = int(torch.floor(torch.log10(torch.tensor(number, dtype=torch.float32))) + 1 if number > 0 else 1)
+ num_groups = int(torch.ceil(torch.tensor(num_digits, dtype=torch.float32) / 3))
  num_groups = min(num_groups, len(labels)) # don't abbreviate beyond trillions
  shift = -3 * (num_groups - 1)
  number = number * (10**shift)