DedalusProject · kburns · Jan 31, 2024 · Jan 9, 2024 · Jan 10, 2024 · Jan 13, 2024
diff --git a/dedalus/core/solvers.py b/dedalus/core/solvers.py
@@ -6,14 +6,23 @@
 import h5py
 import pathlib
 import scipy.linalg
+import cProfile
+import pstats
 from math import prod
+from collections import defaultdict
+import pickle
 
 from . import subsystems
 from . import timesteppers
 from .evaluator import Evaluator
 from ..libraries.matsolvers import matsolvers
 from ..tools.config import config
 from ..tools.array import scipy_sparse_eigs
+from ..tools.parallel import ProfileWrapper, parallel_mkdir
+
+PROFILE_DEFAULT = config['profiling'].getboolean('PROFILE_DEFAULT')
+PARALLEL_PROFILE_DEFAULT = config['profiling'].getboolean('PARALLEL_PROFILE_DEFAULT')
+PROFILE_DIRECTORY = pathlib.Path(config['profiling'].get('PROFILE_DIRECTORY'))
 
 import logging
 logger = logging.getLogger(__name__.split('.')[-1])
@@ -485,6 +494,10 @@ class InitialValueSolver(SolverBase):
         Iteration cadence for enforcing Hermitian symmetry on real variables (default: 100).
     warmup_iterations : int, optional
         Number of warmup iterations to disregard when computing runtime statistics (default: 10).
+    profile : bool, optional
+        Save accumulated profiles with cProfile (default: False).
+    parallel_profile : bool, optional
+        Save per-process and accumulated profiles with cProfile (default: False).
     **kw :
         Other options passed to ProblemBase.
 
@@ -510,15 +523,22 @@ class InitialValueSolver(SolverBase):
     matsolver_default = 'MATRIX_FACTORIZER'
     matrices = ['M', 'L']
 
-    def __init__(self, problem, timestepper, enforce_real_cadence=100, warmup_iterations=10, **kw):
+    def __init__(self, problem, timestepper, enforce_real_cadence=100, warmup_iterations=10, profile=PROFILE_DEFAULT, parallel_profile=PARALLEL_PROFILE_DEFAULT, **kw):
         logger.debug('Beginning IVP instantiation')
-        super().__init__(problem, **kw)
-        if np.isrealobj(self.dtype.type()):
-            self.enforce_real_cadence = enforce_real_cadence
-        else:
-            self.enforce_real_cadence = None
+        # Setup timing and profiling
+        self.dist = problem.dist
         self._bcast_array = np.zeros(1, dtype=float)
         self.init_time = self.world_time
+        if profile or parallel_profile:
+            parallel_mkdir(PROFILE_DIRECTORY, comm=self.dist.comm)
+            self.profile = True
+            self.parallel_profile = parallel_profile
+            self.setup_profiler = cProfile.Profile()
+            self.warmup_profiler = cProfile.Profile()
+            self.run_profiler = cProfile.Profile()
+            self.setup_profiler.enable()
+        # Build subsystems and subproblems
+        super().__init__(problem, **kw)
         # Build LHS matrices
         self.build_matrices(self.subproblems, ['M', 'L'])
         # Compute total modes
@@ -538,6 +558,10 @@ def __init__(self, problem, timestepper, enforce_real_cadence=100, warmup_iterat
         self.sim_time = self.initial_sim_time = problem.time.allreduce_data_max(layout='g')
         self.iteration = self.initial_iteration = 0
         self.warmup_iterations = warmup_iterations
+        if np.isrealobj(self.dtype.type()):
+            self.enforce_real_cadence = enforce_real_cadence
+        else:
+            self.enforce_real_cadence = None
         # Default integration parameters
         self.stop_sim_time = np.inf
         self.stop_wall_time = np.inf
@@ -648,8 +672,14 @@ def step(self, dt):
         wall_time = self.wall_time
         if self.iteration == self.initial_iteration:
             self.start_time = wall_time
+            if self.profile:
+                self.dump_profiles(self.setup_profiler, "setup")
+                self.warmup_profiler.enable()
         if self.iteration == self.initial_iteration + self.warmup_iterations:
             self.warmup_time = wall_time
+            if self.profile:
+                self.dump_profiles(self.warmup_profiler, "warmup")
+                self.run_profiler.enable()
         # Advance using timestepper
         self.timestepper.step(dt, wall_time)
         # Update iteration
@@ -704,6 +734,8 @@ def log_stats(self, format=".4g"):
         logger.info(f"Final iteration: {self.iteration}")
         logger.info(f"Final sim time: {self.sim_time}")
         logger.info(f"Setup time (init - iter 0): {self.start_time:{format}} sec")
+        if self.profile:
+            self.dump_profiles(self.run_profiler, "runtime")
         if self.iteration >= self.initial_iteration + self.warmup_iterations:
             warmup_time = self.warmup_time - self.start_time
             run_time = log_time - self.warmup_time
@@ -716,3 +748,30 @@ def log_stats(self, format=".4g"):
             logger.info(f"Speed: {(modes*stages/cpus/run_time):{format}} mode-stages/cpu-sec")
         else:
             logger.info(f"Timings unavailable because warmup did not complete.")
+
+    def dump_profiles(self, profiler, name):
+        "Save profiling data to disk."
+        comm = self.dist.comm
+        # Disable and create stats on each process
+        profiler.create_stats()
+        p = pstats.Stats(profiler)
+        p.strip_dirs()
+        # Gather using wrapper class to avoid pickling issues
+        profiles = comm.gather(ProfileWrapper(p.stats), root=0)
+        # Sum stats on root process
+        if comm.rank == 0:
+            if self.parallel_profile:
+                stats = {'primcalls': defaultdict(list),
+                         'totcalls': defaultdict(list),
+                         'tottime': defaultdict(list),
+                         'cumtime': defaultdict(list)}
+                for profile in profiles:
+                    for func, (primcalls, totcalls, tottime, cumtime, callers) in profile.stats.items():
+                        stats['primcalls'][func].append(primcalls)
+                        stats['totcalls'][func].append(totcalls)
+                        stats['tottime'][func].append(tottime)
+                        stats['cumtime'][func].append(cumtime)
+                pickle.dump(stats, open(PROFILE_DIRECTORY / f"{name}_parallel.pickle", 'wb'))
+            # Creation of joint_stats destroys profiles, so do this second
+            joint_stats = pstats.Stats(*profiles)
+            joint_stats.dump_stats(PROFILE_DIRECTORY / f"{name}.prof")
diff --git a/dedalus/dedalus.cfg b/dedalus/dedalus.cfg
@@ -118,3 +118,15 @@
     # This works around NFS caching issues
     FILEHANDLER_TOUCH_TMPFILE = False
 
+[profiling]
+
+    # Default profile setting for solvers
+    # This saves accumulated profiling data using cProfile
+    PROFILE_DEFAULT = False
+
+    # Default parallel profile setting for solvers
+    # This saves per-process and accumulated profiling data using cProfile
+    PARALLEL_PROFILE_DEFAULT = False
+
+    # Profile directory base (will be expanded to <PROFILE_DIRECTORY>/runtime.prof, etc)
+    PROFILE_DIRECTORY = profiles
diff --git a/dedalus/tools/parallel.py b/dedalus/tools/parallel.py
@@ -3,6 +3,7 @@
 
 """
 
+import pathlib
 from mpi4py import MPI
 
 
@@ -56,3 +57,23 @@ def __enter__(self):
     def __exit__(self, type, value, traceback):
         for i in range(self.size-self.rank):
             self.comm.Barrier()
+
+
+class ProfileWrapper:
+    """Pickleable wrapper for cProfile.Profile for use with pstats.Stats"""
+
+    def __init__(self, stats):
+        self.stats = stats
+
+    def create_stats(self):
+        pass
+
+
+def parallel_mkdir(path, comm=MPI.COMM_WORLD):
+    """Create a directory from root process."""
+    path = pathlib.Path(path)
+    with Sync(comm=comm, enter=False, exit=True) as sync:
+        if sync.comm.rank == 0:
+            if not path.exists():
+                path.mkdir()
+