improves overall runtime of analysis pass

- makes the step through the trajectory for analysis by default gather 500 frames of data - for standard settings openmm rfe runs, this is a step of 10 - improve performance by loading PDB topology once, not 11 (=nlambda) times - add progress bar to cli call for sanity
OpenFreeEnergy · Nov 15, 2023 · b4a4ec8 · b4a4ec8
1 parent bddd32d
commit b4a4ec8
Showing 2 changed files with 29 additions and 8 deletions.
diff --git a/environment.yml b/environment.yml
@@ -7,6 +7,7 @@ dependencies:
   - netCDF4
   - openff-units
   - pip
+  - tqdm
   # for testing
   - coverage
   - pooch

diff --git a/src/openfe_analysis/rmsd.py b/src/openfe_analysis/rmsd.py
@@ -6,6 +6,7 @@
 from numpy import typing as npt
 import pathlib
 from typing import Optional
+import tqdm
 
 from .reader import FEReader
 from .transformations import (
@@ -67,7 +68,8 @@ def make_Universe(top: pathlib.Path,
 
 
 def gather_rms_data(pdb_topology: pathlib.Path,
-                    dataset: pathlib.Path) -> dict[str, list[float]]:
+                    dataset: pathlib.Path,
+                    skip: Optional[int] = None) -> dict[str, list[float]]:
     """Generate structural analysis of RBFE simulation
 
     Parameters
@@ -76,6 +78,9 @@ def gather_rms_data(pdb_topology: pathlib.Path,
       path to pdb topology
     dataset : pathlib.Path
       path to nc trajectory
+    skip : int, optional
+      step at which to progress through the trajectory.  by default, selects a
+      step that produces roughly 500 frames of analysis per replicate
 
     Produces, for each lambda state:
     - 1D protein RMSD timeseries 'protein_RMSD'
@@ -92,15 +97,27 @@ def gather_rms_data(pdb_topology: pathlib.Path,
 
     ds = nc.Dataset(dataset)
     n_lambda = ds.dimensions['state'].size
+    n_frames = ds.dimensions['iteration'].size
+    if skip is None:
+        # find skip that would give ~500 frames of output
+        # max against 1 to avoid skip=0 case
+        skip = max(n_frames // 500, 1)
+
+    pb = tqdm.tqdm(total=int(n_frames / skip) * n_lambda)
+
+    u_top = mda.Universe(pdb_topology)
+
     for i in range(n_lambda):
-        u = make_Universe(pdb_topology, ds, state=i)
+        # cheeky, but we can read the PDB topology once and reuse per universe
+        # this then only hits the PDB file once for all replicas
+        u = make_Universe(u_top._topology, ds, state=i)
 
         prot = u.select_atoms('protein and name CA')
         ligand = u.select_atoms('resname UNK')
 
         # save coordinates for 2D RMSD matrix
         # TODO: Some smart guard to avoid allocating a silly amount of memory?
-        prot2d = np.empty((len(u.trajectory), len(prot), 3), dtype=np.float32)
+        prot2d = np.empty((len(u.trajectory[::skip]), len(prot), 3), dtype=np.float32)
 
         prot_start = prot.positions
         # prot_weights = prot.masses / np.mean(prot.masses)
@@ -112,9 +129,11 @@ def gather_rms_data(pdb_topology: pathlib.Path,
         this_ligand_rmsd = []
         this_ligand_wander = []
 
-        for ts in u.trajectory:
+        for ts_i, ts in enumerate(u.trajectory[::skip]):
+            pb.update()
+
             if prot:
-                prot2d[ts.frame, :, :] = prot.positions
+                prot2d[ts_i, :, :] = prot.positions
                 this_protein_rmsd.append(
                     rms.rmsd(prot.positions, prot_start, None,  # prot_weights,
                              center=False, superposition=False)
@@ -132,14 +151,15 @@ def gather_rms_data(pdb_topology: pathlib.Path,
                 )
 
         if prot:
-            rmsd2d = twoD_RMSD(prot2d, None)  # prot_weights)
+            # can ignore weights here as it's all Ca
+            rmsd2d = twoD_RMSD(prot2d, w=None)  # prot_weights)
             output['protein_RMSD'].append(this_protein_rmsd)
             output['protein_2D_RMSD'].append(rmsd2d)
         if ligand:
             output['ligand_RMSD'].append(this_ligand_rmsd)
             output['ligand_wander'].append(this_ligand_wander)
 
-        output['time(ps)'] = list(np.arange(len(u.trajectory)) * u.trajectory.dt)
+        output['time(ps)'] = list(np.arange(len(u.trajectory))[::skip] * u.trajectory.dt)
 
     return output
 
@@ -151,7 +171,7 @@ def twoD_RMSD(positions, w: Optional[npt.NDArray]) -> list[float]:
     ----------
     positions : np.ndarray
       the protein positions for the entire trajectory
-    w : np.ndarray
+    w : np.ndarray, optional
       weights array
 
     Returns