Skip to content

Commit

Permalink
improves overall runtime of analysis pass
Browse files Browse the repository at this point in the history
- makes the step through the trajectory for analysis by default gather 500 frames of data
- for standard settings openmm rfe runs, this is a step of 10
- improve performance by loading PDB topology once, not 11 (=nlambda) times
- add progress bar to cli call for sanity
richardjgowers committed Nov 15, 2023
1 parent bddd32d commit b4a4ec8
Showing 2 changed files with 29 additions and 8 deletions.
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -7,6 +7,7 @@ dependencies:
- netCDF4
- openff-units
- pip
- tqdm
# for testing
- coverage
- pooch
36 changes: 28 additions & 8 deletions src/openfe_analysis/rmsd.py
Original file line number Diff line number Diff line change
@@ -6,6 +6,7 @@
from numpy import typing as npt
import pathlib
from typing import Optional
import tqdm

from .reader import FEReader
from .transformations import (
@@ -67,7 +68,8 @@ def make_Universe(top: pathlib.Path,


def gather_rms_data(pdb_topology: pathlib.Path,
dataset: pathlib.Path) -> dict[str, list[float]]:
dataset: pathlib.Path,
skip: Optional[int] = None) -> dict[str, list[float]]:
"""Generate structural analysis of RBFE simulation
Parameters
@@ -76,6 +78,9 @@ def gather_rms_data(pdb_topology: pathlib.Path,
path to pdb topology
dataset : pathlib.Path
path to nc trajectory
skip : int, optional
step at which to progress through the trajectory. by default, selects a
step that produces roughly 500 frames of analysis per replicate
Produces, for each lambda state:
- 1D protein RMSD timeseries 'protein_RMSD'
@@ -92,15 +97,27 @@ def gather_rms_data(pdb_topology: pathlib.Path,

ds = nc.Dataset(dataset)
n_lambda = ds.dimensions['state'].size
n_frames = ds.dimensions['iteration'].size
if skip is None:
# find skip that would give ~500 frames of output
# max against 1 to avoid skip=0 case
skip = max(n_frames // 500, 1)

pb = tqdm.tqdm(total=int(n_frames / skip) * n_lambda)

u_top = mda.Universe(pdb_topology)

for i in range(n_lambda):
u = make_Universe(pdb_topology, ds, state=i)
# cheeky, but we can read the PDB topology once and reuse per universe
# this then only hits the PDB file once for all replicas
u = make_Universe(u_top._topology, ds, state=i)

prot = u.select_atoms('protein and name CA')
ligand = u.select_atoms('resname UNK')

# save coordinates for 2D RMSD matrix
# TODO: Some smart guard to avoid allocating a silly amount of memory?
prot2d = np.empty((len(u.trajectory), len(prot), 3), dtype=np.float32)
prot2d = np.empty((len(u.trajectory[::skip]), len(prot), 3), dtype=np.float32)

prot_start = prot.positions
# prot_weights = prot.masses / np.mean(prot.masses)
@@ -112,9 +129,11 @@ def gather_rms_data(pdb_topology: pathlib.Path,
this_ligand_rmsd = []
this_ligand_wander = []

for ts in u.trajectory:
for ts_i, ts in enumerate(u.trajectory[::skip]):
pb.update()

if prot:
prot2d[ts.frame, :, :] = prot.positions
prot2d[ts_i, :, :] = prot.positions
this_protein_rmsd.append(
rms.rmsd(prot.positions, prot_start, None, # prot_weights,
center=False, superposition=False)
@@ -132,14 +151,15 @@ def gather_rms_data(pdb_topology: pathlib.Path,
)

if prot:
rmsd2d = twoD_RMSD(prot2d, None) # prot_weights)
# can ignore weights here as it's all Ca
rmsd2d = twoD_RMSD(prot2d, w=None) # prot_weights)
output['protein_RMSD'].append(this_protein_rmsd)
output['protein_2D_RMSD'].append(rmsd2d)
if ligand:
output['ligand_RMSD'].append(this_ligand_rmsd)
output['ligand_wander'].append(this_ligand_wander)

output['time(ps)'] = list(np.arange(len(u.trajectory)) * u.trajectory.dt)
output['time(ps)'] = list(np.arange(len(u.trajectory))[::skip] * u.trajectory.dt)

return output

@@ -151,7 +171,7 @@ def twoD_RMSD(positions, w: Optional[npt.NDArray]) -> list[float]:
----------
positions : np.ndarray
the protein positions for the entire trajectory
w : np.ndarray
w : np.ndarray, optional
weights array
Returns

0 comments on commit b4a4ec8

Please sign in to comment.