From 05ffc371ef07525f690438b18ffbf474a72dea10 Mon Sep 17 00:00:00 2001 From: VGPReys Date: Wed, 17 Apr 2024 17:13:05 +0200 Subject: [PATCH 01/36] first draft voroscoring module --- .../modules/scoring/voroscoring/__init__.py | 71 +++++ .../modules/scoring/voroscoring/defaults.yaml | 77 +++++ .../scoring/voroscoring/voroscoring.py | 267 ++++++++++++++++++ 3 files changed, 415 insertions(+) create mode 100644 src/haddock/modules/scoring/voroscoring/__init__.py create mode 100644 src/haddock/modules/scoring/voroscoring/defaults.yaml create mode 100644 src/haddock/modules/scoring/voroscoring/voroscoring.py diff --git a/src/haddock/modules/scoring/voroscoring/__init__.py b/src/haddock/modules/scoring/voroscoring/__init__.py new file mode 100644 index 0000000000..2c8288c359 --- /dev/null +++ b/src/haddock/modules/scoring/voroscoring/__init__.py @@ -0,0 +1,71 @@ +""" +""" +from pathlib import Path + +from haddock.core.typing import FilePath +from haddock.modules import get_engine +from haddock.modules.scoring import ScoringModule +from haddock.modules.scoring.voroscoring.voroscoring import ( + VoroMQA, + update_models_with_scores, + write_models_scores, + ) + +RECIPE_PATH = Path(__file__).resolve().parent +DEFAULT_CONFIG = Path(RECIPE_PATH, "defaults.yaml") + + +class HaddockModule(ScoringModule): + """.""" + + name = RECIPE_PATH.name + + def __init__(self, + order: int, + path: Path, + initial_params: FilePath = DEFAULT_CONFIG) -> None: + super().__init__(order, path, initial_params=initial_params) + + @classmethod + def confirm_installation(cls) -> None: + """Confirm module is installed.""" + # FIXME ? Check if conda env is accessible + return + + def _run(self) -> None: + """Execute module.""" + # Retrieve previous models + try: + models_to_score = self.previous_io.retrieve_models( + individualize=True + ) + except Exception as e: + self.finish_with_error(e) + + jobs: list[VoroMQA] = [] + output_fname = f"{RECIPE_PATH.name}_voro.tsv" + voromqa = VoroMQA( + models_to_score, + './', + self.params, + output_filepath=output_fname, + ) + jobs: list[VoroMQA] = [voromqa] + + # Run CNS Jobs + self.log(f"Running Voro-mqa scoring") + Engine = get_engine(self.params['mode'], self.params) + engine = Engine(jobs) + engine.run() + self.log("Voro-mqa scoring finished!") + + # Update score of output models + self.output_models, models_scores = update_models_with_scores( + output_fname, + models_to_score, + ) + # Write output file + scoring_tsv_fpath = f"{RECIPE_PATH.name}.tsv" + write_models_scores(models_scores, scoring_tsv_fpath) + # Export to next module + self.export_io_models(faulty_tolerance=self.params["tolerance"]) diff --git a/src/haddock/modules/scoring/voroscoring/defaults.yaml b/src/haddock/modules/scoring/voroscoring/defaults.yaml new file mode 100644 index 0000000000..a33f12fdca --- /dev/null +++ b/src/haddock/modules/scoring/voroscoring/defaults.yaml @@ -0,0 +1,77 @@ +metric: + default: jury_score + type: string + choices: + - jury_score + - GNN_sum_score + - GNN_pcadscore + - voromqa_dark + - voromqa_light + - voromqa_energy + - gen_voromqa_energy + - clash_score + - area + minchars: 1 + maxchars: 50 + title: VoroMQA metric used to score. + short: VoroMQA metric used to score. + long: VoroMQA metric used to score. + group: analysis + explevel: easy + +conda_install_dir: + default: "/trinity/login/vreys/miniconda3/" + type: string + minchars: 1 + maxchars: 158 + title: Path to conda intall directory. + short: Absolute path to conda intall directory. + long: Absolute path to conda intall directory. + group: execution + explevel: easy + +conda_env_name: + default: "ftdmp5" + type: string + minchars: 1 + maxchars: 100 + title: Name of the ftdmp conda env. + short: Name of the ftdmp conda env. + long: Name of the ftdmp conda env. + group: execution + explevel: easy + +ftdmp_install_dir: + default: "/trinity/login/vreys/Venclovas/ftdmp/" + type: string + minchars: 1 + maxchars: 158 + title: Path to ftdmp intall directory. + short: Absolute path to ftdmp intall directory. + long: Absolute path to ftdmp intall directory. + group: execution + explevel: easy + +nb_gpus: + default: 1 + type: integer + min: 1 + max: 420 + title: Number of accessible gpu on the device. + short: Number of accessible gpu on the device. + long: Number of accessible gpu on the device. + group: execution + explevel: easy + +concat_chain_: + default: [] + type: list + minitems: 0 + maxitems: 100 + title: List of residues supposed to be buried + short: List of residues supposed to be buried + long: concat_chain_* is an expandable parameter. You can provide concat_chain_1, + concat_chain_2, concat_chain_3, etc. For each selection, enlisted chains will + be concatenated as one prior to scoring. + group: analysis + explevel: expert diff --git a/src/haddock/modules/scoring/voroscoring/voroscoring.py b/src/haddock/modules/scoring/voroscoring/voroscoring.py new file mode 100644 index 0000000000..a235984f15 --- /dev/null +++ b/src/haddock/modules/scoring/voroscoring/voroscoring.py @@ -0,0 +1,267 @@ +import os +import subprocess +import glob +import time + +from random import randint + +from haddock import log +from haddock.core.typing import Any, Path, Union +from haddock.libs.libontology import PDBFile + + +VOROMQA_CFG_TEMPLATE = """#!/bin/bash +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=1 +#SBATCH --partition=gpu +#SBATCH --gres=gpu:{GPUID} +#SBATCH --mem-per-gpu=1GB + +# Where to do the work +WORKDIR="{WORKDIR}" + +# Name of the outputfile (.ssv for space separated values) +OUTPUT_FNAME="voro_scores.ssv" + +# Define Constants +CONDA_INSTALL_DIR="{CONDA_INSTALL_DIR}" +CONDA_ENV_NAME="{CONDA_ENV_NAME}" +FTDMP_INSTALL_DIR="{FTDMP_INSTALL_DIR}" +VOROMQA_SCRIPT="ftdmp-qa-all" + +# Define workflow variables +OUTPUT_FPATH="$WORKDIR/$OUTPUT_FNAME" +PDB_LIST_PATH="{PDB_LIST_PATH}" +OUT_MSG="Output file is here: $OUTPUT_FPATH" + +# 1. Setup enviroments +# Load the gnu12 module... +# NOTE: specific to tintin users... +module load gnu12 +# Activate conda env +source "$CONDA_INSTALL_DIR/bin/activate" +conda activate $CONDA_ENV_NAME +echo "conda env: $CONDA_PREFIX" + +# 2. Setup run directory +# Create working directory +mkdir -p $WORKDIR + +# 3. Run voro-mqa (model quality assessment) +# Go to ftdmp install directory +cd $FTDMP_INSTALL_DIR +echo "Directory: $PWD" +# run voro-mqa +echo "./$VOROMQA_SCRIPT --conda-path $CONDA_INSTALL_DIR --conda-env $CONDA_ENV_NAME --workdir '$WORKDIR' --rank-names 'protein_protein_voromqa_and_global_and_gnn_no_sr' < $PDB_LIST_PATH > $OUTPUT_FPATH" +./$VOROMQA_SCRIPT --conda-path $CONDA_INSTALL_DIR --conda-env $CONDA_ENV_NAME --workdir $WORKDIR --rank-names 'protein_protein_voromqa_and_global_and_gnn_no_sr' --output-redundancy-threshold 1.0 < $PDB_LIST_PATH > $OUTPUT_FPATH +# Let the magic happen.. + +# 4. Analyze results +# Print final ouput file +echo $OUT_MSG +""" + + +class VoroMQA(): + + def __init__( + self, + models: list[Union[str, Path, PDBFile]], + workdir: Union[str, Path], + params: dict[str, Any], + output_filepath: Union[str, Path] = "voroscoring.tsv", + ): + self.models = models + self.workdir = workdir + self.params = params + self.output_filepath = output_filepath + + def run(self): + # Obtain absolute paths + self.workdir = Path(self.workdir).resolve() + all_pdbs = [ + Path(mdl.path, mdl.file_name).resolve() + for mdl in self.models + ] + # Loop over batches + for bi, batch in enumerate(self.batched(all_pdbs, size=300)): + # Run slurm + self.run_voro_batch( + batch, + batch_index=bi, + gpuid=bi % self.params['nb_gpus'], + ) + # Recombine all batches output files + scores_fpath = self.recombine_batches(self.workdir) + log.info(f"Generated output file: {scores_fpath}") + + def run_voro_batch( + self, + pdb_filepaths, + base_workdir, + batch_index: int = 1, + gpuid: int = -1, + ): + # Create workdir + batch_workdir = Path(base_workdir, f"batch_{batch_index}") + batch_workdir.mkdir(parents=True) + + # Create list of pdb files + pdb_files_list_path = Path(batch_workdir, "pdbs.list") + pdb_files_list_path.write_text(os.linesep.join(pdb_filepaths)) + + # Get GPU id + if gpuid < 0: + gpuid = randint(0, self.params["nb_gpus"] - 1) + + # Format config file + batch_cfg = VOROMQA_CFG_TEMPLATE.format( + CONDA_INSTALL_DIR=self.params["conda_install_dir"], + CONDA_ENV_NAME=self.params["conda_env_name"], + FTDMP_INSTALL_DIR=self.params["ftdmp_install_dir"], + GPUID=gpuid, + WORKDIR=batch_workdir, + PDB_LIST_PATH=pdb_files_list_path, + ) + + # Write it + batch_cfg_fpath = Path(batch_workdir, "vorobatchcfg.job") + batch_cfg_fpath.write_text(batch_cfg) + + # Launch slurm + initdir = os.getcwd() + os.chdir(batch_workdir) + log.info(f"sbatch {batch_cfg_fpath}") + subprocess.run(f"sbatch {batch_cfg_fpath}", shell=True) + os.chdir(initdir) + + def recombine_batches(self) -> str: + # Wait for all results to be obtained + batches_result_paths = self.wait_for_termination() + # Loop over them + all_predictions: list[dict[str, str]] = [] + combined_header: list[str] = [] + for batch_results in batches_result_paths: + # Read voro results + with open(batch_results, 'r') as filin: + header = filin.readline().strip().split(' ') + for head in header: + if head not in combined_header: + combined_header.append(head) + for line in filin: + s_ = line.strip().split(' ') + all_predictions.append({ + head: s_[header.index(head)] + for head in header + }) + + # Sort all batches entries + sorted_entries = sorted( + all_predictions, + key=lambda k: float(k[self.params["metric"]]), + reverse="_energy" not in self.params["metric"], + ) + + # Write final output file + finale_output_fpath = f"{self.workdir}/{self.output_filepath}" + with open(finale_output_fpath, "w") as filout: + header = '\t'.join(combined_header) + filout.write(header+os.linesep) + for entry in sorted_entries: + ordered_data = [ + entry[h] if h in entry.keys() else '-' + for h in combined_header + ] + line = '\t'.join(ordered_data) + filout.write(line+os.linesep) + return finale_output_fpath + + def wait_for_termination(self, wait_time: int = 60): + batches_dirpath = glob.glob(f"{self.workdir}/batch_*/") + while True: + try: + output_files = [] + for batch_dir in batches_dirpath: + expected_outputfile = Path(batch_dir, "voro_scores.ssv") + assert expected_outputfile.exists() + assert expected_outputfile.stat().st_size != 0 + output_files.append(expected_outputfile) + except AssertionError as _e: + log.info(f"Waiting {wait_time} sec...") + time.sleep(wait_time) + else: + return output_files + + @staticmethod + def batched(entries: str, size: int = 300): + batch = [] + for pdb in entries: + batch.append(pdb) + if len(batch) == size: + yield batch + batch = [] + yield batch + +def update_models_with_scores( + output_fname: Union[str, Path], + models: list[PDBFile], + metric: str = "jury_score", + ) -> list[PDBFile]: + scores_mapper: dict[str, float] = {} + # Read output file + with open(output_fname, 'r') as filin: + for i, line in enumerate(filin): + s_ = line.strip().split('\t') + # Extract header + if i == 0: + header = s_ + continue + # Extract data + modelpath = s_[header["ID"]] + score = float(s_[header[metric]]) + # Only extract model filename + model_filename = modelpath.split('/')[-1] + # Hold score + scores_mapper[model_filename] = score + + # Compute rankings + ranking_mapper = { + model_filename: rank + for rank, model_filename in enumerate( + sorted( + scores_mapper, + reverse="_energy" not in metric, + ), + start=1, + ) + } + + data_mapper = { + model_filename: { + "score": scores_mapper[model_filename], + "rank": ranking_mapper[model_filename], + } + } + + # Loop over input models + for model in models: + # only modify the model score + model.score = data_mapper[model.file_name]["score"] + model.rank = data_mapper[model.file_name]["rank"] + + return models, data_mapper + +def write_models_scores( + models_scores: dict[str, dict[str, Union[float, int]]], + filepath: Union[str, Path], + ) -> None: + header = ("structure", "original_name", "md5", "score", "rank", ) + with open(filepath, 'w') as filout: + filout.write('\t'.join(header) + os.linesep) + for modelname, scores in sorted( + models_scores, + key=lambda k: models_scores[k]['rank'], + ): + newline_dt = f"{modelname}\t{modelname}\t-\t{scores['score']}\t{scores['rank']}" # noqa : E501 + filout.write(newline_dt + os.linesep) \ No newline at end of file From 1c8e16061bc1f7885d9144a0692bd2219420dbb5 Mon Sep 17 00:00:00 2001 From: VGPReys Date: Thu, 18 Apr 2024 09:01:02 +0200 Subject: [PATCH 02/36] fix writing --- .../modules/scoring/voroscoring/voroscoring.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/haddock/modules/scoring/voroscoring/voroscoring.py b/src/haddock/modules/scoring/voroscoring/voroscoring.py index a235984f15..93d293db79 100644 --- a/src/haddock/modules/scoring/voroscoring/voroscoring.py +++ b/src/haddock/modules/scoring/voroscoring/voroscoring.py @@ -259,9 +259,12 @@ def write_models_scores( header = ("structure", "original_name", "md5", "score", "rank", ) with open(filepath, 'w') as filout: filout.write('\t'.join(header) + os.linesep) - for modelname, scores in sorted( - models_scores, - key=lambda k: models_scores[k]['rank'], - ): + # sort models by keys + sorted_models = sorted( + models_scores, + key=lambda k: models_scores[k]['rank'], + ) + for modelname in sorted_models: + scores = models_scores[modelname] newline_dt = f"{modelname}\t{modelname}\t-\t{scores['score']}\t{scores['rank']}" # noqa : E501 filout.write(newline_dt + os.linesep) \ No newline at end of file From c37370dccf9ddeef90d8ff98d96fbfeb1afbab58 Mon Sep 17 00:00:00 2001 From: VGPReys Date: Thu, 18 Apr 2024 09:35:13 +0200 Subject: [PATCH 03/36] fix types --- .../modules/scoring/voroscoring/__init__.py | 1 - .../scoring/voroscoring/voroscoring.py | 27 ++++++++++--------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/src/haddock/modules/scoring/voroscoring/__init__.py b/src/haddock/modules/scoring/voroscoring/__init__.py index 2c8288c359..dab7faaf64 100644 --- a/src/haddock/modules/scoring/voroscoring/__init__.py +++ b/src/haddock/modules/scoring/voroscoring/__init__.py @@ -42,7 +42,6 @@ def _run(self) -> None: except Exception as e: self.finish_with_error(e) - jobs: list[VoroMQA] = [] output_fname = f"{RECIPE_PATH.name}_voro.tsv" voromqa = VoroMQA( models_to_score, diff --git a/src/haddock/modules/scoring/voroscoring/voroscoring.py b/src/haddock/modules/scoring/voroscoring/voroscoring.py index 93d293db79..613845e7b0 100644 --- a/src/haddock/modules/scoring/voroscoring/voroscoring.py +++ b/src/haddock/modules/scoring/voroscoring/voroscoring.py @@ -6,7 +6,7 @@ from random import randint from haddock import log -from haddock.core.typing import Any, Path, Union +from haddock.core.typing import Any, Generator, Path, Union from haddock.libs.libontology import PDBFile @@ -98,11 +98,11 @@ def run(self): def run_voro_batch( self, - pdb_filepaths, - base_workdir, + pdb_filepaths: list[Union[str, Path]], + base_workdir: Union[str, Path], batch_index: int = 1, gpuid: int = -1, - ): + ) -> None: # Create workdir batch_workdir = Path(base_workdir, f"batch_{batch_index}") batch_workdir.mkdir(parents=True) @@ -166,8 +166,8 @@ def recombine_batches(self) -> str: # Write final output file finale_output_fpath = f"{self.workdir}/{self.output_filepath}" with open(finale_output_fpath, "w") as filout: - header = '\t'.join(combined_header) - filout.write(header+os.linesep) + file_header = '\t'.join(combined_header) + filout.write(file_header + os.linesep) for entry in sorted_entries: ordered_data = [ entry[h] if h in entry.keys() else '-' @@ -177,11 +177,14 @@ def recombine_batches(self) -> str: filout.write(line+os.linesep) return finale_output_fpath - def wait_for_termination(self, wait_time: int = 60): + def wait_for_termination( + self, + wait_time: int = 60, + ) -> list[Union[str, Path]]: batches_dirpath = glob.glob(f"{self.workdir}/batch_*/") while True: try: - output_files = [] + output_files: list[Union[str, Path]] = [] for batch_dir in batches_dirpath: expected_outputfile = Path(batch_dir, "voro_scores.ssv") assert expected_outputfile.exists() @@ -194,7 +197,7 @@ def wait_for_termination(self, wait_time: int = 60): return output_files @staticmethod - def batched(entries: str, size: int = 300): + def batched(entries: str, size: int = 300) -> Generator[list, None, None]: batch = [] for pdb in entries: batch.append(pdb) @@ -207,7 +210,7 @@ def update_models_with_scores( output_fname: Union[str, Path], models: list[PDBFile], metric: str = "jury_score", - ) -> list[PDBFile]: + ) -> tuple[list[PDBFile], dict[str, dict[str, float]]]: scores_mapper: dict[str, float] = {} # Read output file with open(output_fname, 'r') as filin: @@ -218,8 +221,8 @@ def update_models_with_scores( header = s_ continue # Extract data - modelpath = s_[header["ID"]] - score = float(s_[header[metric]]) + modelpath = str(s_[header.index("ID")]) + score = float(s_[header.index(metric)]) # Only extract model filename model_filename = modelpath.split('/')[-1] # Hold score From ef856c9dda1ae5c7d716e62475f46324f78dd181 Mon Sep 17 00:00:00 2001 From: VGPReys Date: Thu, 18 Apr 2024 09:45:17 +0200 Subject: [PATCH 04/36] upgrade haddock module init --- .../modules/scoring/voroscoring/__init__.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/haddock/modules/scoring/voroscoring/__init__.py b/src/haddock/modules/scoring/voroscoring/__init__.py index dab7faaf64..37081f2677 100644 --- a/src/haddock/modules/scoring/voroscoring/__init__.py +++ b/src/haddock/modules/scoring/voroscoring/__init__.py @@ -20,11 +20,17 @@ class HaddockModule(ScoringModule): name = RECIPE_PATH.name - def __init__(self, - order: int, - path: Path, - initial_params: FilePath = DEFAULT_CONFIG) -> None: - super().__init__(order, path, initial_params=initial_params) + def __init__( + self, + order: int, + path: Path, + *ignore: Any, + init_params: FilePath = DEFAULT_CONFIG, + **everything: Any, + ) -> None: + """Initialize class.""" + super().__init__(order, path, init_params) + @classmethod def confirm_installation(cls) -> None: From 75a69ad5534a5584ddd48d556960c1a9a4d23f5e Mon Sep 17 00:00:00 2001 From: VGPReys Date: Thu, 18 Apr 2024 09:45:55 +0200 Subject: [PATCH 05/36] import Any type --- src/haddock/modules/scoring/voroscoring/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/haddock/modules/scoring/voroscoring/__init__.py b/src/haddock/modules/scoring/voroscoring/__init__.py index 37081f2677..5c5e98c2ce 100644 --- a/src/haddock/modules/scoring/voroscoring/__init__.py +++ b/src/haddock/modules/scoring/voroscoring/__init__.py @@ -2,7 +2,7 @@ """ from pathlib import Path -from haddock.core.typing import FilePath +from haddock.core.typing import Any, FilePath from haddock.modules import get_engine from haddock.modules.scoring import ScoringModule from haddock.modules.scoring.voroscoring.voroscoring import ( From 8ba5ae783ab458ecbc701ed57778277d47052b86 Mon Sep 17 00:00:00 2001 From: VGPReys Date: Thu, 18 Apr 2024 11:27:30 +0200 Subject: [PATCH 06/36] redefine scoring modules classes --- src/haddock/modules/scoring/__init__.py | 6 +++++- src/haddock/modules/scoring/emscoring/__init__.py | 4 ++-- src/haddock/modules/scoring/mdscoring/__init__.py | 4 ++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/haddock/modules/scoring/__init__.py b/src/haddock/modules/scoring/__init__.py index f78f47035f..16a1c3b82d 100644 --- a/src/haddock/modules/scoring/__init__.py +++ b/src/haddock/modules/scoring/__init__.py @@ -3,9 +3,10 @@ from haddock.core.typing import FilePath from haddock.modules.base_cns_module import BaseCNSModule +from haddock.modules import BaseHaddockModule -class ScoringModule(BaseCNSModule): +class ScoringModule(BaseHaddockModule): """Parent class for Scoring modules.""" def output(self, output_fname: FilePath, sep: str = "\t") -> None: @@ -23,3 +24,6 @@ def output(self, output_fname: FilePath, sep: str = "\t") -> None: df_sc_sorted.to_csv(output_fname, sep=sep, index=False, na_rep="None") return + +class CNSScoringModule(BaseCNSModule, ScoringModule): + """Parent class for Scoring modules.""" diff --git a/src/haddock/modules/scoring/emscoring/__init__.py b/src/haddock/modules/scoring/emscoring/__init__.py index ca5c539299..ab92af314d 100644 --- a/src/haddock/modules/scoring/emscoring/__init__.py +++ b/src/haddock/modules/scoring/emscoring/__init__.py @@ -10,14 +10,14 @@ from haddock.libs.libcns import prepare_cns_input, prepare_expected_pdb from haddock.libs.libsubprocess import CNSJob from haddock.modules import get_engine -from haddock.modules.scoring import ScoringModule +from haddock.modules.scoring import CNSScoringModule RECIPE_PATH = Path(__file__).resolve().parent DEFAULT_CONFIG = Path(RECIPE_PATH, "defaults.yaml") -class HaddockModule(ScoringModule): +class HaddockModule(CNSScoringModule): """HADDOCK3 module to perform energy minimization scoring.""" name = RECIPE_PATH.name diff --git a/src/haddock/modules/scoring/mdscoring/__init__.py b/src/haddock/modules/scoring/mdscoring/__init__.py index a30582134a..cbe2d0b0f7 100644 --- a/src/haddock/modules/scoring/mdscoring/__init__.py +++ b/src/haddock/modules/scoring/mdscoring/__init__.py @@ -9,14 +9,14 @@ from haddock.libs.libcns import prepare_cns_input, prepare_expected_pdb from haddock.libs.libsubprocess import CNSJob from haddock.modules import get_engine -from haddock.modules.scoring import ScoringModule +from haddock.modules.scoring import CNSScoringModule RECIPE_PATH = Path(__file__).resolve().parent DEFAULT_CONFIG = Path(RECIPE_PATH, "defaults.yaml") -class HaddockModule(ScoringModule): +class HaddockModule(CNSScoringModule): """HADDOCK3 module to perform energy minimization scoring.""" name = RECIPE_PATH.name From 232035721c643d4951db4116c0741c7d1a12ffaf Mon Sep 17 00:00:00 2001 From: VGPReys Date: Thu, 18 Apr 2024 11:35:54 +0200 Subject: [PATCH 07/36] add output attribute --- src/haddock/modules/scoring/voroscoring/voroscoring.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/haddock/modules/scoring/voroscoring/voroscoring.py b/src/haddock/modules/scoring/voroscoring/voroscoring.py index 613845e7b0..a4313c1d18 100644 --- a/src/haddock/modules/scoring/voroscoring/voroscoring.py +++ b/src/haddock/modules/scoring/voroscoring/voroscoring.py @@ -75,7 +75,7 @@ def __init__( self.models = models self.workdir = workdir self.params = params - self.output_filepath = output_filepath + self.output = output_filepath def run(self): # Obtain absolute paths From 6757ae95c98a7d5bd85d83c0f35c81aa6a19bdd6 Mon Sep 17 00:00:00 2001 From: VGPReys Date: Thu, 18 Apr 2024 11:57:49 +0200 Subject: [PATCH 08/36] output Path type --- src/haddock/modules/scoring/voroscoring/__init__.py | 4 ++-- src/haddock/modules/scoring/voroscoring/voroscoring.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/haddock/modules/scoring/voroscoring/__init__.py b/src/haddock/modules/scoring/voroscoring/__init__.py index 5c5e98c2ce..cc98d2e510 100644 --- a/src/haddock/modules/scoring/voroscoring/__init__.py +++ b/src/haddock/modules/scoring/voroscoring/__init__.py @@ -48,12 +48,12 @@ def _run(self) -> None: except Exception as e: self.finish_with_error(e) - output_fname = f"{RECIPE_PATH.name}_voro.tsv" + output_fname = Path(f"{RECIPE_PATH.name}_voro.tsv") voromqa = VoroMQA( models_to_score, './', self.params, - output_filepath=output_fname, + output=output_fname, ) jobs: list[VoroMQA] = [voromqa] diff --git a/src/haddock/modules/scoring/voroscoring/voroscoring.py b/src/haddock/modules/scoring/voroscoring/voroscoring.py index a4313c1d18..97e54371df 100644 --- a/src/haddock/modules/scoring/voroscoring/voroscoring.py +++ b/src/haddock/modules/scoring/voroscoring/voroscoring.py @@ -70,7 +70,7 @@ def __init__( models: list[Union[str, Path, PDBFile]], workdir: Union[str, Path], params: dict[str, Any], - output_filepath: Union[str, Path] = "voroscoring.tsv", + output_filepath: Path = Path("voroscoring.tsv"), ): self.models = models self.workdir = workdir From 86016499584ed02607d54c94c0390693b98fb516 Mon Sep 17 00:00:00 2001 From: VGPReys Date: Thu, 18 Apr 2024 11:59:13 +0200 Subject: [PATCH 09/36] output var name --- src/haddock/modules/scoring/voroscoring/voroscoring.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/haddock/modules/scoring/voroscoring/voroscoring.py b/src/haddock/modules/scoring/voroscoring/voroscoring.py index 97e54371df..bb9e54c1ab 100644 --- a/src/haddock/modules/scoring/voroscoring/voroscoring.py +++ b/src/haddock/modules/scoring/voroscoring/voroscoring.py @@ -70,12 +70,12 @@ def __init__( models: list[Union[str, Path, PDBFile]], workdir: Union[str, Path], params: dict[str, Any], - output_filepath: Path = Path("voroscoring.tsv"), + output: Path = Path("voroscoring.tsv"), ): self.models = models self.workdir = workdir self.params = params - self.output = output_filepath + self.output = output def run(self): # Obtain absolute paths @@ -164,7 +164,7 @@ def recombine_batches(self) -> str: ) # Write final output file - finale_output_fpath = f"{self.workdir}/{self.output_filepath}" + finale_output_fpath = f"{self.workdir}/{self.output}" with open(finale_output_fpath, "w") as filout: file_header = '\t'.join(combined_header) filout.write(file_header + os.linesep) From e1ab6c8fa15fd5c80409e044b658c57f3bec710c Mon Sep 17 00:00:00 2001 From: VGPReys Date: Thu, 18 Apr 2024 12:00:55 +0200 Subject: [PATCH 10/36] get base_workdir from class attribute --- src/haddock/modules/scoring/voroscoring/voroscoring.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/haddock/modules/scoring/voroscoring/voroscoring.py b/src/haddock/modules/scoring/voroscoring/voroscoring.py index bb9e54c1ab..6bead42eaf 100644 --- a/src/haddock/modules/scoring/voroscoring/voroscoring.py +++ b/src/haddock/modules/scoring/voroscoring/voroscoring.py @@ -99,12 +99,11 @@ def run(self): def run_voro_batch( self, pdb_filepaths: list[Union[str, Path]], - base_workdir: Union[str, Path], batch_index: int = 1, gpuid: int = -1, ) -> None: # Create workdir - batch_workdir = Path(base_workdir, f"batch_{batch_index}") + batch_workdir = Path(self.workdir, f"batch_{batch_index}") batch_workdir.mkdir(parents=True) # Create list of pdb files From 44b41e0cc4813f537f5539cabf8145bfb98d3787 Mon Sep 17 00:00:00 2001 From: VGPReys Date: Thu, 18 Apr 2024 12:09:27 +0200 Subject: [PATCH 11/36] Path to str for .join() method --- src/haddock/modules/scoring/voroscoring/voroscoring.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/haddock/modules/scoring/voroscoring/voroscoring.py b/src/haddock/modules/scoring/voroscoring/voroscoring.py index 6bead42eaf..123c43119c 100644 --- a/src/haddock/modules/scoring/voroscoring/voroscoring.py +++ b/src/haddock/modules/scoring/voroscoring/voroscoring.py @@ -81,7 +81,7 @@ def run(self): # Obtain absolute paths self.workdir = Path(self.workdir).resolve() all_pdbs = [ - Path(mdl.path, mdl.file_name).resolve() + str(Path(mdl.path, mdl.file_name).resolve()) for mdl in self.models ] # Loop over batches @@ -89,7 +89,7 @@ def run(self): # Run slurm self.run_voro_batch( batch, - batch_index=bi, + batch_index=bi + 1, gpuid=bi % self.params['nb_gpus'], ) # Recombine all batches output files @@ -98,7 +98,7 @@ def run(self): def run_voro_batch( self, - pdb_filepaths: list[Union[str, Path]], + pdb_filepaths: list[str], batch_index: int = 1, gpuid: int = -1, ) -> None: From 1e9d6124563def46f984cb66f626c7060f448b85 Mon Sep 17 00:00:00 2001 From: VGPReys Date: Thu, 18 Apr 2024 12:11:24 +0200 Subject: [PATCH 12/36] voro scoring example --- examples/scoring/voroscoring-test.cfg | 30 +++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 examples/scoring/voroscoring-test.cfg diff --git a/examples/scoring/voroscoring-test.cfg b/examples/scoring/voroscoring-test.cfg new file mode 100644 index 0000000000..1075666559 --- /dev/null +++ b/examples/scoring/voroscoring-test.cfg @@ -0,0 +1,30 @@ +# ==================================================================== +# Scoring example + +# directory in which the scoring will be done +run_dir = "run1-voroscoring-test" +clean = false + +# execution mode +ncores = 40 +mode = "local" + +# ensemble of different complexes to be scored +molecules = ["data/T161-rescoring-ens.pdb", + "data/HY3.pdb", + "data/protein-dna_1w.pdb", + "data/protein-protein_1w.pdb", + "data/protein-protein_2w.pdb", + "data/protein-trimer_1w.pdb" + ] + +# ==================================================================== +# Parameters for each stage are defined below + +[topoaa] + +[voroscoring] + +[caprieval] + +# ==================================================================== From 042f4841ddf5e77d796690d9ae12bfb618d338d8 Mon Sep 17 00:00:00 2001 From: VGPReys Date: Thu, 18 Apr 2024 12:20:35 +0200 Subject: [PATCH 13/36] output tsv file writing from self.output() --- src/haddock/modules/scoring/__init__.py | 9 +++++++-- .../modules/scoring/voroscoring/__init__.py | 6 +++++- .../modules/scoring/voroscoring/voroscoring.py | 17 ----------------- 3 files changed, 12 insertions(+), 20 deletions(-) diff --git a/src/haddock/modules/scoring/__init__.py b/src/haddock/modules/scoring/__init__.py index 16a1c3b82d..cd57c26200 100644 --- a/src/haddock/modules/scoring/__init__.py +++ b/src/haddock/modules/scoring/__init__.py @@ -9,7 +9,12 @@ class ScoringModule(BaseHaddockModule): """Parent class for Scoring modules.""" - def output(self, output_fname: FilePath, sep: str = "\t") -> None: + def output( + self, + output_fname: FilePath, + sep: str = "\t", + ascending_sort: bool = True, + ) -> None: """Save the output in comprehensive tables.""" # saves scoring data sc_data = [] @@ -19,7 +24,7 @@ def output(self, output_fname: FilePath, sep: str = "\t") -> None: # converts to pandas dataframe and sorts by score df_columns = ["structure", "original_name", "md5", "score"] df_sc = pd.DataFrame(sc_data, columns=df_columns) - df_sc_sorted = df_sc.sort_values(by="score", ascending=True) + df_sc_sorted = df_sc.sort_values(by="score", ascending=ascending_sort) # writes to disk df_sc_sorted.to_csv(output_fname, sep=sep, index=False, na_rep="None") diff --git a/src/haddock/modules/scoring/voroscoring/__init__.py b/src/haddock/modules/scoring/voroscoring/__init__.py index cc98d2e510..89a2f65cf8 100644 --- a/src/haddock/modules/scoring/voroscoring/__init__.py +++ b/src/haddock/modules/scoring/voroscoring/__init__.py @@ -68,9 +68,13 @@ def _run(self) -> None: self.output_models, models_scores = update_models_with_scores( output_fname, models_to_score, + metric=self.params["metric"], ) # Write output file scoring_tsv_fpath = f"{RECIPE_PATH.name}.tsv" - write_models_scores(models_scores, scoring_tsv_fpath) + self.output( + scoring_tsv_fpath, + ascending_sort="_energy" in self.params["metric"], + ) # Export to next module self.export_io_models(faulty_tolerance=self.params["tolerance"]) diff --git a/src/haddock/modules/scoring/voroscoring/voroscoring.py b/src/haddock/modules/scoring/voroscoring/voroscoring.py index 123c43119c..beb9979ea8 100644 --- a/src/haddock/modules/scoring/voroscoring/voroscoring.py +++ b/src/haddock/modules/scoring/voroscoring/voroscoring.py @@ -253,20 +253,3 @@ def update_models_with_scores( model.rank = data_mapper[model.file_name]["rank"] return models, data_mapper - -def write_models_scores( - models_scores: dict[str, dict[str, Union[float, int]]], - filepath: Union[str, Path], - ) -> None: - header = ("structure", "original_name", "md5", "score", "rank", ) - with open(filepath, 'w') as filout: - filout.write('\t'.join(header) + os.linesep) - # sort models by keys - sorted_models = sorted( - models_scores, - key=lambda k: models_scores[k]['rank'], - ) - for modelname in sorted_models: - scores = models_scores[modelname] - newline_dt = f"{modelname}\t{modelname}\t-\t{scores['score']}\t{scores['rank']}" # noqa : E501 - filout.write(newline_dt + os.linesep) \ No newline at end of file From 319056d4a726a56fef80e237dee5fe7c3ff5d6a0 Mon Sep 17 00:00:00 2001 From: VGPReys Date: Thu, 18 Apr 2024 12:21:47 +0200 Subject: [PATCH 14/36] remove import --- src/haddock/modules/scoring/voroscoring/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/haddock/modules/scoring/voroscoring/__init__.py b/src/haddock/modules/scoring/voroscoring/__init__.py index 89a2f65cf8..2a882b526c 100644 --- a/src/haddock/modules/scoring/voroscoring/__init__.py +++ b/src/haddock/modules/scoring/voroscoring/__init__.py @@ -8,7 +8,6 @@ from haddock.modules.scoring.voroscoring.voroscoring import ( VoroMQA, update_models_with_scores, - write_models_scores, ) RECIPE_PATH = Path(__file__).resolve().parent From 5c05779ef71a8971973be5790dd70fafaeae727b Mon Sep 17 00:00:00 2001 From: VGPReys Date: Thu, 18 Apr 2024 12:26:33 +0200 Subject: [PATCH 15/36] solve error in recombine arguments --- src/haddock/modules/scoring/voroscoring/voroscoring.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/haddock/modules/scoring/voroscoring/voroscoring.py b/src/haddock/modules/scoring/voroscoring/voroscoring.py index beb9979ea8..155aa1372e 100644 --- a/src/haddock/modules/scoring/voroscoring/voroscoring.py +++ b/src/haddock/modules/scoring/voroscoring/voroscoring.py @@ -93,7 +93,7 @@ def run(self): gpuid=bi % self.params['nb_gpus'], ) # Recombine all batches output files - scores_fpath = self.recombine_batches(self.workdir) + scores_fpath = self.recombine_batches() log.info(f"Generated output file: {scores_fpath}") def run_voro_batch( @@ -181,6 +181,7 @@ def wait_for_termination( wait_time: int = 60, ) -> list[Union[str, Path]]: batches_dirpath = glob.glob(f"{self.workdir}/batch_*/") + log.info(f"Waiting for voro-mqa predictions to finish...") while True: try: output_files: list[Union[str, Path]] = [] @@ -190,9 +191,13 @@ def wait_for_termination( assert expected_outputfile.stat().st_size != 0 output_files.append(expected_outputfile) except AssertionError as _e: - log.info(f"Waiting {wait_time} sec...") + log.info(f"Waiting {wait_time} sec more...") time.sleep(wait_time) else: + log.info( + "VoroMQA results are accessible: " + f"{len(output_files)} batches" + ) return output_files @staticmethod From 0dd02ba60b6b450a34d908053abda3934373f978 Mon Sep 17 00:00:00 2001 From: VGPReys Date: Thu, 18 Apr 2024 14:03:19 +0200 Subject: [PATCH 16/36] finetunings --- .../modules/scoring/voroscoring/__init__.py | 8 +++--- .../scoring/voroscoring/voroscoring.py | 25 ++++++++----------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/src/haddock/modules/scoring/voroscoring/__init__.py b/src/haddock/modules/scoring/voroscoring/__init__.py index 2a882b526c..4b163b0ea7 100644 --- a/src/haddock/modules/scoring/voroscoring/__init__.py +++ b/src/haddock/modules/scoring/voroscoring/__init__.py @@ -47,6 +47,7 @@ def _run(self) -> None: except Exception as e: self.finish_with_error(e) + # Initiate VoroMQA object output_fname = Path(f"{RECIPE_PATH.name}_voro.tsv") voromqa = VoroMQA( models_to_score, @@ -54,8 +55,9 @@ def _run(self) -> None: self.params, output=output_fname, ) - jobs: list[VoroMQA] = [voromqa] + # Launch machinery + jobs: list[VoroMQA] = [voromqa] # Run CNS Jobs self.log(f"Running Voro-mqa scoring") Engine = get_engine(self.params['mode'], self.params) @@ -64,7 +66,7 @@ def _run(self) -> None: self.log("Voro-mqa scoring finished!") # Update score of output models - self.output_models, models_scores = update_models_with_scores( + self.output_models = update_models_with_scores( output_fname, models_to_score, metric=self.params["metric"], @@ -76,4 +78,4 @@ def _run(self) -> None: ascending_sort="_energy" in self.params["metric"], ) # Export to next module - self.export_io_models(faulty_tolerance=self.params["tolerance"]) + self.export_io_models() diff --git a/src/haddock/modules/scoring/voroscoring/voroscoring.py b/src/haddock/modules/scoring/voroscoring/voroscoring.py index 155aa1372e..a56ec2a685 100644 --- a/src/haddock/modules/scoring/voroscoring/voroscoring.py +++ b/src/haddock/modules/scoring/voroscoring/voroscoring.py @@ -7,7 +7,7 @@ from haddock import log from haddock.core.typing import Any, Generator, Path, Union -from haddock.libs.libontology import PDBFile +from haddock.libs.libontology import NaN, PDBFile VOROMQA_CFG_TEMPLATE = """#!/bin/bash @@ -214,7 +214,7 @@ def update_models_with_scores( output_fname: Union[str, Path], models: list[PDBFile], metric: str = "jury_score", - ) -> tuple[list[PDBFile], dict[str, dict[str, float]]]: + ) -> list[PDBFile]: scores_mapper: dict[str, float] = {} # Read output file with open(output_fname, 'r') as filin: @@ -244,17 +244,14 @@ def update_models_with_scores( ) } - data_mapper = { - model_filename: { - "score": scores_mapper[model_filename], - "rank": ranking_mapper[model_filename], - } - } - # Loop over input models for model in models: - # only modify the model score - model.score = data_mapper[model.file_name]["score"] - model.rank = data_mapper[model.file_name]["rank"] - - return models, data_mapper + # Add score and rank as attribute + if model.file_name in scores_mapper.keys(): + model.score = scores_mapper[model.file_name] + model.rank = ranking_mapper[model.file_name] + else: + # Go for cheese nan + model.score = NaN + model.rank = NaN + return models From b431e6a922078c0c01761a72275ce41650d897d8 Mon Sep 17 00:00:00 2001 From: VGPReys Date: Thu, 18 Apr 2024 14:58:15 +0200 Subject: [PATCH 17/36] tidy types and lints --- src/haddock/modules/scoring/__init__.py | 15 ++- .../modules/scoring/emscoring/__init__.py | 3 +- .../modules/scoring/mdscoring/__init__.py | 4 +- .../modules/scoring/voroscoring/__init__.py | 17 ++- .../scoring/voroscoring/voroscoring.py | 111 ++++++++++++++++-- 5 files changed, 128 insertions(+), 22 deletions(-) diff --git a/src/haddock/modules/scoring/__init__.py b/src/haddock/modules/scoring/__init__.py index cd57c26200..cbf1d31dcd 100644 --- a/src/haddock/modules/scoring/__init__.py +++ b/src/haddock/modules/scoring/__init__.py @@ -15,7 +15,17 @@ def output( sep: str = "\t", ascending_sort: bool = True, ) -> None: - """Save the output in comprehensive tables.""" + r"""Save the output in comprehensive tables. + + Parameters + ---------- + output_fname : FilePath + Path to the file where to write scoring data. + sep : str, optional + Charater used as separator in file, by default "\t" + ascending_sort : bool, optional + Should the data be sorted in ascending order, by default True + """ # saves scoring data sc_data = [] for pdb in self.output_models: @@ -30,5 +40,6 @@ def output( return + class CNSScoringModule(BaseCNSModule, ScoringModule): - """Parent class for Scoring modules.""" + """Parent class for CNS Scoring modules.""" diff --git a/src/haddock/modules/scoring/emscoring/__init__.py b/src/haddock/modules/scoring/emscoring/__init__.py index ab92af314d..39aec86919 100644 --- a/src/haddock/modules/scoring/emscoring/__init__.py +++ b/src/haddock/modules/scoring/emscoring/__init__.py @@ -1,7 +1,8 @@ """EM scoring module. This module performs energy minimization and scoring of the models generated -in the previous step of the workflow. No restraints are applied during this step. +in the previous step of the workflow. +Note that no restraints (AIRs) are applied during this step. """ from pathlib import Path diff --git a/src/haddock/modules/scoring/mdscoring/__init__.py b/src/haddock/modules/scoring/mdscoring/__init__.py index cbe2d0b0f7..fff6d1ad94 100644 --- a/src/haddock/modules/scoring/mdscoring/__init__.py +++ b/src/haddock/modules/scoring/mdscoring/__init__.py @@ -1,7 +1,9 @@ """MD scoring module. This module will perform a short MD simulation on the input models and -score them. No restraints are applied during this step.""" +score them. +Note that no restraints (AIRs) are applied during this step. +""" from pathlib import Path from haddock.core.typing import FilePath diff --git a/src/haddock/modules/scoring/voroscoring/__init__.py b/src/haddock/modules/scoring/voroscoring/__init__.py index 4b163b0ea7..55eb8da730 100644 --- a/src/haddock/modules/scoring/voroscoring/__init__.py +++ b/src/haddock/modules/scoring/voroscoring/__init__.py @@ -1,4 +1,10 @@ -""" +"""Voro scoring module. + +This module performs scoring of input pdb models using ftdmp voro-mqa-all tool. +For more information, please check: https://github.com/kliment-olechnovic/ftdmp + +It is a third party module, and requires the appropriate set up and intallation +for it to run without issue. """ from pathlib import Path @@ -30,11 +36,12 @@ def __init__( """Initialize class.""" super().__init__(order, path, init_params) - @classmethod def confirm_installation(cls) -> None: """Confirm module is installed.""" - # FIXME ? Check if conda env is accessible + # FIXME ? Check : + # - if conda env is accessible + # - if ftdmp is accessible return def _run(self) -> None: @@ -58,8 +65,8 @@ def _run(self) -> None: # Launch machinery jobs: list[VoroMQA] = [voromqa] - # Run CNS Jobs - self.log(f"Running Voro-mqa scoring") + # Run Job(s) + self.log("Running Voro-mqa scoring") Engine = get_engine(self.params['mode'], self.params) engine = Engine(jobs) engine.run() diff --git a/src/haddock/modules/scoring/voroscoring/voroscoring.py b/src/haddock/modules/scoring/voroscoring/voroscoring.py index a56ec2a685..ea5750970a 100644 --- a/src/haddock/modules/scoring/voroscoring/voroscoring.py +++ b/src/haddock/modules/scoring/voroscoring/voroscoring.py @@ -1,3 +1,12 @@ +"""Voro scoring class. + +This class holds all the machinery to perform scoring of input pdb models using +ftdmp voro-mqa-all tool. +For more information, please check: https://github.com/kliment-olechnovic/ftdmp + +It is a third party module, and requires the appropriate set up and intallation +for it to run without issue. +""" import os import subprocess import glob @@ -60,24 +69,39 @@ # 4. Analyze results # Print final ouput file echo $OUT_MSG -""" +""" # noqa : E501 class VoroMQA(): + """The Haddock3 implementation of voro-mqa-all as a python class.""" def __init__( self, - models: list[Union[str, Path, PDBFile]], + models: list[PDBFile], workdir: Union[str, Path], params: dict[str, Any], - output: Path = Path("voroscoring.tsv"), + output: Union[str, Path] = "voroscoring_voro.tsv", ): + """Init of the VoroMQA class. + + Parameters + ---------- + models : list[PDBFile] + List of input PDB files to be scored. + workdir : Union[str, Path] + Where to do the process. + params : dict[str, Any] + Config file parameters + output : Path, optional + Name of the generated file, by default Path("voroscoring_voro.tsv") + """ self.models = models self.workdir = workdir self.params = params - self.output = output + self.output = Path(output) def run(self): + """Process class logic.""" # Obtain absolute paths self.workdir = Path(self.workdir).resolve() all_pdbs = [ @@ -102,6 +126,17 @@ def run_voro_batch( batch_index: int = 1, gpuid: int = -1, ) -> None: + """Preset and launch predictions on subset of pdb files. + + Parameters + ---------- + pdb_filepaths : list[str] + List of absolute path to the PDBs to score + batch_index : int, optional + Index of the batch, by default 1 + gpuid : int, optional + Index of the GPU to use, by default -1 + """ # Create workdir batch_workdir = Path(self.workdir, f"batch_{batch_index}") batch_workdir.mkdir(parents=True) @@ -136,6 +171,13 @@ def run_voro_batch( os.chdir(initdir) def recombine_batches(self) -> str: + """Recombine batches output file in a single one. + + Returns + ------- + finale_output_fpath : str + Filepath of the recombined scores + """ # Wait for all results to be obtained batches_result_paths = self.wait_for_termination() # Loop over them @@ -173,24 +215,33 @@ def recombine_batches(self) -> str: for h in combined_header ] line = '\t'.join(ordered_data) - filout.write(line+os.linesep) + filout.write(line + os.linesep) return finale_output_fpath - def wait_for_termination( - self, - wait_time: int = 60, - ) -> list[Union[str, Path]]: + def wait_for_termination(self, wait_time: int = 60) -> list[Path]: + """Wait until all results are accessible. + + Parameters + ---------- + wait_time : int, optional + Time in second between every termination checks, by default 60 + + Returns + ------- + output_files : list[Path] + List of voro scores results for every batches. + """ batches_dirpath = glob.glob(f"{self.workdir}/batch_*/") - log.info(f"Waiting for voro-mqa predictions to finish...") + log.info("Waiting for voro-mqa predictions to finish...") while True: try: - output_files: list[Union[str, Path]] = [] + output_files: list[Path] = [] for batch_dir in batches_dirpath: expected_outputfile = Path(batch_dir, "voro_scores.ssv") assert expected_outputfile.exists() assert expected_outputfile.stat().st_size != 0 output_files.append(expected_outputfile) - except AssertionError as _e: + except AssertionError: log.info(f"Waiting {wait_time} sec more...") time.sleep(wait_time) else: @@ -201,7 +252,24 @@ def wait_for_termination( return output_files @staticmethod - def batched(entries: str, size: int = 300) -> Generator[list, None, None]: + def batched( + entries: list[str], + size: int = 300, + ) -> Generator[list[str], None, None]: + """Generate batches of defined size. + + Parameters + ---------- + entries : list[str] + List of pdb files. + size : int, optional + Maximum size in every batch, by default 300 + + Yields + ------ + batch : Generator[list[str], None, None] + List of pdb files <= size. + """ batch = [] for pdb in entries: batch.append(pdb) @@ -210,11 +278,28 @@ def batched(entries: str, size: int = 300) -> Generator[list, None, None]: batch = [] yield batch + def update_models_with_scores( output_fname: Union[str, Path], models: list[PDBFile], metric: str = "jury_score", ) -> list[PDBFile]: + """Update PDBfiles with computed scores. + + Parameters + ---------- + output_fname : Union[str, Path] + Path to the file where to access scoring data. + models : list[PDBFile] + List of PDBFiles to be updated. + metric : str, optional + Name of the metric to be retrieved, by default "jury_score" + + Returns + ------- + models : list[PDBFile] + The updated list of PDBfiles now holding the score and rank attributes. + """ scores_mapper: dict[str, float] = {} # Read output file with open(output_fname, 'r') as filin: From d8a1322d9fe0b835ee204848d3d8d241473033f8 Mon Sep 17 00:00:00 2001 From: VGPReys Date: Thu, 18 Apr 2024 15:26:56 +0200 Subject: [PATCH 18/36] reversing scores for systematic ascenting sorting --- src/haddock/modules/scoring/voroscoring/__init__.py | 5 +---- src/haddock/modules/scoring/voroscoring/voroscoring.py | 8 ++++---- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/haddock/modules/scoring/voroscoring/__init__.py b/src/haddock/modules/scoring/voroscoring/__init__.py index 55eb8da730..528d85b5d9 100644 --- a/src/haddock/modules/scoring/voroscoring/__init__.py +++ b/src/haddock/modules/scoring/voroscoring/__init__.py @@ -80,9 +80,6 @@ def _run(self) -> None: ) # Write output file scoring_tsv_fpath = f"{RECIPE_PATH.name}.tsv" - self.output( - scoring_tsv_fpath, - ascending_sort="_energy" in self.params["metric"], - ) + self.output(scoring_tsv_fpath) # Export to next module self.export_io_models() diff --git a/src/haddock/modules/scoring/voroscoring/voroscoring.py b/src/haddock/modules/scoring/voroscoring/voroscoring.py index ea5750970a..e5ab36ba81 100644 --- a/src/haddock/modules/scoring/voroscoring/voroscoring.py +++ b/src/haddock/modules/scoring/voroscoring/voroscoring.py @@ -314,6 +314,9 @@ def update_models_with_scores( score = float(s_[header.index(metric)]) # Only extract model filename model_filename = modelpath.split('/')[-1] + # Reverse score if not an energy + if "_energy" not in metric: + score = -score # Hold score scores_mapper[model_filename] = score @@ -321,10 +324,7 @@ def update_models_with_scores( ranking_mapper = { model_filename: rank for rank, model_filename in enumerate( - sorted( - scores_mapper, - reverse="_energy" not in metric, - ), + sorted(scores_mapper), start=1, ) } From 994bea4ef82fe0a4790430644e2648c1a56d645d Mon Sep 17 00:00:00 2001 From: VGPReys Date: Fri, 19 Apr 2024 09:08:31 +0200 Subject: [PATCH 19/36] adding tests --- .../modules/scoring/voroscoring/__init__.py | 16 +- .../scoring/voroscoring/voroscoring.py | 34 ++-- tests/test_module_voroscoring.py | 163 ++++++++++++++++++ 3 files changed, 193 insertions(+), 20 deletions(-) create mode 100644 tests/test_module_voroscoring.py diff --git a/src/haddock/modules/scoring/voroscoring/__init__.py b/src/haddock/modules/scoring/voroscoring/__init__.py index 528d85b5d9..b2e9991ac6 100644 --- a/src/haddock/modules/scoring/voroscoring/__init__.py +++ b/src/haddock/modules/scoring/voroscoring/__init__.py @@ -55,7 +55,7 @@ def _run(self) -> None: self.finish_with_error(e) # Initiate VoroMQA object - output_fname = Path(f"{RECIPE_PATH.name}_voro.tsv") + output_fname = Path("voro_mqa_all.tsv") voromqa = VoroMQA( models_to_score, './', @@ -73,11 +73,15 @@ def _run(self) -> None: self.log("Voro-mqa scoring finished!") # Update score of output models - self.output_models = update_models_with_scores( - output_fname, - models_to_score, - metric=self.params["metric"], - ) + try: + self.output_models = update_models_with_scores( + output_fname, + models_to_score, + metric=self.params["metric"], + ) + except ValueError as e: + self.finish_with_error(e) + # Write output file scoring_tsv_fpath = f"{RECIPE_PATH.name}.tsv" self.output(scoring_tsv_fpath) diff --git a/src/haddock/modules/scoring/voroscoring/voroscoring.py b/src/haddock/modules/scoring/voroscoring/voroscoring.py index e5ab36ba81..d8c3a5f77f 100644 --- a/src/haddock/modules/scoring/voroscoring/voroscoring.py +++ b/src/haddock/modules/scoring/voroscoring/voroscoring.py @@ -218,7 +218,7 @@ def recombine_batches(self) -> str: filout.write(line + os.linesep) return finale_output_fpath - def wait_for_termination(self, wait_time: int = 60) -> list[Path]: + def wait_for_termination(self, wait_time: float = 60) -> list[Path]: """Wait until all results are accessible. Parameters @@ -232,7 +232,10 @@ def wait_for_termination(self, wait_time: int = 60) -> list[Path]: List of voro scores results for every batches. """ batches_dirpath = glob.glob(f"{self.workdir}/batch_*/") - log.info("Waiting for voro-mqa predictions to finish...") + log.info( + f"Waiting for {len(batches_dirpath)} " + "voro-mqa prediction batch(es) to finish..." + ) while True: try: output_files: list[Path] = [] @@ -247,7 +250,7 @@ def wait_for_termination(self, wait_time: int = 60) -> list[Path]: else: log.info( "VoroMQA results are accessible: " - f"{len(output_files)} batches" + f"{len(output_files)} batch(es)" ) return output_files @@ -276,11 +279,12 @@ def batched( if len(batch) == size: yield batch batch = [] - yield batch + if batch: + yield batch def update_models_with_scores( - output_fname: Union[str, Path], + voro_scoring_fname: Union[str, Path], models: list[PDBFile], metric: str = "jury_score", ) -> list[PDBFile]: @@ -301,8 +305,10 @@ def update_models_with_scores( The updated list of PDBfiles now holding the score and rank attributes. """ scores_mapper: dict[str, float] = {} + ranking_mapper: dict[str, int] = {} + rank: int = 0 # Read output file - with open(output_fname, 'r') as filin: + with open(voro_scoring_fname, 'r') as filin: for i, line in enumerate(filin): s_ = line.strip().split('\t') # Extract header @@ -319,15 +325,14 @@ def update_models_with_scores( score = -score # Hold score scores_mapper[model_filename] = score + rank += 1 + ranking_mapper[model_filename] = rank # Compute rankings - ranking_mapper = { - model_filename: rank - for rank, model_filename in enumerate( - sorted(scores_mapper), - start=1, - ) - } + #ranking_mapper = { + # model_filename: rank + # for rank, model_filename in enumerate(sorted(scores_mapper), start=1) + # } # Loop over input models for model in models: @@ -335,8 +340,9 @@ def update_models_with_scores( if model.file_name in scores_mapper.keys(): model.score = scores_mapper[model.file_name] model.rank = ranking_mapper[model.file_name] + # In some cases computation may fail else: - # Go for cheese nan + # Go for (garlic) cheese naans model.score = NaN model.rank = NaN return models diff --git a/tests/test_module_voroscoring.py b/tests/test_module_voroscoring.py new file mode 100644 index 0000000000..d17fa9b02c --- /dev/null +++ b/tests/test_module_voroscoring.py @@ -0,0 +1,163 @@ +"""Test the voroscoring module.""" +import os +import pytest +import pytest_mock # noqa : F401 +import tempfile +import subprocess +import shutil + +from numpy import isnan +from pathlib import Path + +from haddock.libs.libontology import NaN, PDBFile +from haddock.modules.scoring.voroscoring import ( + DEFAULT_CONFIG as params, + HaddockModule as VoroScoringModule, + ) +from haddock.modules.scoring.voroscoring.voroscoring import ( + VoroMQA, + update_models_with_scores, + ) + +from . import golden_data + + +@pytest.fixture +def output_models(): + """Prot-DNA models using for emscoring output.""" + return [ + PDBFile( + Path(golden_data, "protdna_complex_1.pdb"), + path=golden_data, + score=-0.28, + ), + PDBFile( + Path(golden_data, "protdna_complex_2.pdb"), + path=golden_data, + score=-0.42, + ), + PDBFile( + Path(golden_data, "protdna_complex_3.pdb"), + path=golden_data, + score=NaN, + ), + ] + + +@pytest.fixture +def voromqa(output_models): + with tempfile.TemporaryDirectory(dir=".") as tmpdir: + voromqa_object = VoroMQA( + output_models, + tmpdir, + params, + Path("raw_voromqa_scores.tsv"), + ) + yield voromqa_object + + +def test_voroscoring_output(output_models): + """Test voroscoring expected output.""" + voro_module = VoroScoringModule( + order=1, + path=Path("1_voroscoring"), + initial_params=params + ) + # original names + voro_module.output_models = output_models + for mod in range(len(output_models)): + ori_name = "original_name_" + str(mod) + ".pdb" + voro_module.output_models[mod].ori_name = ori_name + # creating output + output_fname = Path("voroscoring.tsv") + voro_module.output(output_fname) + observed_outf_l = [ + e.split() + for e in open(output_fname).readlines() + if not e.startswith('#') + ] + # expected output + expected_outf_l = [ + ["structure", "original_name", "md5", "score"], + ["protdna_complex_2.pdb", "original_name_1.pdb", "None", "-0.42"], + ["protdna_complex_1.pdb", "original_name_0.pdb", "None", "-0.28"], + ["protdna_complex_3.pdb", "original_name_2.pdb", "None", "None"], + ] + + assert observed_outf_l == expected_outf_l + output_fname.unlink() + + +def test_wait_for_termination(voromqa): + """Test waiting for results function behavior in voromqa.""" + nested_batch_dir = Path(voromqa.workdir, "batch_1") + os.mkdir(nested_batch_dir) + expected_ssv = Path(nested_batch_dir, "voro_scores.ssv") + # Trick to fake the generation of a file + delay_scriptpath = Path(nested_batch_dir, "delay.sh") + delay_scriptpath.write_text( + "\n".join(["sleep 0.1", f'echo "haddock3" > {expected_ssv}']) + ) + assert delay_scriptpath.exists() + os.system(f"chmod u+x {delay_scriptpath}") + os.system(f"./{delay_scriptpath} &") + assert not expected_ssv.exists() + # The actual test of the function + batches_ssv = voromqa.wait_for_termination(wait_time=0.1) + assert expected_ssv.exists() + assert batches_ssv[0] == expected_ssv + shutil.rmtree(nested_batch_dir) + + +def test_batched(voromqa): + """Test batched function behavior in voromqa.""" + for batch in voromqa.batched(list(range(10)), size=2): + assert len(batch) == 2 + batches = list(voromqa.batched(list(range(100)), size=99)) + assert len(batches[0]) == 99 + assert len(batches[1]) == 1 + + +def test_update_models_with_scores(output_models): + """Test to update PDBFiles with scores from voromqa tsv.""" + # Generate fake voro output file + output_fname = Path("fake_voro.tsv") + output_fname.write_text( + """ID\tjury_score\tfake_energy +protdna_complex_2.pdb\t0.5256\t-2 +protdna_complex_1.pdb\t0.1234\t-1 +""" + ) + updated_models = update_models_with_scores( + output_fname, + output_models, + metric="jury_score", + ) + assert updated_models[0].score == -0.1234 + assert updated_models[0].rank == 2 + assert updated_models[1].score == -0.5256 + assert updated_models[1].rank == 1 + assert isnan(updated_models[2].score) + assert isnan(updated_models[2].rank) + + updated_models = update_models_with_scores( + output_fname, + output_models, + metric="fake_energy", + ) + assert updated_models[0].score == -1 + assert updated_models[0].rank == 2 + assert updated_models[1].score == -2 + assert updated_models[1].rank == 1 + assert isnan(updated_models[2].score) + assert isnan(updated_models[2].rank) + + # Test error raising + with pytest.raises(ValueError): + updated_models2 = update_models_with_scores( + output_fname, + output_models, + metric="wrong", + ) + assert updated_models2 is None + output_fname.unlink() From f832c0ca77c9f518b9dc4831703429025d04de72 Mon Sep 17 00:00:00 2001 From: VGPReys Date: Mon, 29 Apr 2024 12:33:05 +0200 Subject: [PATCH 20/36] add header information in generated final output file --- src/haddock/modules/scoring/__init__.py | 19 +++++++++++++++---- .../modules/scoring/voroscoring/__init__.py | 11 +++++++++-- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/src/haddock/modules/scoring/__init__.py b/src/haddock/modules/scoring/__init__.py index a0c825893c..4153851087 100644 --- a/src/haddock/modules/scoring/__init__.py +++ b/src/haddock/modules/scoring/__init__.py @@ -1,7 +1,8 @@ """HADDOCK3 modules to score models.""" +from os import linesep import pandas as pd -from haddock.core.typing import FilePath +from haddock.core.typing import FilePath, Optional from haddock.modules.base_cns_module import BaseCNSModule from haddock.modules import BaseHaddockModule @@ -14,6 +15,7 @@ def output( output_fname: FilePath, sep: str = "\t", ascending_sort: bool = True, + header_comments: Optional[str] = None, ) -> None: r"""Save the output in comprehensive tables. @@ -36,9 +38,18 @@ def output( df_sc = pd.DataFrame(sc_data, columns=df_columns) df_sc_sorted = df_sc.sort_values(by="score", ascending=ascending_sort) # writes to disk - df_sc_sorted.to_csv(output_fname, sep=sep, index=False, na_rep="None") - - return + output_file = open(output_fname, 'a') + # Check if some comment in header are here + if header_comments: + output_file.write(header_comments) + # Write the dataframe + df_sc_sorted.to_csv( + output_file, + sep=sep, + index=False, + na_rep="None", + line_terminator=linesep, + ) class CNSScoringModule(BaseCNSModule, ScoringModule): diff --git a/src/haddock/modules/scoring/voroscoring/__init__.py b/src/haddock/modules/scoring/voroscoring/__init__.py index b2e9991ac6..a71ccce31c 100644 --- a/src/haddock/modules/scoring/voroscoring/__init__.py +++ b/src/haddock/modules/scoring/voroscoring/__init__.py @@ -6,6 +6,7 @@ It is a third party module, and requires the appropriate set up and intallation for it to run without issue. """ +from os import linesep from pathlib import Path from haddock.core.typing import Any, FilePath @@ -84,6 +85,12 @@ def _run(self) -> None: # Write output file scoring_tsv_fpath = f"{RECIPE_PATH.name}.tsv" - self.output(scoring_tsv_fpath) + self.output( + scoring_tsv_fpath, + header_comments=( + "# Note that negative of the value are reported " + f"in the case of non-energetical predictions{linesep}", + ), + ) # Export to next module - self.export_io_models() + self.export_io_models() \ No newline at end of file From a5a36567ca5af79a6c0673c22e9a46761ba60993 Mon Sep 17 00:00:00 2001 From: VGPReys Date: Tue, 30 Apr 2024 15:27:13 +0200 Subject: [PATCH 21/36] make sure line is terminated --- src/haddock/modules/scoring/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/haddock/modules/scoring/__init__.py b/src/haddock/modules/scoring/__init__.py index 4153851087..423fe0c5c9 100644 --- a/src/haddock/modules/scoring/__init__.py +++ b/src/haddock/modules/scoring/__init__.py @@ -41,6 +41,9 @@ def output( output_file = open(output_fname, 'a') # Check if some comment in header are here if header_comments: + # Make sure the comments is ending by a new line + if header_comments[-1] != linesep: + header_comments += linesep output_file.write(header_comments) # Write the dataframe df_sc_sorted.to_csv( From ac409ac965c3d44c6f8b7f334874a1ac65966bc6 Mon Sep 17 00:00:00 2001 From: VGPReys Date: Fri, 3 May 2024 08:39:25 +0200 Subject: [PATCH 22/36] fix tuple error --- src/haddock/modules/scoring/voroscoring/__init__.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/haddock/modules/scoring/voroscoring/__init__.py b/src/haddock/modules/scoring/voroscoring/__init__.py index a71ccce31c..7f8241c301 100644 --- a/src/haddock/modules/scoring/voroscoring/__init__.py +++ b/src/haddock/modules/scoring/voroscoring/__init__.py @@ -87,10 +87,7 @@ def _run(self) -> None: scoring_tsv_fpath = f"{RECIPE_PATH.name}.tsv" self.output( scoring_tsv_fpath, - header_comments=( - "# Note that negative of the value are reported " - f"in the case of non-energetical predictions{linesep}", - ), + header_comments=f"# Note that negative of the value are reported in the case of non-energetical predictions{linesep}", # noqa : E501 ) # Export to next module self.export_io_models() \ No newline at end of file From 795682a8d8ce2a001c9c4fc874cc83b50979d946 Mon Sep 17 00:00:00 2001 From: Victor Reys Date: Tue, 21 May 2024 15:38:47 +0200 Subject: [PATCH 23/36] tests --- examples/scoring/voroscoring-test.cfg | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/scoring/voroscoring-test.cfg b/examples/scoring/voroscoring-test.cfg index 1075666559..9b0ffaaf80 100644 --- a/examples/scoring/voroscoring-test.cfg +++ b/examples/scoring/voroscoring-test.cfg @@ -25,6 +25,9 @@ molecules = ["data/T161-rescoring-ens.pdb", [voroscoring] +[seletop] +select = 3 + [caprieval] # ==================================================================== From 7e8ac73d8b94272e64c65e68f9d9126a2a63a7fa Mon Sep 17 00:00:00 2001 From: "xiaotong1919@gmail.com" Date: Thu, 30 May 2024 15:26:57 +0200 Subject: [PATCH 24/36] change --- examples/scoring/voroscoring-test.cfg | 362 ++++++++++++++++++++++++-- 1 file changed, 340 insertions(+), 22 deletions(-) diff --git a/examples/scoring/voroscoring-test.cfg b/examples/scoring/voroscoring-test.cfg index 1075666559..d8c3a5f77f 100644 --- a/examples/scoring/voroscoring-test.cfg +++ b/examples/scoring/voroscoring-test.cfg @@ -1,30 +1,348 @@ -# ==================================================================== -# Scoring example +"""Voro scoring class. -# directory in which the scoring will be done -run_dir = "run1-voroscoring-test" -clean = false +This class holds all the machinery to perform scoring of input pdb models using +ftdmp voro-mqa-all tool. +For more information, please check: https://github.com/kliment-olechnovic/ftdmp -# execution mode -ncores = 40 -mode = "local" +It is a third party module, and requires the appropriate set up and intallation +for it to run without issue. +""" +import os +import subprocess +import glob +import time -# ensemble of different complexes to be scored -molecules = ["data/T161-rescoring-ens.pdb", - "data/HY3.pdb", - "data/protein-dna_1w.pdb", - "data/protein-protein_1w.pdb", - "data/protein-protein_2w.pdb", - "data/protein-trimer_1w.pdb" - ] +from random import randint -# ==================================================================== -# Parameters for each stage are defined below +from haddock import log +from haddock.core.typing import Any, Generator, Path, Union +from haddock.libs.libontology import NaN, PDBFile -[topoaa] -[voroscoring] +VOROMQA_CFG_TEMPLATE = """#!/bin/bash +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=1 +#SBATCH --partition=gpu +#SBATCH --gres=gpu:{GPUID} +#SBATCH --mem-per-gpu=1GB -[caprieval] +# Where to do the work +WORKDIR="{WORKDIR}" -# ==================================================================== +# Name of the outputfile (.ssv for space separated values) +OUTPUT_FNAME="voro_scores.ssv" + +# Define Constants +CONDA_INSTALL_DIR="{CONDA_INSTALL_DIR}" +CONDA_ENV_NAME="{CONDA_ENV_NAME}" +FTDMP_INSTALL_DIR="{FTDMP_INSTALL_DIR}" +VOROMQA_SCRIPT="ftdmp-qa-all" + +# Define workflow variables +OUTPUT_FPATH="$WORKDIR/$OUTPUT_FNAME" +PDB_LIST_PATH="{PDB_LIST_PATH}" +OUT_MSG="Output file is here: $OUTPUT_FPATH" + +# 1. Setup enviroments +# Load the gnu12 module... +# NOTE: specific to tintin users... +module load gnu12 +# Activate conda env +source "$CONDA_INSTALL_DIR/bin/activate" +conda activate $CONDA_ENV_NAME +echo "conda env: $CONDA_PREFIX" + +# 2. Setup run directory +# Create working directory +mkdir -p $WORKDIR + +# 3. Run voro-mqa (model quality assessment) +# Go to ftdmp install directory +cd $FTDMP_INSTALL_DIR +echo "Directory: $PWD" +# run voro-mqa +echo "./$VOROMQA_SCRIPT --conda-path $CONDA_INSTALL_DIR --conda-env $CONDA_ENV_NAME --workdir '$WORKDIR' --rank-names 'protein_protein_voromqa_and_global_and_gnn_no_sr' < $PDB_LIST_PATH > $OUTPUT_FPATH" +./$VOROMQA_SCRIPT --conda-path $CONDA_INSTALL_DIR --conda-env $CONDA_ENV_NAME --workdir $WORKDIR --rank-names 'protein_protein_voromqa_and_global_and_gnn_no_sr' --output-redundancy-threshold 1.0 < $PDB_LIST_PATH > $OUTPUT_FPATH +# Let the magic happen.. + +# 4. Analyze results +# Print final ouput file +echo $OUT_MSG +""" # noqa : E501 + + +class VoroMQA(): + """The Haddock3 implementation of voro-mqa-all as a python class.""" + + def __init__( + self, + models: list[PDBFile], + workdir: Union[str, Path], + params: dict[str, Any], + output: Union[str, Path] = "voroscoring_voro.tsv", + ): + """Init of the VoroMQA class. + + Parameters + ---------- + models : list[PDBFile] + List of input PDB files to be scored. + workdir : Union[str, Path] + Where to do the process. + params : dict[str, Any] + Config file parameters + output : Path, optional + Name of the generated file, by default Path("voroscoring_voro.tsv") + """ + self.models = models + self.workdir = workdir + self.params = params + self.output = Path(output) + + def run(self): + """Process class logic.""" + # Obtain absolute paths + self.workdir = Path(self.workdir).resolve() + all_pdbs = [ + str(Path(mdl.path, mdl.file_name).resolve()) + for mdl in self.models + ] + # Loop over batches + for bi, batch in enumerate(self.batched(all_pdbs, size=300)): + # Run slurm + self.run_voro_batch( + batch, + batch_index=bi + 1, + gpuid=bi % self.params['nb_gpus'], + ) + # Recombine all batches output files + scores_fpath = self.recombine_batches() + log.info(f"Generated output file: {scores_fpath}") + + def run_voro_batch( + self, + pdb_filepaths: list[str], + batch_index: int = 1, + gpuid: int = -1, + ) -> None: + """Preset and launch predictions on subset of pdb files. + + Parameters + ---------- + pdb_filepaths : list[str] + List of absolute path to the PDBs to score + batch_index : int, optional + Index of the batch, by default 1 + gpuid : int, optional + Index of the GPU to use, by default -1 + """ + # Create workdir + batch_workdir = Path(self.workdir, f"batch_{batch_index}") + batch_workdir.mkdir(parents=True) + + # Create list of pdb files + pdb_files_list_path = Path(batch_workdir, "pdbs.list") + pdb_files_list_path.write_text(os.linesep.join(pdb_filepaths)) + + # Get GPU id + if gpuid < 0: + gpuid = randint(0, self.params["nb_gpus"] - 1) + + # Format config file + batch_cfg = VOROMQA_CFG_TEMPLATE.format( + CONDA_INSTALL_DIR=self.params["conda_install_dir"], + CONDA_ENV_NAME=self.params["conda_env_name"], + FTDMP_INSTALL_DIR=self.params["ftdmp_install_dir"], + GPUID=gpuid, + WORKDIR=batch_workdir, + PDB_LIST_PATH=pdb_files_list_path, + ) + + # Write it + batch_cfg_fpath = Path(batch_workdir, "vorobatchcfg.job") + batch_cfg_fpath.write_text(batch_cfg) + + # Launch slurm + initdir = os.getcwd() + os.chdir(batch_workdir) + log.info(f"sbatch {batch_cfg_fpath}") + subprocess.run(f"sbatch {batch_cfg_fpath}", shell=True) + os.chdir(initdir) + + def recombine_batches(self) -> str: + """Recombine batches output file in a single one. + + Returns + ------- + finale_output_fpath : str + Filepath of the recombined scores + """ + # Wait for all results to be obtained + batches_result_paths = self.wait_for_termination() + # Loop over them + all_predictions: list[dict[str, str]] = [] + combined_header: list[str] = [] + for batch_results in batches_result_paths: + # Read voro results + with open(batch_results, 'r') as filin: + header = filin.readline().strip().split(' ') + for head in header: + if head not in combined_header: + combined_header.append(head) + for line in filin: + s_ = line.strip().split(' ') + all_predictions.append({ + head: s_[header.index(head)] + for head in header + }) + + # Sort all batches entries + sorted_entries = sorted( + all_predictions, + key=lambda k: float(k[self.params["metric"]]), + reverse="_energy" not in self.params["metric"], + ) + + # Write final output file + finale_output_fpath = f"{self.workdir}/{self.output}" + with open(finale_output_fpath, "w") as filout: + file_header = '\t'.join(combined_header) + filout.write(file_header + os.linesep) + for entry in sorted_entries: + ordered_data = [ + entry[h] if h in entry.keys() else '-' + for h in combined_header + ] + line = '\t'.join(ordered_data) + filout.write(line + os.linesep) + return finale_output_fpath + + def wait_for_termination(self, wait_time: float = 60) -> list[Path]: + """Wait until all results are accessible. + + Parameters + ---------- + wait_time : int, optional + Time in second between every termination checks, by default 60 + + Returns + ------- + output_files : list[Path] + List of voro scores results for every batches. + """ + batches_dirpath = glob.glob(f"{self.workdir}/batch_*/") + log.info( + f"Waiting for {len(batches_dirpath)} " + "voro-mqa prediction batch(es) to finish..." + ) + while True: + try: + output_files: list[Path] = [] + for batch_dir in batches_dirpath: + expected_outputfile = Path(batch_dir, "voro_scores.ssv") + assert expected_outputfile.exists() + assert expected_outputfile.stat().st_size != 0 + output_files.append(expected_outputfile) + except AssertionError: + log.info(f"Waiting {wait_time} sec more...") + time.sleep(wait_time) + else: + log.info( + "VoroMQA results are accessible: " + f"{len(output_files)} batch(es)" + ) + return output_files + + @staticmethod + def batched( + entries: list[str], + size: int = 300, + ) -> Generator[list[str], None, None]: + """Generate batches of defined size. + + Parameters + ---------- + entries : list[str] + List of pdb files. + size : int, optional + Maximum size in every batch, by default 300 + + Yields + ------ + batch : Generator[list[str], None, None] + List of pdb files <= size. + """ + batch = [] + for pdb in entries: + batch.append(pdb) + if len(batch) == size: + yield batch + batch = [] + if batch: + yield batch + + +def update_models_with_scores( + voro_scoring_fname: Union[str, Path], + models: list[PDBFile], + metric: str = "jury_score", + ) -> list[PDBFile]: + """Update PDBfiles with computed scores. + + Parameters + ---------- + output_fname : Union[str, Path] + Path to the file where to access scoring data. + models : list[PDBFile] + List of PDBFiles to be updated. + metric : str, optional + Name of the metric to be retrieved, by default "jury_score" + + Returns + ------- + models : list[PDBFile] + The updated list of PDBfiles now holding the score and rank attributes. + """ + scores_mapper: dict[str, float] = {} + ranking_mapper: dict[str, int] = {} + rank: int = 0 + # Read output file + with open(voro_scoring_fname, 'r') as filin: + for i, line in enumerate(filin): + s_ = line.strip().split('\t') + # Extract header + if i == 0: + header = s_ + continue + # Extract data + modelpath = str(s_[header.index("ID")]) + score = float(s_[header.index(metric)]) + # Only extract model filename + model_filename = modelpath.split('/')[-1] + # Reverse score if not an energy + if "_energy" not in metric: + score = -score + # Hold score + scores_mapper[model_filename] = score + rank += 1 + ranking_mapper[model_filename] = rank + + # Compute rankings + #ranking_mapper = { + # model_filename: rank + # for rank, model_filename in enumerate(sorted(scores_mapper), start=1) + # } + + # Loop over input models + for model in models: + # Add score and rank as attribute + if model.file_name in scores_mapper.keys(): + model.score = scores_mapper[model.file_name] + model.rank = ranking_mapper[model.file_name] + # In some cases computation may fail + else: + # Go for (garlic) cheese naans + model.score = NaN + model.rank = NaN + return models From 4370dafb7733dca96a22046c9ace8f2e277cc6dc Mon Sep 17 00:00:00 2001 From: Victor Reys Date: Mon, 17 Jun 2024 13:03:42 +0200 Subject: [PATCH 25/36] fixes to adapt to new hardware --- src/haddock/modules/scoring/__init__.py | 2 +- src/haddock/modules/scoring/voroscoring/voroscoring.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/haddock/modules/scoring/__init__.py b/src/haddock/modules/scoring/__init__.py index 423fe0c5c9..c6a8097d96 100644 --- a/src/haddock/modules/scoring/__init__.py +++ b/src/haddock/modules/scoring/__init__.py @@ -51,7 +51,7 @@ def output( sep=sep, index=False, na_rep="None", - line_terminator=linesep, + lineterminator=linesep, ) diff --git a/src/haddock/modules/scoring/voroscoring/voroscoring.py b/src/haddock/modules/scoring/voroscoring/voroscoring.py index d8c3a5f77f..0dc547d76b 100644 --- a/src/haddock/modules/scoring/voroscoring/voroscoring.py +++ b/src/haddock/modules/scoring/voroscoring/voroscoring.py @@ -7,6 +7,8 @@ It is a third party module, and requires the appropriate set up and intallation for it to run without issue. """ + + import os import subprocess import glob @@ -24,8 +26,8 @@ #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=1 #SBATCH --partition=gpu -#SBATCH --gres=gpu:{GPUID} -#SBATCH --mem-per-gpu=1GB +#SBATCH --gres=gpu:1 +#SBATCH -J {JOBNAME} # Where to do the work WORKDIR="{WORKDIR}" @@ -154,6 +156,7 @@ def run_voro_batch( CONDA_INSTALL_DIR=self.params["conda_install_dir"], CONDA_ENV_NAME=self.params["conda_env_name"], FTDMP_INSTALL_DIR=self.params["ftdmp_install_dir"], + JOBNAME=f'hd3_voro_b{batch_index}', GPUID=gpuid, WORKDIR=batch_workdir, PDB_LIST_PATH=pdb_files_list_path, From 4d6253b768f21a1303cbccbe4710fde242b1f1a5 Mon Sep 17 00:00:00 2001 From: "xiaotong1919@gmail.com" Date: Wed, 18 Sep 2024 17:23:55 +0200 Subject: [PATCH 26/36] check --- examples/scoring/voroscoring-test.cfg | 362 ++------------------------ 1 file changed, 22 insertions(+), 340 deletions(-) diff --git a/examples/scoring/voroscoring-test.cfg b/examples/scoring/voroscoring-test.cfg index d8c3a5f77f..7c1d97a684 100644 --- a/examples/scoring/voroscoring-test.cfg +++ b/examples/scoring/voroscoring-test.cfg @@ -1,348 +1,30 @@ -"""Voro scoring class. +# ==================================================================== +# Scoring example -This class holds all the machinery to perform scoring of input pdb models using -ftdmp voro-mqa-all tool. -For more information, please check: https://github.com/kliment-olechnovic/ftdmp +# directory in which the scoring will be done +run_dir = "run1-voroscoring-test" +clean = false -It is a third party module, and requires the appropriate set up and intallation -for it to run without issue. -""" -import os -import subprocess -import glob -import time +# execution mode +ncores = 3 +mode = "local" -from random import randint +# ensemble of different complexes to be scored +molecules = ["data/T161-rescoring-ens.pdb", + "data/HY3.pdb", + "data/protein-dna_1w.pdb", + "data/protein-protein_1w.pdb", + "data/protein-protein_2w.pdb", + "data/protein-trimer_1w.pdb" + ] -from haddock import log -from haddock.core.typing import Any, Generator, Path, Union -from haddock.libs.libontology import NaN, PDBFile +# ==================================================================== +# Parameters for each stage are defined below +[topoaa] -VOROMQA_CFG_TEMPLATE = """#!/bin/bash -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=1 -#SBATCH --partition=gpu -#SBATCH --gres=gpu:{GPUID} -#SBATCH --mem-per-gpu=1GB +[voroscoring] -# Where to do the work -WORKDIR="{WORKDIR}" +[caprieval] -# Name of the outputfile (.ssv for space separated values) -OUTPUT_FNAME="voro_scores.ssv" - -# Define Constants -CONDA_INSTALL_DIR="{CONDA_INSTALL_DIR}" -CONDA_ENV_NAME="{CONDA_ENV_NAME}" -FTDMP_INSTALL_DIR="{FTDMP_INSTALL_DIR}" -VOROMQA_SCRIPT="ftdmp-qa-all" - -# Define workflow variables -OUTPUT_FPATH="$WORKDIR/$OUTPUT_FNAME" -PDB_LIST_PATH="{PDB_LIST_PATH}" -OUT_MSG="Output file is here: $OUTPUT_FPATH" - -# 1. Setup enviroments -# Load the gnu12 module... -# NOTE: specific to tintin users... -module load gnu12 -# Activate conda env -source "$CONDA_INSTALL_DIR/bin/activate" -conda activate $CONDA_ENV_NAME -echo "conda env: $CONDA_PREFIX" - -# 2. Setup run directory -# Create working directory -mkdir -p $WORKDIR - -# 3. Run voro-mqa (model quality assessment) -# Go to ftdmp install directory -cd $FTDMP_INSTALL_DIR -echo "Directory: $PWD" -# run voro-mqa -echo "./$VOROMQA_SCRIPT --conda-path $CONDA_INSTALL_DIR --conda-env $CONDA_ENV_NAME --workdir '$WORKDIR' --rank-names 'protein_protein_voromqa_and_global_and_gnn_no_sr' < $PDB_LIST_PATH > $OUTPUT_FPATH" -./$VOROMQA_SCRIPT --conda-path $CONDA_INSTALL_DIR --conda-env $CONDA_ENV_NAME --workdir $WORKDIR --rank-names 'protein_protein_voromqa_and_global_and_gnn_no_sr' --output-redundancy-threshold 1.0 < $PDB_LIST_PATH > $OUTPUT_FPATH -# Let the magic happen.. - -# 4. Analyze results -# Print final ouput file -echo $OUT_MSG -""" # noqa : E501 - - -class VoroMQA(): - """The Haddock3 implementation of voro-mqa-all as a python class.""" - - def __init__( - self, - models: list[PDBFile], - workdir: Union[str, Path], - params: dict[str, Any], - output: Union[str, Path] = "voroscoring_voro.tsv", - ): - """Init of the VoroMQA class. - - Parameters - ---------- - models : list[PDBFile] - List of input PDB files to be scored. - workdir : Union[str, Path] - Where to do the process. - params : dict[str, Any] - Config file parameters - output : Path, optional - Name of the generated file, by default Path("voroscoring_voro.tsv") - """ - self.models = models - self.workdir = workdir - self.params = params - self.output = Path(output) - - def run(self): - """Process class logic.""" - # Obtain absolute paths - self.workdir = Path(self.workdir).resolve() - all_pdbs = [ - str(Path(mdl.path, mdl.file_name).resolve()) - for mdl in self.models - ] - # Loop over batches - for bi, batch in enumerate(self.batched(all_pdbs, size=300)): - # Run slurm - self.run_voro_batch( - batch, - batch_index=bi + 1, - gpuid=bi % self.params['nb_gpus'], - ) - # Recombine all batches output files - scores_fpath = self.recombine_batches() - log.info(f"Generated output file: {scores_fpath}") - - def run_voro_batch( - self, - pdb_filepaths: list[str], - batch_index: int = 1, - gpuid: int = -1, - ) -> None: - """Preset and launch predictions on subset of pdb files. - - Parameters - ---------- - pdb_filepaths : list[str] - List of absolute path to the PDBs to score - batch_index : int, optional - Index of the batch, by default 1 - gpuid : int, optional - Index of the GPU to use, by default -1 - """ - # Create workdir - batch_workdir = Path(self.workdir, f"batch_{batch_index}") - batch_workdir.mkdir(parents=True) - - # Create list of pdb files - pdb_files_list_path = Path(batch_workdir, "pdbs.list") - pdb_files_list_path.write_text(os.linesep.join(pdb_filepaths)) - - # Get GPU id - if gpuid < 0: - gpuid = randint(0, self.params["nb_gpus"] - 1) - - # Format config file - batch_cfg = VOROMQA_CFG_TEMPLATE.format( - CONDA_INSTALL_DIR=self.params["conda_install_dir"], - CONDA_ENV_NAME=self.params["conda_env_name"], - FTDMP_INSTALL_DIR=self.params["ftdmp_install_dir"], - GPUID=gpuid, - WORKDIR=batch_workdir, - PDB_LIST_PATH=pdb_files_list_path, - ) - - # Write it - batch_cfg_fpath = Path(batch_workdir, "vorobatchcfg.job") - batch_cfg_fpath.write_text(batch_cfg) - - # Launch slurm - initdir = os.getcwd() - os.chdir(batch_workdir) - log.info(f"sbatch {batch_cfg_fpath}") - subprocess.run(f"sbatch {batch_cfg_fpath}", shell=True) - os.chdir(initdir) - - def recombine_batches(self) -> str: - """Recombine batches output file in a single one. - - Returns - ------- - finale_output_fpath : str - Filepath of the recombined scores - """ - # Wait for all results to be obtained - batches_result_paths = self.wait_for_termination() - # Loop over them - all_predictions: list[dict[str, str]] = [] - combined_header: list[str] = [] - for batch_results in batches_result_paths: - # Read voro results - with open(batch_results, 'r') as filin: - header = filin.readline().strip().split(' ') - for head in header: - if head not in combined_header: - combined_header.append(head) - for line in filin: - s_ = line.strip().split(' ') - all_predictions.append({ - head: s_[header.index(head)] - for head in header - }) - - # Sort all batches entries - sorted_entries = sorted( - all_predictions, - key=lambda k: float(k[self.params["metric"]]), - reverse="_energy" not in self.params["metric"], - ) - - # Write final output file - finale_output_fpath = f"{self.workdir}/{self.output}" - with open(finale_output_fpath, "w") as filout: - file_header = '\t'.join(combined_header) - filout.write(file_header + os.linesep) - for entry in sorted_entries: - ordered_data = [ - entry[h] if h in entry.keys() else '-' - for h in combined_header - ] - line = '\t'.join(ordered_data) - filout.write(line + os.linesep) - return finale_output_fpath - - def wait_for_termination(self, wait_time: float = 60) -> list[Path]: - """Wait until all results are accessible. - - Parameters - ---------- - wait_time : int, optional - Time in second between every termination checks, by default 60 - - Returns - ------- - output_files : list[Path] - List of voro scores results for every batches. - """ - batches_dirpath = glob.glob(f"{self.workdir}/batch_*/") - log.info( - f"Waiting for {len(batches_dirpath)} " - "voro-mqa prediction batch(es) to finish..." - ) - while True: - try: - output_files: list[Path] = [] - for batch_dir in batches_dirpath: - expected_outputfile = Path(batch_dir, "voro_scores.ssv") - assert expected_outputfile.exists() - assert expected_outputfile.stat().st_size != 0 - output_files.append(expected_outputfile) - except AssertionError: - log.info(f"Waiting {wait_time} sec more...") - time.sleep(wait_time) - else: - log.info( - "VoroMQA results are accessible: " - f"{len(output_files)} batch(es)" - ) - return output_files - - @staticmethod - def batched( - entries: list[str], - size: int = 300, - ) -> Generator[list[str], None, None]: - """Generate batches of defined size. - - Parameters - ---------- - entries : list[str] - List of pdb files. - size : int, optional - Maximum size in every batch, by default 300 - - Yields - ------ - batch : Generator[list[str], None, None] - List of pdb files <= size. - """ - batch = [] - for pdb in entries: - batch.append(pdb) - if len(batch) == size: - yield batch - batch = [] - if batch: - yield batch - - -def update_models_with_scores( - voro_scoring_fname: Union[str, Path], - models: list[PDBFile], - metric: str = "jury_score", - ) -> list[PDBFile]: - """Update PDBfiles with computed scores. - - Parameters - ---------- - output_fname : Union[str, Path] - Path to the file where to access scoring data. - models : list[PDBFile] - List of PDBFiles to be updated. - metric : str, optional - Name of the metric to be retrieved, by default "jury_score" - - Returns - ------- - models : list[PDBFile] - The updated list of PDBfiles now holding the score and rank attributes. - """ - scores_mapper: dict[str, float] = {} - ranking_mapper: dict[str, int] = {} - rank: int = 0 - # Read output file - with open(voro_scoring_fname, 'r') as filin: - for i, line in enumerate(filin): - s_ = line.strip().split('\t') - # Extract header - if i == 0: - header = s_ - continue - # Extract data - modelpath = str(s_[header.index("ID")]) - score = float(s_[header.index(metric)]) - # Only extract model filename - model_filename = modelpath.split('/')[-1] - # Reverse score if not an energy - if "_energy" not in metric: - score = -score - # Hold score - scores_mapper[model_filename] = score - rank += 1 - ranking_mapper[model_filename] = rank - - # Compute rankings - #ranking_mapper = { - # model_filename: rank - # for rank, model_filename in enumerate(sorted(scores_mapper), start=1) - # } - - # Loop over input models - for model in models: - # Add score and rank as attribute - if model.file_name in scores_mapper.keys(): - model.score = scores_mapper[model.file_name] - model.rank = ranking_mapper[model.file_name] - # In some cases computation may fail - else: - # Go for (garlic) cheese naans - model.score = NaN - model.rank = NaN - return models +# ==================================================================== \ No newline at end of file From 7dd1d5f0e1af147e9d2cd9269a23347bd80e149d Mon Sep 17 00:00:00 2001 From: Victor Reys <132575181+VGPReys@users.noreply.github.com> Date: Wed, 19 Mar 2025 16:19:42 +0100 Subject: [PATCH 27/36] Update how to obtain defaults.yaml filename --- src/haddock/modules/scoring/voroscoring/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/haddock/modules/scoring/voroscoring/__init__.py b/src/haddock/modules/scoring/voroscoring/__init__.py index 7f8241c301..57796e2f12 100644 --- a/src/haddock/modules/scoring/voroscoring/__init__.py +++ b/src/haddock/modules/scoring/voroscoring/__init__.py @@ -9,6 +9,7 @@ from os import linesep from pathlib import Path +from haddock.core.defaults import MODULE_DEFAULT_YAML from haddock.core.typing import Any, FilePath from haddock.modules import get_engine from haddock.modules.scoring import ScoringModule @@ -18,7 +19,7 @@ ) RECIPE_PATH = Path(__file__).resolve().parent -DEFAULT_CONFIG = Path(RECIPE_PATH, "defaults.yaml") +DEFAULT_CONFIG = Path(RECIPE_PATH, MODULE_DEFAULT_YAML) class HaddockModule(ScoringModule): From bee270960e800ffdc917bf1d9dc65c93214d261a Mon Sep 17 00:00:00 2001 From: Victor Reys <132575181+VGPReys@users.noreply.github.com> Date: Wed, 19 Mar 2025 16:26:09 +0100 Subject: [PATCH 28/36] Updating the SLURM job --- src/haddock/modules/scoring/voroscoring/voroscoring.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/haddock/modules/scoring/voroscoring/voroscoring.py b/src/haddock/modules/scoring/voroscoring/voroscoring.py index 0dc547d76b..d68fe50a72 100644 --- a/src/haddock/modules/scoring/voroscoring/voroscoring.py +++ b/src/haddock/modules/scoring/voroscoring/voroscoring.py @@ -21,6 +21,8 @@ from haddock.libs.libontology import NaN, PDBFile +# Defines the SLURM job template +# Notes: Please feel free to modify the #SBATCH entries to fit your needs/setup VOROMQA_CFG_TEMPLATE = """#!/bin/bash #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 @@ -48,8 +50,8 @@ # 1. Setup enviroments # Load the gnu12 module... -# NOTE: specific to tintin users... -module load gnu12 +# NOTE: specific to haddock-team users... +# module load gnu12 # Activate conda env source "$CONDA_INSTALL_DIR/bin/activate" conda activate $CONDA_ENV_NAME From 46217f9fd233d5b9fa4850a9a160307e3ab53f3a Mon Sep 17 00:00:00 2001 From: Victor Reys <132575181+VGPReys@users.noreply.github.com> Date: Wed, 19 Mar 2025 16:37:34 +0100 Subject: [PATCH 29/36] Updating tests to reflect the 3 digits outputs --- tests/test_module_voroscoring.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_module_voroscoring.py b/tests/test_module_voroscoring.py index d17fa9b02c..40088c36a2 100644 --- a/tests/test_module_voroscoring.py +++ b/tests/test_module_voroscoring.py @@ -79,8 +79,8 @@ def test_voroscoring_output(output_models): # expected output expected_outf_l = [ ["structure", "original_name", "md5", "score"], - ["protdna_complex_2.pdb", "original_name_1.pdb", "None", "-0.42"], - ["protdna_complex_1.pdb", "original_name_0.pdb", "None", "-0.28"], + ["protdna_complex_2.pdb", "original_name_1.pdb", "None", "-0.420"], + ["protdna_complex_1.pdb", "original_name_0.pdb", "None", "-0.280"], ["protdna_complex_3.pdb", "original_name_2.pdb", "None", "None"], ] From fd08d851e8920baf53861dae8f6489e4dcb4fe19 Mon Sep 17 00:00:00 2001 From: Victor Reys <132575181+VGPReys@users.noreply.github.com> Date: Wed, 19 Mar 2025 17:10:15 +0100 Subject: [PATCH 30/36] Update INSTALL.md for voroscoring module --- docs/INSTALL.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/docs/INSTALL.md b/docs/INSTALL.md index 365221b8eb..0cb17bd181 100644 --- a/docs/INSTALL.md +++ b/docs/INSTALL.md @@ -154,3 +154,24 @@ on your machine. Please refer to the [official page](http://docs.openmm.org/latest/userguide/) of the project for a full description of the installation procedure. + + + +## `voroscoring` + +The use of the `[voroscoring]` module requires: +- A cluster with SLURM installed +- The setup of a conda environement (e.g.: ftdmp), in which you will install FTDMP +- A functional installation of [FTDMP](https://github.com/kliment-olechnovic) + +Once those three conditions are fulfilled, when using the `[voroscoring]` module in haddock3, the configuration file must be tuned to contain parameters describing how to load the appropriate conda env (ftdmp) and where to find FTDMP scripts and executables: + +```TOML +[voroscoring] +# This parameter defines the base directory where conda / miniconda is installed +conda_install_dir = "/absolute/path/to/conda/" +# This parameter defines the name of the conda env that you created and where FTDMP is installled +conda_env_name = "ftdmp" +# This parameter defines where FTDMP scripts / executables can be found +ftdmp_install_dir = "/absolute/path/to/FTDMP/" +``` From e188b7405245b621df97987f6353b2949b8e8806 Mon Sep 17 00:00:00 2001 From: Victor Reys <132575181+VGPReys@users.noreply.github.com> Date: Wed, 19 Mar 2025 17:11:04 +0100 Subject: [PATCH 31/36] Removing the chain contatenation parameter as not implemented --- .../modules/scoring/voroscoring/defaults.yaml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/haddock/modules/scoring/voroscoring/defaults.yaml b/src/haddock/modules/scoring/voroscoring/defaults.yaml index a33f12fdca..4c04fe3302 100644 --- a/src/haddock/modules/scoring/voroscoring/defaults.yaml +++ b/src/haddock/modules/scoring/voroscoring/defaults.yaml @@ -63,15 +63,3 @@ nb_gpus: group: execution explevel: easy -concat_chain_: - default: [] - type: list - minitems: 0 - maxitems: 100 - title: List of residues supposed to be buried - short: List of residues supposed to be buried - long: concat_chain_* is an expandable parameter. You can provide concat_chain_1, - concat_chain_2, concat_chain_3, etc. For each selection, enlisted chains will - be concatenated as one prior to scoring. - group: analysis - explevel: expert From 17482fde1cf2ce0c42adbaf5ba06d2d9100a484a Mon Sep 17 00:00:00 2001 From: Victor Reys Date: Tue, 10 Sep 2024 08:57:38 +0200 Subject: [PATCH 32/36] update GPU scheduler --- src/haddock/modules/scoring/voroscoring/__init__.py | 4 +++- src/haddock/modules/scoring/voroscoring/voroscoring.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/haddock/modules/scoring/voroscoring/__init__.py b/src/haddock/modules/scoring/voroscoring/__init__.py index 57796e2f12..e9533b6bd9 100644 --- a/src/haddock/modules/scoring/voroscoring/__init__.py +++ b/src/haddock/modules/scoring/voroscoring/__init__.py @@ -6,6 +6,7 @@ It is a third party module, and requires the appropriate set up and intallation for it to run without issue. """ + from os import linesep from pathlib import Path @@ -91,4 +92,5 @@ def _run(self) -> None: header_comments=f"# Note that negative of the value are reported in the case of non-energetical predictions{linesep}", # noqa : E501 ) # Export to next module - self.export_io_models() \ No newline at end of file + self.export_io_models() + diff --git a/src/haddock/modules/scoring/voroscoring/voroscoring.py b/src/haddock/modules/scoring/voroscoring/voroscoring.py index d68fe50a72..599c63c871 100644 --- a/src/haddock/modules/scoring/voroscoring/voroscoring.py +++ b/src/haddock/modules/scoring/voroscoring/voroscoring.py @@ -8,7 +8,6 @@ for it to run without issue. """ - import os import subprocess import glob @@ -347,7 +346,8 @@ def update_models_with_scores( model.rank = ranking_mapper[model.file_name] # In some cases computation may fail else: - # Go for (garlic) cheese naans + # Go for (garlic cheese) naans model.score = NaN model.rank = NaN + model.ori_name = model.file_name return models From bbddcadd5fbb166662a6f3f0aeda985a498bd396 Mon Sep 17 00:00:00 2001 From: Victor Reys Date: Thu, 8 May 2025 11:03:39 +0200 Subject: [PATCH 33/36] update install --- docs/INSTALL.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/INSTALL.md b/docs/INSTALL.md index 0cb17bd181..1fa44e1ae0 100644 --- a/docs/INSTALL.md +++ b/docs/INSTALL.md @@ -155,8 +155,6 @@ on your machine. Please refer to the [official page](http://docs.openmm.org/latest/userguide/) of the project for a full description of the installation procedure. - - ## `voroscoring` The use of the `[voroscoring]` module requires: @@ -175,3 +173,4 @@ conda_env_name = "ftdmp" # This parameter defines where FTDMP scripts / executables can be found ftdmp_install_dir = "/absolute/path/to/FTDMP/" ``` + From fcf5df312a7d1301410cbb0146952bd73db6558e Mon Sep 17 00:00:00 2001 From: Victor Reys Date: Fri, 9 May 2025 17:07:44 +0200 Subject: [PATCH 34/36] removing parameters --- src/haddock/modules/scoring/voroscoring/defaults.yaml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/haddock/modules/scoring/voroscoring/defaults.yaml b/src/haddock/modules/scoring/voroscoring/defaults.yaml index 4c04fe3302..baabb8b62d 100644 --- a/src/haddock/modules/scoring/voroscoring/defaults.yaml +++ b/src/haddock/modules/scoring/voroscoring/defaults.yaml @@ -52,14 +52,3 @@ ftdmp_install_dir: group: execution explevel: easy -nb_gpus: - default: 1 - type: integer - min: 1 - max: 420 - title: Number of accessible gpu on the device. - short: Number of accessible gpu on the device. - long: Number of accessible gpu on the device. - group: execution - explevel: easy - From 0d054fd13c224727624c610f0b064368c3bea846 Mon Sep 17 00:00:00 2001 From: Victor Reys Date: Fri, 9 May 2025 17:08:06 +0200 Subject: [PATCH 35/36] updating module to run in local mode --- .../scoring/voroscoring/voroscoring.py | 67 ++++++++++++------- 1 file changed, 42 insertions(+), 25 deletions(-) diff --git a/src/haddock/modules/scoring/voroscoring/voroscoring.py b/src/haddock/modules/scoring/voroscoring/voroscoring.py index 599c63c871..e6b4626d4f 100644 --- a/src/haddock/modules/scoring/voroscoring/voroscoring.py +++ b/src/haddock/modules/scoring/voroscoring/voroscoring.py @@ -17,17 +17,28 @@ from haddock import log from haddock.core.typing import Any, Generator, Path, Union +from haddock.libs.libio import working_directory from haddock.libs.libontology import NaN, PDBFile # Defines the SLURM job template # Notes: Please feel free to modify the #SBATCH entries to fit your needs/setup -VOROMQA_CFG_TEMPLATE = """#!/bin/bash -#SBATCH --nodes=1 +SLURM_HEADER_GPU = """#SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=1 #SBATCH --partition=gpu #SBATCH --gres=gpu:1 +""" + +SLURM_HEADER_CPU = """#SBATCH -J hd3-voroscoring-cpu +#SBATCH --partition haddock +#SBATCH --nodes=1 +#SBATCH --tasks-per-node=1 +""" + +# Job template +VOROMQA_CFG_TEMPLATE = """#!/bin/bash +{HEADER} #SBATCH -J {JOBNAME} # Where to do the work @@ -48,9 +59,9 @@ OUT_MSG="Output file is here: $OUTPUT_FPATH" # 1. Setup enviroments -# Load the gnu12 module... +# Load the gnu13 module... # NOTE: specific to haddock-team users... -# module load gnu12 +# module load gnu13 # Activate conda env source "$CONDA_INSTALL_DIR/bin/activate" conda activate $CONDA_ENV_NAME @@ -114,11 +125,7 @@ def run(self): # Loop over batches for bi, batch in enumerate(self.batched(all_pdbs, size=300)): # Run slurm - self.run_voro_batch( - batch, - batch_index=bi + 1, - gpuid=bi % self.params['nb_gpus'], - ) + self.run_voro_batch(batch, batch_index=bi + 1) # Recombine all batches output files scores_fpath = self.recombine_batches() log.info(f"Generated output file: {scores_fpath}") @@ -127,7 +134,6 @@ def run_voro_batch( self, pdb_filepaths: list[str], batch_index: int = 1, - gpuid: int = -1, ) -> None: """Preset and launch predictions on subset of pdb files. @@ -137,8 +143,6 @@ def run_voro_batch( List of absolute path to the PDBs to score batch_index : int, optional Index of the batch, by default 1 - gpuid : int, optional - Index of the GPU to use, by default -1 """ # Create workdir batch_workdir = Path(self.workdir, f"batch_{batch_index}") @@ -148,17 +152,13 @@ def run_voro_batch( pdb_files_list_path = Path(batch_workdir, "pdbs.list") pdb_files_list_path.write_text(os.linesep.join(pdb_filepaths)) - # Get GPU id - if gpuid < 0: - gpuid = randint(0, self.params["nb_gpus"] - 1) - # Format config file batch_cfg = VOROMQA_CFG_TEMPLATE.format( + HEADER=SLURM_HEADER_CPU, CONDA_INSTALL_DIR=self.params["conda_install_dir"], CONDA_ENV_NAME=self.params["conda_env_name"], FTDMP_INSTALL_DIR=self.params["ftdmp_install_dir"], - JOBNAME=f'hd3_voro_b{batch_index}', - GPUID=gpuid, + JOBNAME=f"hd3_voro_b{batch_index}", WORKDIR=batch_workdir, PDB_LIST_PATH=pdb_files_list_path, ) @@ -167,13 +167,30 @@ def run_voro_batch( batch_cfg_fpath = Path(batch_workdir, "vorobatchcfg.job") batch_cfg_fpath.write_text(batch_cfg) - # Launch slurm - initdir = os.getcwd() - os.chdir(batch_workdir) - log.info(f"sbatch {batch_cfg_fpath}") - subprocess.run(f"sbatch {batch_cfg_fpath}", shell=True) - os.chdir(initdir) - + # Launch script + self._launch_computation(batch_workdir, batch_cfg_fpath) + #initdir = os.getcwd() + #os.chdir(batch_workdir) + #log.info(f"sbatch {batch_cfg_fpath}") + #subprocess.run(f"sbatch {batch_cfg_fpath}", shell=True) + #os.chdir(initdir) + + def _launch_computation(self, batch_workdir: str, batch_cfg_fpath: str) -> None: + """Execute a given script from working directory. + + Parameters + ---------- + batch_workdir : str + Path to working directory + batch_cfg_fpath : str + Script to execute + """ + exec_tool = "sbatch" if self.params["mode"] == "batch" else "bash" + cmd_ = f"{exec_tool} {batch_cfg_fpath}" + with working_directory(batch_workdir): + log.info(cmd_) + subprocess.run(cmd_, shell=True) + def recombine_batches(self) -> str: """Recombine batches output file in a single one. From d039548a052994b67ee7c604872a2f3daefe4a09 Mon Sep 17 00:00:00 2001 From: Victor Reys Date: Mon, 26 May 2025 12:20:07 +0200 Subject: [PATCH 36/36] loading gnu 13 module --- src/haddock/modules/scoring/voroscoring/voroscoring.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/haddock/modules/scoring/voroscoring/voroscoring.py b/src/haddock/modules/scoring/voroscoring/voroscoring.py index e6b4626d4f..931ee8f1a5 100644 --- a/src/haddock/modules/scoring/voroscoring/voroscoring.py +++ b/src/haddock/modules/scoring/voroscoring/voroscoring.py @@ -38,8 +38,8 @@ # Job template VOROMQA_CFG_TEMPLATE = """#!/bin/bash -{HEADER} #SBATCH -J {JOBNAME} +{HEADER} # Where to do the work WORKDIR="{WORKDIR}" @@ -61,7 +61,8 @@ # 1. Setup enviroments # Load the gnu13 module... # NOTE: specific to haddock-team users... -# module load gnu13 +# This is made to get good gcc compiler +module load gnu13 # Activate conda env source "$CONDA_INSTALL_DIR/bin/activate" conda activate $CONDA_ENV_NAME