From 1f8e46bdb6b5722c947b50150bfa3c15545737f2 Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Mon, 17 Oct 2022 14:33:49 +0100 Subject: [PATCH 01/44] Add CLI option for convert_cif_to_foldseek_db --- cath_alphaflow/cli.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cath_alphaflow/cli.py b/cath_alphaflow/cli.py index 59de3b9..e478236 100644 --- a/cath_alphaflow/cli.py +++ b/cath_alphaflow/cli.py @@ -8,6 +8,7 @@ from .commands import convert_dssp_to_sse_summary from .commands import convert_cif_to_dssp from .commands import extract_plddt_and_lur +from .commands import convert_cif_to_foldseek_db logging.basicConfig( level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s" @@ -49,3 +50,4 @@ def dump_config(): cli.add_command(convert_dssp_to_sse_summary.convert_dssp_to_sse_summary) cli.add_command(convert_cif_to_dssp.convert_cif_to_dssp) cli.add_command(extract_plddt_and_lur.convert_cif_to_plddt_summary) +cli.add_command(convert_cif_to_foldseek_db.convert_cif_to_foldseek_db) From 776c444ee569944fdc8ff2b7a6db4ca680730496 Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Mon, 17 Oct 2022 14:34:21 +0100 Subject: [PATCH 02/44] Add FS paths to config.env.example --- config.env.example | 3 +++ 1 file changed, 3 insertions(+) diff --git a/config.env.example b/config.env.example index 58f91e7..4497780 100644 --- a/config.env.example +++ b/config.env.example @@ -7,3 +7,6 @@ ORACLE_DB_USERNAME="oracle_db_username" ORACLE_DB_PASSWORD="oracle_db_password" DSSP_BINARY_PATH="/share/apps/dssp-4.0.2/bin/mkdssp" DSSP_PDB_DICT="/share/apps/libcifpp-3.0.0/share/libcifpp/mmcif_pdbx_v50.dic" +FS_BINARY_PATH="~/code/foldseek/bin/foldseek" +FS_DB_PATH="~/data/databases/foldseek/cath_s95/cath_s95_db" +FS_TMP_PATH="tmp_fs/" \ No newline at end of file From e09985c7a19d36a7b4939b9a6d9790cd8bbc002e Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Mon, 17 Oct 2022 14:34:41 +0100 Subject: [PATCH 03/44] Add defults for Foldseek suffixes --- cath_alphaflow/constants.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cath_alphaflow/constants.py b/cath_alphaflow/constants.py index 46b5a42..5396433 100644 --- a/cath_alphaflow/constants.py +++ b/cath_alphaflow/constants.py @@ -1,5 +1,8 @@ DEFAULT_CIF_SUFFIX = ".cif" DEFAULT_DSSP_SUFFIX = ".dssp" +DEFAULT_FS_INTERMEDIATE_SUFFIX = ".results" +DEFAULT_FS_QUERYDB_SUFFIX = ".db" +DEFAULT_FS_RESULTS_SUFFIX = ".m8" DEFAULT_HELIX_MIN_LENGTH = 3 DEFAULT_STRAND_MIN_LENGTH = 2 MIN_LENGTH_LUR = 5 From a2fb1e5221144f25fa387e22cd2daa65c3692ec0 Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Mon, 17 Oct 2022 14:35:10 +0100 Subject: [PATCH 04/44] Add Foldseek paths to settings.py --- cath_alphaflow/settings.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cath_alphaflow/settings.py b/cath_alphaflow/settings.py index ef60611..fc2127a 100644 --- a/cath_alphaflow/settings.py +++ b/cath_alphaflow/settings.py @@ -1,3 +1,4 @@ +from email.policy import default from prettyconf import config DEFAULT_AF_VERSION = 3 @@ -12,6 +13,9 @@ class Settings: ORACLE_DB_PASSWORD = config("ORACLE_DB_PASSWORD", default=None) DSSP_BINARY_PATH = config("DSSP_BINARY_PATH", default=None) DSSP_PDB_DICT = config("DSSP_PDB_DICT", default=None) + FS_BINARY_PATH = config("FS_BINARY_PATH", default=None) + FS_DB_PATH = config("FS_DB_PATH", default=None) + FS_TMP_PATH = config("FS_TMP_PATH", default=None) def to_dict(self): dict = {} From 3d91ade0aa44f36cd8ea91175704a104468bc58c Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Mon, 17 Oct 2022 14:35:51 +0100 Subject: [PATCH 05/44] Create module to generate Foldseek query database using symlinks. --- .../commands/convert_cif_to_foldseek_db.py | 87 +++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 cath_alphaflow/commands/convert_cif_to_foldseek_db.py diff --git a/cath_alphaflow/commands/convert_cif_to_foldseek_db.py b/cath_alphaflow/commands/convert_cif_to_foldseek_db.py new file mode 100644 index 0000000..a3a53b0 --- /dev/null +++ b/cath_alphaflow/commands/convert_cif_to_foldseek_db.py @@ -0,0 +1,87 @@ +import logging +from pathlib import Path +import os +import click +import subprocess +from cath_alphaflow.io_utils import yield_first_col + +from cath_alphaflow.constants import ( + DEFAULT_FS_QUERYDB_SUFFIX, + DEFAULT_CIF_SUFFIX, +) +from cath_alphaflow.settings import get_default_settings + +config = get_default_settings() + +FS_BINARY_PATH = config.FS_BINARY_PATH +FS_DB_PATH = config.FS_DB_PATH + +LOG = logging.getLogger() + + +@click.command() +@click.option( + "--cif_dir", + type=click.Path(exists=True, file_okay=False, dir_okay=True, resolve_path=True), + required=True, + help="Input: directory of CIF files", +) +@click.option( + "--id_file", + type=click.File("rt"), + required=True, + help="Input: CSV file containing list of ids to convert from CIF to DSSP", +) +@click.option( + "--cif_suffix", + type=str, + default=DEFAULT_CIF_SUFFIX, + help=f"Input: optional suffix to add to id when looking for cif file (default: {DEFAULT_CIF_SUFFIX})", +) +@click.option( + "--fs_querydb_suffix", + type=str, + default=DEFAULT_FS_QUERYDB_SUFFIX, + help=f"Input: optional suffix to add to Foldseek query database during creation (default: {DEFAULT_FS_QUERYDB_SUFFIX})", +) +@click.option( + "--fs_querydb_dir", + type=click.Path(file_okay=False, dir_okay=True, resolve_path=True), + required=True, + help=f"Input: optional suffix to add to Foldseek query database during creation", +) +@click.option( + "--fs_querydb_suffix", + type=str, + default=DEFAULT_FS_QUERYDB_SUFFIX, + help=f"Input: optional suffix to add to Foldseek query database during creation (default: {DEFAULT_FS_QUERYDB_SUFFIX})", +) +def convert_cif_to_foldseek_db( + cif_dir, fs_querydb_dir, id_file, cif_suffix, fs_querydb_suffix +): + "Create Foldseek query database from mmCIF folder" + fs_querydb_path = Path(fs_querydb_dir) + if not fs_querydb_path.exists(): + os.makedirs(fs_querydb_path) + for file_stub in yield_first_col(id_file): + cif_path = Path(cif_dir) / f"{file_stub}{cif_suffix}" + click.echo(cif_path) + if not cif_path.exists(): + msg = f"failed to locate CIF input file {cif_path}" + LOG.error(msg) + raise FileNotFoundError(msg) + # Create symlinks to querydb_dir + subprocess.call( + ["ln", "-s", cif_path, f"{fs_querydb_path}/{cif_path.name}"], + ) + subprocess.call( + [ + FS_BINARY_PATH, + "createdb", + f"{fs_querydb_path}/", + f"{fs_querydb_dir}{fs_querydb_suffix}", + ], + stderr=subprocess.DEVNULL, + ) + click.echo("DONE") + return Path.exists(Path(f"{fs_querydb_dir}{fs_querydb_suffix}")) From 012785346da6c54a8ab256802c37d5ba8281a5bb Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Wed, 26 Oct 2022 17:03:10 +0100 Subject: [PATCH 06/44] create run foldseek module --- cath_alphaflow/commands/run_foldseek.py | 80 +++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 cath_alphaflow/commands/run_foldseek.py diff --git a/cath_alphaflow/commands/run_foldseek.py b/cath_alphaflow/commands/run_foldseek.py new file mode 100644 index 0000000..2fb407e --- /dev/null +++ b/cath_alphaflow/commands/run_foldseek.py @@ -0,0 +1,80 @@ +from email.policy import default +import logging +from pathlib import Path +import os +import click +import subprocess + +from cath_alphaflow.settings import get_default_settings + +config = get_default_settings() + +FS_BINARY_PATH = config.FS_BINARY_PATH +FS_DB_PATH = config.FS_DB_PATH +FS_TMP_PATH = config.FS_TMP_PATH + +LOG = logging.getLogger() + + +@click.command() +@click.option( + "--fs_querydb", + type=click.Path(exists=True, file_okay=True, resolve_path=True), + default="fs_query_structures.db", + help=f"Input: Foldseek query database)", +) +@click.option( + "--fs_targetdb", + type=click.Path(exists=True, file_okay=True, resolve_path=True), + default=FS_DB_PATH, + help=f"Target Database for Foldseek. default:{FS_DB_PATH}", +) +@click.option( + "--fs_rawdata", + type=click.Path(resolve_path=True), + default="fs_query_structures.raw", + help=f"Raw output of Foldseek (before convertalis). default: fs_query_structures.raw", +) +@click.option( + "--fs_results", + type=click.Path(resolve_path=True), + default="fs_query_results.m8", + help=f"Foldseek tabular output", +) +@click.option( + "--tmp_dir", + type=click.Path(file_okay=False, dir_okay=True, resolve_path=True), + required=True, + default=FS_TMP_PATH, + help=f"Output: Foldseek temp folder (default:{FS_TMP_PATH})", +) +def run_foldseek(fs_querydb, fs_targetdb, fs_rawdata, fs_results, tmp_dir): + "Run Foldseek Query DB against Target DB" + subprocess.call( + [ + FS_BINARY_PATH, + "search", + fs_querydb, + fs_targetdb, + fs_rawdata, + tmp_dir, + "-s", + "9", + ], + stderr=subprocess.DEVNULL, + ) + subprocess.call( + [ + FS_BINARY_PATH, + "convertalis", + fs_querydb, + fs_targetdb, + fs_rawdata, + fs_results, + "--format-output", + "query,target,qstart,qend,qlen,tstart,tend,tlen,qcov,tcov,bits,evalue", + ], + stderr=subprocess.DEVNULL, + ) + + click.echo("DONE") From 117344b3aae4c4cccc1ee1e010d8b87fabe43aec Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Wed, 26 Oct 2022 17:04:25 +0100 Subject: [PATCH 07/44] added settings.json to gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index e892a85..bf2c085 100644 --- a/.gitignore +++ b/.gitignore @@ -144,3 +144,4 @@ config.env # VSCode .vscode/settings.json +.vscode/settings.json From ba0883dc63a842f3dde58e3f2ff8f8576613c641 Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Wed, 26 Oct 2022 17:05:15 +0100 Subject: [PATCH 08/44] add run foldseek, convert cif to db and convert fs to summary. --- cath_alphaflow/cli.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cath_alphaflow/cli.py b/cath_alphaflow/cli.py index e478236..408504d 100644 --- a/cath_alphaflow/cli.py +++ b/cath_alphaflow/cli.py @@ -1,6 +1,8 @@ import logging import click +from cath_alphaflow.commands import convert_foldseek_output_to_summary + from .settings import get_default_settings from .commands import create_dataset_uniprot_ids from .commands import create_dataset_cath_files @@ -9,6 +11,8 @@ from .commands import convert_cif_to_dssp from .commands import extract_plddt_and_lur from .commands import convert_cif_to_foldseek_db +from .commands import run_foldseek +from .commands import convert_foldseek_output_to_summary logging.basicConfig( level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s" @@ -51,3 +55,5 @@ def dump_config(): cli.add_command(convert_cif_to_dssp.convert_cif_to_dssp) cli.add_command(extract_plddt_and_lur.convert_cif_to_plddt_summary) cli.add_command(convert_cif_to_foldseek_db.convert_cif_to_foldseek_db) +cli.add_command(run_foldseek.run_foldseek) +cli.add_command(convert_foldseek_output_to_summary.convert_foldseek_output_to_summary) From 1c794a9271d82807bf4b3a894816da67201884c7 Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Wed, 26 Oct 2022 17:05:30 +0100 Subject: [PATCH 09/44] add foldseek constants --- cath_alphaflow/constants.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cath_alphaflow/constants.py b/cath_alphaflow/constants.py index 5396433..a8a8e94 100644 --- a/cath_alphaflow/constants.py +++ b/cath_alphaflow/constants.py @@ -1,8 +1,9 @@ DEFAULT_CIF_SUFFIX = ".cif" DEFAULT_DSSP_SUFFIX = ".dssp" -DEFAULT_FS_INTERMEDIATE_SUFFIX = ".results" DEFAULT_FS_QUERYDB_SUFFIX = ".db" DEFAULT_FS_RESULTS_SUFFIX = ".m8" +DEFAULT_FS_OVERLAP = 0.6 +DEFAULT_FS_BITS_CUTOFF = 160 DEFAULT_HELIX_MIN_LENGTH = 3 DEFAULT_STRAND_MIN_LENGTH = 2 MIN_LENGTH_LUR = 5 From 8987027019c759de2a9975b3d5719c30b5ee6310 Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Wed, 26 Oct 2022 17:06:02 +0100 Subject: [PATCH 10/44] remove default for intermediate suffix --- cath_alphaflow/commands/convert_cif_to_foldseek_db.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/cath_alphaflow/commands/convert_cif_to_foldseek_db.py b/cath_alphaflow/commands/convert_cif_to_foldseek_db.py index a3a53b0..60adbb2 100644 --- a/cath_alphaflow/commands/convert_cif_to_foldseek_db.py +++ b/cath_alphaflow/commands/convert_cif_to_foldseek_db.py @@ -14,7 +14,6 @@ config = get_default_settings() FS_BINARY_PATH = config.FS_BINARY_PATH -FS_DB_PATH = config.FS_DB_PATH LOG = logging.getLogger() @@ -50,12 +49,6 @@ required=True, help=f"Input: optional suffix to add to Foldseek query database during creation", ) -@click.option( - "--fs_querydb_suffix", - type=str, - default=DEFAULT_FS_QUERYDB_SUFFIX, - help=f"Input: optional suffix to add to Foldseek query database during creation (default: {DEFAULT_FS_QUERYDB_SUFFIX})", -) def convert_cif_to_foldseek_db( cif_dir, fs_querydb_dir, id_file, cif_suffix, fs_querydb_suffix ): From f373ed024160c143e9a5dfa41f2d069a96354e6e Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Wed, 26 Oct 2022 17:06:25 +0100 Subject: [PATCH 11/44] add module to convert foldseek output to summary --- .../convert_foldseek_output_to_summary.py | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 cath_alphaflow/commands/convert_foldseek_output_to_summary.py diff --git a/cath_alphaflow/commands/convert_foldseek_output_to_summary.py b/cath_alphaflow/commands/convert_foldseek_output_to_summary.py new file mode 100644 index 0000000..3147653 --- /dev/null +++ b/cath_alphaflow/commands/convert_foldseek_output_to_summary.py @@ -0,0 +1,63 @@ +import logging +from nis import match +from pathlib import Path +import click +from cath_alphaflow.io_utils import yield_first_col +from cath_alphaflow.settings import get_default_settings +from cath_alphaflow.constants import DEFAULT_FS_BITS_CUTOFF, DEFAULT_FS_OVERLAP + +config = get_default_settings() + +LOG = logging.getLogger() + + +@click.command() +@click.option( + "--id_file", + type=click.File("rt"), + required=True, + help="Input: CSV file containing list of ids to convert from CIF to DSSP", +) +@click.option( + "--fs_input_file", + type=click.Path(exists=True, file_okay=True, resolve_path=True), + default="fs_query_results.m8", + help=f"Foldseek tabular output as input", +) +@click.option( + "--fs_results", + type=click.Path(resolve_path=True), + default="fs_hits.tsv", + help=f"Foldseek hits file", +) +def convert_foldseek_output_to_summary(id_file, fs_input_file, fs_results): + "Convert Foldseek tabular output to summary of best hits" + seen_ids = set() + with open(fs_input_file, "rt") as fs_fh: + for line in fs_fh: + ( + query, + target, + qstart, + qend, + qlen, + tstart, + tend, + tlen, + qcov, + tcov, + bits, + evalue, + ) = line.split() + if query.endswith(".cif"): + query = query[:-4] + if query in seen_ids: + continue + if ( + float(tcov) >= DEFAULT_FS_OVERLAP + and int(bits) >= DEFAULT_FS_BITS_CUTOFF + ): + seen_ids.add(query) + + for file_stub in yield_first_col(id_file): + click.echo(file_stub) From 7b4e0a3fb37715a3895b007757a395d2403732ed Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Thu, 27 Oct 2022 13:13:07 +0100 Subject: [PATCH 12/44] Add Foldseek Summary --- cath_alphaflow/models.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/cath_alphaflow/models.py b/cath_alphaflow/models.py index 43bf794..4950289 100644 --- a/cath_alphaflow/models.py +++ b/cath_alphaflow/models.py @@ -131,6 +131,21 @@ class LURSummary: residues_total: int +@dataclass +class FoldseekSummary: + target: str + qstart: int + qend: int + qlen: int + tstart: int + tend: int + tlen: int + qcov: float + tcov: float + bits: int + evalue: float + + @dataclass class SecStrSummary: af_domain_id: str From c317c11ed2e74793072d9050c3c93d5c4143d19d Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Thu, 27 Oct 2022 13:13:22 +0100 Subject: [PATCH 13/44] Add Foldseek Summary Writer --- cath_alphaflow/io_utils.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/cath_alphaflow/io_utils.py b/cath_alphaflow/io_utils.py index 62caa1e..f6c6a97 100644 --- a/cath_alphaflow/io_utils.py +++ b/cath_alphaflow/io_utils.py @@ -36,6 +36,28 @@ def get_sse_summary_reader(csvfile): return reader +def get_foldseek_summary_writer(csvfile): + writer = get_csv_dictwriter( + csvfile, + fieldnames=[ + "query", + "target", + "qstart", + "qend", + "qlen", + "tstart", + "tend", + "tlen", + "qcov", + "tcov", + "bits", + "evalue", + ], + ) + writer.writeheader() + return writer + + def get_sse_summary_writer(csvfile): writer = get_csv_dictwriter( csvfile, From 0bfb1a80cdd3a0383cf01045bbf8ecb481da9a28 Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Thu, 27 Oct 2022 13:13:35 +0100 Subject: [PATCH 14/44] remove duplicate declaration --- cath_alphaflow/cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cath_alphaflow/cli.py b/cath_alphaflow/cli.py index 408504d..13cfcc3 100644 --- a/cath_alphaflow/cli.py +++ b/cath_alphaflow/cli.py @@ -1,7 +1,6 @@ import logging import click -from cath_alphaflow.commands import convert_foldseek_output_to_summary from .settings import get_default_settings from .commands import create_dataset_uniprot_ids From ec8318947e01748848cb5dc351f86339206b8157 Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Thu, 27 Oct 2022 13:13:58 +0100 Subject: [PATCH 15/44] Revisited foldseek parser --- .../convert_foldseek_output_to_summary.py | 70 +++++++++++++++---- 1 file changed, 57 insertions(+), 13 deletions(-) diff --git a/cath_alphaflow/commands/convert_foldseek_output_to_summary.py b/cath_alphaflow/commands/convert_foldseek_output_to_summary.py index 3147653..018af14 100644 --- a/cath_alphaflow/commands/convert_foldseek_output_to_summary.py +++ b/cath_alphaflow/commands/convert_foldseek_output_to_summary.py @@ -1,10 +1,9 @@ import logging -from nis import match -from pathlib import Path import click -from cath_alphaflow.io_utils import yield_first_col +from cath_alphaflow.io_utils import yield_first_col, get_foldseek_summary_writer from cath_alphaflow.settings import get_default_settings from cath_alphaflow.constants import DEFAULT_FS_BITS_CUTOFF, DEFAULT_FS_OVERLAP +from cath_alphaflow.models import FoldseekSummary config = get_default_settings() @@ -26,15 +25,23 @@ ) @click.option( "--fs_results", - type=click.Path(resolve_path=True), + type=click.File("wt"), default="fs_hits.tsv", help=f"Foldseek hits file", ) def convert_foldseek_output_to_summary(id_file, fs_input_file, fs_results): + fs_results_writer = get_foldseek_summary_writer(fs_results) + best_hits_dict = extract_best_hits_foldseek(fs_input_file) + for file_stub in yield_first_col(id_file): + extract_hits_from_hits_dict_foldseek(file_stub, best_hits_dict) + + +def extract_best_hits_foldseek(fs_input_file): "Convert Foldseek tabular output to summary of best hits" - seen_ids = set() + results_dict = {} with open(fs_input_file, "rt") as fs_fh: for line in fs_fh: + line = line.rstrip() ( query, target, @@ -51,13 +58,50 @@ def convert_foldseek_output_to_summary(id_file, fs_input_file, fs_results): ) = line.split() if query.endswith(".cif"): query = query[:-4] - if query in seen_ids: + if query in results_dict: continue - if ( - float(tcov) >= DEFAULT_FS_OVERLAP - and int(bits) >= DEFAULT_FS_BITS_CUTOFF - ): - seen_ids.add(query) + results_dict[query] = [ + target, + qstart, + qend, + qlen, + tstart, + tend, + tlen, + qcov, + tcov, + bits, + evalue, + ] + return results_dict - for file_stub in yield_first_col(id_file): - click.echo(file_stub) + +def extract_hits_from_hits_dict_foldseek(query_id, best_hits_dict): + no_hits_placeholder = "" + if query_id in best_hits_dict: + ( + target, + qstart, + qend, + qlen, + tstart, + tend, + tlen, + qcov, + tcov, + bits, + evalue, + ) = best_hits_dict[query_id] + return FoldseekSummary( + target, + qstart, + qend, + qlen, + tstart, + tend, + tlen, + qcov, + tcov, + bits, + evalue=no_hits_placeholder, + ) From 8dbc269b0d230a5d16424a1d45bab7336110a323 Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Thu, 27 Oct 2022 14:59:53 +0100 Subject: [PATCH 16/44] Include filters for overlap and bits Use iterators --- .../convert_foldseek_output_to_summary.py | 101 +++++------------- 1 file changed, 25 insertions(+), 76 deletions(-) diff --git a/cath_alphaflow/commands/convert_foldseek_output_to_summary.py b/cath_alphaflow/commands/convert_foldseek_output_to_summary.py index 018af14..cd9bb85 100644 --- a/cath_alphaflow/commands/convert_foldseek_output_to_summary.py +++ b/cath_alphaflow/commands/convert_foldseek_output_to_summary.py @@ -1,6 +1,11 @@ import logging +from unittest import result import click -from cath_alphaflow.io_utils import yield_first_col, get_foldseek_summary_writer +from cath_alphaflow.io_utils import ( + yield_first_col, + get_foldseek_reader, + get_foldseek_summary_writer, +) from cath_alphaflow.settings import get_default_settings from cath_alphaflow.constants import DEFAULT_FS_BITS_CUTOFF, DEFAULT_FS_OVERLAP from cath_alphaflow.models import FoldseekSummary @@ -19,7 +24,7 @@ ) @click.option( "--fs_input_file", - type=click.Path(exists=True, file_okay=True, resolve_path=True), + type=click.File("rt"), default="fs_query_results.m8", help=f"Foldseek tabular output as input", ) @@ -30,78 +35,22 @@ help=f"Foldseek hits file", ) def convert_foldseek_output_to_summary(id_file, fs_input_file, fs_results): - fs_results_writer = get_foldseek_summary_writer(fs_results) - best_hits_dict = extract_best_hits_foldseek(fs_input_file) + unique_af_ids = set() + unique_af_ids.add("NOHIT") + best_hits = set() + foldseek_results_writer = get_foldseek_summary_writer(fs_results) for file_stub in yield_first_col(id_file): - extract_hits_from_hits_dict_foldseek(file_stub, best_hits_dict) - - -def extract_best_hits_foldseek(fs_input_file): - "Convert Foldseek tabular output to summary of best hits" - results_dict = {} - with open(fs_input_file, "rt") as fs_fh: - for line in fs_fh: - line = line.rstrip() - ( - query, - target, - qstart, - qend, - qlen, - tstart, - tend, - tlen, - qcov, - tcov, - bits, - evalue, - ) = line.split() - if query.endswith(".cif"): - query = query[:-4] - if query in results_dict: - continue - results_dict[query] = [ - target, - qstart, - qend, - qlen, - tstart, - tend, - tlen, - qcov, - tcov, - bits, - evalue, - ] - return results_dict - - -def extract_hits_from_hits_dict_foldseek(query_id, best_hits_dict): - no_hits_placeholder = "" - if query_id in best_hits_dict: - ( - target, - qstart, - qend, - qlen, - tstart, - tend, - tlen, - qcov, - tcov, - bits, - evalue, - ) = best_hits_dict[query_id] - return FoldseekSummary( - target, - qstart, - qend, - qlen, - tstart, - tend, - tlen, - qcov, - tcov, - bits, - evalue=no_hits_placeholder, - ) + unique_af_ids.add(file_stub) + foldseek_reader = get_foldseek_reader(fs_input_file) + for foldseek_result_as_dict in foldseek_reader: + result = FoldseekSummary(**foldseek_result_as_dict) + if result.query.endswith(".cif"): + result.query = result.query[:-4] + if ( + result.query not in best_hits + and float(result.tcov) >= DEFAULT_FS_OVERLAP + and int(result.bits) >= DEFAULT_FS_BITS_CUTOFF + and result.query in unique_af_ids + ): + best_hits.add(result.query) + foldseek_results_writer.writerow(result.__dict__) From 009e0670ad865baae460395315f940c9aa962960 Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Thu, 27 Oct 2022 15:00:12 +0100 Subject: [PATCH 17/44] Create new FoldseekReader --- cath_alphaflow/io_utils.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/cath_alphaflow/io_utils.py b/cath_alphaflow/io_utils.py index f6c6a97..6bf0141 100644 --- a/cath_alphaflow/io_utils.py +++ b/cath_alphaflow/io_utils.py @@ -4,6 +4,7 @@ from .models import AFChainID from .models import AFDomainID +from .models import FoldseekSummary LOG = logging.getLogger(__name__) @@ -36,6 +37,25 @@ def get_sse_summary_reader(csvfile): return reader +def get_foldseek_reader(csvfile): + foldseek_fieldnames = [ + "query", + "target", + "qstart", + "qend", + "qlen", + "tstart", + "tend", + "tlen", + "qcov", + "tcov", + "bits", + "evalue", + ] + foldseek_reader = get_csv_dictreader(csvfile, fieldnames=foldseek_fieldnames) + return foldseek_reader + + def get_foldseek_summary_writer(csvfile): writer = get_csv_dictwriter( csvfile, From c92572e91ab28196917f8fb41af3b29afe201a91 Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Thu, 27 Oct 2022 15:00:42 +0100 Subject: [PATCH 18/44] Add query to FoldseekSummary --- cath_alphaflow/models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cath_alphaflow/models.py b/cath_alphaflow/models.py index 4950289..d17d7b1 100644 --- a/cath_alphaflow/models.py +++ b/cath_alphaflow/models.py @@ -133,6 +133,7 @@ class LURSummary: @dataclass class FoldseekSummary: + query: str target: str qstart: int qend: int From 6ada0ef1419bff22ea9f1d19b2659950e91dbc4e Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Wed, 2 Nov 2022 11:06:34 +0000 Subject: [PATCH 19/44] Replace subprocess.call with subprocess.run Use native os.symlink Check if symlink exists before creating it. --- cath_alphaflow/commands/convert_cif_to_foldseek_db.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cath_alphaflow/commands/convert_cif_to_foldseek_db.py b/cath_alphaflow/commands/convert_cif_to_foldseek_db.py index 60adbb2..76352ec 100644 --- a/cath_alphaflow/commands/convert_cif_to_foldseek_db.py +++ b/cath_alphaflow/commands/convert_cif_to_foldseek_db.py @@ -64,10 +64,10 @@ def convert_cif_to_foldseek_db( LOG.error(msg) raise FileNotFoundError(msg) # Create symlinks to querydb_dir - subprocess.call( - ["ln", "-s", cif_path, f"{fs_querydb_path}/{cif_path.name}"], - ) - subprocess.call( + if os.path.exists(f"{fs_querydb_path}/{cif_path.name}") == False: + os.symlink(cif_path, f"{fs_querydb_path}/{cif_path.name}") + + subprocess.run( [ FS_BINARY_PATH, "createdb", @@ -75,6 +75,7 @@ def convert_cif_to_foldseek_db( f"{fs_querydb_dir}{fs_querydb_suffix}", ], stderr=subprocess.DEVNULL, + check=True, ) click.echo("DONE") return Path.exists(Path(f"{fs_querydb_dir}{fs_querydb_suffix}")) From 59508d9c1d147c6d81fb4920bef0432a811e6890 Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Wed, 2 Nov 2022 11:11:35 +0000 Subject: [PATCH 20/44] Clean to return fs_querydb_path.exists --- cath_alphaflow/commands/convert_cif_to_foldseek_db.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cath_alphaflow/commands/convert_cif_to_foldseek_db.py b/cath_alphaflow/commands/convert_cif_to_foldseek_db.py index 76352ec..60b23ac 100644 --- a/cath_alphaflow/commands/convert_cif_to_foldseek_db.py +++ b/cath_alphaflow/commands/convert_cif_to_foldseek_db.py @@ -78,4 +78,5 @@ def convert_cif_to_foldseek_db( check=True, ) click.echo("DONE") - return Path.exists(Path(f"{fs_querydb_dir}{fs_querydb_suffix}")) + fs_querydb_path = Path(f"{fs_querydb_dir}{fs_querydb_suffix}") + return fs_querydb_path.exists() From 3dfbdeda87706b6e052990427f1a5f4516057cda Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Wed, 2 Nov 2022 11:26:43 +0000 Subject: [PATCH 21/44] Change subprocess.call to subproces.run with check=True --- cath_alphaflow/commands/run_foldseek.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cath_alphaflow/commands/run_foldseek.py b/cath_alphaflow/commands/run_foldseek.py index 2fb407e..2b27bbb 100644 --- a/cath_alphaflow/commands/run_foldseek.py +++ b/cath_alphaflow/commands/run_foldseek.py @@ -63,7 +63,7 @@ def run_foldseek(fs_querydb, fs_targetdb, fs_rawdata, fs_results, tmp_dir): ], stderr=subprocess.DEVNULL, ) - subprocess.call( + subprocess.run( [ FS_BINARY_PATH, "convertalis", @@ -75,6 +75,7 @@ def run_foldseek(fs_querydb, fs_targetdb, fs_rawdata, fs_results, tmp_dir): "query,target,qstart,qend,qlen,tstart,tend,tlen,qcov,tcov,bits,evalue", ], stderr=subprocess.DEVNULL, + check=True, ) click.echo("DONE") From 53d87e6aa1929d377d9aeaef1224cd5f8ddd4dc1 Mon Sep 17 00:00:00 2001 From: Ian Sillitoe Date: Wed, 2 Nov 2022 17:32:15 +0000 Subject: [PATCH 22/44] update description --- cath_alphaflow/commands/convert_cif_to_foldseek_db.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cath_alphaflow/commands/convert_cif_to_foldseek_db.py b/cath_alphaflow/commands/convert_cif_to_foldseek_db.py index 60adbb2..16e3763 100644 --- a/cath_alphaflow/commands/convert_cif_to_foldseek_db.py +++ b/cath_alphaflow/commands/convert_cif_to_foldseek_db.py @@ -41,13 +41,13 @@ "--fs_querydb_suffix", type=str, default=DEFAULT_FS_QUERYDB_SUFFIX, - help=f"Input: optional suffix to add to Foldseek query database during creation (default: {DEFAULT_FS_QUERYDB_SUFFIX})", + help=f"Output: optional suffix to add to Foldseek query database during creation (default: {DEFAULT_FS_QUERYDB_SUFFIX})", ) @click.option( "--fs_querydb_dir", type=click.Path(file_okay=False, dir_okay=True, resolve_path=True), required=True, - help=f"Input: optional suffix to add to Foldseek query database during creation", + help=f"Output: directory to use for Foldseek query database during creation", ) def convert_cif_to_foldseek_db( cif_dir, fs_querydb_dir, id_file, cif_suffix, fs_querydb_suffix From 783e3c72591722e1cbbdc8d0dfe05619dc6fc67c Mon Sep 17 00:00:00 2001 From: Ian Sillitoe Date: Wed, 2 Nov 2022 17:35:54 +0000 Subject: [PATCH 23/44] tidy up path calculations --- cath_alphaflow/commands/convert_cif_to_foldseek_db.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cath_alphaflow/commands/convert_cif_to_foldseek_db.py b/cath_alphaflow/commands/convert_cif_to_foldseek_db.py index 98c6fba..add32b9 100644 --- a/cath_alphaflow/commands/convert_cif_to_foldseek_db.py +++ b/cath_alphaflow/commands/convert_cif_to_foldseek_db.py @@ -53,7 +53,7 @@ def convert_cif_to_foldseek_db( cif_dir, fs_querydb_dir, id_file, cif_suffix, fs_querydb_suffix ): "Create Foldseek query database from mmCIF folder" - fs_querydb_path = Path(fs_querydb_dir) + fs_querydb_path = Path(fs_querydb_dir).resolve() if not fs_querydb_path.exists(): os.makedirs(fs_querydb_path) for file_stub in yield_first_col(id_file): @@ -64,8 +64,9 @@ def convert_cif_to_foldseek_db( LOG.error(msg) raise FileNotFoundError(msg) # Create symlinks to querydb_dir - if os.path.exists(f"{fs_querydb_path}/{cif_path.name}") == False: - os.symlink(cif_path, f"{fs_querydb_path}/{cif_path.name}") + dest_cif_path = fs_querydb_path / cif_path.name + if not dest_cif_path.exists(): + os.symlink(str(cif_path), str(dest_cif_path)) subprocess.run( [ From 94aa5a9e8a3e1372005846138c8d81e17eba7d61 Mon Sep 17 00:00:00 2001 From: Ian Sillitoe Date: Wed, 2 Nov 2022 17:40:38 +0000 Subject: [PATCH 24/44] add check for expected output file --- .../commands/convert_cif_to_foldseek_db.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/cath_alphaflow/commands/convert_cif_to_foldseek_db.py b/cath_alphaflow/commands/convert_cif_to_foldseek_db.py index add32b9..046c2d1 100644 --- a/cath_alphaflow/commands/convert_cif_to_foldseek_db.py +++ b/cath_alphaflow/commands/convert_cif_to_foldseek_db.py @@ -68,16 +68,22 @@ def convert_cif_to_foldseek_db( if not dest_cif_path.exists(): os.symlink(str(cif_path), str(dest_cif_path)) + fs_querydb_db_path = Path(f"{fs_querydb_dir}{fs_querydb_suffix}") + subprocess.run( [ FS_BINARY_PATH, "createdb", f"{fs_querydb_path}/", - f"{fs_querydb_dir}{fs_querydb_suffix}", + str(fs_querydb_db_path), ], stderr=subprocess.DEVNULL, check=True, ) + + if not fs_querydb_db_path.exists(): + msg = f"failed to create expected foldseek database file: {fs_querydb_db_path}" + raise FileNotFoundError(msg) + click.echo("DONE") - fs_querydb_path = Path(f"{fs_querydb_dir}{fs_querydb_suffix}") - return fs_querydb_path.exists() + return From d0193e08fe52c20a474bd0d1992f3bf73fd68a8b Mon Sep 17 00:00:00 2001 From: Ian Sillitoe Date: Mon, 14 Nov 2022 16:49:03 +0000 Subject: [PATCH 25/44] clarify usage of foldseek path --- cath_alphaflow/commands/convert_cif_to_foldseek_db.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cath_alphaflow/commands/convert_cif_to_foldseek_db.py b/cath_alphaflow/commands/convert_cif_to_foldseek_db.py index 046c2d1..f5fec98 100644 --- a/cath_alphaflow/commands/convert_cif_to_foldseek_db.py +++ b/cath_alphaflow/commands/convert_cif_to_foldseek_db.py @@ -53,6 +53,11 @@ def convert_cif_to_foldseek_db( cif_dir, fs_querydb_dir, id_file, cif_suffix, fs_querydb_suffix ): "Create Foldseek query database from mmCIF folder" + + if FS_BINARY_PATH is None: + msg = "expected foldseek binary path (FS_BINARY_PATH) to be set" + raise RuntimeError(msg) + fs_querydb_path = Path(fs_querydb_dir).resolve() if not fs_querydb_path.exists(): os.makedirs(fs_querydb_path) @@ -72,9 +77,9 @@ def convert_cif_to_foldseek_db( subprocess.run( [ - FS_BINARY_PATH, + str(FS_BINARY_PATH), "createdb", - f"{fs_querydb_path}/", + str(fs_querydb_path) + "/", str(fs_querydb_db_path), ], stderr=subprocess.DEVNULL, From d18955ec81f935a01e52621dc35f194a337a86c7 Mon Sep 17 00:00:00 2001 From: Ian Sillitoe Date: Mon, 14 Nov 2022 16:49:21 +0000 Subject: [PATCH 26/44] provide defaults for foldseek settings --- cath_alphaflow/settings.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/cath_alphaflow/settings.py b/cath_alphaflow/settings.py index fc2127a..5bd5fde 100644 --- a/cath_alphaflow/settings.py +++ b/cath_alphaflow/settings.py @@ -1,9 +1,16 @@ -from email.policy import default +from pathlib import Path from prettyconf import config DEFAULT_AF_VERSION = 3 DEFAULT_AF_FRAGMENT = 1 +PROJECT_ROOT_DIR = Path(__file__).parent.parent +DEFAULT_FS_BINARY_PATH = str(PROJECT_ROOT_DIR / "foldseek" / "bin" / "foldseek") + + +def resolve_path(raw_path_str): + return str(Path(raw_path_str).resolve()) + class Settings: ORACLE_DB_HOST = config("ORACLE_DB_HOST", default=None) @@ -13,9 +20,13 @@ class Settings: ORACLE_DB_PASSWORD = config("ORACLE_DB_PASSWORD", default=None) DSSP_BINARY_PATH = config("DSSP_BINARY_PATH", default=None) DSSP_PDB_DICT = config("DSSP_PDB_DICT", default=None) - FS_BINARY_PATH = config("FS_BINARY_PATH", default=None) - FS_DB_PATH = config("FS_DB_PATH", default=None) - FS_TMP_PATH = config("FS_TMP_PATH", default=None) + FS_BINARY_PATH = config( + "FS_BINARY_PATH", + default=DEFAULT_FS_BINARY_PATH, + cast=resolve_path, + ) + FS_DB_PATH = config("FS_DB_PATH", default="foldseek_db", cast=resolve_path) + FS_TMP_PATH = config("FS_TMP_PATH", default="foldseek_tmp", cast=resolve_path) def to_dict(self): dict = {} From 1aabd7c13280479f59fef01c1f8ad9bb61f97cf8 Mon Sep 17 00:00:00 2001 From: Ian Sillitoe Date: Mon, 14 Nov 2022 16:49:44 +0000 Subject: [PATCH 27/44] add foldseek tests --- tests/test_cif_to_foldseek.py | 75 +++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 tests/test_cif_to_foldseek.py diff --git a/tests/test_cif_to_foldseek.py b/tests/test_cif_to_foldseek.py new file mode 100644 index 0000000..374d076 --- /dev/null +++ b/tests/test_cif_to_foldseek.py @@ -0,0 +1,75 @@ +import os +from pathlib import Path +import csv +import logging + +from cath_alphaflow.cli import cli + + +UNIPROT_IDS = ["P00520"] +FIXTURE_PATH = Path(__file__).parent / "fixtures" +EXAMPLE_CIF_FILE = FIXTURE_PATH / "cif" / "AF-P00520-F1-model_v3.cif.gz" + +FS_BINARY_PATH = Path(__file__).parent.parent / "foldseek" / "bin" / "foldseek" + +SUBCOMMAND = "convert-cif-to-foldseek-db" + + +if not FS_BINARY_PATH.exists(): + msg = f"cannot run tests as foldseek is not installed: {FS_BINARY_PATH}" + + +def test_cli_usage(create_cli_runner): + runner = create_cli_runner() + with runner.isolated_filesystem(): + result = runner.invoke(cli, [SUBCOMMAND, "--help"]) + assert result.exit_code == 0 + assert "Usage:" in result.output + + +def write_ids_to_file(fh, headers, ids): + writer = csv.writer(fh, delimiter="\t") + writer.writerow(headers) + for _id in ids: + writer.writerow([_id]) + fh.flush() + + +def create_fake_cif_dir(dirname, ids, cif_src=EXAMPLE_CIF_FILE): + dir_path = Path(dirname) + dir_path.mkdir() + for _id in ids: + path_dest = dir_path / f"{_id}.cif" + os.symlink(cif_src, f"{path_dest}") + return dir_path + + +def test_convert_cif_to_foldseek_db(tmp_path, create_cli_runner): + + headers = ["header"] + ids = ["id1", "id2"] + + runner = create_cli_runner(extra_settings={"FS_BINARY_PATH": "foldseek-fake-path"}) + with runner.isolated_filesystem(temp_dir=tmp_path): + + cwd_path = Path.cwd() + + tmp_dssp_path = create_fake_cif_dir("cif", ids) + tmp_id_path = cwd_path / "ids.csv" + with tmp_id_path.open("wt") as fh: + write_ids_to_file(fh, headers, ids) + tmp_foldseek_db_path = cwd_path / "tmp_foldseek_db" + tmp_foldseek_db_path.mkdir() + + args = ( + SUBCOMMAND, + "--cif_dir", + f"{tmp_dssp_path}", + "--id_file", + f"{tmp_id_path}", + "--fs_querydb_dir", + f"{tmp_foldseek_db_path}", + ) + result = runner.invoke(cli, args) + assert result.exit_code == 0 + assert "DONE" in result.output From 81bbe1cbdf06d225f69aca45c6b8fb7dfb2e1f6d Mon Sep 17 00:00:00 2001 From: Ian Sillitoe Date: Mon, 14 Nov 2022 21:54:19 +0000 Subject: [PATCH 28/44] add create_cli_runner --- tests/conftest.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index db6420a..205f8dc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,13 @@ from cath_alphaflow import settings +import logging import pytest import cx_Oracle +from click.testing import CliRunner + + +LOG = logging.getLogger(__name__) + class MockCursorIterator: """Iterator class""" @@ -116,3 +122,24 @@ def mock_get_default_settings(*args, **kwargs): return settings.TestSettings() monkeypatch.setattr(settings, "get_default_settings", mock_get_default_settings) + + +@pytest.fixture +def create_cli_runner(monkeypatch): + def _create_cli_runner(**kwargs): + def mock_get_default_settings(): + _settings = settings.TestSettings() + for key, val in kwargs.items(): + LOG.info(f"overriding local tests setting: {key}={val}") + print(f"overriding local tests setting: {key}={val}") + setattr(_settings, key, val) + return _settings + + monkeypatch.setattr(settings, "get_default_settings", mock_get_default_settings) + + cli_runner = CliRunner() + return cli_runner + + yield _create_cli_runner + + # cleanup From 80888e08f4eab38e076635d61e6f093ba584378e Mon Sep 17 00:00:00 2001 From: Ian Sillitoe Date: Mon, 14 Nov 2022 22:13:05 +0000 Subject: [PATCH 29/44] install foldseek --- .github/workflows/python-package.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index e8e9647..03e0b71 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -30,6 +30,10 @@ jobs: python -m pip install flake8 pytest if [ -f requirements.txt ]; then pip install -r requirements.txt; fi pip install -e . + - name: Install Foldseek + run: | + wget https://mmseqs.com/foldseek/foldseek-linux-sse41.tar.gz; tar xvzf foldseek-linux-sse41.tar.gz + echo "::add-path::${GITHUB_WORKSPACE}/foldseek/bin" - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names From 8bd009c7d11e13f844a64238b2ec3123b45c663b Mon Sep 17 00:00:00 2001 From: Ian Sillitoe Date: Mon, 14 Nov 2022 22:17:32 +0000 Subject: [PATCH 30/44] correct path --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 03e0b71..54d4bb3 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -33,7 +33,7 @@ jobs: - name: Install Foldseek run: | wget https://mmseqs.com/foldseek/foldseek-linux-sse41.tar.gz; tar xvzf foldseek-linux-sse41.tar.gz - echo "::add-path::${GITHUB_WORKSPACE}/foldseek/bin" + echo "${GITHUB_WORKSPACE}/foldseek/bin:$PATH" > $GITHUB_PATH - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names From 6ee75c26658166d667e6871d9efc36be2cb12523 Mon Sep 17 00:00:00 2001 From: Ian Sillitoe Date: Mon, 14 Nov 2022 22:18:30 +0000 Subject: [PATCH 31/44] simplify path --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 54d4bb3..d5456c6 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -33,7 +33,7 @@ jobs: - name: Install Foldseek run: | wget https://mmseqs.com/foldseek/foldseek-linux-sse41.tar.gz; tar xvzf foldseek-linux-sse41.tar.gz - echo "${GITHUB_WORKSPACE}/foldseek/bin:$PATH" > $GITHUB_PATH + echo "${HOME}/foldseek/bin" > $GITHUB_PATH - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names From 45cbd3e8631804aec5a26ba7eb0573e463078cc7 Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Fri, 10 Mar 2023 17:33:49 +0000 Subject: [PATCH 32/44] Add DEFAULT_FS_QUERYDB_NAME to constants --- cath_alphaflow/constants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cath_alphaflow/constants.py b/cath_alphaflow/constants.py index bd209fc..9f21ec9 100644 --- a/cath_alphaflow/constants.py +++ b/cath_alphaflow/constants.py @@ -1,5 +1,6 @@ DEFAULT_CIF_SUFFIX = ".cif" DEFAULT_DSSP_SUFFIX = ".dssp" +DEFAULT_FS_QUERYDB_NAME = "af_query_foldseek" DEFAULT_FS_QUERYDB_SUFFIX = ".db" DEFAULT_FS_RESULTS_SUFFIX = ".m8" DEFAULT_FS_OVERLAP = 0.6 From c8aacbe5e59b46707e64a97789a9fc9f80794693 Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Mon, 13 Mar 2023 11:09:38 +0000 Subject: [PATCH 33/44] Add option to generate query db for set of files Add possibility to do symlinking for a querydb --- .../commands/convert_cif_to_foldseek_db.py | 115 +++++++++++++----- 1 file changed, 86 insertions(+), 29 deletions(-) diff --git a/cath_alphaflow/commands/convert_cif_to_foldseek_db.py b/cath_alphaflow/commands/convert_cif_to_foldseek_db.py index f5fec98..2f88fe9 100644 --- a/cath_alphaflow/commands/convert_cif_to_foldseek_db.py +++ b/cath_alphaflow/commands/convert_cif_to_foldseek_db.py @@ -4,12 +4,16 @@ import click import subprocess from cath_alphaflow.io_utils import yield_first_col - +from cath_alphaflow.models.domains import AFDomainID from cath_alphaflow.constants import ( DEFAULT_FS_QUERYDB_SUFFIX, + DEFAULT_FS_QUERYDB_NAME, DEFAULT_CIF_SUFFIX, + ID_TYPE_AF_DOMAIN, + ID_TYPE_UNIPROT_DOMAIN, ) from cath_alphaflow.settings import get_default_settings +from cath_alphaflow.errors import ArgumentError config = get_default_settings() @@ -25,23 +29,36 @@ required=True, help="Input: directory of CIF files", ) -@click.option( - "--id_file", - type=click.File("rt"), - required=True, - help="Input: CSV file containing list of ids to convert from CIF to DSSP", -) @click.option( "--cif_suffix", type=str, default=DEFAULT_CIF_SUFFIX, help=f"Input: optional suffix to add to id when looking for cif file (default: {DEFAULT_CIF_SUFFIX})", ) +@click.option( + "--fs_querydb_name", + type=str, + default=DEFAULT_FS_QUERYDB_NAME, + help=f"Output: Foldseek Query Database Name (default: {DEFAULT_FS_QUERYDB_NAME})", +) @click.option( "--fs_querydb_suffix", type=str, default=DEFAULT_FS_QUERYDB_SUFFIX, - help=f"Output: optional suffix to add to Foldseek query database during creation (default: {DEFAULT_FS_QUERYDB_SUFFIX})", + help=f"Option: optional suffix to add to Foldseek query database during creation (default: {DEFAULT_FS_QUERYDB_SUFFIX})", +) +@click.option( + "--id_file", + type=click.File("rt"), + default=None, + required=False, + help='Optional id list file if generating a subset of a larger folder. (default: False)' +) +@click.option( + "--id_type", + type=click.Choice([ID_TYPE_AF_DOMAIN, ID_TYPE_UNIPROT_DOMAIN]), + default=ID_TYPE_AF_DOMAIN, + help=f"Option: specify the type of ID to specify the chopping [{ID_TYPE_AF_DOMAIN}]", ) @click.option( "--fs_querydb_dir", @@ -49,8 +66,20 @@ required=True, help=f"Output: directory to use for Foldseek query database during creation", ) +@click.option( + "--fs_bin_path", + type=click.Path(file_okay=True, resolve_path=True), + default=FS_BINARY_PATH, + help=f"Option: directory containing the Foldseek executable. (default: {FS_BINARY_PATH})" +) +@click.option( + "--af_version", + type=int, + default=4, + help=f"Option: specify the AF version when parsing uniprot ids", +) def convert_cif_to_foldseek_db( - cif_dir, fs_querydb_dir, id_file, cif_suffix, fs_querydb_suffix + cif_dir, fs_querydb_dir, fs_querydb_name, id_file, id_type, cif_suffix, fs_querydb_suffix, fs_bin_path, af_version ): "Create Foldseek query database from mmCIF folder" @@ -61,34 +90,62 @@ def convert_cif_to_foldseek_db( fs_querydb_path = Path(fs_querydb_dir).resolve() if not fs_querydb_path.exists(): os.makedirs(fs_querydb_path) - for file_stub in yield_first_col(id_file): - cif_path = Path(cif_dir) / f"{file_stub}{cif_suffix}" - click.echo(cif_path) - if not cif_path.exists(): - msg = f"failed to locate CIF input file {cif_path}" - LOG.error(msg) - raise FileNotFoundError(msg) - # Create symlinks to querydb_dir - dest_cif_path = fs_querydb_path / cif_path.name - if not dest_cif_path.exists(): - os.symlink(str(cif_path), str(dest_cif_path)) - - fs_querydb_db_path = Path(f"{fs_querydb_dir}{fs_querydb_suffix}") + + if id_file is not None: + af_tmp_dir = 'af_tmp_dir' + if Path(af_tmp_dir).is_dir==False: + os.mkdir(af_tmp_dir) + for af_domain_id_str in yield_first_col(id_file): + if id_type == ID_TYPE_UNIPROT_DOMAIN: + af_domain_id = AFDomainID.from_uniprot_str( + af_domain_id_str,version=af_version + ) + elif id_type == ID_TYPE_AF_DOMAIN: + af_domain_id = AFDomainID.from_str(af_domain_id_str) + else: + msg = f"failed to understand id_type '${id_type}'" + raise ArgumentError(msg) + file_stub = af_domain_id.to_file_stub() + cif_path = Path(cif_dir) / f"{file_stub}{cif_suffix}" + if not cif_path.exists(): + msg = f"failed to locate CIF input file {cif_path}" + LOG.error(msg) + raise FileNotFoundError(msg) + # Create symlinks to querydb_dir + dest_cif_path = Path(af_tmp_dir) / cif_path.name + if not dest_cif_path.exists(): + os.symlink(str(cif_path), str(dest_cif_path)) + cif_input_dir = af_tmp_dir + fs_querydb = Path(f"{fs_querydb_dir}/{fs_querydb_name}{fs_querydb_suffix}") + else: + cif_input_dir = cif_dir + fs_querydb = Path(f"{fs_querydb_dir}/{fs_querydb_name}{fs_querydb_suffix}") + LOG.info(f'{cif_input_dir} {fs_querydb_dir}') subprocess.run( [ - str(FS_BINARY_PATH), + f"{fs_bin_path}", "createdb", - str(fs_querydb_path) + "/", - str(fs_querydb_db_path), + f"{cif_input_dir}", + f"{fs_querydb}", ], stderr=subprocess.DEVNULL, check=True, ) - - if not fs_querydb_db_path.exists(): - msg = f"failed to create expected foldseek database file: {fs_querydb_db_path}" - raise FileNotFoundError(msg) + if id_file is not None: + for root, dirs, files in os.walk(af_tmp_dir, topdown=False): + for name in files: + file_path = os.path.join(root, name) + if os.path.islink(file_path): + os.remove(file_path) + for name in dirs: + dir_path = os.path.join(root, name) + if os.path.islink(dir_path): + os.remove(dir_path) + if not fs_querydb.exists(): + msg = f"failed to create expected foldseek database file: {fs_querydb_db_path}" + raise FileNotFoundError(msg) + os.rmdir(af_tmp_dir) click.echo("DONE") return From 8d40098d699f45b4ceef90f154d85cf36ae56f0b Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Mon, 13 Mar 2023 11:10:26 +0000 Subject: [PATCH 34/44] Switch from cath_alphaflow.domains to cath_alphaflow.models.domains --- cath_alphaflow/commands/convert_foldseek_output_to_summary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cath_alphaflow/commands/convert_foldseek_output_to_summary.py b/cath_alphaflow/commands/convert_foldseek_output_to_summary.py index cd9bb85..4ed706c 100644 --- a/cath_alphaflow/commands/convert_foldseek_output_to_summary.py +++ b/cath_alphaflow/commands/convert_foldseek_output_to_summary.py @@ -8,7 +8,7 @@ ) from cath_alphaflow.settings import get_default_settings from cath_alphaflow.constants import DEFAULT_FS_BITS_CUTOFF, DEFAULT_FS_OVERLAP -from cath_alphaflow.models import FoldseekSummary +from cath_alphaflow.models.domains import FoldseekSummary config = get_default_settings() From 89077db561ac5ae1b87a4b6acdb96a3e8c622e49 Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Mon, 13 Mar 2023 11:59:30 +0000 Subject: [PATCH 35/44] Fix typo --- cath_alphaflow/commands/convert_cif_to_foldseek_db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cath_alphaflow/commands/convert_cif_to_foldseek_db.py b/cath_alphaflow/commands/convert_cif_to_foldseek_db.py index 2f88fe9..dd606bb 100644 --- a/cath_alphaflow/commands/convert_cif_to_foldseek_db.py +++ b/cath_alphaflow/commands/convert_cif_to_foldseek_db.py @@ -143,7 +143,7 @@ def convert_cif_to_foldseek_db( if os.path.islink(dir_path): os.remove(dir_path) if not fs_querydb.exists(): - msg = f"failed to create expected foldseek database file: {fs_querydb_db_path}" + msg = f"failed to create expected foldseek database file: {fs_querydb_path}" raise FileNotFoundError(msg) os.rmdir(af_tmp_dir) From 6dd1795f4943f8e9e6e77dc8956b9e66950d2126 Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Tue, 14 Mar 2023 16:19:16 +0000 Subject: [PATCH 36/44] Add Foldseek overlap to settings --- cath_alphaflow/settings.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cath_alphaflow/settings.py b/cath_alphaflow/settings.py index 63b77b4..3f10a52 100644 --- a/cath_alphaflow/settings.py +++ b/cath_alphaflow/settings.py @@ -6,7 +6,7 @@ PROJECT_ROOT_DIR = Path(__file__).parent.parent DEFAULT_FS_BINARY_PATH = str(PROJECT_ROOT_DIR / "foldseek" / "bin" / "foldseek") - +DEFAULT_FS_OVERLAP = 0.6 def resolve_path(raw_path_str): return str(Path(raw_path_str).resolve()) @@ -27,6 +27,8 @@ class Settings: ) FS_DB_PATH = config("FS_DB_PATH", default="foldseek_db", cast=resolve_path) FS_TMP_PATH = config("FS_TMP_PATH", default="foldseek_tmp", cast=resolve_path) + FS_OVERLAP = config("FS_OVERLAP", default=DEFAULT_FS_OVERLAP) + MONGO_USERNAME = config("MONGO_USERNAME", default=None) MONGO_PASSWORD = config("MONGO_PASSWORD", default=None) From 21d301b6a129523ef0797e1c58dfcd60f675667e Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Tue, 14 Mar 2023 16:23:40 +0000 Subject: [PATCH 37/44] Replace tmp_dir with TemporaryDirectory Remove temp files using shutil, removed unecessary loop Cleaned leftover labels from chopping in click Introduced default for AF_version Introduce safeguard to delete af_tmp_dir only if present, defaults to none. --- .../commands/convert_cif_to_foldseek_db.py | 34 ++++++------------- 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/cath_alphaflow/commands/convert_cif_to_foldseek_db.py b/cath_alphaflow/commands/convert_cif_to_foldseek_db.py index dd606bb..6b185c4 100644 --- a/cath_alphaflow/commands/convert_cif_to_foldseek_db.py +++ b/cath_alphaflow/commands/convert_cif_to_foldseek_db.py @@ -2,6 +2,7 @@ from pathlib import Path import os import click +import shutil import subprocess from cath_alphaflow.io_utils import yield_first_col from cath_alphaflow.models.domains import AFDomainID @@ -12,7 +13,8 @@ ID_TYPE_AF_DOMAIN, ID_TYPE_UNIPROT_DOMAIN, ) -from cath_alphaflow.settings import get_default_settings +from tempfile import TemporaryDirectory +from cath_alphaflow.settings import get_default_settings,DEFAULT_AF_VERSION from cath_alphaflow.errors import ArgumentError config = get_default_settings() @@ -58,7 +60,7 @@ "--id_type", type=click.Choice([ID_TYPE_AF_DOMAIN, ID_TYPE_UNIPROT_DOMAIN]), default=ID_TYPE_AF_DOMAIN, - help=f"Option: specify the type of ID to specify the chopping [{ID_TYPE_AF_DOMAIN}]", + help=f"Option: specify the type of ID in id_file [{ID_TYPE_AF_DOMAIN}]", ) @click.option( "--fs_querydb_dir", @@ -75,8 +77,8 @@ @click.option( "--af_version", type=int, - default=4, - help=f"Option: specify the AF version when parsing uniprot ids", + default=DEFAULT_AF_VERSION, + help=f"Option: specify the AF version when parsing uniprot ids. (default: {DEFAULT_AF_VERSION}", ) def convert_cif_to_foldseek_db( cif_dir, fs_querydb_dir, fs_querydb_name, id_file, id_type, cif_suffix, fs_querydb_suffix, fs_bin_path, af_version @@ -91,10 +93,9 @@ def convert_cif_to_foldseek_db( if not fs_querydb_path.exists(): os.makedirs(fs_querydb_path) + af_tmp_dir = None if id_file is not None: - af_tmp_dir = 'af_tmp_dir' - if Path(af_tmp_dir).is_dir==False: - os.mkdir(af_tmp_dir) + af_tmp_dir = TemporaryDirectory(prefix='af_fs_tmp_dir_') for af_domain_id_str in yield_first_col(id_file): if id_type == ID_TYPE_UNIPROT_DOMAIN: af_domain_id = AFDomainID.from_uniprot_str( @@ -113,10 +114,10 @@ def convert_cif_to_foldseek_db( LOG.error(msg) raise FileNotFoundError(msg) # Create symlinks to querydb_dir - dest_cif_path = Path(af_tmp_dir) / cif_path.name + dest_cif_path = Path(af_tmp_dir.name) / cif_path.name if not dest_cif_path.exists(): os.symlink(str(cif_path), str(dest_cif_path)) - cif_input_dir = af_tmp_dir + cif_input_dir = af_tmp_dir.name fs_querydb = Path(f"{fs_querydb_dir}/{fs_querydb_name}{fs_querydb_suffix}") else: cif_input_dir = cif_dir @@ -132,20 +133,5 @@ def convert_cif_to_foldseek_db( stderr=subprocess.DEVNULL, check=True, ) - if id_file is not None: - for root, dirs, files in os.walk(af_tmp_dir, topdown=False): - for name in files: - file_path = os.path.join(root, name) - if os.path.islink(file_path): - os.remove(file_path) - for name in dirs: - dir_path = os.path.join(root, name) - if os.path.islink(dir_path): - os.remove(dir_path) - if not fs_querydb.exists(): - msg = f"failed to create expected foldseek database file: {fs_querydb_path}" - raise FileNotFoundError(msg) - os.rmdir(af_tmp_dir) - click.echo("DONE") return From 53dd10c31230bc85f61e1606f59d4038f0aa44d2 Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Tue, 14 Mar 2023 16:24:44 +0000 Subject: [PATCH 38/44] Switch from file_stub to id_type based reader Add comments Introduce filters --- .../convert_foldseek_output_to_summary.py | 93 ++++++++++++++----- 1 file changed, 72 insertions(+), 21 deletions(-) diff --git a/cath_alphaflow/commands/convert_foldseek_output_to_summary.py b/cath_alphaflow/commands/convert_foldseek_output_to_summary.py index 4ed706c..98b96ae 100644 --- a/cath_alphaflow/commands/convert_foldseek_output_to_summary.py +++ b/cath_alphaflow/commands/convert_foldseek_output_to_summary.py @@ -6,9 +6,10 @@ get_foldseek_reader, get_foldseek_summary_writer, ) -from cath_alphaflow.settings import get_default_settings -from cath_alphaflow.constants import DEFAULT_FS_BITS_CUTOFF, DEFAULT_FS_OVERLAP -from cath_alphaflow.models.domains import FoldseekSummary +from cath_alphaflow.settings import get_default_settings, DEFAULT_AF_VERSION +from cath_alphaflow.constants import DEFAULT_FS_BITS_CUTOFF, DEFAULT_FS_OVERLAP,ID_TYPE_AF_DOMAIN,ID_TYPE_UNIPROT_DOMAIN +from cath_alphaflow.models.domains import FoldseekSummary,AFDomainID +from cath_alphaflow.errors import ArgumentError config = get_default_settings() @@ -20,37 +21,87 @@ "--id_file", type=click.File("rt"), required=True, - help="Input: CSV file containing list of ids to convert from CIF to DSSP", + help="Input: CSV file containing list of ids to process.", +) +@click.option( + "--id_type", + type=click.Choice([ID_TYPE_AF_DOMAIN, ID_TYPE_UNIPROT_DOMAIN]), + default=ID_TYPE_AF_DOMAIN, + help=f"Option: specify the type of ID in id_file [{ID_TYPE_AF_DOMAIN}]", ) @click.option( "--fs_input_file", type=click.File("rt"), - default="fs_query_results.m8", + required=True, help=f"Foldseek tabular output as input", ) @click.option( "--fs_results", type=click.File("wt"), - default="fs_hits.tsv", - help=f"Foldseek hits file", + required=True, + help=f"Foldseek results summary file", +) +@click.option( + "--af_version", + type=int, + default=DEFAULT_AF_VERSION, + help=f"Option: specify the AF version when parsing uniprot ids. (default:{DEFAULT_AF_VERSION})", ) -def convert_foldseek_output_to_summary(id_file, fs_input_file, fs_results): + +def convert_foldseek_output_to_summary(id_file, fs_input_file, fs_results, id_type, af_version): unique_af_ids = set() unique_af_ids.add("NOHIT") - best_hits = set() + best_hit_by_query = {} foldseek_results_writer = get_foldseek_summary_writer(fs_results) - for file_stub in yield_first_col(id_file): - unique_af_ids.add(file_stub) + # Build set of unique AF IDs + for af_domain_id_str in yield_first_col(id_file): + if id_type == ID_TYPE_UNIPROT_DOMAIN: + af_domain_id = AFDomainID.from_uniprot_str( + af_domain_id_str,version=af_version + ) + af_domain_id_str = af_domain_id.to_file_stub() + elif id_type == ID_TYPE_AF_DOMAIN: + af_domain_id = AFDomainID.from_str(af_domain_id_str) + af_domain_id_str = af_domain_id.to_file_stub() + else: + msg = f"failed to understand id_type '${id_type}'" + raise ArgumentError(msg) + unique_af_ids.add(af_domain_id_str) + + # Build Foldseek Reader foldseek_reader = get_foldseek_reader(fs_input_file) + + # Extract best hit per query for filtered ids for foldseek_result_as_dict in foldseek_reader: result = FoldseekSummary(**foldseek_result_as_dict) - if result.query.endswith(".cif"): - result.query = result.query[:-4] - if ( - result.query not in best_hits - and float(result.tcov) >= DEFAULT_FS_OVERLAP - and int(result.bits) >= DEFAULT_FS_BITS_CUTOFF - and result.query in unique_af_ids - ): - best_hits.add(result.query) - foldseek_results_writer.writerow(result.__dict__) + + """ + result: + {'query': 'AF-A0A059CHW2-F1-model_v4-22-322.cif', 'target': '1xhlA00', 'qstart': '7', 'qend': '271', + 'qlen': '301', 'tstart': '3', 'tend': '252', 'tlen': '274', 'qcov': '0.880', 'tcov': '0.912', + 'bits': '509', 'evalue': '1.305E-11'} + """ + + af_query_id = AFDomainID.from_foldseek_query(result.query) + + + af_query_id_str = af_query_id.from_str(str(af_query_id)).to_file_stub() + + if af_query_id_str not in unique_af_ids: + continue + + if float(result.tcov) <= DEFAULT_FS_OVERLAP or int(result.bits) <= DEFAULT_FS_BITS_CUTOFF: + continue + + if af_query_id_str not in best_hit_by_query: + best_hit_by_query[af_query_id_str] = result + + if int(result.bits) > int(best_hit_by_query[af_query_id_str].bits): + best_hit_by_query[af_query_id_str] = result + + + + + # Write results to summary file + for af_query_id, result in best_hit_by_query.items(): + foldseek_results_writer.writerow(result.__dict__) From 7b426a0fd120db86fc71d64de50a0f1dd72c4a52 Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Tue, 14 Mar 2023 16:26:05 +0000 Subject: [PATCH 39/44] Introduce defaults for coverage and aligner Require target databases and raw files Expand click options on coverage and aligner Remove temporary files --- cath_alphaflow/commands/run_foldseek.py | 60 ++++++++++++++++++++----- 1 file changed, 49 insertions(+), 11 deletions(-) diff --git a/cath_alphaflow/commands/run_foldseek.py b/cath_alphaflow/commands/run_foldseek.py index 2b27bbb..e85ea2e 100644 --- a/cath_alphaflow/commands/run_foldseek.py +++ b/cath_alphaflow/commands/run_foldseek.py @@ -2,6 +2,7 @@ import logging from pathlib import Path import os +import glob import click import subprocess @@ -10,8 +11,11 @@ config = get_default_settings() FS_BINARY_PATH = config.FS_BINARY_PATH -FS_DB_PATH = config.FS_DB_PATH FS_TMP_PATH = config.FS_TMP_PATH +FS_OVERLAP = config.FS_OVERLAP +DEFAULT_FS_COV_MODE = "0" # overlap over query and target +DEFAULT_FS_ALIGNER = "2" # 3di+AA (fast, accurate) +DEFAULT_FS_FORMAT_OUTPUT = "query,target,qstart,qend,qlen,tstart,tend,tlen,qcov,tcov,bits,evalue" LOG = logging.getLogger() @@ -20,25 +24,25 @@ @click.option( "--fs_querydb", type=click.Path(exists=True, file_okay=True, resolve_path=True), - default="fs_query_structures.db", + required=True, help=f"Input: Foldseek query database)", ) @click.option( "--fs_targetdb", type=click.Path(exists=True, file_okay=True, resolve_path=True), - default=FS_DB_PATH, - help=f"Target Database for Foldseek. default:{FS_DB_PATH}", + required=True, + help=f"Target Database for Foldseek", ) @click.option( "--fs_rawdata", type=click.Path(resolve_path=True), - default="fs_query_structures.raw", + default="./fs_query_structures.raw", help=f"Raw output of Foldseek (before convertalis). default: fs_query_structures.raw", ) @click.option( "--fs_results", type=click.Path(resolve_path=True), - default="fs_query_results.m8", + default="./fs_query_results.m8", help=f"Foldseek tabular output", ) @click.option( @@ -48,11 +52,36 @@ default=FS_TMP_PATH, help=f"Output: Foldseek temp folder (default:{FS_TMP_PATH})", ) -def run_foldseek(fs_querydb, fs_targetdb, fs_rawdata, fs_results, tmp_dir): +@click.option( + "--cov_mode", + type=click.Choice(["0","1","2","3","4","5"]), + default=DEFAULT_FS_COV_MODE, + help=f"Foldseek coverage mode: 0: % of query and target 1: % target 2: % query 3: target seqlen is %of query 4: query seqlen is % of target 5: shortest seq is % of longest (default: 0)", +) +@click.option( + "--coverage", + type=float, + default=FS_OVERLAP, + help=f'Foldseek overlap (default:{FS_OVERLAP})' +) +@click.option( + "--fs_bin_path", + type=click.Path(file_okay=True, resolve_path=True), + default=FS_BINARY_PATH, + help=f"Option: directory containing the Foldseek executable. (default: {FS_BINARY_PATH})" +) +@click.option( + "--alignment-type", + type=click.Choice(["0","1","2"]), + default=DEFAULT_FS_ALIGNER, + help=f"Option: Foldseek alignment engine: 0: 3di alignment 1: TMalign 2: 3di+AA. (default: {DEFAULT_FS_ALIGNER})", +) +def run_foldseek(fs_querydb, fs_targetdb, fs_rawdata, fs_results, tmp_dir, cov_mode, coverage, alignment_type, fs_bin_path): "Run Foldseek Query DB against Target DB" + assert str(fs_rawdata) != '' subprocess.call( [ - FS_BINARY_PATH, + fs_bin_path, "search", fs_querydb, fs_targetdb, @@ -60,22 +89,31 @@ def run_foldseek(fs_querydb, fs_targetdb, fs_rawdata, fs_results, tmp_dir): tmp_dir, "-s", "9", + "--cov-mode", + str(cov_mode), + "-c", + str(coverage), + "--alignment-type", + str(alignment_type) ], stderr=subprocess.DEVNULL, ) subprocess.run( [ - FS_BINARY_PATH, + fs_bin_path, "convertalis", fs_querydb, fs_targetdb, fs_rawdata, fs_results, "--format-output", - "query,target,qstart,qend,qlen,tstart,tend,tlen,qcov,tcov,bits,evalue", + DEFAULT_FS_FORMAT_OUTPUT, ], stderr=subprocess.DEVNULL, check=True, ) - + files_to_remove = glob.glob(f"{fs_rawdata}*") + if files_to_remove: + for file in files_to_remove: + os.unlink(file) click.echo("DONE") From 030ff329798fff9477859d03617e4861235655a9 Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Tue, 14 Mar 2023 16:26:23 +0000 Subject: [PATCH 40/44] Add classmethod from_foldseek_query --- cath_alphaflow/models/domains.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cath_alphaflow/models/domains.py b/cath_alphaflow/models/domains.py index 96dde23..9a03e28 100644 --- a/cath_alphaflow/models/domains.py +++ b/cath_alphaflow/models/domains.py @@ -240,6 +240,12 @@ def from_str(cls, raw_domid: str): raise ParseError(msg) return domid + + @classmethod + def from_foldseek_query(cls, raw_query_id: str): + if raw_query_id.endswith(".cif"): + raw_query_id = raw_query_id.replace('.cif','') + return cls.from_str(raw_query_id) @property def af_domain_id(self): From ad5f61aadede450dcd4d85715f41afca599606ce Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Tue, 14 Mar 2023 16:27:10 +0000 Subject: [PATCH 41/44] Fix test Unlink tmp file if present Introduce FS_BINARY from config or env --- tests/test_cif_to_foldseek.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/test_cif_to_foldseek.py b/tests/test_cif_to_foldseek.py index 374d076..638867d 100644 --- a/tests/test_cif_to_foldseek.py +++ b/tests/test_cif_to_foldseek.py @@ -2,16 +2,15 @@ from pathlib import Path import csv import logging - +from click.testing import CliRunner from cath_alphaflow.cli import cli +from cath_alphaflow.settings import get_default_settings +config = get_default_settings() -UNIPROT_IDS = ["P00520"] +FS_BINARY_PATH = Path(config.FS_BINARY_PATH) FIXTURE_PATH = Path(__file__).parent / "fixtures" EXAMPLE_CIF_FILE = FIXTURE_PATH / "cif" / "AF-P00520-F1-model_v3.cif.gz" - -FS_BINARY_PATH = Path(__file__).parent.parent / "foldseek" / "bin" / "foldseek" - SUBCOMMAND = "convert-cif-to-foldseek-db" @@ -19,8 +18,8 @@ msg = f"cannot run tests as foldseek is not installed: {FS_BINARY_PATH}" -def test_cli_usage(create_cli_runner): - runner = create_cli_runner() +def test_cli_usage(): + runner = CliRunner() with runner.isolated_filesystem(): result = runner.invoke(cli, [SUBCOMMAND, "--help"]) assert result.exit_code == 0 @@ -39,7 +38,9 @@ def create_fake_cif_dir(dirname, ids, cif_src=EXAMPLE_CIF_FILE): dir_path = Path(dirname) dir_path.mkdir() for _id in ids: - path_dest = dir_path / f"{_id}.cif" + path_dest = dir_path / f"{_id}.cif.gz" + if path_dest.is_symlink(): + path_dest.unlink() os.symlink(cif_src, f"{path_dest}") return dir_path @@ -47,14 +48,14 @@ def create_fake_cif_dir(dirname, ids, cif_src=EXAMPLE_CIF_FILE): def test_convert_cif_to_foldseek_db(tmp_path, create_cli_runner): headers = ["header"] - ids = ["id1", "id2"] + ids = ["AF-P00520-F1-model_v3.cif.gz"] runner = create_cli_runner(extra_settings={"FS_BINARY_PATH": "foldseek-fake-path"}) with runner.isolated_filesystem(temp_dir=tmp_path): cwd_path = Path.cwd() - tmp_dssp_path = create_fake_cif_dir("cif", ids) + tmp_fs_path = create_fake_cif_dir("cif", ids) tmp_id_path = cwd_path / "ids.csv" with tmp_id_path.open("wt") as fh: write_ids_to_file(fh, headers, ids) @@ -64,12 +65,11 @@ def test_convert_cif_to_foldseek_db(tmp_path, create_cli_runner): args = ( SUBCOMMAND, "--cif_dir", - f"{tmp_dssp_path}", - "--id_file", - f"{tmp_id_path}", + f"{tmp_fs_path}", "--fs_querydb_dir", f"{tmp_foldseek_db_path}", ) + print(args) result = runner.invoke(cli, args) assert result.exit_code == 0 assert "DONE" in result.output From ab351094c7a48b6d36202ec9771b005754b9c5c4 Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Wed, 15 Mar 2023 10:09:57 +0000 Subject: [PATCH 42/44] Switch from three_to_one to protein_letters_3to1 to avoid warning about deprecation warning --- tests/test_chopping.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_chopping.py b/tests/test_chopping.py index 006e51d..346ebc7 100644 --- a/tests/test_chopping.py +++ b/tests/test_chopping.py @@ -7,6 +7,7 @@ from Bio import SeqIO from Bio.PDB import MMCIFParser from Bio.PDB.Polypeptide import three_to_one +from Bio.PDB.Polypeptide import protein_letters_3to1 from Bio.PDB.mmcifio import MMCIFIO from Bio.PDB.MMCIF2Dict import MMCIF2Dict @@ -143,7 +144,7 @@ def test_chop_multi_fragment(): structure = parser.get_structure(uniprot_with_chopping, fp) sequence_from_chopped_cif = "".join( - [three_to_one(res.get_resname()) for res in structure.get_residues()] + [protein_letters_3to1[res.get_resname()] for res in structure.get_residues()] ) assert sequence_from_chopped_fasta == sequence_from_chopped_cif From c0629733fe15e3577468257fcb1099803a905eec Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Wed, 15 Mar 2023 10:15:08 +0000 Subject: [PATCH 43/44] Remove three_to_one --- tests/test_chopping.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_chopping.py b/tests/test_chopping.py index 346ebc7..a40940a 100644 --- a/tests/test_chopping.py +++ b/tests/test_chopping.py @@ -6,7 +6,6 @@ from Bio import SeqIO from Bio.PDB import MMCIFParser -from Bio.PDB.Polypeptide import three_to_one from Bio.PDB.Polypeptide import protein_letters_3to1 from Bio.PDB.mmcifio import MMCIFIO from Bio.PDB.MMCIF2Dict import MMCIF2Dict From f3fbc33ef1c6e40844581633086e566c54a670d8 Mon Sep 17 00:00:00 2001 From: Nicola Bordin Date: Wed, 15 Mar 2023 17:14:53 +0000 Subject: [PATCH 44/44] Replace subprocess call with subprocess run --- cath_alphaflow/commands/run_foldseek.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cath_alphaflow/commands/run_foldseek.py b/cath_alphaflow/commands/run_foldseek.py index e85ea2e..c5bf88f 100644 --- a/cath_alphaflow/commands/run_foldseek.py +++ b/cath_alphaflow/commands/run_foldseek.py @@ -79,7 +79,7 @@ def run_foldseek(fs_querydb, fs_targetdb, fs_rawdata, fs_results, tmp_dir, cov_mode, coverage, alignment_type, fs_bin_path): "Run Foldseek Query DB against Target DB" assert str(fs_rawdata) != '' - subprocess.call( + subprocess.run( [ fs_bin_path, "search", @@ -97,6 +97,7 @@ def run_foldseek(fs_querydb, fs_targetdb, fs_rawdata, fs_results, tmp_dir, cov_m str(alignment_type) ], stderr=subprocess.DEVNULL, + check=True, ) subprocess.run( [