Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Foldseek #15

Merged
merged 48 commits into from
Mar 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
1f8e46b
Add CLI option for convert_cif_to_foldseek_db
bordin89 Oct 17, 2022
776c444
Add FS paths to config.env.example
bordin89 Oct 17, 2022
e09985c
Add defults for Foldseek suffixes
bordin89 Oct 17, 2022
a2fb1e5
Add Foldseek paths to settings.py
bordin89 Oct 17, 2022
3d91ade
Create module to generate Foldseek query database
bordin89 Oct 17, 2022
0127853
create run foldseek module
bordin89 Oct 26, 2022
117344b
added settings.json to gitignore
bordin89 Oct 26, 2022
ba0883d
add run foldseek, convert cif to db
bordin89 Oct 26, 2022
1c794a9
add foldseek constants
bordin89 Oct 26, 2022
8987027
remove default for intermediate suffix
bordin89 Oct 26, 2022
f373ed0
add module to convert foldseek output
bordin89 Oct 26, 2022
7b4e0a3
Add Foldseek Summary
bordin89 Oct 27, 2022
c317c11
Add Foldseek Summary Writer
bordin89 Oct 27, 2022
0bfb1a8
remove duplicate declaration
bordin89 Oct 27, 2022
ec83189
Revisited foldseek parser
bordin89 Oct 27, 2022
8dbc269
Include filters for overlap and bits
bordin89 Oct 27, 2022
009e067
Create new FoldseekReader
bordin89 Oct 27, 2022
c92572e
Add query to FoldseekSummary
bordin89 Oct 27, 2022
6ada0ef
Replace subprocess.call with subprocess.run
bordin89 Nov 2, 2022
59508d9
Clean to return fs_querydb_path.exists
bordin89 Nov 2, 2022
3dfbded
Change subprocess.call to subproces.run
bordin89 Nov 2, 2022
53d87e6
update description
sillitoe Nov 2, 2022
e556909
Merge branch 'foldseek' of github.com:UCLOrengoGroup/cath-alphaflow i…
sillitoe Nov 2, 2022
783e3c7
tidy up path calculations
sillitoe Nov 2, 2022
94aa5a9
add check for expected output file
sillitoe Nov 2, 2022
d0193e0
clarify usage of foldseek path
sillitoe Nov 14, 2022
d18955e
provide defaults for foldseek settings
sillitoe Nov 14, 2022
1aabd7c
add foldseek tests
sillitoe Nov 14, 2022
81bbe1c
add create_cli_runner
sillitoe Nov 14, 2022
80888e0
install foldseek
sillitoe Nov 14, 2022
8bd009c
correct path
sillitoe Nov 14, 2022
6ee75c2
simplify path
sillitoe Nov 14, 2022
478b9e9
Merge branch 'main' into foldseek
bordin89 Mar 10, 2023
45cbd3e
Add DEFAULT_FS_QUERYDB_NAME to constants
bordin89 Mar 10, 2023
c8aacbe
Add option to generate query db for set of files
bordin89 Mar 13, 2023
8d40098
Switch from cath_alphaflow.domains
bordin89 Mar 13, 2023
89077db
Fix typo
bordin89 Mar 13, 2023
b4467e2
Merge branch 'main' into foldseek
bordin89 Mar 13, 2023
6dd1795
Add Foldseek overlap to settings
bordin89 Mar 14, 2023
21d301b
Replace tmp_dir with TemporaryDirectory
bordin89 Mar 14, 2023
53dd10c
Switch from file_stub to id_type based reader
bordin89 Mar 14, 2023
7b426a0
Introduce defaults for coverage and aligner
bordin89 Mar 14, 2023
030ff32
Add classmethod from_foldseek_query
bordin89 Mar 14, 2023
ad5f61a
Fix test
bordin89 Mar 14, 2023
2acb3e7
Update main branch for files untouched by this repo
bordin89 Mar 14, 2023
ab35109
Switch from three_to_one to
bordin89 Mar 15, 2023
c062973
Remove three_to_one
bordin89 Mar 15, 2023
f3fbc33
Replace subprocess call with subprocess run
bordin89 Mar 15, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ jobs:
python -m pip install flake8 pytest
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
pip install -e .
- name: Install Foldseek
run: |
wget https://mmseqs.com/foldseek/foldseek-linux-sse41.tar.gz; tar xvzf foldseek-linux-sse41.tar.gz
echo "${HOME}/foldseek/bin" > $GITHUB_PATH
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
Expand Down
7 changes: 7 additions & 0 deletions cath_alphaflow/cli.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
import logging
import click


from .settings import get_default_settings
from .commands import create_dataset_uniprot_ids
from .commands import create_dataset_cath_files
from .commands import optimise_domain_boundaries
from .commands import convert_dssp_to_sse_summary
from .commands import convert_cif_to_dssp
from .commands import extract_plddt_and_lur
from .commands import convert_cif_to_foldseek_db
from .commands import run_foldseek
from .commands import convert_foldseek_output_to_summary
from .commands import chop_domain
from .commands import create_md5
from .commands import convert_cif_to_fasta
Expand Down Expand Up @@ -54,6 +58,9 @@ def dump_config():
cli.add_command(convert_dssp_to_sse_summary.convert_dssp_to_sse_summary)
cli.add_command(convert_cif_to_dssp.convert_cif_to_dssp)
cli.add_command(extract_plddt_and_lur.convert_cif_to_plddt_summary)
cli.add_command(convert_cif_to_foldseek_db.convert_cif_to_foldseek_db)
cli.add_command(run_foldseek.run_foldseek)
cli.add_command(convert_foldseek_output_to_summary.convert_foldseek_output_to_summary)
cli.add_command(chop_domain.chop_cif_command)
cli.add_command(create_md5.create_md5)
cli.add_command(convert_cif_to_fasta.convert_cif_to_fasta)
Expand Down
137 changes: 137 additions & 0 deletions cath_alphaflow/commands/convert_cif_to_foldseek_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import logging
from pathlib import Path
import os
import click
import shutil
import subprocess
from cath_alphaflow.io_utils import yield_first_col
from cath_alphaflow.models.domains import AFDomainID
from cath_alphaflow.constants import (
DEFAULT_FS_QUERYDB_SUFFIX,
DEFAULT_FS_QUERYDB_NAME,
DEFAULT_CIF_SUFFIX,
ID_TYPE_AF_DOMAIN,
ID_TYPE_UNIPROT_DOMAIN,
)
from tempfile import TemporaryDirectory
from cath_alphaflow.settings import get_default_settings,DEFAULT_AF_VERSION
from cath_alphaflow.errors import ArgumentError

config = get_default_settings()

FS_BINARY_PATH = config.FS_BINARY_PATH

LOG = logging.getLogger()


@click.command()
@click.option(
"--cif_dir",
type=click.Path(exists=True, file_okay=False, dir_okay=True, resolve_path=True),
required=True,
help="Input: directory of CIF files",
)
@click.option(
"--cif_suffix",
type=str,
default=DEFAULT_CIF_SUFFIX,
help=f"Input: optional suffix to add to id when looking for cif file (default: {DEFAULT_CIF_SUFFIX})",
)
@click.option(
"--fs_querydb_name",
type=str,
default=DEFAULT_FS_QUERYDB_NAME,
help=f"Output: Foldseek Query Database Name (default: {DEFAULT_FS_QUERYDB_NAME})",
)
@click.option(
"--fs_querydb_suffix",
type=str,
default=DEFAULT_FS_QUERYDB_SUFFIX,
help=f"Option: optional suffix to add to Foldseek query database during creation (default: {DEFAULT_FS_QUERYDB_SUFFIX})",
)
@click.option(
"--id_file",
type=click.File("rt"),
default=None,
required=False,
help='Optional id list file if generating a subset of a larger folder. (default: False)'
)
@click.option(
"--id_type",
type=click.Choice([ID_TYPE_AF_DOMAIN, ID_TYPE_UNIPROT_DOMAIN]),
default=ID_TYPE_AF_DOMAIN,
help=f"Option: specify the type of ID in id_file [{ID_TYPE_AF_DOMAIN}]",
)
@click.option(
"--fs_querydb_dir",
type=click.Path(file_okay=False, dir_okay=True, resolve_path=True),
required=True,
help=f"Output: directory to use for Foldseek query database during creation",
)
@click.option(
"--fs_bin_path",
type=click.Path(file_okay=True, resolve_path=True),
default=FS_BINARY_PATH,
help=f"Option: directory containing the Foldseek executable. (default: {FS_BINARY_PATH})"
)
@click.option(
"--af_version",
type=int,
default=DEFAULT_AF_VERSION,
help=f"Option: specify the AF version when parsing uniprot ids. (default: {DEFAULT_AF_VERSION}",
)
def convert_cif_to_foldseek_db(
cif_dir, fs_querydb_dir, fs_querydb_name, id_file, id_type, cif_suffix, fs_querydb_suffix, fs_bin_path, af_version
):
"Create Foldseek query database from mmCIF folder"

if FS_BINARY_PATH is None:
msg = "expected foldseek binary path (FS_BINARY_PATH) to be set"
raise RuntimeError(msg)

fs_querydb_path = Path(fs_querydb_dir).resolve()
if not fs_querydb_path.exists():
os.makedirs(fs_querydb_path)

af_tmp_dir = None
if id_file is not None:
af_tmp_dir = TemporaryDirectory(prefix='af_fs_tmp_dir_')
for af_domain_id_str in yield_first_col(id_file):
if id_type == ID_TYPE_UNIPROT_DOMAIN:
af_domain_id = AFDomainID.from_uniprot_str(
af_domain_id_str,version=af_version
)
elif id_type == ID_TYPE_AF_DOMAIN:
af_domain_id = AFDomainID.from_str(af_domain_id_str)
else:
msg = f"failed to understand id_type '${id_type}'"
raise ArgumentError(msg)

file_stub = af_domain_id.to_file_stub()
cif_path = Path(cif_dir) / f"{file_stub}{cif_suffix}"
if not cif_path.exists():
msg = f"failed to locate CIF input file {cif_path}"
LOG.error(msg)
raise FileNotFoundError(msg)
# Create symlinks to querydb_dir
dest_cif_path = Path(af_tmp_dir.name) / cif_path.name
if not dest_cif_path.exists():
os.symlink(str(cif_path), str(dest_cif_path))
cif_input_dir = af_tmp_dir.name
fs_querydb = Path(f"{fs_querydb_dir}/{fs_querydb_name}{fs_querydb_suffix}")
else:
cif_input_dir = cif_dir
fs_querydb = Path(f"{fs_querydb_dir}/{fs_querydb_name}{fs_querydb_suffix}")
LOG.info(f'{cif_input_dir} {fs_querydb_dir}')
subprocess.run(
[
f"{fs_bin_path}",
"createdb",
f"{cif_input_dir}",
f"{fs_querydb}",
],
stderr=subprocess.DEVNULL,
check=True,
)
click.echo("DONE")
return
107 changes: 107 additions & 0 deletions cath_alphaflow/commands/convert_foldseek_output_to_summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import logging
from unittest import result
import click
from cath_alphaflow.io_utils import (
yield_first_col,
get_foldseek_reader,
get_foldseek_summary_writer,
)
from cath_alphaflow.settings import get_default_settings, DEFAULT_AF_VERSION
from cath_alphaflow.constants import DEFAULT_FS_BITS_CUTOFF, DEFAULT_FS_OVERLAP,ID_TYPE_AF_DOMAIN,ID_TYPE_UNIPROT_DOMAIN
from cath_alphaflow.models.domains import FoldseekSummary,AFDomainID
from cath_alphaflow.errors import ArgumentError

config = get_default_settings()

LOG = logging.getLogger()


@click.command()
@click.option(
"--id_file",
type=click.File("rt"),
required=True,
help="Input: CSV file containing list of ids to process.",
)
@click.option(
"--id_type",
type=click.Choice([ID_TYPE_AF_DOMAIN, ID_TYPE_UNIPROT_DOMAIN]),
default=ID_TYPE_AF_DOMAIN,
help=f"Option: specify the type of ID in id_file [{ID_TYPE_AF_DOMAIN}]",
)
@click.option(
"--fs_input_file",
type=click.File("rt"),
required=True,
help=f"Foldseek tabular output as input",
)
@click.option(
"--fs_results",
type=click.File("wt"),
required=True,
help=f"Foldseek results summary file",
)
@click.option(
"--af_version",
type=int,
default=DEFAULT_AF_VERSION,
help=f"Option: specify the AF version when parsing uniprot ids. (default:{DEFAULT_AF_VERSION})",
)

def convert_foldseek_output_to_summary(id_file, fs_input_file, fs_results, id_type, af_version):
unique_af_ids = set()
unique_af_ids.add("NOHIT")
best_hit_by_query = {}
foldseek_results_writer = get_foldseek_summary_writer(fs_results)
# Build set of unique AF IDs
for af_domain_id_str in yield_first_col(id_file):
if id_type == ID_TYPE_UNIPROT_DOMAIN:
af_domain_id = AFDomainID.from_uniprot_str(
af_domain_id_str,version=af_version
)
af_domain_id_str = af_domain_id.to_file_stub()
elif id_type == ID_TYPE_AF_DOMAIN:
af_domain_id = AFDomainID.from_str(af_domain_id_str)
af_domain_id_str = af_domain_id.to_file_stub()
else:
msg = f"failed to understand id_type '${id_type}'"
raise ArgumentError(msg)
unique_af_ids.add(af_domain_id_str)

# Build Foldseek Reader
foldseek_reader = get_foldseek_reader(fs_input_file)

# Extract best hit per query for filtered ids
for foldseek_result_as_dict in foldseek_reader:
result = FoldseekSummary(**foldseek_result_as_dict)
Copy link
Contributor

@sillitoe sillitoe Oct 31, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could move this link into foldseek_reader (so that it yields a FoldseekSummary object rather than dict)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tried changing that, but it breaks the code. I am still haven't understood how to pass from DictReader to a mapping.


"""
result:
{'query': 'AF-A0A059CHW2-F1-model_v4-22-322.cif', 'target': '1xhlA00', 'qstart': '7', 'qend': '271',
'qlen': '301', 'tstart': '3', 'tend': '252', 'tlen': '274', 'qcov': '0.880', 'tcov': '0.912',
'bits': '509', 'evalue': '1.305E-11'}
"""

af_query_id = AFDomainID.from_foldseek_query(result.query)


af_query_id_str = af_query_id.from_str(str(af_query_id)).to_file_stub()

if af_query_id_str not in unique_af_ids:
continue

if float(result.tcov) <= DEFAULT_FS_OVERLAP or int(result.bits) <= DEFAULT_FS_BITS_CUTOFF:
continue

if af_query_id_str not in best_hit_by_query:
best_hit_by_query[af_query_id_str] = result

if int(result.bits) > int(best_hit_by_query[af_query_id_str].bits):
best_hit_by_query[af_query_id_str] = result




# Write results to summary file
for af_query_id, result in best_hit_by_query.items():
foldseek_results_writer.writerow(result.__dict__)
Loading