Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

First version of pLDDT and LUR module #11

Merged
merged 12 commits into from
Oct 13, 2022
2 changes: 2 additions & 0 deletions cath_alphaflow/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .commands import optimise_domain_boundaries
from .commands import convert_dssp_to_sse_summary
from .commands import convert_cif_to_dssp
from .commands import extract_plddt_and_lur

logging.basicConfig(
level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s"
Expand Down Expand Up @@ -47,3 +48,4 @@ def dump_config():
cli.add_command(optimise_domain_boundaries.optimise_domain_boundaries)
cli.add_command(convert_dssp_to_sse_summary.convert_dssp_to_sse_summary)
cli.add_command(convert_cif_to_dssp.convert_cif_to_dssp)
cli.add_command(extract_plddt_and_lur.convert_cif_to_plddt_summary)
8 changes: 1 addition & 7 deletions cath_alphaflow/commands/convert_dssp_to_sse_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,7 @@
get_sse_summary_writer,
)
from cath_alphaflow.models import SecStrSummary
from cath_alphaflow.constants import (
DEFAULT_DSSP_SUFFIX,
DEFAULT_HELIX_MIN_LENGTH,
DEFAULT_STRAND_MIN_LENGTH,
)
from cath_alphaflow.constants import DEFAULT_DSSP_SUFFIX


@click.command()
Expand Down Expand Up @@ -60,8 +56,6 @@ def get_sse_summary_from_dssp(

dssp_string = []
read_headers = False
domain_length = 0
ss_total = 0
if acc_id is None:
acc_id = dssp_path.stem

Expand Down
122 changes: 122 additions & 0 deletions cath_alphaflow/commands/extract_plddt_and_lur.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
from pathlib import Path
from Bio.PDB import MMCIF2Dict
import logging
import click
from cath_alphaflow.io_utils import (
yield_first_col,
get_plddt_summary_writer,
)
from cath_alphaflow.models import pLDDTSummary
from cath_alphaflow.constants import MIN_LENGTH_LUR

LOG = logging.getLogger()


@click.command()
@click.option(
"--cif_in_dir",
type=click.Path(exists=True, file_okay=False, dir_okay=True, resolve_path=True),
required=True,
help="Input: directory of CIF files",
)
@click.option(
"--id_file",
type=click.File("rt"),
required=True,
help="Input: CSV file containing list of ids to process from CIF to pLDDT",
)
@click.option(
"--plddt_stats_file",
type=click.File("wt"),
required=True,
help="Output: pLDDT and LUR output file",
)
@click.option(
"--cif_suffix",
type=str,
default=".cif",
help="Option: suffix to use for mmCIF files (default: .cif)",
)
def convert_cif_to_plddt_summary(
cif_in_dir,
id_file,
plddt_stats_file,
cif_suffix,
):
"Creates summary of secondary structure elements (SSEs) from DSSP files"

plddt_out_writer = get_plddt_summary_writer(plddt_stats_file)

for file_stub in yield_first_col(id_file):
cif_path = Path(cif_in_dir) / f"{file_stub}{cif_suffix}"
if not cif_path.exists():
msg = f"failed to locate CIF input file {cif_path}"
LOG.error(msg)
raise FileNotFoundError(msg)

avg_plddt = get_average_plddt_from_plddt_string(cif_path)
perc_LUR = get_LUR_residues_percentage(cif_path)
plddt_stats = pLDDTSummary(
af_domain_id=file_stub, avg_plddt=avg_plddt, perc_LUR=perc_LUR
)
plddt_out_writer.writerow(plddt_stats.__dict__)

click.echo("DONE")


def get_average_plddt_from_plddt_string(
cif_path: Path, *, chopping=None, acc_id=None
) -> pLDDTSummary:
if acc_id is None:
acc_id = cif_path.stem
mmcif_dict = MMCIF2Dict.MMCIF2Dict(cif_path)
chain_plddt = mmcif_dict["_ma_qa_metric_global.metric_value"][0]
plddt_string = mmcif_dict["_ma_qa_metric_local.metric_value"]
segment_plddt = ""
Copy link
Contributor

@sillitoe sillitoe Oct 11, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be initialised as an array (of float) that gets append to in the following loop?

segment_plddt = []

if chopping:
for segment in chopping.segments:
segment_plddt += plddt_string[(segment.start - 1) : segment.end]
domain_length = len(segment_plddt)
average_plddt = round((sum(segment_plddt) / domain_length) * 100, 2)

else:
average_plddt = chain_plddt
return average_plddt


def get_LUR_residues_percentage(
cif_path: Path, *, chopping=None, acc_id=None
) -> pLDDTSummary:
if acc_id is None:
acc_id = cif_path.stem
mmcif_dict = MMCIF2Dict.MMCIF2Dict(cif_path)
plddt_string = mmcif_dict["_ma_qa_metric_local.metric_value"]
segment_plddt = ""
if chopping:
for segment in chopping.segments:
segment_plddt += plddt_string[(segment.start - 1) : segment.end]
else:
segment_plddt = plddt_string
# Calculate LUR
LUR_perc = 0
LUR_total = 0
LUR_res = 0
LUR_stretch = False
min_res_lur = MIN_LENGTH_LUR
for residue in segment_plddt:
plddt_res = float(residue)
if plddt_res < 90:
LUR_res += 1
if LUR_stretch == True:
LUR_total += 1

if LUR_res == min_res_lur and LUR_stretch == False:
LUR_stretch = True
LUR_total += min_res_lur

else:
LUR_stretch = False
LUR_res = 0
LUR_perc = round(LUR_total / len(segment_plddt) * 100, 2)

return LUR_perc
1 change: 1 addition & 0 deletions cath_alphaflow/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
DEFAULT_DSSP_SUFFIX = ".dssp"
DEFAULT_HELIX_MIN_LENGTH = 3
DEFAULT_STRAND_MIN_LENGTH = 2
MIN_LENGTH_LUR = 5
13 changes: 13 additions & 0 deletions cath_alphaflow/io_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,19 @@ def get_sse_summary_writer(csvfile):
return writer


def get_plddt_summary_writer(csvfile):
writer = get_csv_dictwriter(
csvfile,
fieldnames=[
"af_domain_id",
"avg_plddt",
"perc_LUR",
],
)
writer.writeheader()
return writer


class AFDomainIDReader(csv.DictReader):
def __init__(self, *args):
self._seen_header = False
Expand Down
7 changes: 7 additions & 0 deletions cath_alphaflow/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,3 +194,10 @@ def new_from_dssp_str(
)

return ss_sum


@dataclass
class pLDDTSummary:
af_domain_id: str
avg_plddt: float
perc_LUR: float