Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Motif search #1549

Draft
wants to merge 10 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ __pycache__

.pypirc
.gitignore
.python-version

chauthorinfo.sh
pre-commit.sh
Expand All @@ -48,3 +49,4 @@ Documentation/*
*log
*BAK
*.sublime*
*.code-workspace
16 changes: 15 additions & 1 deletion prody/sequence/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,27 +46,41 @@
* :func:`.showShannonEntropy` - plot Shannon entropy
* :func:`.showMSAOccupancy` - plot row (sequence) or column occupancy
* :func:`.showMutinfoMatrix` - show mutual information matrix


Searching
========
* :func:`.getPdbCode` - get PDB code from MOTIF
mkonstanty marked this conversation as resolved.
Show resolved Hide resolved
"""

__all__ = []

from . import motif
from .motif import *

__all__.extend(motif.__all__)

from . import msa
from .msa import *

__all__.extend(msa.__all__)

from . import msafile
from .msafile import *

__all__.extend(msafile.__all__)

from . import analysis
from .analysis import *

__all__.extend(analysis.__all__)

from . import plotting
from .plotting import *

__all__.extend(plotting.__all__)

from . import sequence
from .sequence import *
__all__.extend(sequence.__all__)

__all__.extend(sequence.__all__)
125 changes: 125 additions & 0 deletions prody/sequence/motif.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""This module gets PDB code from MOTIF."""

import argparse
import re

import requests
from prody import LOGGER

__author__ = "Mariusz Konstanty"

__all__ = ["getPdbFromMotif", "expasySearchMotif", "_expasyTextToStructure"]

DATABASES = {
"sp": "Swiss-Prot",
# "rs": "RefSeq",
"pdb": "PDB",
"tr": "TrEMBL",
"local": "local database",
}
MOTIF_PATTERN = (
r"^([ARDNCEQGHILKMFPSTWYVx]*"
r"(\[[ARDNCEQGHILKMFPSTWYV]+\])*"
r"({{[{{ARDNCEQGHILKMFPSTWYV}}]+}})*"
r"([ARDNCEQGHILKMFPSTWYVx]\(\d(,\d)?\))*)+$"
)
MOTIF_MATCHER = re.compile(MOTIF_PATTERN)


def getPdbFromMotif(motif: str, database: str) -> str:
if database not in DATABASES:
raise ValueError(f"Database must be one of: {DATABASES.keys()}.")
mkonstanty marked this conversation as resolved.
Show resolved Hide resolved
if not re.match(MOTIF_MATCHER, motif.replace("-", "")):
raise ValueError(f"{motif} is not valid PROSITE motif.")
result = expasySearchMotif(motif, database)
pdb_code = ""
return pdb_code


def _expasyTextToStructure(response: str) -> list:
RESULT_PATTERN = (
r">(sp|tr|pdb)\|"
r"(\w+)\|(\w+).*\n"
r"(.*)\n"
r"([ARDNCEQGHILKMFPSTWYVx\n]+)\W+"
r"(\d{,3}) - (\d{,3}):\W+(\w+)"
)
motifs = re.compile(RESULT_PATTERN, re.M)
results = re.findall(motifs, response)
structure = []
names = ("database", "accession", "id", "description", "sequence", "start", "end", "match")
for result in results:
result = map(lambda x: x.replace("\n", ""), result)
structure.append(dict(zip(names, result)))
return structure


def expasySearchMotif(motif, database):
"""Search motif in remote Swiss-Prot database.

https://prosite.expasy.org/scanprosite/scanprosite_doc.html#rest_intro

Args:
motif (str): Motif in PROSITE format.
database (str): 'sp' (UniProtKB/Swiss-Prot) or 'tr' (UniProtKB/TrEMBL) or 'pdb' (PDB).

Returns:
str: Expasy response from motif search.
"""
headers = {"User-Agent": "Python pattern search agent", "Contact": "mkonstanty@gmail.com"}
url = "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
payload = {"sig": motif, "db": database}
try:
result = requests.get(url, params=payload, headers=headers)
except Exception as exception:
print(f"Remote search for motif {motif} in Swiss-Prot database failed: {exception}")
return []
else:
return _expasyTextToStructure(result.text)


def _argMotif(motif: str) -> str:
"""Create motif type for argparse.ArgumentParser.

Args:
value (str): motif to be validated

Returns:
str: valid Motif
"""
new_motif = str(motif).replace("-", "")
if not re.match(MOTIF_MATCHER, new_motif):
msg = f"{motif} is not a valid PROSITE MOTIF."
raise argparse.ArgumentTypeError(msg)
return new_motif


def _parseArgs():
"""Parse arguments from the command line.

Returns:
argparse.Namespace: parsed script arguments
"""
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"-m",
"--motif",
type=_argMotif,
required=True,
help="motif to search in a databases",
)
parser.add_argument(
"-d",
"--databases",
choices=DATABASES,
action="append",
required=True,
help="choose the databases to search from",
)
return parser.parse_args()


if __name__ == "__main__":
args = _parseArgs()
102 changes: 102 additions & 0 deletions prody/tests/sequence/test_motif.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
from prody import LOGGER, _expasyTextToStructure, expasySearchMotif
from prody.tests import TestCase

LOGGER.verbosity = None


EXPASY_SCAN_PROSITE_TOOL_RESPONSE = """<pre>include splice variants (Swiss-Prot)
Output format: Text

Hits for USERPAT1'P-x(2)-G-E-S-G(2)-[AS]' motif on UniProtKB/Swiss-Prot sequences:
UniProtKB/Swiss-Prot (Release 2022_01 of 23-Feb-22) contains 566,996 entries.

found: 11 hits in 11 sequences

Graphical view (graphical view with feature detection)
shaded alignment of hits

:
>USERPAT1 (user pattern) :
Pattern: P-x(2)-G-E-S-G(2)-[AS]
Approximate number of expected random matches in ~ 100'000 sequences (50'000'000 residues): 0.44


>sp|Q3SYW2|CO2_BOVIN (750 aa)
Complement C2 (EC 3.4.21.43) (C3/C5 convertase) [Cleaved into: Complement C2b fragment; Complement C2a fragment]. [Bos taurus (Bovine)]
MDPLMAVLCLLPLYPGLATAALSCPKNVNISGGSFTLSNGWNPGSILTYSCPLGHYPYPVVTRLCKSNGQWQIPRSTRST
KAICKPVRCPAPVSFENGVYIPRLGSHPVGGNLSFECEDGFTLRGSAVRQCRPNGMWDGETAVCDNGASHCPNPGISVGA
VRTGSRFGLGDKVRYRCSSNLVLTGSAERECQDDGVWSGTEAICRQPYSYDFPEDVAPALGTSFSHLLATTNPIQQKKKQ
NLGRKIQIQRSGHLNLYLLLDASQSVSKDDFEIFKDSASRMVDRIFSFEIKVSVAIITFASKPKIIMSVLEDRSRDVTEV
ENSLRNINYKDHENGTGTNIYEALHAVYIMMNNQMNRPHMNPGAWQEIRHAIILLTDGKSNMGGSPKVAVDNIKEVLNIN
QKRKDYLDIYAIGVGSLHVDWKELNNLGSKKDGERHAFILKDVQALSQVFEHMLDVSQLTDPICGVGNMSANASAQERTP
WHVTIKPKSQETCRGALISDQWVLTAAHCFRNAEDRTLWRVSVGDPNFQGSKEFQIEEAVISPGFNVFSKKSQGIPEFYG
DDIALLKLTQKVKMSTHARPICLPCTVGANLALRKLPGSTCRDHEKELLNQVSIPAHFVALNGDKLNINLKTGSEWTNCV
KVVLKDKTTFPNLTDVREVVTDQFLCSGTQGDDSPCKGESGGAVFLERRLRFFQVGLVSWGLYNPCGGSSKNSRKPAPHG
KVPRDFHINLFRLQPWLRQHLEGILNFVPL
675 - 683: PckGESGGA


>sp|Q863A0|CO2_GORGO (752 aa)
Complement C2 (EC 3.4.21.43) (C3/C5 convertase) [Cleaved into: Complement C2b fragment; Complement C2a fragment]. [Gorilla gorilla gorilla (Western lowland gorilla)]
MGPLMVLFCLLFVYTGLADSAPSCPQNVNISGGTFTLSHGWAPGSLLTYSCPQGLYPSPASRLCKSSGQWQTPGATRSLS
KAVCKPVRCPAPVSFENGIYTPRLGSYPVGGNVSFECEDGFILRGSPVRQCRPNGMWDGETAVCDNGAGHCPNPGISLGA
VRTGFRFGHGDKVRYRCSSNLVLTGSSERECQGNGVWSGTEPICRQPYSYDFPEDVAPALGTSFSHMLGATNPTQKTKES
LGRKIQIQRSGHLNLYLLLDCSQSVSENDFLIFKESASLMVDRIFSFEINVSVAIITFASKPKVLMSVLNDNSRDMTEVI
SSLENANYKDHENGTGTNTYAALNSVYLMMNNQMRILGMETMAWQEIRHAIILLTDGKSNMGGSPKTAVDRIREILNINQ
KRNDYLDIYAIGVGKLDVDWRELNELGSKKDGERHAFILQDTKALHQVFEHMLDVSKLTDTICGVGNMSANASDQERTPW
HVTIKPKSQETCRGALISDQWVLTAAHCFRDGNDHSLWRVNVGDPKSQWGKEFLIEKAVISPGFDVFAKKNQGILEFYGD
DIALLKLAQKVKMSTHARPICLPCTMEANLALRRPQGSTCRDHENELLNKQSVPAHFVALNGSKLNINLKMGVEWTSCAE
VVSQEKTMFPNLTDVREVVTDQFLCSGTQEDESPCKGESGGAVFLERRFRFFQVGLVSWGLYNPCLGSADKNSRKRAPRS
KVPPPRDFHINLFRMQPWLRQHLGDVLNFLPL
674 - 682: PckGESGGA</pre>"""

EXPASY_SCAN_PROSITE_TOOL_RESULT = [
{
"database": "sp",
"accession": "Q3SYW2",
"id": "CO2_BOVIN",
"description": "Complement C2 (EC 3.4.21.43) (C3/C5 convertase) [Cleaved into: Complement C2b fragment; Complement C2a fragment]. [Bos taurus (Bovine)]",
"sequence": "MDPLMAVLCLLPLYPGLATAALSCPKNVNISGGSFTLSNGWNPGSILTYSCPLGHYPYPVVTRLCKSNGQWQIPRSTRST"
"KAICKPVRCPAPVSFENGVYIPRLGSHPVGGNLSFECEDGFTLRGSAVRQCRPNGMWDGETAVCDNGASHCPNPGISVGA"
"VRTGSRFGLGDKVRYRCSSNLVLTGSAERECQDDGVWSGTEAICRQPYSYDFPEDVAPALGTSFSHLLATTNPIQQKKKQ"
"NLGRKIQIQRSGHLNLYLLLDASQSVSKDDFEIFKDSASRMVDRIFSFEIKVSVAIITFASKPKIIMSVLEDRSRDVTEV"
"ENSLRNINYKDHENGTGTNIYEALHAVYIMMNNQMNRPHMNPGAWQEIRHAIILLTDGKSNMGGSPKVAVDNIKEVLNIN"
"QKRKDYLDIYAIGVGSLHVDWKELNNLGSKKDGERHAFILKDVQALSQVFEHMLDVSQLTDPICGVGNMSANASAQERTP"
"WHVTIKPKSQETCRGALISDQWVLTAAHCFRNAEDRTLWRVSVGDPNFQGSKEFQIEEAVISPGFNVFSKKSQGIPEFYG"
"DDIALLKLTQKVKMSTHARPICLPCTVGANLALRKLPGSTCRDHEKELLNQVSIPAHFVALNGDKLNINLKTGSEWTNCV"
"KVVLKDKTTFPNLTDVREVVTDQFLCSGTQGDDSPCKGESGGAVFLERRLRFFQVGLVSWGLYNPCGGSSKNSRKPAPHG"
"KVPRDFHINLFRLQPWLRQHLEGILNFVPL",
"start": "675",
"end": "683",
"match": "PckGESGGA",
},
{
"database": "sp",
"accession": "Q863A0",
"id": "CO2_GORGO",
"description": "Complement C2 (EC 3.4.21.43) (C3/C5 convertase) [Cleaved into: Complement C2b fragment; Complement C2a fragment]. [Gorilla gorilla gorilla (Western lowland gorilla)]",
"sequence": "MGPLMVLFCLLFVYTGLADSAPSCPQNVNISGGTFTLSHGWAPGSLLTYSCPQGLYPSPASRLCKSSGQWQTPGATRSLS"
"KAVCKPVRCPAPVSFENGIYTPRLGSYPVGGNVSFECEDGFILRGSPVRQCRPNGMWDGETAVCDNGAGHCPNPGISLGA"
"VRTGFRFGHGDKVRYRCSSNLVLTGSSERECQGNGVWSGTEPICRQPYSYDFPEDVAPALGTSFSHMLGATNPTQKTKES"
"LGRKIQIQRSGHLNLYLLLDCSQSVSENDFLIFKESASLMVDRIFSFEINVSVAIITFASKPKVLMSVLNDNSRDMTEVI"
"SSLENANYKDHENGTGTNTYAALNSVYLMMNNQMRILGMETMAWQEIRHAIILLTDGKSNMGGSPKTAVDRIREILNINQ"
"KRNDYLDIYAIGVGKLDVDWRELNELGSKKDGERHAFILQDTKALHQVFEHMLDVSKLTDTICGVGNMSANASDQERTPW"
"HVTIKPKSQETCRGALISDQWVLTAAHCFRDGNDHSLWRVNVGDPKSQWGKEFLIEKAVISPGFDVFAKKNQGILEFYGD"
"DIALLKLAQKVKMSTHARPICLPCTMEANLALRRPQGSTCRDHENELLNKQSVPAHFVALNGSKLNINLKMGVEWTSCAE"
"VVSQEKTMFPNLTDVREVVTDQFLCSGTQEDESPCKGESGGAVFLERRFRFFQVGLVSWGLYNPCLGSADKNSRKRAPRS"
"KVPPPRDFHINLFRMQPWLRQHLGDVLNFLPL",
"start": "674",
"end": "682",
"match": "PckGESGGA",
},
]

EXPASY_ERROR = '<p>ERROR: Invalid characters in pattern "sdfsdf".</p>'


class TestMotif(TestCase):
def testExpasyTextToStructure(self):
self.assertEqual(_expasyTextToStructure(EXPASY_SCAN_PROSITE_TOOL_RESPONSE), EXPASY_SCAN_PROSITE_TOOL_RESULT)

def testEmptyExpasyStructure(self):
self.assertEqual(_expasyTextToStructure(EXPASY_ERROR), [])
26 changes: 26 additions & 0 deletions prody/utilities/motiftools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# -*- coding: utf-8 -*-
"""This module defines functions and constants related to MOTIF."""

__all__ = ["prositeToRegEx"]


def prositeToRegEx(pattern: str) -> str:
"""Change PROSITE Motif to python regular expression.

Args:
pattern (str): PROSITE Motif

Returns:
str: regular expression
"""
pattern = (
pattern.replace("{", "[^")
.replace("}", "]")
.replace("(", "{")
.replace(")", "}")
.replace("-", "")
.replace("x", ".")
.replace(">", "$")
.replace("<", "*")
)
return pattern