ELELAB · elenapapaleo · Apr 22, 2024 · Apr 18, 2024 · Apr 18, 2024 · Apr 18, 2024
diff --git a/cancermuts/datasources.py b/cancermuts/datasources.py
@@ -51,6 +51,7 @@
 from bravado.client import SwaggerClient
 from bravado.requests_client import RequestsClient
 from requests.adapters import HTTPAdapter
+import gget
 
 
 import sys
@@ -1215,6 +1216,87 @@ def add_sequence_properties(self, sequence, exclude_elm_classes=r'{.*}', use_ali
             property_obj.metadata['ref']      = self.description
             sequence.add_property(property_obj)
 
+class ggetELMPredictions(StaticSource, object):
+    @logger_init
+    def __init__(self):
+        """
+        Data source for ELM which uses the gget Python package, rather than
+        interrogating the ELM webserver
+        """
+
+        description = "ELM Prediction with gget"
+        super(ggetELMPredictions, self).__init__(name='ggetELM', version='1.0', description=description)
+
+    def _get_prediction(self, sequence):
+        """
+        Gets predicted SLIMs using regexp mode only, using the gget Python
+        package
+
+        Parameters
+        ----------
+        sequence : :obj:`str`
+            Protein sequence, as a single string
+
+        Returns
+        ----------
+        slims : :obj:`pandas.DataFrame`
+            data frame containing SLIM definitions in form of:
+              - ELM identifier
+              - Name
+              - Description
+              - Start position
+              - End position
+        """
+
+        try:
+            ortho_slims, regex_slims = gget.elm(sequence, uniprot=False)
+        except FileNotFoundError:
+            gget.setup('elm')
+            ortho_slims, regex_slims = gget.elm(sequence, uniprot=False)
+
+        return regex_slims[['ELMIdentifier',
+                            'FunctionalSiteName',
+                            'Description',
+                            'motif_start_in_query',
+                            'motif_end_in_query']].drop_duplicates()
+
+    def add_sequence_properties(self, sequence, exclude_elm_classes=r'{.*}'):
+        """
+        Adds sequence properties to a sequence object
+
+        Parameters
+        ----------
+        sequence : :obj:`cancermuts.core.Sequence`
+            Sequence object with the protein to be annotated
+
+        exclude_elm_classes : :obj:`str`
+            Regular expression matching ELM classes to be excluded from the output
+        """
+
+        self.log.info("adding gget ELM predictions to sequence ...")
+
+        data = self._get_prediction(sequence.sequence)
+
+        for _, r in data.iterrows():
+
+            if re.match(exclude_elm_classes, r['ELMIdentifier']):
+                self.log.info("%s was filtered out as requested" % r['ELMIdentifier'])
+                continue
+
+            this_positions = []
+            for p in range(r['motif_start_in_query'], r['motif_end_in_query']+1):
+                this_positions.append(sequence.positions[sequence.seq2index(p)])
+
+            property_obj = sequence_properties_classes['linear_motif']  (sources=[self],
+                                                                         positions=this_positions,
+                                                                         name=r['FunctionalSiteName'],
+                                                                         id=r['ELMIdentifier'])
+
+            property_obj.metadata['function'] = [r['Description']]
+            property_obj.metadata['ref']      = self.description
+            sequence.add_property(property_obj)
+
+
 class gnomAD(DynamicSource, object):
 
     description = "gnomAD"

diff --git a/docs/tutorial.md b/docs/tutorial.md
@@ -617,6 +617,23 @@ and their specifics:
  <SequencePosition, residue T at position 12>]
 ```
 
+Alternatively, and in a very similar fashion, it is possible to use the [gget Python
+package](https://github.com/pachterlab/gget) to obtain short linear motifs definitions.
+The current implementation only considers "regexp" type of predictions from the
+full protein sequence. It should be noted that, unlike when using the ELM webserver
+as detailed above to obtain these data, no filtering is applied. For instance:
+
+```py
+>>> elm = ggetELMPredictions()
+>>> elm.add_sequence_properties(seq,
+	                            exclude_elm_classes="MOD_.")
+```
+
+This can be useful when e.g. the ELM webserver is not available or for when
+many calls in a row are necessary. The ELM webserver requires a minimum 3-minute
+interval between queries which has been baked into the current implementation of
+the ELM data source.
+
 ### Custom annotations
 
 We can further add annotations manually to our dataset. This is for data that

diff --git a/setup.py b/setup.py
@@ -9,8 +9,9 @@
       packages=['cancermuts'],
       install_requires=['requests',
                         'bioservices>=1.10.0',
-      		        'myvariant',
-      		        'pyliftover',
+                        'gget',
+                        'myvariant',
+                        'pyliftover',
                         'Bio',
                         'bravado',
                         'matplotlib',