gyorilab · bgyori · Apr 26, 2022 · Apr 24, 2022 · Apr 25, 2022 · Apr 25, 2022
diff --git a/gilda/api.py b/gilda/api.py
@@ -1,6 +1,8 @@
-__all__ = ['ground', 'get_models', 'get_names']
+__all__ = ['ground', 'get_models', 'get_names', 'get_grounder', 'make_grounder']
 
-from gilda.grounder import Grounder
+from typing import List, Mapping, Union, Optional
+
+from gilda.grounder import Grounder, Term
 
 
 class GrounderInstance(object):
@@ -85,3 +87,40 @@ def get_names(db, id, status=None, source=None):
         are returned.
     """
     return grounder.get_names(db, id, status=status, source=source)
+
+
+def get_grounder() -> Grounder:
+    """Initialize and return the default Grounder instance.
+
+    Returns
+    -------
+    :
+        A Grounder instance whose attributes and methods can be used
+        directly.
+    """
+    return grounder.get_grounder()
+
+
+def make_grounder(
+        terms: Union[str, List[Term], Mapping[str, List[Term]]]) -> Grounder:
+    """Create a custom grounder from a list of Terms.
+
+    Parameters
+    ----------
+    terms :
+        Specifies the grounding terms that should be loaded in the Grounder.
+        If str, it is interpreted as a path to a grounding
+        terms gzipped TSV file which is then loaded. If list, it is assumed to
+        be a flat list of Terms. If dict, it is assumed to be a grounding terms
+        dict with normalized entity strings as keys and lists of Term objects
+        as values.
+        Default: None
+
+    Returns
+    -------
+    :
+        A Grounder instance, initialized with either the default terms
+        loaded from the resource file or a custom set of terms
+        if the terms argument was specified.
+    """
+    return Grounder(terms=terms)
diff --git a/gilda/grounder.py b/gilda/grounder.py
@@ -5,7 +5,7 @@
 import logging
 import itertools
 from collections import defaultdict
-from typing import Mapping, Set, Tuple
+from typing import List, Mapping, Set, Tuple
 from adeft.disambiguate import load_disambiguator
 from adeft.modeling.classify import load_model_info
 from adeft import available_shortforms as available_adeft_models
@@ -25,25 +25,31 @@ class Grounder(object):
 
     Parameters
     ----------
-    terms : str or dict or None
+    terms : str or dict or list or None
         Specifies the grounding terms that should be loaded in the Grounder.
         If None, the default grounding terms are loaded from the versioned
         resource folder. If str, it is interpreted as a path to a grounding
-        terms TSV file which is then loaded. If dict, it is assumed to be
-        a grounding terms dict with normalized entity strings as keys
-        and Term objects as values. Default: None
+        terms gzipped TSV file which is then loaded. If list, it is assumed to
+        be a flat list of Terms. If dict, it is assumed to be a grounding terms
+        dict with normalized entity strings as keys and Term objects as values.
+        Default: None
     """
     def __init__(self, terms=None):
         if terms is None:
             terms = get_grounding_terms()
 
         if isinstance(terms, str):
             self.entries = load_terms_file(terms)
+        elif isinstance(terms, list):
+            self.entries = defaultdict(list)
+            for term in terms:
+                self.entries[term.norm_text].append(term)
+            self.entries = dict(self.entries)
         elif isinstance(terms, dict):
             self.entries = terms
         else:
-            raise TypeError('terms is neither a path nor a normalized'
-                            ' entry name to term dictionary')
+            raise TypeError('terms is neither a path nor a list of terms,'
+                            'nor a normalized entry name to term dictionary')
 
         self.adeft_disambiguators = load_adeft_models()
         self.gilda_disambiguators = load_gilda_models()
@@ -304,6 +310,78 @@ def get_names(self, db, id, status=None, source=None):
                     names.add(entry.text)
         return sorted(names)
 
+    def get_ambiguities(self,
+                        skip_names: bool = True,
+                        skip_curated: bool = True,
+                        skip_name_matches: bool = True,
+                        skip_species_ambigs: bool = True) -> List[List[Term]]:
+        """Return a list of ambiguous term groups in the grounder.
+
+        Parameters
+        ----------
+        skip_names :
+            If True, groups of terms where one has the "name" status are
+            skipped. This makes sense usually since these are prioritized over
+            synonyms anyway.
+        skip_curated :
+            If True, groups of terms where one has the "curated" status
+            are skipped. This makes sense usually since these are prioritized
+            over synonyms anyway.
+        skip_name_matches :
+            If True, groups of terms that all share the same standard name
+            are skipped. This is effective at eliminating spurious ambiguities
+            due to unresolved cross-references between equivalent terms
+            in different namespaces.
+        skip_species_ambigs :
+            If True, groups of terms that are all genes or proteins, and are
+            all from different species (one term from each species) are skipped.
+            This is effective at eliminating ambiguities between orthologous
+            genes in different species that are usually resolved using the
+            organism priority list.
+        """
+        ambig_entries = defaultdict(list)
+        for terms in self.entries.values():
+            for term in terms:
+                # We consider it an ambiguity if the same text entry appears
+                # multiple times
+                key = term.text
+                ambig_entries[key].append(term)
+
+        # It's only an ambiguity if there are two entries at least
+        ambig_entries = {k: v for k, v in ambig_entries.items()
+                         if len(v) >= 2}
+
+        ambigs = []
+        for text, entries in ambig_entries.items():
+            dbs = {e.db for e in entries}
+            db_ids = {(e.db, e.id) for e in entries}
+            statuses = {e.status for e in entries}
+            sources = {e.source for e in entries}
+            names = {e.entry_name for e in entries}
+            # If the entries all point to the same ID, we skip it
+            if len(db_ids) <= 1:
+                continue
+            # If there is a name in statuses, we skip it because it's
+            # prioritized
+            if skip_names and 'name' in statuses:
+                continue
+            # We skip curated terms because they are prioritized anyway
+            if skip_curated and 'curated' in statuses:
+                continue
+            # If there is an adeft model already, we skip it
+            if 'adeft' in sources:
+                continue
+            if skip_name_matches:
+                if len({e.entry_name.lower() for e in entries}) == 1:
+                    continue
+            if skip_species_ambigs:
+                if dbs <= {'HGNC', 'UP'} and \
+                        len({e.organism for e in entries}) == len(entries):
+                    continue
+            # Everything else is an ambiguity
+            ambigs.append(entries)
+        return ambigs
+
 
 class ScoredMatch(object):
     """Class representing a scored match to a grounding term.

diff --git a/gilda/tests/test_api.py b/gilda/tests/test_api.py
@@ -1,5 +1,6 @@
 from gilda.tests import appreq
 from gilda.api import *
+from gilda.term import Term
 
 
 def test_api_ground():
@@ -48,3 +49,12 @@ def test_organisms():
     assert len(matches5) == 1, matches5
     assert matches5[0].term.db == 'HGNC', matches5
     assert matches5[0].term.id == '11117', matches5
+
+
+def test_make_grounder():
+    grounder = make_grounder([
+        Term('a', 'A', 'X', '1', 'A', 'name', 'test'),
+        Term('b', 'B', 'X', '2', 'B', 'name', 'test')
+    ])
+    assert grounder.ground('a')
+    assert not grounder.ground('x')