Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend grounder API #91

Merged
merged 4 commits into from
Apr 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 41 additions & 2 deletions gilda/api.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
__all__ = ['ground', 'get_models', 'get_names']
__all__ = ['ground', 'get_models', 'get_names', 'get_grounder', 'make_grounder']

from gilda.grounder import Grounder
from typing import List, Mapping, Union, Optional

from gilda.grounder import Grounder, Term


class GrounderInstance(object):
Expand Down Expand Up @@ -85,3 +87,40 @@ def get_names(db, id, status=None, source=None):
are returned.
"""
return grounder.get_names(db, id, status=status, source=source)


def get_grounder() -> Grounder:
"""Initialize and return the default Grounder instance.

Returns
-------
:
A Grounder instance whose attributes and methods can be used
directly.
"""
return grounder.get_grounder()


def make_grounder(
terms: Union[str, List[Term], Mapping[str, List[Term]]]) -> Grounder:
"""Create a custom grounder from a list of Terms.

Parameters
----------
terms :
Specifies the grounding terms that should be loaded in the Grounder.
If str, it is interpreted as a path to a grounding
terms gzipped TSV file which is then loaded. If list, it is assumed to
be a flat list of Terms. If dict, it is assumed to be a grounding terms
dict with normalized entity strings as keys and lists of Term objects
as values.
Default: None

Returns
-------
:
A Grounder instance, initialized with either the default terms
loaded from the resource file or a custom set of terms
if the terms argument was specified.
"""
return Grounder(terms=terms)
92 changes: 85 additions & 7 deletions gilda/grounder.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import logging
import itertools
from collections import defaultdict
from typing import Mapping, Set, Tuple
from typing import List, Mapping, Set, Tuple
from adeft.disambiguate import load_disambiguator
from adeft.modeling.classify import load_model_info
from adeft import available_shortforms as available_adeft_models
Expand All @@ -25,25 +25,31 @@ class Grounder(object):

Parameters
----------
terms : str or dict or None
terms : str or dict or list or None
Specifies the grounding terms that should be loaded in the Grounder.
If None, the default grounding terms are loaded from the versioned
resource folder. If str, it is interpreted as a path to a grounding
terms TSV file which is then loaded. If dict, it is assumed to be
a grounding terms dict with normalized entity strings as keys
and Term objects as values. Default: None
terms gzipped TSV file which is then loaded. If list, it is assumed to
be a flat list of Terms. If dict, it is assumed to be a grounding terms
dict with normalized entity strings as keys and Term objects as values.
Default: None
"""
def __init__(self, terms=None):
if terms is None:
terms = get_grounding_terms()

if isinstance(terms, str):
self.entries = load_terms_file(terms)
elif isinstance(terms, list):
self.entries = defaultdict(list)
for term in terms:
self.entries[term.norm_text].append(term)
self.entries = dict(self.entries)
elif isinstance(terms, dict):
self.entries = terms
else:
raise TypeError('terms is neither a path nor a normalized'
' entry name to term dictionary')
raise TypeError('terms is neither a path nor a list of terms,'
'nor a normalized entry name to term dictionary')

self.adeft_disambiguators = load_adeft_models()
self.gilda_disambiguators = load_gilda_models()
Expand Down Expand Up @@ -304,6 +310,78 @@ def get_names(self, db, id, status=None, source=None):
names.add(entry.text)
return sorted(names)

def get_ambiguities(self,
skip_names: bool = True,
skip_curated: bool = True,
skip_name_matches: bool = True,
skip_species_ambigs: bool = True) -> List[List[Term]]:
"""Return a list of ambiguous term groups in the grounder.

Parameters
----------
skip_names :
If True, groups of terms where one has the "name" status are
skipped. This makes sense usually since these are prioritized over
synonyms anyway.
skip_curated :
If True, groups of terms where one has the "curated" status
are skipped. This makes sense usually since these are prioritized
over synonyms anyway.
skip_name_matches :
If True, groups of terms that all share the same standard name
are skipped. This is effective at eliminating spurious ambiguities
due to unresolved cross-references between equivalent terms
in different namespaces.
skip_species_ambigs :
If True, groups of terms that are all genes or proteins, and are
all from different species (one term from each species) are skipped.
This is effective at eliminating ambiguities between orthologous
genes in different species that are usually resolved using the
organism priority list.
"""
ambig_entries = defaultdict(list)
for terms in self.entries.values():
for term in terms:
# We consider it an ambiguity if the same text entry appears
# multiple times
key = term.text
ambig_entries[key].append(term)

# It's only an ambiguity if there are two entries at least
ambig_entries = {k: v for k, v in ambig_entries.items()
if len(v) >= 2}

ambigs = []
for text, entries in ambig_entries.items():
dbs = {e.db for e in entries}
db_ids = {(e.db, e.id) for e in entries}
statuses = {e.status for e in entries}
sources = {e.source for e in entries}
names = {e.entry_name for e in entries}
# If the entries all point to the same ID, we skip it
if len(db_ids) <= 1:
continue
# If there is a name in statuses, we skip it because it's
# prioritized
if skip_names and 'name' in statuses:
continue
# We skip curated terms because they are prioritized anyway
if skip_curated and 'curated' in statuses:
continue
# If there is an adeft model already, we skip it
if 'adeft' in sources:
continue
if skip_name_matches:
if len({e.entry_name.lower() for e in entries}) == 1:
continue
if skip_species_ambigs:
if dbs <= {'HGNC', 'UP'} and \
len({e.organism for e in entries}) == len(entries):
continue
# Everything else is an ambiguity
ambigs.append(entries)
return ambigs


class ScoredMatch(object):
"""Class representing a scored match to a grounding term.
Expand Down
10 changes: 10 additions & 0 deletions gilda/tests/test_api.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from gilda.tests import appreq
from gilda.api import *
from gilda.term import Term


def test_api_ground():
Expand Down Expand Up @@ -48,3 +49,12 @@ def test_organisms():
assert len(matches5) == 1, matches5
assert matches5[0].term.db == 'HGNC', matches5
assert matches5[0].term.id == '11117', matches5


def test_make_grounder():
grounder = make_grounder([
Term('a', 'A', 'X', '1', 'A', 'name', 'test'),
Term('b', 'B', 'X', '2', 'B', 'name', 'test')
])
assert grounder.ground('a')
assert not grounder.ground('x')