From 67ff8ecfe1a5995227ec692d6b5daaa9bd523916 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Sun, 24 Apr 2022 12:48:31 -0400 Subject: [PATCH 1/4] Start implementing getting ambiguities --- gilda/grounder.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/gilda/grounder.py b/gilda/grounder.py index 2b4cc01..bef5030 100644 --- a/gilda/grounder.py +++ b/gilda/grounder.py @@ -304,6 +304,12 @@ def get_names(self, db, id, status=None, source=None): names.add(entry.text) return sorted(names) + def get_ambiguities(self, use_exact_name=False, use_status=False): + def is_ambiguous(use_exact_name, use_status): + pass + + return {norm_text: terms for norm_text, terms in self.entries.items()} + class ScoredMatch(object): """Class representing a scored match to a grounding term. From f6ab45ce11d474d5c661f3f5939762e270b5a857 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Mon, 25 Apr 2022 16:36:08 -0400 Subject: [PATCH 2/4] Add function to get ambiguities --- gilda/grounder.py | 76 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 71 insertions(+), 5 deletions(-) diff --git a/gilda/grounder.py b/gilda/grounder.py index bef5030..d75b2d4 100644 --- a/gilda/grounder.py +++ b/gilda/grounder.py @@ -5,7 +5,7 @@ import logging import itertools from collections import defaultdict -from typing import Mapping, Set, Tuple +from typing import List, Mapping, Set, Tuple from adeft.disambiguate import load_disambiguator from adeft.modeling.classify import load_model_info from adeft import available_shortforms as available_adeft_models @@ -304,11 +304,77 @@ def get_names(self, db, id, status=None, source=None): names.add(entry.text) return sorted(names) - def get_ambiguities(self, use_exact_name=False, use_status=False): - def is_ambiguous(use_exact_name, use_status): - pass + def get_ambiguities(self, + skip_names: bool = True, + skip_curated: bool = True, + skip_name_matches: bool = True, + skip_species_ambigs: bool = True) -> List[List[Term]]: + """Return a list of ambiguous term groups in the grounder. - return {norm_text: terms for norm_text, terms in self.entries.items()} + Parameters + ---------- + skip_names : + If True, groups of terms where one has the "name" status are + skipped. This makes sense usually since these are prioritized over + synonyms anyway. + skip_curated : + If True, groups of terms where one has the "curated" status + are skipped. This makes sense usually since these are prioritized + over synonyms anyway. + skip_name_matches : + If True, groups of terms that all share the same standard name + are skipped. This is effective at eliminating spurious ambiguities + due to unresolved cross-references between equivalent terms + in different namespaces. + skip_species_ambigs : + If True, groups of terms that are all genes or proteins, and are + all from different species (one term from each species) are skipped. + This is effective at eliminating ambiguities between orthologous + genes in different species that are usually resolved using the + organism priority list. + """ + ambig_entries = defaultdict(list) + for terms in self.entries.values(): + for term in terms: + # We consider it an ambiguity if the same text entry appears + # multiple times + key = term.text + ambig_entries[key].append(term) + + # It's only an ambiguity if there are two entries at least + ambig_entries = {k: v for k, v in ambig_entries.items() + if len(v) >= 2} + + ambigs = [] + for text, entries in ambig_entries.items(): + dbs = {e.db for e in entries} + db_ids = {(e.db, e.id) for e in entries} + statuses = {e.status for e in entries} + sources = {e.source for e in entries} + names = {e.entry_name for e in entries} + # If the entries all point to the same ID, we skip it + if len(db_ids) <= 1: + continue + # If there is a name in statuses, we skip it because it's + # prioritized + if skip_names and 'name' in statuses: + continue + # We skip curated terms because they are prioritized anyway + if skip_curated and 'curated' in statuses: + continue + # If there is an adeft model already, we skip it + if 'adeft' in sources: + continue + if skip_name_matches: + if len({e.entry_name.lower() for e in entries}) == 1: + continue + if skip_species_ambigs: + if dbs <= {'HGNC', 'UP'} and \ + len({e.organism for e in entries}) == len(entries): + continue + # Everything else is an ambiguity + ambigs.append(entries) + return ambigs class ScoredMatch(object): From b909c1a299f7453f1441d033d974477e725bd9cd Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Mon, 25 Apr 2022 19:19:27 -0400 Subject: [PATCH 3/4] Implement more ways to make a grounder --- gilda/api.py | 43 +++++++++++++++++++++++++++++++++++++++++-- gilda/grounder.py | 18 ++++++++++++------ 2 files changed, 53 insertions(+), 8 deletions(-) diff --git a/gilda/api.py b/gilda/api.py index 0251f37..5ca2344 100644 --- a/gilda/api.py +++ b/gilda/api.py @@ -1,6 +1,8 @@ -__all__ = ['ground', 'get_models', 'get_names'] +__all__ = ['ground', 'get_models', 'get_names', 'get_grounder', 'make_grounder'] -from gilda.grounder import Grounder +from typing import List, Mapping, Union, Optional + +from gilda.grounder import Grounder, Term class GrounderInstance(object): @@ -85,3 +87,40 @@ def get_names(db, id, status=None, source=None): are returned. """ return grounder.get_names(db, id, status=status, source=source) + + +def get_grounder() -> Grounder: + """Initialize and return the default Grounder instance. + + Returns + ------- + : + A Grounder instance whose attributes and methods can be used + directly. + """ + return grounder.get_grounder() + + +def make_grounder( + terms: Union[str, List[Term], Mapping[str, List[Term]]]) -> Grounder: + """Create a custom grounder from a list of Terms. + + Parameters + ---------- + terms : + Specifies the grounding terms that should be loaded in the Grounder. + If str, it is interpreted as a path to a grounding + terms gzipped TSV file which is then loaded. If list, it is assumed to + be a flat list of Terms. If dict, it is assumed to be a grounding terms + dict with normalized entity strings as keys and lists of Term objects + as values. + Default: None + + Returns + ------- + : + A Grounder instance, initialized with either the default terms + loaded from the resource file or a custom set of terms + if the terms argument was specified. + """ + return Grounder(terms=terms) diff --git a/gilda/grounder.py b/gilda/grounder.py index d75b2d4..9ac7b0e 100644 --- a/gilda/grounder.py +++ b/gilda/grounder.py @@ -25,13 +25,14 @@ class Grounder(object): Parameters ---------- - terms : str or dict or None + terms : str or dict or list or None Specifies the grounding terms that should be loaded in the Grounder. If None, the default grounding terms are loaded from the versioned resource folder. If str, it is interpreted as a path to a grounding - terms TSV file which is then loaded. If dict, it is assumed to be - a grounding terms dict with normalized entity strings as keys - and Term objects as values. Default: None + terms gzipped TSV file which is then loaded. If list, it is assumed to + be a flat list of Terms. If dict, it is assumed to be a grounding terms + dict with normalized entity strings as keys and Term objects as values. + Default: None """ def __init__(self, terms=None): if terms is None: @@ -39,11 +40,16 @@ def __init__(self, terms=None): if isinstance(terms, str): self.entries = load_terms_file(terms) + elif isinstance(terms, list): + self.entries = defaultdict(list) + for term in terms: + self.entries[term.norm_text].append(term) + self.entries = dict(self.entries) elif isinstance(terms, dict): self.entries = terms else: - raise TypeError('terms is neither a path nor a normalized' - ' entry name to term dictionary') + raise TypeError('terms is neither a path nor a list of terms,' + 'nor a normalized entry name to term dictionary') self.adeft_disambiguators = load_adeft_models() self.gilda_disambiguators = load_gilda_models() From ff555544807506853f7e3d8b99b4a9d1ab6b6166 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Mon, 25 Apr 2022 19:50:19 -0400 Subject: [PATCH 4/4] Test making custom grounder --- gilda/tests/test_api.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/gilda/tests/test_api.py b/gilda/tests/test_api.py index 5676a98..0d9caea 100644 --- a/gilda/tests/test_api.py +++ b/gilda/tests/test_api.py @@ -1,5 +1,6 @@ from gilda.tests import appreq from gilda.api import * +from gilda.term import Term def test_api_ground(): @@ -48,3 +49,12 @@ def test_organisms(): assert len(matches5) == 1, matches5 assert matches5[0].term.db == 'HGNC', matches5 assert matches5[0].term.id == '11117', matches5 + + +def test_make_grounder(): + grounder = make_grounder([ + Term('a', 'A', 'X', '1', 'A', 'name', 'test'), + Term('b', 'B', 'X', '2', 'B', 'name', 'test') + ]) + assert grounder.ground('a') + assert not grounder.ground('x')