bcgsc · dustinbleile · Dec 8, 2022 · Oct 19, 2022 · Oct 19, 2022 · Oct 19, 2022
diff --git a/graphkb/constants.py b/graphkb/constants.py
@@ -3,12 +3,12 @@
 from .types import CategoryBaseTermMapping
 
 DEFAULT_LIMIT = 1000
-
 GKB_BASE_URL = "https://graphkb-api.bcgsc.ca/api"
 GKB_STAGING_URL = "https://graphkbstaging-api.bcgsc.ca/api"
 GKB_DEV_URL = "https://graphkbdev-api.bcgsc.ca/api"
 DEFAULT_URL = GKB_BASE_URL
 
+PREFERRED_GENE_SOURCE = "#39:5"  # HGNC
 
 BASE_RETURN_PROPERTIES = ['@rid', '@class']
 
@@ -61,8 +61,10 @@
 ONCOKB_SOURCE_NAME = 'oncokb'
 ONCOGENE = 'oncogenic'
 TUMOUR_SUPPRESSIVE = 'tumour suppressive'
-
 FUSION_NAMES = ['structural variant', 'fusion']
+
+PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST = ["cancer genome interpreter", "civic"]
+
 BASE_THERAPEUTIC_TERMS = ['therapeutic efficacy', 'eligibility']
 # the order here is the order these are applied, the first category matched is returned
 RELEVANCE_BASE_TERMS: CategoryBaseTermMapping = [
@@ -74,6 +76,10 @@
     ('biological', ['functional effect', 'tumourigenesis', 'predisposing']),
 ]
 
+CHROMOSOMES_HG38 = [f"chr{i}" for i in range(1, 23)] + ['chrX', 'chrY', 'chrM']
+CHROMOSOMES_HG19 = [str(i) for i in range(1, 23)] + ['x', 'y', 'mt']
+CHROMOSOMES = CHROMOSOMES_HG38 + CHROMOSOMES_HG19
+
 AMBIGUOUS_AA = ['x', '?', 'X']
 AA_3to1_MAPPING = {
     'Ala': 'A',

diff --git a/graphkb/genes.py b/graphkb/genes.py
@@ -1,17 +1,22 @@
 """
 Methods for retrieving gene annotation lists from GraphKB
 """
-from typing import Any, Dict, List, cast
+from typing import Any, Dict, List, Tuple, cast
 
 from . import GraphKBConnection
 from .constants import (
     BASE_THERAPEUTIC_TERMS,
+    CHROMOSOMES,
     GENE_RETURN_PROPERTIES,
     ONCOGENE,
     ONCOKB_SOURCE_NAME,
+    PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST,
+    PREFERRED_GENE_SOURCE,
     TUMOUR_SUPPRESSIVE,
 )
+from .match import get_equivalent_features
 from .types import Ontology, Statement, Variant
+from .util import get_rid, logger
 from .vocab import get_terms_set
 
 
@@ -45,8 +50,7 @@ def _get_oncokb_gene_list(
 
 
 def get_oncokb_oncogenes(conn: GraphKBConnection) -> List[Ontology]:
-    """
-    Gets the list of oncogenes stored in GraphKB derived from OncoKB
+    """Gets the list of oncogenes stored in GraphKB derived from OncoKB.
 
     Args:
         conn: the graphkb connection object
@@ -58,8 +62,7 @@ def get_oncokb_oncogenes(conn: GraphKBConnection) -> List[Ontology]:
 
 
 def get_oncokb_tumour_supressors(conn: GraphKBConnection) -> List[Ontology]:
-    """
-    Gets the list of tumour supressor genes stored in GraphKB derived from OncoKB
+    """Gets the list of tumour supressor genes stored in GraphKB derived from OncoKB.
 
     Args:
         conn: the graphkb connection object
@@ -161,3 +164,201 @@ def get_genes_from_variant_types(
         ),
     )
     return result
+
+
+def get_preferred_gene_name(
+    conn: GraphKBConnection, gene_name: str, source: str = PREFERRED_GENE_SOURCE
+) -> str:
+    """Preferred gene symbol of a gene or transcript.
+
+    Args:
+        gene_name: the gene name to search features by
+        ignore_cache (bool, optional): bypass the cache to always force a new request
+        source: id of the preferred gene symbol source
+    Returns:
+        preferred displayName symbol.
+
+    Example:
+        return KRAS for get_preferred_gene_name(conn, 'NM_033360')
+        return KRAS for get_preferred_gene_name(conn, 'ENSG00000133703.11')
+    """
+    if gene_name in CHROMOSOMES:
+        logger.error(f"{gene_name} assumed to be a chromosome, not gene")
+        return ''
+    eq = get_equivalent_features(conn=conn, gene_name=gene_name)
+    genes = [m for m in eq if m.get('biotype') == 'gene' and not m.get('deprecated')]
+    if not genes:
+        logger.error(f"No genes found for: {gene_name}")
+        return ''
+    if source:
+        source_filtered_genes = [m for m in genes if m.get('source') == source]
+        if not source_filtered_genes:
+            logger.error(f"No data from source {source} for {gene_name}")
+        else:
+            genes = source_filtered_genes
+
+    gene_names = [g['displayName'] for g in genes if g]
+    if len(gene_names) > 1:
+        logger.error(
+            f"Multiple gene names found for: {gene_name} - using {gene_names[0]}, ignoring {gene_names[1:]}"
+        )
+    return gene_names[0]
+
+
+def get_cancer_predisposition_info(conn: GraphKBConnection) -> Tuple[List[str], Dict[str, str]]:
+    """
+    Return two lists from GraphKB, one of cancer predisposition genes and one of associated variants.
+
+    GERO-272 - criteria for what counts as a "cancer predisposition" variant
+
+    In short:
+    * Statement 'source' is 'CGL'
+    * Statement 'relevance' is 'pathogenic'
+    * gene is gotten from any associated 'PositionalVariant' records
+
+    Example: https://graphkb.bcgsc.ca/view/Statement/155:11616
+
+    Returns:
+        genes: list of cancer predisposition genes
+        variants: dictionary mapping pharmacogenomic variant IDs to variant display names
+    """
+    genes = set()
+    non_genes = set()
+    infer_genes = set()
+    variants = {}
+
+    relevance_rids = list(get_terms_set(conn, "cancer predisposition"))
+
+    for record in conn.query(
+        {
+            "target": "Statement",
+            "filters": [
+                {
+                    "evidence": {
+                        "target": "Source",
+                        "filters": {"@rid": get_rid(conn, "Source", "CGL")},
+                    },
+                    "relevance": {
+                        "target": "Vocabulary",
+                        "filters": {"@rid": relevance_rids},
+                    },
+                }
+            ],
+            "returnProperties": [
+                "conditions.@class",
+                "conditions.@rid",
+                "conditions.displayName",
+                "conditions.reference1.biotype",
+                "conditions.reference1.displayName",
+                "conditions.reference2.biotype",
+                "conditions.reference2.displayName",
+            ],
+        },
+        ignore_cache=False,
+    ):
+        for condition in record["conditions"]:  # type: ignore
+            if condition["@class"] == "PositionalVariant":
+                variants[condition["@rid"]] = condition["displayName"]
+                for reference in ["reference1", "reference2"]:
+                    name = (condition.get(reference) or {}).get("displayName", "")
+                    biotype = (condition.get(reference) or {}).get("biotype", "")
+                    if name and biotype == "gene":
+                        genes.add(name)
+                    elif name:
+                        gene = get_preferred_gene_name(conn, name)
+                        if gene:
+                            infer_genes.add((gene, name, biotype))
+                        else:
+                            non_genes.add((name, biotype))
+                            logger.error(
+                                f"Non-gene cancer predisposition {biotype}: {name} for {condition['displayName']}"
+                            )
+
+    for gene, name, biotype in infer_genes:
+        logger.debug(f"Found gene '{gene}' for '{name}' ({biotype})")
+        genes.add(gene)
+
+    for name, biotype in non_genes:
+        logger.error(f"Unable to find gene for '{name}' ({biotype})")
+
+    return sorted(genes), variants
+
+
+def get_pharmacogenomic_info(conn: GraphKBConnection) -> Tuple[List[str], Dict[str, str]]:
+    """
+    Return two lists from GraphKB, one of pharmacogenomic genes and one of associated variants.
+
+    SDEV-2733 - criteria for what counts as a "pharmacogenomic" variant
+
+    In short:
+    * Statement 'source' is not 'CGI' or 'CIViC'
+    * Statement 'relevance' is 'increased toxicity' or 'decreased toxicity'
+    * gene is gotten from any associated 'PositionalVariant' records
+
+    Example: https://graphkb.bcgsc.ca/view/Statement/154:9574
+
+    Returns:
+        genes: list of pharmacogenomic genes
+        variants: dictionary mapping pharmacogenomic variant IDs to variant display names
+    """
+    genes = set()
+    non_genes = set()
+    infer_genes = set()
+    variants = {}
+
+    relevance_rids = list(get_terms_set(conn, "pharmacogenomic"))
+
+    for record in conn.query(
+        {
+            "target": "Statement",
+            "filters": [
+                {
+                    "relevance": {
+                        "target": "Vocabulary",
+                        "filters": {"@rid": relevance_rids},
+                    },
+                }
+            ],
+            "returnProperties": [
+                "conditions.@class",
+                "conditions.@rid",
+                "conditions.displayName",
+                "conditions.reference1.biotype",
+                "conditions.reference1.displayName",
+                "conditions.reference2.biotype",
+                "conditions.reference2.displayName",
+                "source.name",
+            ],
+        },
+        ignore_cache=False,
+    ):
+        if record["source"]:  # type: ignore
+            if record["source"]["name"].lower() in PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST:  # type: ignore
+                continue
+
+        for condition in record["conditions"]:  # type: ignore
+            if condition["@class"] == "PositionalVariant":
+                variants[condition["@rid"]] = condition["displayName"]
+                for reference in ["reference1", "reference2"]:
+                    name = (condition.get(reference) or {}).get("displayName", "")
+                    biotype = (condition.get(reference) or {}).get("biotype", "")
+                    if name and biotype == "gene":
+                        genes.add(name)
+                    elif name:
+                        gene = get_preferred_gene_name(conn, name)
+                        if gene:
+                            infer_genes.add((gene, name, biotype))
+                        else:
+                            non_genes.add((name, biotype))
+                            logger.error(
+                                f"Non-gene pharmacogenomic {biotype}: {name} for {condition['displayName']}"
+                            )
+
+    for gene, name, biotype in infer_genes:
+        logger.debug(f"Found gene '{gene}' for '{name}' ({biotype})")
+        genes.add(gene)
+
+    for name, biotype in non_genes:
+        logger.error(f"Unable to find gene for '{name}' ({biotype})")
+
+    return sorted(genes), variants
diff --git a/graphkb/match.py b/graphkb/match.py
@@ -12,7 +12,7 @@
     VARIANT_RETURN_PROPERTIES,
 )
 from .types import BasicPosition, Ontology, ParsedVariant, PositionalVariant, Record, Variant
-from .util import FeatureNotFoundError, convert_to_rid_list, looks_like_rid
+from .util import FeatureNotFoundError, convert_to_rid_list, logger, looks_like_rid
 from .vocab import get_term_tree
 
 FEATURES_CACHE: Set[str] = set()
@@ -26,8 +26,7 @@ def get_equivalent_features(
     source: str = '',
     source_id_version: str = '',
 ) -> List[Ontology]:
-    """
-    Match an equivalent list of features given some input feature name (or ID)
+    """Match an equivalent list of features given some input feature name (or ID).
 
     Args:
         gene_name: the gene name to search features by
@@ -62,14 +61,19 @@ def get_equivalent_features(
     if source:
         filters.append({'source': {'target': 'Source', 'filters': {'name': source}}})
 
+    if gene_name.count('.') == 1 and gene_name.split('.')[-1].isnumeric():
+        # eg. ENSG00000133703.11 or NM_033360.4
+        logger.debug(
+            f"Assuming {gene_name} has a .version_format - ignoring the version for equivalent features"
+        )
+        gene_name = gene_name.split('.')[0]
+
     if is_source_id or source_id_version:
         filters.append({'sourceId': gene_name})
-
         if source_id_version:
             filters.append(
                 {'OR': [{'sourceIdVersion': source_id_version}, {'sourceIdVersion': None}]}
             )
-
     elif FEATURES_CACHE and gene_name.lower() not in FEATURES_CACHE and not ignore_cache:
         return []
     else:

diff --git a/graphkb/util.py b/graphkb/util.py
@@ -248,3 +248,27 @@ def get_source(self, name: str) -> Record:
         if len(source) != 1:
             raise AssertionError(f'Unable to unqiuely identify source with name {name}')
         return source[0]
+
+
+def get_rid(conn: GraphKBConnection, target: str, name: str) -> str:
+    """
+    Retrieve a record by name and target
+
+    Args:
+        conn: GraphKBConnection
+        target: record type to query
+        name: the name of the record to retrieve
+
+    Returns:
+        str: @rid of the record
+
+    Raises:
+        AssertionError: if the term was not found or more than 1 match was found (expected to be unique)
+    """
+    result = conn.query(
+        {"target": target, "filters": {"name": name}, "returnProperties": ["@rid"]},
+        ignore_cache=False,
+    )
+    assert len(result) == 1, f"unable to find unique '{target}' ID for '{name}'"
+
+    return result[0]["@rid"]
diff --git a/graphkb/vocab.py b/graphkb/vocab.py
@@ -187,10 +187,8 @@ def get_term_by_name(
 def get_terms_set(
     graphkb_conn: GraphKBConnection, base_terms: Iterable[str], ignore_cache: bool = False
 ) -> Set[str]:
-    """
-    Get a set of terms of vocabulary given some base/parent term names. Returns the record
-    IDs for the resulting terms
-    """
+    """Get a set of vocabulary rids given some base/parent term names."""
+    base_terms = [base_terms] if isinstance(base_terms, str) else base_terms
     cache_key = tuple(sorted(base_terms))
     if graphkb_conn.cache.get(cache_key, None) and not ignore_cache:
         return graphkb_conn.cache[cache_key]

diff --git a/setup.cfg b/setup.cfg
@@ -10,7 +10,7 @@ include_trailing_comma = true
 [metadata]
 name = graphkb
 url = https://github.com/bcgsc/pori_graphkb_python
-version = 1.7.0
+version = 1.8.0
 author_email = graphkb@bcgsc.ca
 description = python adapter for interacting with the GraphKB API
 long_description = file: README.md