Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/sdev 3256 gene list updates pharma cancer predisp #72

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions graphkb/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
from .types import CategoryBaseTermMapping

DEFAULT_LIMIT = 1000

GKB_BASE_URL = "https://graphkb-api.bcgsc.ca/api"
GKB_STAGING_URL = "https://graphkbstaging-api.bcgsc.ca/api"
GKB_DEV_URL = "https://graphkbdev-api.bcgsc.ca/api"
DEFAULT_URL = GKB_BASE_URL

PREFERRED_GENE_SOURCE = "#39:5" # HGNC

BASE_RETURN_PROPERTIES = ['@rid', '@class']

Expand Down Expand Up @@ -61,8 +61,10 @@
ONCOKB_SOURCE_NAME = 'oncokb'
ONCOGENE = 'oncogenic'
TUMOUR_SUPPRESSIVE = 'tumour suppressive'

FUSION_NAMES = ['structural variant', 'fusion']

PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST = ["cancer genome interpreter", "civic"]

BASE_THERAPEUTIC_TERMS = ['therapeutic efficacy', 'eligibility']
# the order here is the order these are applied, the first category matched is returned
RELEVANCE_BASE_TERMS: CategoryBaseTermMapping = [
Expand All @@ -74,6 +76,10 @@
('biological', ['functional effect', 'tumourigenesis', 'predisposing']),
]

CHROMOSOMES_HG38 = [f"chr{i}" for i in range(1, 23)] + ['chrX', 'chrY', 'chrM']
CHROMOSOMES_HG19 = [str(i) for i in range(1, 23)] + ['x', 'y', 'mt']
CHROMOSOMES = CHROMOSOMES_HG38 + CHROMOSOMES_HG19

AMBIGUOUS_AA = ['x', '?', 'X']
AA_3to1_MAPPING = {
'Ala': 'A',
Expand Down
211 changes: 206 additions & 5 deletions graphkb/genes.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
"""
Methods for retrieving gene annotation lists from GraphKB
"""
from typing import Any, Dict, List, cast
from typing import Any, Dict, List, Tuple, cast

from . import GraphKBConnection
from .constants import (
BASE_THERAPEUTIC_TERMS,
CHROMOSOMES,
GENE_RETURN_PROPERTIES,
ONCOGENE,
ONCOKB_SOURCE_NAME,
PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST,
PREFERRED_GENE_SOURCE,
TUMOUR_SUPPRESSIVE,
)
from .match import get_equivalent_features
from .types import Ontology, Statement, Variant
from .util import get_rid, logger
from .vocab import get_terms_set


Expand Down Expand Up @@ -45,8 +50,7 @@ def _get_oncokb_gene_list(


def get_oncokb_oncogenes(conn: GraphKBConnection) -> List[Ontology]:
"""
Gets the list of oncogenes stored in GraphKB derived from OncoKB
"""Gets the list of oncogenes stored in GraphKB derived from OncoKB.

Args:
conn: the graphkb connection object
Expand All @@ -58,8 +62,7 @@ def get_oncokb_oncogenes(conn: GraphKBConnection) -> List[Ontology]:


def get_oncokb_tumour_supressors(conn: GraphKBConnection) -> List[Ontology]:
"""
Gets the list of tumour supressor genes stored in GraphKB derived from OncoKB
"""Gets the list of tumour supressor genes stored in GraphKB derived from OncoKB.

Args:
conn: the graphkb connection object
Expand Down Expand Up @@ -161,3 +164,201 @@ def get_genes_from_variant_types(
),
)
return result


def get_preferred_gene_name(
conn: GraphKBConnection, gene_name: str, source: str = PREFERRED_GENE_SOURCE
) -> str:
"""Preferred gene symbol of a gene or transcript.

Args:
gene_name: the gene name to search features by
ignore_cache (bool, optional): bypass the cache to always force a new request
source: id of the preferred gene symbol source
Returns:
preferred displayName symbol.

Example:
return KRAS for get_preferred_gene_name(conn, 'NM_033360')
return KRAS for get_preferred_gene_name(conn, 'ENSG00000133703.11')
"""
if gene_name in CHROMOSOMES:
logger.error(f"{gene_name} assumed to be a chromosome, not gene")
return ''
eq = get_equivalent_features(conn=conn, gene_name=gene_name)
genes = [m for m in eq if m.get('biotype') == 'gene' and not m.get('deprecated')]
if not genes:
logger.error(f"No genes found for: {gene_name}")
return ''
if source:
source_filtered_genes = [m for m in genes if m.get('source') == source]
if not source_filtered_genes:
logger.error(f"No data from source {source} for {gene_name}")
else:
genes = source_filtered_genes

gene_names = [g['displayName'] for g in genes if g]
if len(gene_names) > 1:
logger.error(
f"Multiple gene names found for: {gene_name} - using {gene_names[0]}, ignoring {gene_names[1:]}"
)
return gene_names[0]


def get_cancer_predisposition_info(conn: GraphKBConnection) -> Tuple[List[str], Dict[str, str]]:
"""
Return two lists from GraphKB, one of cancer predisposition genes and one of associated variants.

GERO-272 - criteria for what counts as a "cancer predisposition" variant

In short:
* Statement 'source' is 'CGL'
* Statement 'relevance' is 'pathogenic'
* gene is gotten from any associated 'PositionalVariant' records

Example: https://graphkb.bcgsc.ca/view/Statement/155:11616

Returns:
genes: list of cancer predisposition genes
variants: dictionary mapping pharmacogenomic variant IDs to variant display names
"""
genes = set()
non_genes = set()
infer_genes = set()
variants = {}

relevance_rids = list(get_terms_set(conn, "cancer predisposition"))

for record in conn.query(
{
"target": "Statement",
"filters": [
{
"evidence": {
"target": "Source",
"filters": {"@rid": get_rid(conn, "Source", "CGL")},
},
"relevance": {
"target": "Vocabulary",
"filters": {"@rid": relevance_rids},
},
}
],
"returnProperties": [
"conditions.@class",
"conditions.@rid",
"conditions.displayName",
"conditions.reference1.biotype",
"conditions.reference1.displayName",
"conditions.reference2.biotype",
"conditions.reference2.displayName",
],
},
ignore_cache=False,
):
for condition in record["conditions"]: # type: ignore
if condition["@class"] == "PositionalVariant":
variants[condition["@rid"]] = condition["displayName"]
for reference in ["reference1", "reference2"]:
name = (condition.get(reference) or {}).get("displayName", "")
biotype = (condition.get(reference) or {}).get("biotype", "")
if name and biotype == "gene":
genes.add(name)
elif name:
gene = get_preferred_gene_name(conn, name)
if gene:
infer_genes.add((gene, name, biotype))
else:
non_genes.add((name, biotype))
logger.error(
f"Non-gene cancer predisposition {biotype}: {name} for {condition['displayName']}"
)

for gene, name, biotype in infer_genes:
logger.debug(f"Found gene '{gene}' for '{name}' ({biotype})")
genes.add(gene)

for name, biotype in non_genes:
logger.error(f"Unable to find gene for '{name}' ({biotype})")

return sorted(genes), variants


def get_pharmacogenomic_info(conn: GraphKBConnection) -> Tuple[List[str], Dict[str, str]]:
"""
Return two lists from GraphKB, one of pharmacogenomic genes and one of associated variants.

SDEV-2733 - criteria for what counts as a "pharmacogenomic" variant

In short:
* Statement 'source' is not 'CGI' or 'CIViC'
* Statement 'relevance' is 'increased toxicity' or 'decreased toxicity'
* gene is gotten from any associated 'PositionalVariant' records

Example: https://graphkb.bcgsc.ca/view/Statement/154:9574

Returns:
genes: list of pharmacogenomic genes
variants: dictionary mapping pharmacogenomic variant IDs to variant display names
"""
genes = set()
non_genes = set()
infer_genes = set()
variants = {}

relevance_rids = list(get_terms_set(conn, "pharmacogenomic"))

for record in conn.query(
{
"target": "Statement",
"filters": [
{
"relevance": {
"target": "Vocabulary",
"filters": {"@rid": relevance_rids},
},
}
],
"returnProperties": [
"conditions.@class",
"conditions.@rid",
"conditions.displayName",
"conditions.reference1.biotype",
"conditions.reference1.displayName",
"conditions.reference2.biotype",
"conditions.reference2.displayName",
"source.name",
],
},
ignore_cache=False,
):
if record["source"]: # type: ignore
if record["source"]["name"].lower() in PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST: # type: ignore
continue

for condition in record["conditions"]: # type: ignore
if condition["@class"] == "PositionalVariant":
variants[condition["@rid"]] = condition["displayName"]
for reference in ["reference1", "reference2"]:
name = (condition.get(reference) or {}).get("displayName", "")
biotype = (condition.get(reference) or {}).get("biotype", "")
if name and biotype == "gene":
genes.add(name)
elif name:
gene = get_preferred_gene_name(conn, name)
if gene:
infer_genes.add((gene, name, biotype))
else:
non_genes.add((name, biotype))
logger.error(
f"Non-gene pharmacogenomic {biotype}: {name} for {condition['displayName']}"
)

for gene, name, biotype in infer_genes:
logger.debug(f"Found gene '{gene}' for '{name}' ({biotype})")
genes.add(gene)

for name, biotype in non_genes:
logger.error(f"Unable to find gene for '{name}' ({biotype})")

return sorted(genes), variants
14 changes: 9 additions & 5 deletions graphkb/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
VARIANT_RETURN_PROPERTIES,
)
from .types import BasicPosition, Ontology, ParsedVariant, PositionalVariant, Record, Variant
from .util import FeatureNotFoundError, convert_to_rid_list, looks_like_rid
from .util import FeatureNotFoundError, convert_to_rid_list, logger, looks_like_rid
from .vocab import get_term_tree

FEATURES_CACHE: Set[str] = set()
Expand All @@ -26,8 +26,7 @@ def get_equivalent_features(
source: str = '',
source_id_version: str = '',
) -> List[Ontology]:
"""
Match an equivalent list of features given some input feature name (or ID)
"""Match an equivalent list of features given some input feature name (or ID).

Args:
gene_name: the gene name to search features by
Expand Down Expand Up @@ -62,14 +61,19 @@ def get_equivalent_features(
if source:
filters.append({'source': {'target': 'Source', 'filters': {'name': source}}})

if gene_name.count('.') == 1 and gene_name.split('.')[-1].isnumeric():
# eg. ENSG00000133703.11 or NM_033360.4
logger.debug(
f"Assuming {gene_name} has a .version_format - ignoring the version for equivalent features"
)
gene_name = gene_name.split('.')[0]

if is_source_id or source_id_version:
filters.append({'sourceId': gene_name})

if source_id_version:
filters.append(
{'OR': [{'sourceIdVersion': source_id_version}, {'sourceIdVersion': None}]}
)

elif FEATURES_CACHE and gene_name.lower() not in FEATURES_CACHE and not ignore_cache:
return []
else:
Expand Down
24 changes: 24 additions & 0 deletions graphkb/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,3 +248,27 @@ def get_source(self, name: str) -> Record:
if len(source) != 1:
raise AssertionError(f'Unable to unqiuely identify source with name {name}')
return source[0]


def get_rid(conn: GraphKBConnection, target: str, name: str) -> str:
"""
Retrieve a record by name and target

Args:
conn: GraphKBConnection
target: record type to query
name: the name of the record to retrieve

Returns:
str: @rid of the record

Raises:
AssertionError: if the term was not found or more than 1 match was found (expected to be unique)
"""
result = conn.query(
{"target": target, "filters": {"name": name}, "returnProperties": ["@rid"]},
ignore_cache=False,
)
assert len(result) == 1, f"unable to find unique '{target}' ID for '{name}'"

return result[0]["@rid"]
6 changes: 2 additions & 4 deletions graphkb/vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,10 +187,8 @@ def get_term_by_name(
def get_terms_set(
graphkb_conn: GraphKBConnection, base_terms: Iterable[str], ignore_cache: bool = False
) -> Set[str]:
"""
Get a set of terms of vocabulary given some base/parent term names. Returns the record
IDs for the resulting terms
"""
"""Get a set of vocabulary rids given some base/parent term names."""
base_terms = [base_terms] if isinstance(base_terms, str) else base_terms
cache_key = tuple(sorted(base_terms))
if graphkb_conn.cache.get(cache_key, None) and not ignore_cache:
return graphkb_conn.cache[cache_key]
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ include_trailing_comma = true
[metadata]
name = graphkb
url = https://github.com/bcgsc/pori_graphkb_python
version = 1.7.0
version = 1.8.0
author_email = graphkb@bcgsc.ca
description = python adapter for interacting with the GraphKB API
long_description = file: README.md
Expand Down
Loading