Skip to content

Commit

Permalink
Merge branch 'main' into annotator-dataproxy-interface
Browse files Browse the repository at this point in the history
  • Loading branch information
jsstevenson committed Feb 7, 2025
2 parents 1d275d4 + 0b8702c commit 3610969
Show file tree
Hide file tree
Showing 5 changed files with 120 additions and 83 deletions.
2 changes: 2 additions & 0 deletions src/ga4gh/vrs/dataproxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,8 @@ def create_dataproxy(uri: str | None = None) -> _DataProxy:
* seqrepo+http://localhost:5000/seqrepo
* seqrepo+https://somewhere:5000/seqrepo
:raise ValueError: if URI doesn't match recognized schemes, e.g. is missing provider
prefix (`"seqrepo+"`)
"""
uri = uri or os.environ.get("GA4GH_VRS_DATAPROXY_URI", None)

Expand Down
112 changes: 51 additions & 61 deletions src/ga4gh/vrs/extras/annotator/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@

import logging
import pickle
from enum import Enum
from pathlib import Path

import pysam
from pydantic import ValidationError

from ga4gh.core.identifiers import (
VrsObjectIdentifierIs,
use_ga4gh_compute_identifier_when,
)
from ga4gh.vrs.dataproxy import DataProxyValidationError, _DataProxy
from ga4gh.vrs.dataproxy import _DataProxy
from ga4gh.vrs.extras.translator import AlleleTranslator

_logger = logging.getLogger(__name__)
Expand All @@ -21,29 +21,35 @@ class VCFAnnotatorError(Exception):
"""Custom exceptions for VCF Annotator tool"""


class FieldName(str, Enum):
"""Define VCF field names for VRS annotations"""

IDS_FIELD = "VRS_Allele_IDs"
STARTS_FIELD = "VRS_Starts"
ENDS_FIELD = "VRS_Ends"
STATES_FIELD = "VRS_States"
ERROR_FIELD = "VRS_Error"


# VCF character escape map
VCF_ESCAPE_MAP = str.maketrans(
{
"%": "%25",
";": "%3B",
",": "%2C",
"\r": "%0D",
"\n": "%0A",
}
)


class VCFAnnotator:
"""Annotate VCFs with VRS allele IDs.
Uses pysam to read, store, and (optionally) output VCFs. Alleles are translated
into VRS IDs using the VRS-Python translator class.
"""

# Field names for VCF
VRS_ALLELE_IDS_FIELD = "VRS_Allele_IDs"
VRS_STARTS_FIELD = "VRS_Starts"
VRS_ENDS_FIELD = "VRS_Ends"
VRS_STATES_FIELD = "VRS_States"
VRS_ERROR_FIELD = "VRS_Error"
# VCF character escape map
VCF_ESCAPE_MAP = [ # noqa: RUF012
("%", "%25"),
(";", "%3B"),
(",", "%2C"),
("\r", "%0D"),
("\n", "%0A"),
("\t", "%09"),
]

def __init__(self, data_proxy: _DataProxy) -> None:
"""Initialize the VCFAnnotator class.
Expand Down Expand Up @@ -90,7 +96,7 @@ def annotate(

vcf = pysam.VariantFile(filename=str(input_vcf_path.absolute()))
vcf.header.info.add(
self.VRS_ALLELE_IDS_FIELD,
FieldName.IDS_FIELD.value,
info_field_num,
"String",
(
Expand All @@ -99,15 +105,15 @@ def annotate(
),
)
vcf.header.info.add(
self.VRS_ERROR_FIELD,
FieldName.ERROR_FIELD.value,
".",
"String",
("If an error occurred computing a VRS Identifier, the error message"),
)

if vrs_attributes:
vcf.header.info.add(
self.VRS_STARTS_FIELD,
FieldName.STARTS_FIELD.value,
info_field_num,
"String",
(
Expand All @@ -116,7 +122,7 @@ def annotate(
),
)
vcf.header.info.add(
self.VRS_ENDS_FIELD,
FieldName.ENDS_FIELD.value,
info_field_num,
"String",
(
Expand All @@ -125,7 +131,7 @@ def annotate(
),
)
vcf.header.info.add(
self.VRS_STATES_FIELD,
FieldName.STATES_FIELD.value,
info_field_num,
"String",
(
Expand All @@ -144,12 +150,12 @@ def annotate(
vrs_data = {} if output_pkl_path else None
for record in vcf:
if vcf_out:
additional_info_fields = [self.VRS_ALLELE_IDS_FIELD]
additional_info_fields = [FieldName.IDS_FIELD]
if vrs_attributes:
additional_info_fields += [
self.VRS_STARTS_FIELD,
self.VRS_ENDS_FIELD,
self.VRS_STATES_FIELD,
FieldName.STARTS_FIELD,
FieldName.ENDS_FIELD,
FieldName.STATES_FIELD,
]
else:
# no INFO field names need to be designated if not producing an annotated VCF
Expand All @@ -167,10 +173,9 @@ def annotate(
except Exception as ex:
_logger.exception("VRS error on %s-%s", record.chrom, record.pos)
err_msg = f"{ex}" or f"{type(ex)}"
for search_repl in VCFAnnotator.VCF_ESCAPE_MAP:
err_msg = err_msg.replace(search_repl[0], search_repl[1])
additional_info_fields = [self.VRS_ERROR_FIELD]
vrs_field_data = {self.VRS_ERROR_FIELD: [err_msg]}
err_msg = err_msg.translate(VCF_ESCAPE_MAP)
additional_info_fields = [FieldName.ERROR_FIELD]
vrs_field_data = {FieldName.ERROR_FIELD.value: [err_msg]}

_logger.debug(
"VCF record %s-%s generated vrs_field_data %s",
Expand All @@ -181,7 +186,9 @@ def annotate(

if output_vcf_path and vcf_out:
for k in additional_info_fields:
record.info[k] = [value or "." for value in vrs_field_data[k]]
record.info[k.value] = [
value or "." for value in vrs_field_data[k.value]
]
vcf_out.write(record)

vcf.close()
Expand Down Expand Up @@ -227,41 +234,24 @@ def _get_vrs_object(
assembly_name=assembly,
require_validation=require_validation,
)
except (ValidationError, DataProxyValidationError):
vrs_obj = None
_logger.exception(
"ValidationError when translating %s from gnomad", vcf_coords
)
raise
except KeyError:
vrs_obj = None
_logger.exception("KeyError when translating %s from gnomad", vcf_coords)
raise
except AssertionError:
vrs_obj = None
_logger.exception(
"AssertionError when translating %s from gnomad", vcf_coords
)
raise
except Exception:
vrs_obj = None
_logger.exception(
"Unhandled Exception when translating %s from gnomad", vcf_coords
"Exception encountered during translation of variation: %s", vcf_coords
)
raise
else:
if not vrs_obj:
_logger.debug(
"None was returned when translating %s from gnomad", vcf_coords
)
if vrs_obj is None:
_logger.debug(
"None was returned when translating %s from gnomad", vcf_coords
)

if vrs_data and vrs_obj:
key = vrs_data_key if vrs_data_key else vcf_coords
vrs_data[key] = str(vrs_obj.model_dump(exclude_none=True))

if vrs_field_data:
allele_id = vrs_obj.id if vrs_obj else ""
vrs_field_data[self.VRS_ALLELE_IDS_FIELD].append(allele_id)
vrs_field_data[FieldName.IDS_FIELD].append(allele_id)

if vrs_attributes:
if vrs_obj:
Expand All @@ -275,16 +265,16 @@ def _get_vrs_object(
else:
start = end = alt = ""

vrs_field_data[self.VRS_STARTS_FIELD].append(start)
vrs_field_data[self.VRS_ENDS_FIELD].append(end)
vrs_field_data[self.VRS_STATES_FIELD].append(alt)
vrs_field_data[FieldName.STARTS_FIELD].append(start)
vrs_field_data[FieldName.ENDS_FIELD].append(end)
vrs_field_data[FieldName.STATES_FIELD].append(alt)

def _get_vrs_data(
self,
record: pysam.VariantRecord,
vrs_data: dict | None,
assembly: str,
additional_info_fields: list[str],
additional_info_fields: list[FieldName],
vrs_attributes: bool = False,
compute_for_ref: bool = True,
require_validation: bool = True,
Expand All @@ -307,7 +297,7 @@ def _get_vrs_data(
:return: A dictionary mapping VRS-related INFO fields to lists of associated
values. Will be empty if `create_annotated_vcf` is false.
"""
vrs_field_data = {field: [] for field in additional_info_fields}
vrs_field_data = {field.value: [] for field in additional_info_fields}

# Get VRS data for reference allele
gnomad_loc = f"{record.chrom}-{record.pos}"
Expand All @@ -330,7 +320,7 @@ def _get_vrs_data(
if "*" in allele:
_logger.debug("Star allele found: %s", allele)
for field in additional_info_fields:
vrs_field_data[field].append("")
vrs_field_data[field.value].append("")
else:
self._get_vrs_object(
allele,
Expand Down
22 changes: 0 additions & 22 deletions tests/extras/test_dataproxy.py

This file was deleted.

67 changes: 67 additions & 0 deletions tests/test_dataproxy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import os
import re

import pytest

from ga4gh.vrs.dataproxy import create_dataproxy


@pytest.mark.parametrize("dp", ["rest_dataproxy", "dataproxy"])
@pytest.mark.vcr
def test_data_proxies(dp, request):
dataproxy = request.getfixturevalue(dp)
r = dataproxy.get_metadata("NM_000551.3")
assert r["length"] == 4560
assert "ga4gh:SQ.v_QTc1p-MUYdgrRv4LMT6ByXIOsdw3C_" in r["aliases"]

r = dataproxy.get_metadata("NC_000013.11")
assert r["length"] == 114364328
assert "ga4gh:SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT" in r["aliases"]

seq = dataproxy.get_sequence("ga4gh:SQ.v_QTc1p-MUYdgrRv4LMT6ByXIOsdw3C_")
assert seq.startswith("CCTCGCCTCCGTTACAACGGCCTACGGTGCTGGAGGATCCTTCTGCGCAC")

seq = dataproxy.get_sequence(
"ga4gh:SQ.v_QTc1p-MUYdgrRv4LMT6ByXIOsdw3C_", start=0, end=50
)
assert seq == "CCTCGCCTCCGTTACAACGGCCTACGGTGCTGGAGGATCCTTCTGCGCAC"


def test_data_proxy_configs():
with pytest.raises(
ValueError,
match=re.escape(
"create_dataproxy scheme must include provider (e.g., `seqrepo+http:...`)"
),
):
create_dataproxy("file:///path/to/seqrepo/root")

with pytest.raises(
ValueError,
match=re.escape("SeqRepo URI scheme seqrepo+fake-scheme not implemented"),
):
create_dataproxy("seqrepo+fake-scheme://localhost:5000")

with pytest.raises(
ValueError, match="DataProxy provider fake-dataprovider not implemented"
):
create_dataproxy("fake-dataprovider+http://localhost:5000")

with pytest.raises(
ValueError,
match="No data proxy URI provided or found in GA4GH_VRS_DATAPROXY_URI",
):
create_dataproxy(None)

# check that fallback on env var works
os.environ["GA4GH_VRS_DATAPROXY_URI"] = "seqrepo+:tests/data/seqrepo/latest"
create_dataproxy(None)

# check that method arg takes precedence by passing an invalid arg
with pytest.raises(
ValueError,
match=re.escape(
"create_dataproxy scheme must include provider (e.g., `seqrepo+http:...`)"
),
):
create_dataproxy("file:///path/to/seqrepo/root")

0 comments on commit 3610969

Please sign in to comment.