diff --git a/docs/extras/vcf_annotator.md b/docs/extras/vcf_annotator.md index 81d25c1c..131f609b 100644 --- a/docs/extras/vcf_annotator.md +++ b/docs/extras/vcf_annotator.md @@ -13,44 +13,26 @@ To see the help page: vrs-annotate vcf --help ``` -### Use local SeqRepo Data Proxy with default root directory +### Configuring the sequence data proxy -The tool uses a SeqRepo data proxy. By default, the local instance at `/usr/local/share/seqrepo/latest` is used. +Like other VRS-Python tools, the VCF annotator requires access to [sequence and identifier data services](https://vrs.ga4gh.org/en/stable/impl-guide/required_data.html#data-services), as implemented in libraries like [SeqRepo](https://github.com/biocommons/biocommons.seqrepo). By default, the CLI will attempt to connect to a [SeqRepo REST instance](https://github.com/biocommons/seqrepo-rest-service) at `http://localhost:5000/seqrepo`, but a URI can be passed with the `--dataproxy_uri` option or set with the `GA4GH_VRS_DATAPROXY_URI` environment variable (the former takes priority over the latter). -Example of how to run: +For example, to use a local set of SeqRepo data, you can use an absolute file path: ```commandline -vrs-annotate vcf input.vcf.gz --vcf_out output.vcf.gz --vrs_pickle_out vrs_objects.pkl +vrs-annotate vcf --dataproxy_uri="seqrepo+file:///usr/local/share/seqrepo/2024-02-20/" --vcf_out=out.vcf.gz input.vcf.gz ``` -Pass the path of the input VCF file as the argument to the script. Use either `--vcf_out` to specify the path of the output annotated VCF file, or `--vrs_pickle_out` to specify the path of the output pickle file containing VRS data (both `vcf_out` and `vrs_pickle_out` are optional, but at least one __must__ be provided). - -### Use local SeqRepo Data Proxy with different - -You can change the root directory of SeqRepo by using `seqrepo_root_dir`. - -To use the local SeqRepo data proxy with SeqRepo root directory at `vrs-python/seqrepo/latest`: - -```commandline -vrs-annotate vcf input.vcf.gz --vcf_out output.vcf.gz --vrs_pickle_out vrs_objects.pkl --seqrepo_root_dir vrs-python/seqrepo/latest -``` - -### Use the REST SeqRepo Data Proxy with default base url - -You can change the data proxy type by using: `--seqrepo_dp_type` (options are `local` or `rest`). - -To use the REST SeqRepo data proxy at default url: `http://localhost:5000/seqrepo`: +Alternative, a relative file path: ```commandline -vrs-annotate vcf input.vcf.gz --vcf_out output.vcf.gz --vrs_pickle_out vrs_objects.pkl --seqrepo_dp_type rest +vrs-annotate vcf --dataproxy_uri="seqrepo+../seqrepo/2024-02-20/" --vcf_out=out.vcf.gz input.vcf.gz ``` -### Use the REST SeqRepo Data Proxy with different base url -You can change the SeqRepo REST base url by using: `--seqrepo_base_url`. +Or an alternate REST path: -To use the REST SeqRepo data proxy, at custom url: `http://custom.url:5000/seqrepo`: ```commandline -vrs-annotate vcf input.vcf.gz --vcf_out output.vcf.gz --vrs_pickle_out vrs_objects.pkl --seqrepo_dp_type rest --seqrepo_base_url http://custom.url:5000/seqrepo +vrs-annotate vcf --dataproxy_uri="seqrepo+http://mylabwebsite.org/seqrepo" --vcf_out=out.vcf.gz input.vcf.gz ``` ### Other Options diff --git a/src/ga4gh/vrs/extras/annotator/cli.py b/src/ga4gh/vrs/extras/annotator/cli.py index 80c80c8b..aaf2cad0 100644 --- a/src/ga4gh/vrs/extras/annotator/cli.py +++ b/src/ga4gh/vrs/extras/annotator/cli.py @@ -11,8 +11,10 @@ from timeit import default_timer as timer import click +import requests -from ga4gh.vrs.extras.annotator.vcf import SeqRepoProxyType, VCFAnnotator +from ga4gh.vrs.dataproxy import create_dataproxy +from ga4gh.vrs.extras.annotator.vcf import VCFAnnotator _logger = logging.getLogger(__name__) @@ -98,29 +100,10 @@ def _set_log_level(ctx: dict, param: str, value: _LogLevel) -> None: # noqa: AR help="Include VRS_Start, VRS_End, and VRS_State fields in the VCF output INFO field.", ) @click.option( - "--seqrepo_dp_type", + "--dataproxy_uri", required=False, - default=SeqRepoProxyType.LOCAL, - type=click.Choice( - [v.value for v in SeqRepoProxyType.__members__.values()], case_sensitive=True - ), - help="Specify type of SeqRepo dataproxy to use.", - show_default=True, - show_choices=True, -) -@click.option( - "--seqrepo_root_dir", - required=False, - default=Path("/usr/local/share/seqrepo/latest"), - type=click.Path(path_type=Path), - help="Define root directory for local SeqRepo instance, if --seqrepo_dp_type=local.", - show_default=True, -) -@click.option( - "--seqrepo_base_url", - required=False, - default="http://localhost:5000/seqrepo", - help="Specify base URL for SeqRepo REST API, if --seqrepo_dp_type=rest.", + default="seqrepo+http://localhost:5000/seqrepo", + help="URI declaring source of sequence data. See subcommand description for more information.", show_default=True, ) @click.option( @@ -155,9 +138,7 @@ def _annotate_vcf_cli( vcf_out: Path | None, vrs_pickle_out: Path | None, vrs_attributes: bool, - seqrepo_dp_type: SeqRepoProxyType, - seqrepo_root_dir: Path, - seqrepo_base_url: str, + dataproxy_uri: str, assembly: str, skip_ref: bool, require_validation: bool, @@ -168,10 +149,29 @@ def _annotate_vcf_cli( $ vrs-annotate vcf input.vcf.gz --vcf_out output.vcf.gz --vrs_pickle_out vrs_objects.pkl Note that at least one of --vcf_out or --vrs_pickle_out must be selected and defined. - """ - annotator = VCFAnnotator( - seqrepo_dp_type, seqrepo_base_url, str(seqrepo_root_dir.absolute()) - ) + + Sequence data from a provider such as SeqRepo is required. Use the `--dataproxy_uri` + option or the environment variable `GA4GH_VRS_DATAPROXY_URI` to define its location + (the former will take priority over the latter when both are set). + + Currently accepted URI schemes: + + \b + * seqrepo+http://localhost:5000/seqrepo + * seqrepo+https://somewhere:5000/seqrepo + * seqrepo+file:///path/to/seqrepo/root + * seqrepo+:../relative/path/to/seqrepo/root + """ # noqa: D301 + data_proxy = create_dataproxy(dataproxy_uri) + try: + data_proxy.get_metadata("GRCh38:1") + except requests.exceptions.ConnectionError: + msg = f"Connection to SeqRepo dataproxy at {dataproxy_uri} failed. Is the REST service running?" + _logger.exception(msg) + if not silent: + click.echo(msg, err=True) + exit(1) + annotator = VCFAnnotator(data_proxy) start = timer() msg = f"Annotating {vcf_in} with the VCF Annotator..." _logger.info(msg) diff --git a/src/ga4gh/vrs/extras/annotator/vcf.py b/src/ga4gh/vrs/extras/annotator/vcf.py index c884307a..2755225a 100644 --- a/src/ga4gh/vrs/extras/annotator/vcf.py +++ b/src/ga4gh/vrs/extras/annotator/vcf.py @@ -6,16 +6,12 @@ from pathlib import Path import pysam -from biocommons.seqrepo import SeqRepo from ga4gh.core.identifiers import ( VrsObjectIdentifierIs, use_ga4gh_compute_identifier_when, ) -from ga4gh.vrs.dataproxy import ( - SeqRepoDataProxy, - SeqRepoRESTDataProxy, -) +from ga4gh.vrs.dataproxy import _DataProxy from ga4gh.vrs.extras.translator import AlleleTranslator _logger = logging.getLogger(__name__) @@ -25,13 +21,6 @@ class VCFAnnotatorError(Exception): """Custom exceptions for VCF Annotator tool""" -class SeqRepoProxyType(str, Enum): - """Define constraints for SeqRepo Data Proxy types""" - - LOCAL = "local" - REST = "rest" - - class FieldName(str, Enum): """Define VCF field names for VRS annotations""" @@ -61,24 +50,13 @@ class VCFAnnotator: into VRS IDs using the VRS-Python translator class. """ - def __init__( - self, - seqrepo_dp_type: SeqRepoProxyType = SeqRepoProxyType.LOCAL, - seqrepo_base_url: str = "http://localhost:5000/seqrepo", - seqrepo_root_dir: str = "/usr/local/share/seqrepo/latest", - ) -> None: + def __init__(self, data_proxy: _DataProxy) -> None: """Initialize the VCFAnnotator class. - :param seqrepo_dp_type: The type of SeqRepo Data Proxy to use - (i.e., local vs REST) - :param seqrepo_base_url: The base url for SeqRepo REST API - :param seqrepo_root_dir: The root directory for the local SeqRepo instance + :param data_proxy: GA4GH sequence dataproxy instance. """ - if seqrepo_dp_type == SeqRepoProxyType.LOCAL: - self.dp = SeqRepoDataProxy(SeqRepo(seqrepo_root_dir)) - else: - self.dp = SeqRepoRESTDataProxy(seqrepo_base_url) - self.tlr = AlleleTranslator(self.dp) + self.data_proxy = data_proxy + self.tlr = AlleleTranslator(self.data_proxy) def _update_vcf_header( self, vcf: pysam.VariantFile, incl_ref_allele: bool, incl_vrs_attrs: bool diff --git a/tests/extras/test_annotate_vcf.py b/tests/extras/test_annotate_vcf.py index efceae74..2c6c80be 100644 --- a/tests/extras/test_annotate_vcf.py +++ b/tests/extras/test_annotate_vcf.py @@ -2,20 +2,29 @@ import gzip import logging +import os import re from pathlib import Path import pytest -from ga4gh.vrs.dataproxy import DataProxyValidationError +from ga4gh.vrs.dataproxy import DataProxyValidationError, SeqRepoRESTDataProxy from ga4gh.vrs.extras.annotator.vcf import VCFAnnotator, VCFAnnotatorError TEST_DATA_DIR = Path("tests/extras/data") @pytest.fixture -def vcf_annotator(): - return VCFAnnotator("rest") +def rest_dataproxy_fn_scope(): + """REST dataproxy scoped to individual test functions, rather than the entire session""" + return SeqRepoRESTDataProxy( + base_url=os.environ.get("SEQREPO_REST_URL", "http://localhost:5000/seqrepo") + ) + + +@pytest.fixture +def vcf_annotator(rest_dataproxy_fn_scope: SeqRepoRESTDataProxy): + return VCFAnnotator(rest_dataproxy_fn_scope) @pytest.fixture(scope="session")