Skip to content

Commit

Permalink
refactor: move VCF header update to separate method
Browse files Browse the repository at this point in the history
  • Loading branch information
jsstevenson committed Feb 6, 2025
1 parent f44d184 commit 49d2e47
Showing 1 changed file with 51 additions and 39 deletions.
90 changes: 51 additions & 39 deletions src/ga4gh/vrs/extras/annotator/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,43 +82,19 @@ def __init__(
self.dp = SeqRepoRESTDataProxy(seqrepo_base_url)
self.tlr = AlleleTranslator(self.dp)

@use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.MISSING)
def annotate(
self,
input_vcf_path: Path,
output_vcf_path: Path | None = None,
output_pkl_path: Path | None = None,
vrs_attributes: bool = False,
assembly: str = "GRCh38",
compute_for_ref: bool = True,
require_validation: bool = True,
def _update_vcf_header(
self, vcf: pysam.VariantFile, incl_ref_allele: bool, incl_vrs_attrs: bool
) -> None:
"""Given a VCF, produce an output VCF annotated with VRS allele IDs, and/or
a pickle file containing the full VRS objects.
"""Add new fields to VCF header
:param input_vcf_path: Location of input VCF
:param output_vcf_path: The path for the output VCF file
:param output_pkl_path: The path for the output VCF pickle file
:param vrs_attributes: If `True`, include VRS_Start, VRS_End, VRS_State
properties in the VCF INFO field. If `False` will not include these
properties. Only used if `vcf_out` is defined.
:param assembly: The assembly used in `vcf_in` data
:param compute_for_ref: If true, compute VRS IDs for the reference allele
:param require_validation: If `True`, validation checks (i.e., REF value
matches expected REF at given location) must pass in order to return a VRS
object for a record. If `False` then VRS object will be returned even if
validation checks fail, although all instances of failed validation are
logged as warnings regardless.
:raise VCFAnnotatorError: if no output formats are selected
:param vcf: pysam VCF object to annotate
:param incl_ref_allele: whether VRS alleles will be calculated for REFs
:param incl_vrs_attrs: whether INFO properties should be defined for VRS attributes
(normalized coordinates/state)
"""
if not any((output_vcf_path, output_pkl_path)):
msg = "Must provide one of: `vcf_out` or `vrs_pickle_out`"
raise VCFAnnotatorError(msg)

info_field_num = "R" if compute_for_ref else "A"
info_field_desc = "REF and ALT" if compute_for_ref else "ALT"
info_field_num = "R" if incl_ref_allele else "A"
info_field_desc = "REF and ALT" if incl_ref_allele else "ALT"

vcf = pysam.VariantFile(filename=str(input_vcf_path.absolute()))
vcf.header.info.add(
FieldName.IDS_FIELD.value,
info_field_num,
Expand All @@ -135,7 +111,7 @@ def annotate(
("If an error occurred computing a VRS Identifier, the error message"),
)

if vrs_attributes:
if incl_vrs_attrs:
vcf.header.info.add(
FieldName.STARTS_FIELD.value,
info_field_num,
Expand Down Expand Up @@ -164,11 +140,47 @@ def annotate(
),
)

vcf_out = (
pysam.VariantFile(str(output_vcf_path.absolute()), "w", header=vcf.header)
if output_vcf_path
else None
)
@use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.MISSING)
def annotate(
self,
input_vcf_path: Path,
output_vcf_path: Path | None = None,
output_pkl_path: Path | None = None,
vrs_attributes: bool = False,
assembly: str = "GRCh38",
compute_for_ref: bool = True,
require_validation: bool = True,
) -> None:
"""Given a VCF, produce an output VCF annotated with VRS allele IDs, and/or
a pickle file containing the full VRS objects.
:param input_vcf_path: Location of input VCF
:param output_vcf_path: The path for the output VCF file
:param output_pkl_path: The path for the output VCF pickle file
:param vrs_attributes: If `True`, include VRS_Start, VRS_End, VRS_State
properties in the VCF INFO field. If `False` will not include these
properties. Only used if `vcf_out` is defined.
:param assembly: The assembly used in `vcf_in` data
:param compute_for_ref: If true, compute VRS IDs for the reference allele
:param require_validation: If `True`, validation checks (i.e., REF value
matches expected REF at given location) must pass in order to return a VRS
object for a record. If `False` then VRS object will be returned even if
validation checks fail, although all instances of failed validation are
logged as warnings regardless.
:raise VCFAnnotatorError: if no output formats are selected
"""
if not any((output_vcf_path, output_pkl_path)):
msg = "Must provide one of: `vcf_out` or `vrs_pickle_out`"
raise VCFAnnotatorError(msg)

vcf = pysam.VariantFile(filename=str(input_vcf_path.absolute()))
if output_vcf_path:
self._update_vcf_header(vcf, compute_for_ref, vrs_attributes)
vcf_out = pysam.VariantFile(
str(output_vcf_path.absolute()), "w", header=vcf.header
)
else:
vcf_out = None

# only retain raw data if dumping to pkl
vrs_data = {} if output_pkl_path else None
Expand Down

0 comments on commit 49d2e47

Please sign in to comment.