diff --git a/src/ga4gh/vrs/extras/annotator/vcf.py b/src/ga4gh/vrs/extras/annotator/vcf.py index ed5e4a92..d2bd12d6 100644 --- a/src/ga4gh/vrs/extras/annotator/vcf.py +++ b/src/ga4gh/vrs/extras/annotator/vcf.py @@ -82,43 +82,19 @@ def __init__( self.dp = SeqRepoRESTDataProxy(seqrepo_base_url) self.tlr = AlleleTranslator(self.dp) - @use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.MISSING) - def annotate( - self, - input_vcf_path: Path, - output_vcf_path: Path | None = None, - output_pkl_path: Path | None = None, - vrs_attributes: bool = False, - assembly: str = "GRCh38", - compute_for_ref: bool = True, - require_validation: bool = True, + def _update_vcf_header( + self, vcf: pysam.VariantFile, incl_ref_allele: bool, incl_vrs_attrs: bool ) -> None: - """Given a VCF, produce an output VCF annotated with VRS allele IDs, and/or - a pickle file containing the full VRS objects. + """Add new fields to VCF header - :param input_vcf_path: Location of input VCF - :param output_vcf_path: The path for the output VCF file - :param output_pkl_path: The path for the output VCF pickle file - :param vrs_attributes: If `True`, include VRS_Start, VRS_End, VRS_State - properties in the VCF INFO field. If `False` will not include these - properties. Only used if `vcf_out` is defined. - :param assembly: The assembly used in `vcf_in` data - :param compute_for_ref: If true, compute VRS IDs for the reference allele - :param require_validation: If `True`, validation checks (i.e., REF value - matches expected REF at given location) must pass in order to return a VRS - object for a record. If `False` then VRS object will be returned even if - validation checks fail, although all instances of failed validation are - logged as warnings regardless. - :raise VCFAnnotatorError: if no output formats are selected + :param vcf: pysam VCF object to annotate + :param incl_ref_allele: whether VRS alleles will be calculated for REFs + :param incl_vrs_attrs: whether INFO properties should be defined for VRS attributes + (normalized coordinates/state) """ - if not any((output_vcf_path, output_pkl_path)): - msg = "Must provide one of: `vcf_out` or `vrs_pickle_out`" - raise VCFAnnotatorError(msg) - - info_field_num = "R" if compute_for_ref else "A" - info_field_desc = "REF and ALT" if compute_for_ref else "ALT" + info_field_num = "R" if incl_ref_allele else "A" + info_field_desc = "REF and ALT" if incl_ref_allele else "ALT" - vcf = pysam.VariantFile(filename=str(input_vcf_path.absolute())) vcf.header.info.add( FieldName.IDS_FIELD.value, info_field_num, @@ -135,7 +111,7 @@ def annotate( ("If an error occurred computing a VRS Identifier, the error message"), ) - if vrs_attributes: + if incl_vrs_attrs: vcf.header.info.add( FieldName.STARTS_FIELD.value, info_field_num, @@ -164,11 +140,47 @@ def annotate( ), ) - vcf_out = ( - pysam.VariantFile(str(output_vcf_path.absolute()), "w", header=vcf.header) - if output_vcf_path - else None - ) + @use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.MISSING) + def annotate( + self, + input_vcf_path: Path, + output_vcf_path: Path | None = None, + output_pkl_path: Path | None = None, + vrs_attributes: bool = False, + assembly: str = "GRCh38", + compute_for_ref: bool = True, + require_validation: bool = True, + ) -> None: + """Given a VCF, produce an output VCF annotated with VRS allele IDs, and/or + a pickle file containing the full VRS objects. + + :param input_vcf_path: Location of input VCF + :param output_vcf_path: The path for the output VCF file + :param output_pkl_path: The path for the output VCF pickle file + :param vrs_attributes: If `True`, include VRS_Start, VRS_End, VRS_State + properties in the VCF INFO field. If `False` will not include these + properties. Only used if `vcf_out` is defined. + :param assembly: The assembly used in `vcf_in` data + :param compute_for_ref: If true, compute VRS IDs for the reference allele + :param require_validation: If `True`, validation checks (i.e., REF value + matches expected REF at given location) must pass in order to return a VRS + object for a record. If `False` then VRS object will be returned even if + validation checks fail, although all instances of failed validation are + logged as warnings regardless. + :raise VCFAnnotatorError: if no output formats are selected + """ + if not any((output_vcf_path, output_pkl_path)): + msg = "Must provide one of: `vcf_out` or `vrs_pickle_out`" + raise VCFAnnotatorError(msg) + + vcf = pysam.VariantFile(filename=str(input_vcf_path.absolute())) + if output_vcf_path: + self._update_vcf_header(vcf, compute_for_ref, vrs_attributes) + vcf_out = pysam.VariantFile( + str(output_vcf_path.absolute()), "w", header=vcf.header + ) + else: + vcf_out = None # only retain raw data if dumping to pkl vrs_data = {} if output_pkl_path else None