From 3a3ded40ca45d23faac3aa56226e6078f67a3902 Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 13 Nov 2024 19:25:38 -0500 Subject: [PATCH] Beginning seq context deprecation And replacing `--reference` functionality to fill in symbolic alts --- docs/api/truvari.rst | 8 ----- truvari/__init__.py | 4 --- truvari/bench.py | 5 +-- truvari/collapse.py | 2 +- truvari/comparisons.py | 77 +--------------------------------------- truvari/matching.py | 79 ++++++++++++++++++++++++++++++++---------- 6 files changed, 64 insertions(+), 111 deletions(-) diff --git a/docs/api/truvari.rst b/docs/api/truvari.rst index cf24217e..44271929 100644 --- a/docs/api/truvari.rst +++ b/docs/api/truvari.rst @@ -50,10 +50,6 @@ entry_size_similarity ^^^^^^^^^^^^^^^^^^^^^ .. autofunction:: entry_size_similarity -entry_shared_ref_context -^^^^^^^^^^^^^^^^^^^^^^^^ -.. autofunction:: entry_shared_ref_context - entry_to_hash ^^^^^^^^^^^^^ .. autofunction:: entry_to_hash @@ -100,10 +96,6 @@ compress_index_vcf ^^^^^^^^^^^^^^^^^^ .. autofunction:: compress_index_vcf -create_pos_haplotype -^^^^^^^^^^^^^^^^^^^^ -.. autofunction:: create_pos_haplotype - get_gt ^^^^^^ .. autofunction:: get_gt diff --git a/truvari/__init__.py b/truvari/__init__.py index 4a94883e..04b7d90e 100644 --- a/truvari/__init__.py +++ b/truvari/__init__.py @@ -13,7 +13,6 @@ :meth:`entry_is_present` :meth:`entry_reciprocal_overlap` :meth:`entry_same_variant_type` -:meth:`entry_shared_ref_context` :meth:`entry_seq_similarity` :meth:`entry_size` :meth:`entry_size_similarity` @@ -30,7 +29,6 @@ :meth:`calc_af` :meth:`calc_hwe` :meth:`compress_index_vcf` -:meth:`create_pos_haplotype` :meth:`get_gt` :meth:`get_scalebin` :meth:`get_sizebin` @@ -105,7 +103,6 @@ from truvari.comparisons import ( coords_within, - create_pos_haplotype, entry_boundaries, entry_distance, entry_gt_comp, @@ -113,7 +110,6 @@ entry_is_present, entry_reciprocal_overlap, entry_same_variant_type, - entry_shared_ref_context, entry_seq_similarity, entry_size, entry_size_similarity, diff --git a/truvari/bench.py b/truvari/bench.py index c53d84dc..684d8d33 100644 --- a/truvari/bench.py +++ b/truvari/bench.py @@ -36,7 +36,7 @@ def parse_args(args): parser.add_argument("-o", "--output", type=str, required=True, help="Output directory") parser.add_argument("-f", "--reference", type=str, default=None, - help="Fasta used to call variants. Turns on reference context sequence comparison") + help="Fasta used to call variants. Only needed with symbolic variants.") parser.add_argument("--short", action="store_true", help="Short circuit comparisions. Faster, but fewer annotations") parser.add_argument("--debug", action="store_true", default=False, @@ -59,8 +59,6 @@ def parse_args(args): help="Assume DUP svtypes are INS (%(default)s)") thresg.add_argument("-C", "--chunksize", type=truvari.restricted_int, default=defaults.chunksize, help="Max reference distance to compare calls (%(default)s)") - thresg.add_argument("-B", "--minhaplen", type=truvari.restricted_int, default=defaults.minhaplen, - help="Min haplotype sequence length to create (%(default)s)") genoty = parser.add_argument_group("Genotype Comparison Arguments") genoty.add_argument("--bSample", type=str, default=None, @@ -144,7 +142,6 @@ def check_params(args): logging.error("Include bed %s does not exist", args.includebed) check_fail = True if args.reference: - sys.stderr.write("WARN: `--reference` is no longer recommended and will be deprecated by v5\n") if not os.path.exists(args.reference): logging.error("Reference %s does not exist", args.reference) check_fail = True diff --git a/truvari/collapse.py b/truvari/collapse.py index a41ab49e..041879e3 100644 --- a/truvari/collapse.py +++ b/truvari/collapse.py @@ -489,7 +489,7 @@ def parse_args(args): parser.add_argument("-c", "--collapsed-output", type=str, default="collapsed.vcf", help="Where collapsed variants are written (collapsed.vcf)") parser.add_argument("-f", "--reference", type=str, default=None, - help="Indexed fasta used to call variants") + help="Indexed fasta used to call variants. Only needed with symbolic variants.") parser.add_argument("-k", "--keep", choices=["first", "maxqual", "common"], default="first", help="When collapsing calls, which one to keep (%(default)s)") parser.add_argument("--bed", type=str, default=None, diff --git a/truvari/comparisons.py b/truvari/comparisons.py index 4e064b58..be94cbd4 100644 --- a/truvari/comparisons.py +++ b/truvari/comparisons.py @@ -40,42 +40,6 @@ def coords_within(qstart, qend, rstart, rend, end_within): return qstart >= rstart and ending -def create_pos_haplotype(a1, a2, ref, min_len=0): - """ - Create haplotypes of two allele's regions that are assumed to be overlapping - - :param `a1`: tuple of chrom, start, end, seq - :type `a1`: tuple - :param `a2`: tuple of chrom, start, end, seq - :type `a2`: tuple - :param `ref`: Reference genome - :type `ref`: :class:`pysam.FastaFile` - :param `min_len`: Minimum length of the haplotype sequence to create - :type `min_len`: int, optional - - :return: allele haplotype sequences created - :rtupe: tuple (string, string) - """ - chrom, a1_start, a1_end, a1_seq = a1 - chrom, a2_start, a2_end, a2_seq = a2 - start = min(a1_start, a2_start) - end = max(a1_end, a2_end) - - hap_len1 = (abs(a1_start - start) + len(a1_seq) + abs(a1_end - end)) - hap_len2 = (abs(a2_start - start) + len(a2_seq) + abs(a2_end - end)) - min_size = min(hap_len1, hap_len2) - if min_size < min_len: - start -= (min_len - min_size) // 2 - end += (min_len + min_size) // 2 - # no negative fetch - start = max(0, start) - hap1_seq = ref.fetch(chrom, start, a1_start) + \ - a1_seq + ref.fetch(chrom, a1_end, end) - hap2_seq = ref.fetch(chrom, start, a2_start) + \ - a2_seq + ref.fetch(chrom, a2_end, end) - return str(hap1_seq), str(hap2_seq) - - def entry_boundaries(entry, ins_inflate=False): """ Get the start and end of an entry @@ -204,7 +168,7 @@ def entry_is_present(entry, sample=None, allow_missing=True): truvari.GT.HET, truvari.GT.HOM] -def entry_seq_similarity(entryA, entryB, ref=None, min_len=0): +def entry_seq_similarity(entryA, entryB): """ Calculate sequence similarity of two entries. If reference is not None, compare their shared reference context. Otherwise, use the unroll technique. @@ -213,10 +177,6 @@ def entry_seq_similarity(entryA, entryB, ref=None, min_len=0): :type `entryA`: :class:`pysam.VariantRecord` :param `entryB`: second entry :type `entryB`: :class:`pysam.VariantRecord` - :param `ref`: Reference genome - :type `ref`: :class:`pysam.FastaFile` - :param `min_len`: Minimum length of reference context to generate - :type `min_len`: float, optional :return: sequence similarity :rtype: float @@ -242,11 +202,6 @@ def entry_seq_similarity(entryA, entryB, ref=None, min_len=0): if st_dist == 0 or ed_dist == 0: return seqsim(a_seq, b_seq) - if ref is not None: - allele1, allele2 = entry_shared_ref_context( - entryA, entryB, ref, min_len=min_len) - return seqsim(allele1, allele2) - # Directionality of rolling makes a difference if st_dist < 0: st_dist *= -1 @@ -286,36 +241,6 @@ def entry_reciprocal_overlap(entry1, entry2, ins_inflate=True): return reciprocal_overlap(astart, aend, bstart, bend) -def entry_shared_ref_context(entryA, entryB, ref, use_ref_seq=False, min_len=0): - """ - Get the shared reference context of two entries and create the haplotype - - :param `entryA`: first entry - :type `entryA`: :class:`pysam.VariantRecord` - :param `entryB`: second entry - :type `entryB`: :class:`pysam.VariantRecord` - :param `ref`: Reference genome - :type `ref`: :class:`pysam.FastaFile` - :param `use_ref_seq`: If True, use the reference genome to get the variant sequence instead the entries - :type `use_ref_seq`: bool, optional - :param `min_len`: Minimum length of the reference context to create - :type `min_len`: int, optional - - :return: sequences created - :rtype: tuple : (string, string) - """ - def get_props(entry): - """ - We compare the longer of the ref/alt sequence to increase comparability - """ - if use_ref_seq and (entry.alts[0] == "" or len(entry.alts[0]) < len(entry.ref)): - return entry.chrom, entry.start, entry.stop, ref.fetch(entry.chrom, entry.start, entry.stop) - return entry.chrom, entry.start, entry.stop, entry.alts[0] - a1 = get_props(entryA) - a2 = get_props(entryB) - return create_pos_haplotype(a1, a2, ref, min_len=min_len) - - def entry_same_variant_type(entryA, entryB, dup_to_ins=False): """ Check if entryA svtype == entryB svtype diff --git a/truvari/matching.py b/truvari/matching.py index 5ff891e3..960d1f2d 100644 --- a/truvari/matching.py +++ b/truvari/matching.py @@ -100,7 +100,6 @@ def make_match_params(): params.reference = None params.refdist = 500 params.pctseq = 0.70 - params.minhaplen = 50 params.pctsize = 0.70 params.pctovl = 0.0 params.typeignore = False @@ -128,7 +127,6 @@ def make_match_params_from_args(args): ret.reference = args.reference ret.refdist = args.refdist ret.pctseq = args.pctseq - ret.minhaplen = args.minhaplen ret.pctsize = args.pctsize ret.pctovl = args.pctovl ret.typeignore = args.typeignore @@ -156,7 +154,12 @@ def filter_call(self, entry, base=False): return True if self.params.check_multi and len(entry.alts) > 1: - raise ValueError(f"Cannot compare multi-allelic records. Please split\nline {str(entry)}") + raise ValueError( + f"Cannot compare multi-allelic records. Please split\nline {str(entry)}") + + # Don't compare BNDs, ever + if entry.alleles_variant_types[1] == 'BND': + return True if self.params.passonly and truvari.entry_is_filtered(entry): return True @@ -175,7 +178,6 @@ def filter_call(self, entry, base=False): return False - def build_match(self, base, comp, matid=None, skip_gt=False, short_circuit=False): """ Build a MatchResult @@ -231,8 +233,7 @@ def build_match(self, base, comp, matid=None, skip_gt=False, short_circuit=False return ret if self.params.pctseq > 0: - ret.seqsim = truvari.entry_seq_similarity( - base, comp, self.reference, self.params.minhaplen) + ret.seqsim = truvari.entry_seq_similarity(base, comp) if ret.seqsim < self.params.pctseq: logging.debug("%s and %s sequence similarity is too low (%.3ff)", str(base), str(comp), ret.seqsim) @@ -305,14 +306,24 @@ def chunker(matcher, *files): cur_chunk = defaultdict(list) unresolved_warned = False for key, entry in file_zipper(*files): - if matcher.params.pctseq != 0 and (entry.alts and entry.alts[0].startswith('<')): - if not unresolved_warned: - logging.warning( - "Unresolved SVs (e.g. ALT=) are filtered when `--pctseq != 0`") - unresolved_warned = True + if matcher.filter_call(entry, key == 'base'): cur_chunk['__filtered'].append(entry) call_counts['__filtered'] += 1 continue + + # check symbolic, resolve if needed/possible + if matcher.params.pctseq != 0 and entry.alts[0].startswith('<') and matcher.params.reference: + was_resolved = resolve_sv(entry, + matcher.params.reference, + matcher.params.dup_to_ins) + if not was_resolved: + if not unresolved_warned: + logging.warning("Some symbolic SVs couldn't be resolved") + unresolved_warned = True + cur_chunk['__filtered'].append(entry) + call_counts['__filtered'] += 1 + continue + new_chrom = cur_chrom and entry.chrom != cur_chrom new_chunk = cur_end and cur_end + matcher.params.chunksize < entry.start if new_chunk or new_chrom: @@ -324,14 +335,46 @@ def chunker(matcher, *files): cur_chunk = defaultdict(list) cur_chrom = entry.chrom - if not matcher.filter_call(entry, key == 'base'): - cur_end = max(entry.stop, cur_end) - cur_chunk[key].append(entry) - call_counts[key] += 1 - else: - cur_chunk['__filtered'].append(entry) - call_counts['__filtered'] += 1 + cur_end = max(entry.stop, cur_end) + cur_chunk[key].append(entry) + call_counts[key] += 1 + chunk_count += 1 logging.info("%d chunks of %d variants %s", chunk_count, sum(call_counts.values()), call_counts) yield cur_chunk, chunk_count + + +RC = str.maketrans("ATCG", "TAGC") + + +def do_rc(s): + """ + Reverse complement a sequence + """ + return s.translate(RC)[::-1] + + +def resolve_sv(entry, ref, dup_to_ins=False): + """ + Attempts to resolve an SV's REF/ALT sequences + """ + if ref is None or entry.alts[0] in ['', ''] or entry.start > ref.get_reference_length(entry.chrom): + return False + + seq = ref.fetch(entry.chrom, entry.start, entry.stop) + if entry.alts[0] == '': + entry.ref = seq + entry.alts = [seq[0]] + elif entry.alts[0] == '': + entry.ref = seq + entry.alts = [do_rc(seq)] + elif entry.alts[0] == '' and dup_to_ins: + entry.info['SVTYPE'] = 'INS' + entry.ref = seq[0] + entry.alts = [seq] + entry.stop = entry.start + 1 + else: + return False + + return True