bcgsc · zhemingfan · Nov 9, 2022 · Oct 18, 2022 · Oct 21, 2022 · Oct 21, 2022
diff --git a/docs/outputs/columns.md b/docs/outputs/columns.md
@@ -610,3 +610,15 @@ non-specific events.
 Flag to indicate if the
 current event was a supplementary call, meaning a call that was
 found as a result of validating another event.
+
+## dgv
+
+**type**: `str`
+
+ID(s) of SVs from dgv database matched to a SV call from the summary step
+
+## known\_sv\_count
+
+**type**: `int`
+
+Number of known SVs matched to a call in the summary step
diff --git a/src/mavis/annotate/file_io.py b/src/mavis/annotate/file_io.py
@@ -4,7 +4,7 @@
 import json
 import os
 import re
-from typing import Callable, Dict, List, Optional
+from typing import Callable, Dict, List, Optional, TYPE_CHECKING
 
 import pandas as pd
 from Bio import SeqIO
@@ -13,14 +13,20 @@
 from ..constants import CODON_SIZE, GIEMSA_STAIN, START_AA, STOP_AA, STRAND, translate
 from ..interval import Interval
 from ..types import ReferenceAnnotations, ReferenceGenome
-from ..util import logger
+from ..util import logger, read_bpp_from_input_file
 from .base import BioInterval, ReferenceName
 from .genomic import Exon, Gene, PreTranscript, Template, Transcript
 from .protein import Domain, Translation
 
+if TYPE_CHECKING:
+    from ..breakpoint import Breakpoint, BreakpointPair
 
-def load_masking_regions(*filepaths: str) -> Dict[str, List[BioInterval]]:
+
+def load_known_sv(*filepaths: str) -> Dict[str, List["BreakpointPair"]]:
     """
+    loads a standard MAVIS or BED file input to a ist of known breakpoints.
+
+    Standard BED file requirements:
     reads a file of regions. The expect input format for the file is tab-delimited and
     the header should contain the following columns
 
@@ -35,26 +41,42 @@ def load_masking_regions(*filepaths: str) -> Dict[str, List[BioInterval]]:
 
         #chr    start       end         name
         chr20   25600000    27500000    centromere
-
     Args:
-        filepath: path to the input tab-delimited file
+        filepath: path to standard MAVIS format file
     Returns:
-        a dictionary keyed by chromosome name with values of lists of regions on the chromosome
+        a dictionary with {(bp1_chr,bp2_chr):{BreakpointPair}}
     """
-    regions: Dict[str, List[BioInterval]] = {}
+    regions = {}
     for filepath in filepaths:
-        df = pd.read_csv(
-            filepath, sep='\t', dtype={'chr': str, 'start': int, 'end': int, 'name': str}
-        )
-        for col in ['chr', 'start', 'end', 'name']:
-            if col not in df:
-                raise KeyError(f'missing required column ({col})')
-        df['chr'] = df['chr'].apply(lambda c: ReferenceName(c))
-        for row in df.to_dict('records'):
-            mask_region = BioInterval(
-                reference_object=row['chr'], start=row['start'], end=row['end'], name=row['name']
+        header = set(pd.read_csv(filepath, nrows=1, sep='\t').columns)
+        mavis_header = {'break1_chromosome', 'break2_chromosome'}
+        bed_header = {'chr', 'start', 'end', 'name'}
+        if mavis_header.issubset(header):
+            bpps = read_bpp_from_input_file(filepath, expand_orient=True, expand_svtype=True)
+            for bpp in bpps:
+                chr_list = [bpp.break1.chr, bpp.break2.chr]
+                regions.setdefault(tuple(chr_list), []).append(bpp)
+
+        elif bed_header.issubset(header):
+            logger.warning('BED file support will be deprecated in future versions')
+            df = pd.read_csv(
+                filepath, sep='\t', dtype={'chr': str, 'start': int, 'end': int, 'name': str}
             )
-            regions.setdefault(mask_region.reference_object, []).append(mask_region)
+            for col in bed_header:
+                if col not in df:
+                    raise KeyError(f'missing required column ({col})')
+            df['chr'] = df['chr'].apply(lambda c: ReferenceName(c))
+            for row in df.to_dict('records'):
+                known_sv_region = BioInterval(
+                    reference_object=row['chr'],
+                    start=row['start'],
+                    end=row['end'],
+                    name=row['name'],
+                )
+                regions.setdefault(known_sv_region.reference_object, []).append(known_sv_region)
+        else:
+            logger.warning('No known SV inputs were loaded.')
+            return {}
     return regions
 
 
@@ -344,9 +366,9 @@ class ReferenceFile:
     LOAD_FUNCTIONS: Dict[str, Optional[Callable]] = {
         'annotations': load_annotations,
         'reference_genome': load_reference_genome,
-        'masking': load_masking_regions,
+        'masking': load_known_sv,
         'template_metadata': load_templates,
-        'dgv_annotation': load_masking_regions,
+        'dgv_annotation': load_known_sv,
         'aligner_reference': None,
     }
     """dict: Mapping of file types (based on ENV name) to load functions"""

diff --git a/src/mavis/constants.py b/src/mavis/constants.py
@@ -386,6 +386,7 @@ class COLUMNS(MavisNamespace):
     library: str = 'library'
     cluster_id: str = 'cluster_id'
     cluster_size: str = 'cluster_size'
+    dgv: str = 'dgv'
     validation_id: str = 'validation_id'
     annotation_id: str = 'annotation_id'
     product_id: str = 'product_id'
@@ -463,6 +464,7 @@ class COLUMNS(MavisNamespace):
     contig_strand_specific: str = 'contig_strand_specific'
     contigs_assembled: str = 'contigs_assembled'
     call_sequence_complexity: str = 'call_sequence_complexity'
+    known_sv_count: str = 'known_sv_count'
     spanning_reads: str = 'spanning_reads'
     spanning_read_names: str = 'spanning_read_names'
     flanking_median_fragment_size: str = 'flanking_median_fragment_size'
@@ -555,4 +557,6 @@ def sort_columns(input_columns):
     COLUMNS.tools,
     COLUMNS.tools,
     COLUMNS.tracking_id,
+    COLUMNS.dgv,
+    COLUMNS.known_sv_count,
 }
diff --git a/src/mavis/pairing/pairing.py b/src/mavis/pairing/pairing.py
@@ -154,9 +154,12 @@ def comparison_distance(
     return max_distance
 
 
-def equivalent(event1: BreakpointPair, event2: BreakpointPair, distances=None) -> bool:
+def equivalent(
+    event1: BreakpointPair, event2: BreakpointPair, distances=None, matching_event_type=True
+) -> bool:
     """
     compares two events by breakpoint position to see if they are equivalent
+    matching_event_type specificies whether event type must match
     """
 
     max_distance = comparison_distance(event1, event2, distances)
@@ -175,10 +178,11 @@ def equivalent(event1: BreakpointPair, event2: BreakpointPair, distances=None) -
         [
             abs(Interval.dist(event1.break1, event2.break1)) > max_distance,
             abs(Interval.dist(event1.break2, event2.break2)) > max_distance,
-            event1.data[COLUMNS.event_type] != event2.data[COLUMNS.event_type],
         ]
     ):
         return False
+    if matching_event_type == True and event1.event_type != event2.event_type:
+        return False
     return True
 
 

diff --git a/src/mavis/summary/main.py b/src/mavis/summary/main.py
@@ -276,16 +276,17 @@ def main(inputs: List[str], output: str, config: Dict, start_time=int(time.time(
         COLUMNS.cdna_synon,
         COLUMNS.net_size,
         COLUMNS.assumed_untemplated,
-        'dgv',
+        COLUMNS.dgv,
+        COLUMNS.known_sv_count,
     }
 
     rows = []
     for lib in bpps_by_library:
         logger.info(f'annotating dgv for {lib}')
         if not dgv_annotation.is_empty():
             annotate_dgv(
-                bpps_by_library[lib], dgv_annotation.content, distance=10
-            )  # TODO make distance a parameter
+                bpps_by_library[lib], dgv_annotation.content, config['summary.cluster_radius']
+            )
         logger.info(f'adding pairing states for {lib}')
         for row in bpps_by_library[lib]:
             # in case no pairing was done, add default (applicable to single library summaries)

diff --git a/src/mavis/summary/summary.py b/src/mavis/summary/summary.py
@@ -1,12 +1,14 @@
-from typing import Dict, List
+from typing import Mapping, Dict, List, Tuple
+from itertools import chain
 
-from ..annotate.genomic import Transcript
+from ..annotate.genomic import BioInterval, Transcript
 from ..breakpoint import Breakpoint, BreakpointPair
 from ..constants import CALL_METHOD, COLUMNS, DISEASE_STATUS, PROTOCOL, SVTYPE
 from ..interval import Interval
-from ..pairing.pairing import pair_by_distance, product_key
+from ..pairing.pairing import equivalent, pair_by_distance, product_key
 from ..util import get_connected_components
 from .constants import PAIRING_STATE
+from ..cluster.cluster import merge_breakpoint_pairs
 
 
 def filter_by_annotations(bpp_list: List[BreakpointPair], best_transcripts: Dict[str, Transcript]):
@@ -190,45 +192,120 @@ def group_by_distance(calls, distances):
     return grouped_calls, removed_calls
 
 
-def annotate_dgv(bpps, dgv_regions_by_reference_name, distance=0):
+def annotate_with_dgv_bpp(
+    bpps: List[BreakpointPair],
+    dgv_regions_list: List[BreakpointPair],
+    input_cluster_radius,
+):
     """
-    given a list of bpps and a dgv reference, annotate the events that are within the set distance of both breakpoints
+    Given a list of breakpoint pairs (bpps) and bpps from another reference set), annotate the events that are within the set distance of both breakpoints
 
     Args:
         bpps (list) : the list of BreakpointPair objects
-        dgv_regions_by_reference_name (dict) : the dgv reference regions file loaded by load_masking_regions
-        distance (int) : the minimum distance required to match a dgv event with a breakpoint
+        dgv_regions_list (list) : list of BreakpointPair objects specified by the MAVIS input file
+        cluster_radius (int) : Distance used in matching input SVs to reference SVs through clusterind, defined by summary.cluster_radius in the configuration file
+    """
+    bpps_copy = bpps.copy()
+    for variant in bpps:
+        variant.data[COLUMNS.dgv] = False
+        variant.data[COLUMNS.known_sv_count] = 0
+    for variant in dgv_regions_list:
+        variant.data[COLUMNS.dgv] = True
+        variant.data[COLUMNS.known_sv_count] = 0
+    bpps_copy.extend(dgv_regions_list)
+
+    clusters = merge_breakpoint_pairs(
+        bpps_copy, cluster_radius=input_cluster_radius
+    )  # all breakpoint pairs (bpps + dgv)
+
+    flag_col = '_is_variant'
+    for variant in bpps:
+        variant.data[flag_col] = True
+        variant.data[COLUMNS.dgv] = ''
+        variant.data[COLUMNS.known_sv_count] = 0
+
+    for clustered_bpp_key, clustered_bpp_val in clusters.items():
+        dgv_tracking_ids = [
+            bpp.tracking_id for bpp in clustered_bpp_val if bpp.data.get(flag_col) != True
+        ]
+        dgv_field = ';'.join(sorted(dgv_tracking_ids))
+        variant_bpp = [bpp for bpp in clustered_bpp_val if bpp.data.get(flag_col) == True]
+        for variant in variant_bpp:
+            variant.data[COLUMNS.dgv] = dgv_field
+            variant.data[COLUMNS.known_sv_count] = len(dgv_tracking_ids)
+
+    for bpp in bpps:
+        if flag_col in bpp.data:
+            del bpp.data[flag_col]
+
+
+def annotate_with_dgv_bed(
+    bpps: List[BreakpointPair],
+    dgv_regions_by_reference_name: Dict[Tuple[str], List[BreakpointPair]],
+    input_cluster_radius,
+):
+    """
+    Given a list of breakpoint pairs (bpps) and bpps from another reference set), annotate the events that are within the set distance of both breakpoints
+
+    Args:
+        bpps (list) : the list of BreakpointPair objects
+        dgv_regions_by_reference_name (dict) : tuple of (break1_chr,break2_chr) and its associated list of BioInterval objects specified by the MAVIS input file
+        cluster_radius (int) : Distance used in matching input SVs to reference SVs through clusterind, defined by summary.cluster_radius in the configuration file
     """
     for chrom in dgv_regions_by_reference_name:
         dgv_regions_by_reference_name[chrom] = sorted(
             dgv_regions_by_reference_name[chrom], key=lambda x: x.start
         )
-
     lowest_resolution = max([len(b.break1) for b in bpps])  # only need start res
-
+    for bpp in bpps:
+        bpp.data[COLUMNS.dgv] = []
+        bpp.data[COLUMNS.known_sv_count] = 0
     # only look at the bpps that dgv events could pair to, Intrachromosomal
     for bpp in [
         b for b in bpps if not b.interchromosomal and b.break1.chr in dgv_regions_by_reference_name
     ]:
-        bpp.data['dgv'] = []
         for dgv_region in dgv_regions_by_reference_name[bpp.break1.chr]:
             dist = abs(Interval.dist(Interval(dgv_region.start), bpp.break1))
-            if dist > lowest_resolution + distance:
+            if dist > lowest_resolution + input_cluster_radius:
                 continue
             elif (
-                dist > distance
-                or abs(Interval.dist(Interval(dgv_region.end), bpp.break2)) > distance
+                dist > input_cluster_radius
+                or abs(Interval.dist(Interval(dgv_region.end), bpp.break2)) > input_cluster_radius
             ):
                 continue
             refname = dgv_region.reference_object
             try:
                 refname = dgv_region.reference_object.name
             except AttributeError:
                 pass
-            bpp.data['dgv'].append(
+            bpp.data[COLUMNS.dgv].append(
                 '{}({}:{}-{})'.format(dgv_region.name, refname, dgv_region.start, dgv_region.end)
             )
-        bpp.data['dgv'] = ';'.join(bpp.data['dgv'])
+            bpp.data[COLUMNS.known_sv_count] = len(bpp.data[COLUMNS.dgv])
+        bpp.data[COLUMNS.dgv] = ';'.join(bpp.data[COLUMNS.dgv])
+
+
+def annotate_dgv(
+    bpps: List[BreakpointPair],
+    dgv_regions_by_reference_name: Dict[Tuple[str], List[BreakpointPair]],
+    input_cluster_radius,
+):
+    """
+    Given a list of breakpoint pairs (bpps) and bpps from another reference set), annotate the events that are within the set distance of both breakpoints
+
+    Args:
+        bpps (list) : the list of BreakpointPair objects
+        dgv_regions_by_reference_name (dict) : tuple of (break1_chr,break2_chr) and its associated list of BreakpointPair/BioInterval objects specified by the MAVIS input file
+        cluster_radius (int) : Distance used in matching input SVs to reference SVs through clusterind, defined by summary.cluster_radius in the configuration file
+    """
+
+    if isinstance(list(dgv_regions_by_reference_name.values())[0][0], BreakpointPair):
+        annotate_with_dgv_bpp(
+            bpps, list(chain(*dgv_regions_by_reference_name.values())), input_cluster_radius
+        )
+
+    elif isinstance(list(dgv_regions_by_reference_name.values())[0][0], BioInterval):
+        annotate_with_dgv_bed(bpps, dgv_regions_by_reference_name, input_cluster_radius)
 
 
 def get_pairing_state(