Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change support for MAVIS input in summary module #336

Merged
merged 18 commits into from
Nov 9, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ jobs:
run: |
python -m pip install --upgrade pip setuptools wheel
pip install mavis_config pandas snakemake
pip install tabulate==0.8.9
- uses: eWaterCycle/setup-singularity@v6
with:
singularity-version: 3.6.4
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ install_requires =
Shapely>=1.6.4.post1
shortuuid>=0.5.0
svgwrite
tabulate==0.8.9
typing_extensions>=4
setup_requires =
pip>=9.0.0
Expand Down
26 changes: 23 additions & 3 deletions src/mavis/annotate/file_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import json
import os
import re
from typing import Callable, Dict, List, Optional
from typing import Callable, Dict, List, Optional, TYPE_CHECKING

import pandas as pd
from Bio import SeqIO
Expand All @@ -13,11 +13,14 @@
from ..constants import CODON_SIZE, GIEMSA_STAIN, START_AA, STOP_AA, STRAND, translate
from ..interval import Interval
from ..types import ReferenceAnnotations, ReferenceGenome
from ..util import logger
from ..util import logger, read_bpp_from_input_file
from .base import BioInterval, ReferenceName
from .genomic import Exon, Gene, PreTranscript, Template, Transcript
from .protein import Domain, Translation

if TYPE_CHECKING:
from ..breakpoint import Breakpoint, BreakpointPair


def load_masking_regions(*filepaths: str) -> Dict[str, List[BioInterval]]:
"""
Expand Down Expand Up @@ -58,6 +61,23 @@ def load_masking_regions(*filepaths: str) -> Dict[str, List[BioInterval]]:
return regions


def load_known_sv(*filepaths: str) -> Dict[str, List["BreakpointPair"]]:
creisle marked this conversation as resolved.
Show resolved Hide resolved
"""
loads a standard MAVIS file input in
Args:
filepath: path to standard MAVIS format file
Returns:
a dictionary with {(bp1_chr,bp2_chr):{BreakpointPair}}
creisle marked this conversation as resolved.
Show resolved Hide resolved
"""
regions: Dict[(str, str), List[BreakpointPair]] = {}
for filepath in filepaths:
bpps = read_bpp_from_input_file(filepath, expand_orient=True, expand_svtype=True)
for bpp in bpps:
chr_list = [bpp.break1.chr, bpp.break2.chr]
regions.setdefault(tuple(chr_list), []).append(bpp)
return regions


def load_annotations(
*filepaths: str,
reference_genome: Optional[ReferenceGenome] = None,
Expand Down Expand Up @@ -346,7 +366,7 @@ class ReferenceFile:
'reference_genome': load_reference_genome,
'masking': load_masking_regions,
'template_metadata': load_templates,
'dgv_annotation': load_masking_regions,
'dgv_annotation': load_known_sv,
'aligner_reference': None,
}
"""dict: Mapping of file types (based on ENV name) to load functions"""
Expand Down
11 changes: 9 additions & 2 deletions src/mavis/pairing/pairing.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,12 @@ def comparison_distance(
return max_distance


def equivalent(event1: BreakpointPair, event2: BreakpointPair, distances=None) -> bool:
def equivalent(
event1: BreakpointPair, event2: BreakpointPair, distances=None, matching_event_type=True
) -> bool:
"""
compares two events by breakpoint position to see if they are equivalent
creisle marked this conversation as resolved.
Show resolved Hide resolved
matching_event_type specificies whether event type must match
"""

max_distance = comparison_distance(event1, event2, distances)
Expand All @@ -175,10 +178,14 @@ def equivalent(event1: BreakpointPair, event2: BreakpointPair, distances=None) -
[
abs(Interval.dist(event1.break1, event2.break1)) > max_distance,
abs(Interval.dist(event1.break2, event2.break2)) > max_distance,
event1.data[COLUMNS.event_type] != event2.data[COLUMNS.event_type],
]
):
return False
if (
matching_event_type == True
and event1.data[COLUMNS.event_type] != event2.data[COLUMNS.event_type]
creisle marked this conversation as resolved.
Show resolved Hide resolved
):
return False
return True


Expand Down
5 changes: 3 additions & 2 deletions src/mavis/summary/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,15 +277,16 @@ def main(inputs: List[str], output: str, config: Dict, start_time=int(time.time(
COLUMNS.net_size,
COLUMNS.assumed_untemplated,
'dgv',
'known_sv_count',
}

rows = []
for lib in bpps_by_library:
logger.info(f'annotating dgv for {lib}')
if not dgv_annotation.is_empty():
annotate_dgv(
bpps_by_library[lib], dgv_annotation.content, distance=10
) # TODO make distance a parameter
bpps_by_library[lib], dgv_annotation.content, config['summary.cluster_radius']
)
logger.info(f'adding pairing states for {lib}')
for row in bpps_by_library[lib]:
# in case no pairing was done, add default (applicable to single library summaries)
Expand Down
66 changes: 33 additions & 33 deletions src/mavis/summary/summary.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Dict, List
from typing import Mapping, Dict, List, Tuple

from ..annotate.genomic import Transcript
from ..breakpoint import Breakpoint, BreakpointPair
Expand All @@ -7,6 +7,9 @@
from ..pairing.pairing import pair_by_distance, product_key
from ..util import get_connected_components
from .constants import PAIRING_STATE
from ..pairing.pairing import equivalent
zhemingfan marked this conversation as resolved.
Show resolved Hide resolved
from ..cluster.cluster import merge_breakpoint_pairs
creisle marked this conversation as resolved.
Show resolved Hide resolved
from itertools import chain
creisle marked this conversation as resolved.
Show resolved Hide resolved


def filter_by_annotations(bpp_list: List[BreakpointPair], best_transcripts: Dict[str, Transcript]):
Expand Down Expand Up @@ -190,43 +193,40 @@ def group_by_distance(calls, distances):
return grouped_calls, removed_calls


def annotate_dgv(bpps, dgv_regions_by_reference_name, distance=0):
def annotate_dgv(
bpps: List[BreakpointPair],
dgv_regions_by_reference_name: Dict[Tuple[str], List[BreakpointPair]],
input_cluster_radius,
):
"""
given a list of bpps and a dgv reference, annotate the events that are within the set distance of both breakpoints
Given a list of breakpoint pairs (bpps) and bpps from another reference set), annotate the events that are within the set distance of both breakpoints
creisle marked this conversation as resolved.
Show resolved Hide resolved

Args:
bpps (list) : the list of BreakpointPair objects
dgv_regions_by_reference_name (dict) : the dgv reference regions file loaded by load_masking_regions
distance (int) : the minimum distance required to match a dgv event with a breakpoint
dgv_regions_by_reference_name (dict) : tuple of (break1_chr,break2_chr) and its associated list of BreakpointPair objects specified by the MAVIS input file
cluster_radius (int) : Distance used in matching input SVs to reference SVs through clusterind, defined by summary.cluster_radius in the configuration file
creisle marked this conversation as resolved.
Show resolved Hide resolved
"""
for chrom in dgv_regions_by_reference_name:
dgv_regions_by_reference_name[chrom] = sorted(
dgv_regions_by_reference_name[chrom], key=lambda x: x.start
)

lowest_resolution = max([len(b.break1) for b in bpps]) # only need start res

# only look at the bpps that dgv events could pair to, Intrachromosomal
for bpp in [
b for b in bpps if not b.interchromosomal and b.break1.chr in dgv_regions_by_reference_name
]:
for dgv_region in dgv_regions_by_reference_name[bpp.break1.chr]:
dist = abs(Interval.dist(Interval(dgv_region.start), bpp.break1))
if dist > lowest_resolution + distance:
break
elif (
dist > distance
or abs(Interval.dist(Interval(dgv_region.end), bpp.break2)) > distance
):
continue
refname = dgv_region.reference_object
try:
refname = dgv_region.reference_object.name
except AttributeError:
pass
bpp.data['dgv'] = '{}({}:{}-{})'.format(
dgv_region.name, refname, dgv_region.start, dgv_region.end
)
bpps_copy = bpps.copy()
for variant in bpps:
variant.data['dgv'] = False
variant.data['known_sv_count'] = 0
dgv_flattened = list(chain(*dgv_regions_by_reference_name.values()))
creisle marked this conversation as resolved.
Show resolved Hide resolved
for variant in dgv_flattened:
zhemingfan marked this conversation as resolved.
Show resolved Hide resolved
variant.data['dgv'] = True
variant.data['known_sv_count'] = 0
bpps_copy.extend(dgv_flattened)

clusters = merge_breakpoint_pairs(
bpps_copy, cluster_radius=input_cluster_radius
) # all breakpoint pairs (bpps + dgv)

for clustered_bpp_key, clustered_bpp_val in clusters.items():
dgv_tracking_ids = [bpp.tracking_id for bpp in clustered_bpp_val if bpp.data['dgv'] == True]
dgv_field = ','.join(sorted(dgv_tracking_ids))
variant_bpp = [bpp for bpp in clustered_bpp_val if bpp.data['dgv'] == False]
for variant in variant_bpp:
variant.data['dgv'] = dgv_field
zhemingfan marked this conversation as resolved.
Show resolved Hide resolved
variant.data['known_sv_count'] = len(dgv_tracking_ids)


def get_pairing_state(
Expand Down
Loading