Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change support for MAVIS input in summary module #336

Merged
merged 18 commits into from
Nov 9, 2022
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 42 additions & 20 deletions src/mavis/annotate/file_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import json
import os
import re
from typing import Callable, Dict, List, Optional
from typing import Callable, Dict, List, Optional, TYPE_CHECKING

import pandas as pd
from Bio import SeqIO
Expand All @@ -13,14 +13,20 @@
from ..constants import CODON_SIZE, GIEMSA_STAIN, START_AA, STOP_AA, STRAND, translate
from ..interval import Interval
from ..types import ReferenceAnnotations, ReferenceGenome
from ..util import logger
from ..util import logger, read_bpp_from_input_file
from .base import BioInterval, ReferenceName
from .genomic import Exon, Gene, PreTranscript, Template, Transcript
from .protein import Domain, Translation

if TYPE_CHECKING:
from ..breakpoint import Breakpoint, BreakpointPair

def load_masking_regions(*filepaths: str) -> Dict[str, List[BioInterval]]:

def load_known_sv(*filepaths: str) -> Dict[str, List["BreakpointPair"]]:
"""
loads a standard MAVIS or BED file input to a ist of known breakpoints.
creisle marked this conversation as resolved.
Show resolved Hide resolved

Standard BED file requirements:
reads a file of regions. The expect input format for the file is tab-delimited and
the header should contain the following columns

Expand All @@ -35,26 +41,42 @@ def load_masking_regions(*filepaths: str) -> Dict[str, List[BioInterval]]:

#chr start end name
chr20 25600000 27500000 centromere

Args:
filepath: path to the input tab-delimited file
filepath: path to standard MAVIS format file
Returns:
a dictionary keyed by chromosome name with values of lists of regions on the chromosome
a dictionary with {(bp1_chr,bp2_chr):{BreakpointPair}}
creisle marked this conversation as resolved.
Show resolved Hide resolved
"""
regions: Dict[str, List[BioInterval]] = {}
regions = {}
for filepath in filepaths:
df = pd.read_csv(
filepath, sep='\t', dtype={'chr': str, 'start': int, 'end': int, 'name': str}
)
for col in ['chr', 'start', 'end', 'name']:
if col not in df:
raise KeyError(f'missing required column ({col})')
df['chr'] = df['chr'].apply(lambda c: ReferenceName(c))
for row in df.to_dict('records'):
mask_region = BioInterval(
reference_object=row['chr'], start=row['start'], end=row['end'], name=row['name']
header = set(pd.read_csv(filepath, nrows=1, sep='\t').columns)
mavis_header = {'break1_chromosome', 'break2_chromosome'}
bed_header = {'chr', 'start', 'end', 'name'}
if mavis_header.issubset(header):
bpps = read_bpp_from_input_file(filepath, expand_orient=True, expand_svtype=True)
for bpp in bpps:
chr_list = [bpp.break1.chr, bpp.break2.chr]
regions.setdefault(tuple(chr_list), []).append(bpp)

elif bed_header.issubset(header):
logger.warning('BED file support will be deprecated in future versions')
creisle marked this conversation as resolved.
Show resolved Hide resolved
df = pd.read_csv(
filepath, sep='\t', dtype={'chr': str, 'start': int, 'end': int, 'name': str}
)
regions.setdefault(mask_region.reference_object, []).append(mask_region)
for col in bed_header:
if col not in df:
creisle marked this conversation as resolved.
Show resolved Hide resolved
raise KeyError(f'missing required column ({col})')
df['chr'] = df['chr'].apply(lambda c: ReferenceName(c))
for row in df.to_dict('records'):
known_sv_region = BioInterval(
reference_object=row['chr'],
start=row['start'],
end=row['end'],
name=row['name'],
)
regions.setdefault(known_sv_region.reference_object, []).append(known_sv_region)
else:
logger.warning('No known SV inputs were loaded.')
return {}
return regions


Expand Down Expand Up @@ -344,9 +366,9 @@ class ReferenceFile:
LOAD_FUNCTIONS: Dict[str, Optional[Callable]] = {
'annotations': load_annotations,
'reference_genome': load_reference_genome,
'masking': load_masking_regions,
'masking': load_known_sv,
creisle marked this conversation as resolved.
Show resolved Hide resolved
'template_metadata': load_templates,
'dgv_annotation': load_masking_regions,
'dgv_annotation': load_known_sv,
'aligner_reference': None,
}
"""dict: Mapping of file types (based on ENV name) to load functions"""
Expand Down
4 changes: 4 additions & 0 deletions src/mavis/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,7 @@ class COLUMNS(MavisNamespace):
library: str = 'library'
cluster_id: str = 'cluster_id'
cluster_size: str = 'cluster_size'
dgv: str = 'dgv'
validation_id: str = 'validation_id'
annotation_id: str = 'annotation_id'
product_id: str = 'product_id'
Expand Down Expand Up @@ -463,6 +464,7 @@ class COLUMNS(MavisNamespace):
contig_strand_specific: str = 'contig_strand_specific'
contigs_assembled: str = 'contigs_assembled'
call_sequence_complexity: str = 'call_sequence_complexity'
known_sv_count: str = 'known_sv_count'
spanning_reads: str = 'spanning_reads'
spanning_read_names: str = 'spanning_read_names'
flanking_median_fragment_size: str = 'flanking_median_fragment_size'
Expand Down Expand Up @@ -555,4 +557,6 @@ def sort_columns(input_columns):
COLUMNS.tools,
COLUMNS.tools,
COLUMNS.tracking_id,
COLUMNS.dgv,
COLUMNS.known_sv_count,
}
11 changes: 9 additions & 2 deletions src/mavis/pairing/pairing.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,12 @@ def comparison_distance(
return max_distance


def equivalent(event1: BreakpointPair, event2: BreakpointPair, distances=None) -> bool:
def equivalent(
event1: BreakpointPair, event2: BreakpointPair, distances=None, matching_event_type=True
) -> bool:
"""
compares two events by breakpoint position to see if they are equivalent
creisle marked this conversation as resolved.
Show resolved Hide resolved
matching_event_type specificies whether event type must match
"""

max_distance = comparison_distance(event1, event2, distances)
Expand All @@ -175,10 +178,14 @@ def equivalent(event1: BreakpointPair, event2: BreakpointPair, distances=None) -
[
abs(Interval.dist(event1.break1, event2.break1)) > max_distance,
abs(Interval.dist(event1.break2, event2.break2)) > max_distance,
event1.data[COLUMNS.event_type] != event2.data[COLUMNS.event_type],
]
):
return False
if (
matching_event_type == True
and event1.data[COLUMNS.event_type] != event2.data[COLUMNS.event_type]
creisle marked this conversation as resolved.
Show resolved Hide resolved
):
return False
return True


Expand Down
7 changes: 4 additions & 3 deletions src/mavis/summary/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,16 +276,17 @@ def main(inputs: List[str], output: str, config: Dict, start_time=int(time.time(
COLUMNS.cdna_synon,
COLUMNS.net_size,
COLUMNS.assumed_untemplated,
'dgv',
COLUMNS.dgv,
COLUMNS.known_sv_count,
}

rows = []
for lib in bpps_by_library:
logger.info(f'annotating dgv for {lib}')
if not dgv_annotation.is_empty():
annotate_dgv(
bpps_by_library[lib], dgv_annotation.content, distance=10
) # TODO make distance a parameter
bpps_by_library[lib], dgv_annotation.content, config['summary.cluster_radius']
)
logger.info(f'adding pairing states for {lib}')
for row in bpps_by_library[lib]:
# in case no pairing was done, add default (applicable to single library summaries)
Expand Down
119 changes: 84 additions & 35 deletions src/mavis/summary/summary.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from typing import Dict, List
from typing import Mapping, Dict, List, Tuple

from ..annotate.genomic import Transcript
from ..annotate.genomic import BioInterval, Transcript
from ..breakpoint import Breakpoint, BreakpointPair
from ..constants import CALL_METHOD, COLUMNS, DISEASE_STATUS, PROTOCOL, SVTYPE
from ..interval import Interval
from ..pairing.pairing import pair_by_distance, product_key
from ..pairing.pairing import equivalent, pair_by_distance, product_key
from ..util import get_connected_components
from .constants import PAIRING_STATE
from ..cluster.cluster import merge_breakpoint_pairs
creisle marked this conversation as resolved.
Show resolved Hide resolved
from itertools import chain
creisle marked this conversation as resolved.
Show resolved Hide resolved


def filter_by_annotations(bpp_list: List[BreakpointPair], best_transcripts: Dict[str, Transcript]):
Expand Down Expand Up @@ -190,45 +192,92 @@ def group_by_distance(calls, distances):
return grouped_calls, removed_calls


def annotate_dgv(bpps, dgv_regions_by_reference_name, distance=0):
def annotate_dgv(
creisle marked this conversation as resolved.
Show resolved Hide resolved
bpps: List[BreakpointPair],
dgv_regions_by_reference_name: Dict[Tuple[str], List[BreakpointPair]],
input_cluster_radius,
):
"""
given a list of bpps and a dgv reference, annotate the events that are within the set distance of both breakpoints
Given a list of breakpoint pairs (bpps) and bpps from another reference set), annotate the events that are within the set distance of both breakpoints

Args:
bpps (list) : the list of BreakpointPair objects
dgv_regions_by_reference_name (dict) : the dgv reference regions file loaded by load_masking_regions
distance (int) : the minimum distance required to match a dgv event with a breakpoint
dgv_regions_by_reference_name (dict) : tuple of (break1_chr,break2_chr) and its associated list of BreakpointPair/BioInterval objects specified by the MAVIS input file
cluster_radius (int) : Distance used in matching input SVs to reference SVs through clusterind, defined by summary.cluster_radius in the configuration file
"""
for chrom in dgv_regions_by_reference_name:
dgv_regions_by_reference_name[chrom] = sorted(
dgv_regions_by_reference_name[chrom], key=lambda x: x.start
)

lowest_resolution = max([len(b.break1) for b in bpps]) # only need start res

# only look at the bpps that dgv events could pair to, Intrachromosomal
for bpp in [
b for b in bpps if not b.interchromosomal and b.break1.chr in dgv_regions_by_reference_name
]:
bpp.data['dgv'] = []
for dgv_region in dgv_regions_by_reference_name[bpp.break1.chr]:
dist = abs(Interval.dist(Interval(dgv_region.start), bpp.break1))
if dist > lowest_resolution + distance:
continue
elif (
dist > distance
or abs(Interval.dist(Interval(dgv_region.end), bpp.break2)) > distance
):
continue
refname = dgv_region.reference_object
try:
refname = dgv_region.reference_object.name
except AttributeError:
pass
bpp.data['dgv'].append(
'{}({}:{}-{})'.format(dgv_region.name, refname, dgv_region.start, dgv_region.end)
if isinstance(list(dgv_regions_by_reference_name.values())[0][0], BreakpointPair):
bpps_copy = bpps.copy()
creisle marked this conversation as resolved.
Show resolved Hide resolved
for variant in bpps:
variant.data['dgv'] = False
creisle marked this conversation as resolved.
Show resolved Hide resolved
variant.data['known_sv_count'] = 0
dgv_flattened = list(chain(*dgv_regions_by_reference_name.values()))
for variant in dgv_flattened:
variant.data['dgv'] = True
variant.data['known_sv_count'] = 0
bpps_copy.extend(dgv_flattened)

clusters = merge_breakpoint_pairs(
bpps_copy, cluster_radius=input_cluster_radius
) # all breakpoint pairs (bpps + dgv)

flag_col = '_is_variant'
for variant in bpps:
variant.data[flag_col] = True
variant.data[COLUMNS.dgv] = ''
variant.data[COLUMNS.known_sv_count] = 0

for clustered_bpp_key, clustered_bpp_val in clusters.items():
dgv_tracking_ids = [
bpp.tracking_id for bpp in clustered_bpp_val if bpp.data.get(flag_col) != True
]
dgv_field = ';'.join(sorted(dgv_tracking_ids))
variant_bpp = [bpp for bpp in clustered_bpp_val if bpp.data.get(flag_col) == True]
for variant in variant_bpp:
variant.data[COLUMNS.dgv] = dgv_field
variant.data[COLUMNS.known_sv_count] = len(dgv_tracking_ids)

for bpp in bpps:
if flag_col in bpp.data:
del bpp.data[flag_col]

elif isinstance(list(dgv_regions_by_reference_name.values())[0][0], BioInterval):
creisle marked this conversation as resolved.
Show resolved Hide resolved
for chrom in dgv_regions_by_reference_name:
dgv_regions_by_reference_name[chrom] = sorted(
dgv_regions_by_reference_name[chrom], key=lambda x: x.start
)
bpp.data['dgv'] = ';'.join(bpp.data['dgv'])
lowest_resolution = max([len(b.break1) for b in bpps]) # only need start res
for bpp in bpps:
bpp.data[COLUMNS.dgv] = []
bpp.data[COLUMNS.known_sv_count] = 0
# only look at the bpps that dgv events could pair to, Intrachromosomal
for bpp in [
b
for b in bpps
if not b.interchromosomal and b.break1.chr in dgv_regions_by_reference_name
]:
for dgv_region in dgv_regions_by_reference_name[bpp.break1.chr]:
dist = abs(Interval.dist(Interval(dgv_region.start), bpp.break1))
if dist > lowest_resolution + input_cluster_radius:
continue
elif (
dist > input_cluster_radius
or abs(Interval.dist(Interval(dgv_region.end), bpp.break2))
> input_cluster_radius
):
continue
refname = dgv_region.reference_object
try:
refname = dgv_region.reference_object.name
except AttributeError:
pass
bpp.data[COLUMNS.dgv].append(
'{}({}:{}-{})'.format(
dgv_region.name, refname, dgv_region.start, dgv_region.end
)
)
bpp.data[COLUMNS.known_sv_count] = len(bpp.data[COLUMNS.dgv])
bpp.data[COLUMNS.dgv] = ';'.join(bpp.data[COLUMNS.dgv])


def get_pairing_state(
Expand Down
Loading