Skip to content

Commit

Permalink
Merge pull request #52 from yhoogstrate/breakpoint_entropy
Browse files Browse the repository at this point in the history
Breakpoint entropy
  • Loading branch information
yhoogstrate authored Apr 12, 2017
2 parents 6e06567 + e343f27 commit f746ff1
Show file tree
Hide file tree
Showing 27 changed files with 279 additions and 131 deletions.
4 changes: 4 additions & 0 deletions Changelog
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
2017-04-12 Youri Hoogstrate v0.6.0
* Changed behavior of pruning: less post-merging steps necessary
* Added two extra variables that can be used for classification

2017-04-05 Youri Hoogstrate v0.5.0
* Large improvements in parsing junctions from reads and much better
performance on circRNAs
Expand Down
105 changes: 98 additions & 7 deletions drdisco/IntronDecomposition.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,15 @@ def merge_frequency_tables(frequency_tables):
return new_frequency_table


def list_to_freq_table(as_list):
table = {}
for val in as_list:
if val not in table:
table[val] = 0
table[val] += 1
return table


def bam_parse_alignment_offset(cigartuple):
pos = 0
for chunk in cigartuple:
Expand Down Expand Up @@ -384,7 +393,11 @@ def __init__(self, _origin, _target):
self._types = {}
self._unique_alignment_hashes = {} # Used for determining entropy

def get_entropy(self):
# Used for determining entropy / rmsqd:
self._unique_breakpoints = {True: ({}, {}), # discordant mates: (origin , target)
False: ({}, {})} # other types: (origin, target)

def get_entropy_of_alignments(self):
"""Entropy is an important metric / propery of an edge
The assumption is that all reads should be 'different', i.e.
have a different start position, different end position,
Expand Down Expand Up @@ -438,6 +451,35 @@ def get_entropy(self):

return entropy(self._unique_alignment_hashes)

def get_distances_to_breakpoints(self, distances_of_discordant_mates):
"""
In the following case all (non-disco) breakpoints are nearly identical:
[==========> |
[========> |
[=========> |
[==========> |
[=========> |
and here quite variable:
[=========> |
[=======> |
[=========> |
[===========> |
[==========> |
Here we return the dist to the median of each of the values (0,0,0,0,0) for top example, (-1, -1, 0, 2, 2) for bottom
"""

medians = (self._origin.position.pos, self._target.position.pos)
dists = []
for i in [0, 1]:
for key in self._unique_breakpoints[distances_of_discordant_mates][i]:
dists += self._unique_breakpoints[distances_of_discordant_mates][i][key] * [medians[i] - key]

return dists

def get_complement(self):
try:
return self._target.edges[self._origin.position._hash]
Expand All @@ -450,6 +492,16 @@ def merge_edge(self, edge):
for alignment_key in edge._unique_alignment_hashes:
self.add_alignment_key(alignment_key)

def update_pos(pos, i, is_discordant_mates, n):
if pos not in self._unique_breakpoints[is_discordant_mates][i]:
self._unique_breakpoints[is_discordant_mates][i][pos] = 0
self._unique_breakpoints[is_discordant_mates][i][pos] += n

for key in [True, False]:
for i in [0, 1]:
for breakpoint in edge._unique_breakpoints[key][i]:
update_pos(breakpoint, i, key, edge._unique_breakpoints[key][i][breakpoint])

for _type in edge._types:
if _type != JunctionTypes.silent_mate:
self.add_type(_type, edge._types[_type])
Expand All @@ -466,6 +518,16 @@ def add_alignment_key(self, alignment_key):
else:
self._unique_alignment_hashes[alignment_key] = 1

def add_break_pos(self, pos1, pos2, is_discordant_mates):
def update_pos(pos, i):
if pos not in self._unique_breakpoints[is_discordant_mates][i]:
self._unique_breakpoints[is_discordant_mates][i][pos] = 1
else:
self._unique_breakpoints[is_discordant_mates][i][pos] += 1

update_pos(pos1.pos, 0)
update_pos(pos2.pos, 1)

def get_count(self, _type):
if _type in self._types:
return self._types[_type]
Expand Down Expand Up @@ -576,8 +638,9 @@ def insert_edge(self, pos1, pos2, _type, cigarstrs):
node2.insert_edge(edge)

edge.add_type(_type, 1)
if node1 == edge._origin: # Avoid double insertion of all keys :) only do it if the positions don't get swapped
if node1 == edge._origin and _type != JunctionTypes.cigar_splice_junction: # Avoid double insertion of all keys :) only do it if the positions don't get swapped
edge.add_alignment_key(cigarstrs)
edge.add_break_pos(pos1, pos2, (_type == JunctionTypes.discordant_mates))

def reinsert_edges(self, edges):
"""Only works for Edges of which the _origin and _target Node
Expand Down Expand Up @@ -657,7 +720,7 @@ def prune(self):
return candidates

def prune_edge(self, edge):
# # @ todo double check if this is strand specific
# @todo double check if this is strand specific

for edge_m in self.search_edges_between(edge):
d1 = edge._origin.position.get_dist(edge_m._origin.position, True)
Expand All @@ -674,8 +737,10 @@ def search_edges_between(self, edge_to_prune):
"""searches for other junctions in-between edge+insert size:"""
def pos_to_range(pos):
if pos.strand == STRAND_REVERSE:
return (pos.pos - MAX_ACCEPTABLE_INSERT_SIZE) - 1, pos.pos + MAX_ACCEPTABLE_ALIGNMENT_ERROR
# return (pos.pos - MAX_ACCEPTABLE_INSERT_SIZE) - 1, pos.pos + MAX_ACCEPTABLE_ALIGNMENT_ERROR
return (pos.pos - MAX_ACCEPTABLE_INSERT_SIZE) - 1, pos.pos + MAX_ACCEPTABLE_INSERT_SIZE
else:
# return pos.pos - MAX_ACCEPTABLE_ALIGNMENT_ERROR, (pos.pos + MAX_ACCEPTABLE_INSERT_SIZE) + 1
return pos.pos - MAX_ACCEPTABLE_ALIGNMENT_ERROR, (pos.pos + MAX_ACCEPTABLE_INSERT_SIZE) + 1

pos1, pos2 = edge_to_prune._origin.position, edge_to_prune._target.position
Expand Down Expand Up @@ -876,6 +941,29 @@ def classify_intronic_exonic(self, splice_junctions):
return self.xonic
self.xonic = 2

def get_breakpoint_stddev(self):
""" Calculates the variance of the breakpoints post merge - low values indicate consistent results"""
sq_dists = 0.0
n = 0
for edge in self.edges:
for dist in edge.get_distances_to_breakpoints(False):
sq_dists += dist ** 2
# dist * dist ? math.pow(dist,2)?
n += 1
if n > 0:
return math.sqrt(sq_dists / float(n))
else:
return 0.0

def get_breakpoint_disco_entropy(self):
""" Calculates the variance of the breakpoints post merge - low values indicate consistent results"""
as_list = []

for edge in self.edges:
as_list += edge.get_distances_to_breakpoints(True)

return entropy(list_to_freq_table(as_list))

def __str__(self):
"""Makes tabular output"""
node_a, node_b = self.edges[0]._origin, self.edges[0]._target
Expand All @@ -891,14 +979,16 @@ def __str__(self):
"%i\t%i\t%i\t%i\t"
"%i\t%i\t%i\t"
"%i\t%i\t"
"%s\t%s\t" # %.2f\t%.2f\t
"%.4f\t%.4f\t"
"%.4f\t%.4f\t"
"%s\n" % (node_a.position._chr, node_a.position.pos, strand_tt[self.edges[0]._origin.position.strand], # Pos-A
node_b.position._chr, node_b.position.pos, strand_tt[self.edges[0]._target.position.strand], # Pos-B
dist, ("valid" if self.discarded == [] else ','.join(self.discarded)), ("circular" if self.edges[0].is_circular() else "linear"), x_onic_tt[self.xonic], # Classification status
self.total_score / 2, self.total_clips, self.get_n_split_reads() / 2, self.get_n_discordant_reads() / 2, # Evidence stats
len(self.edges), nodes_a, nodes_b, # Edges and nodes stats
len(self.left_splice_junctions), len(self.right_splice_junctions),
self.edges[0].get_entropy(), self.get_overall_entropy(), # Entropy stats
self.edges[0].get_entropy_of_alignments(), self.get_overall_entropy(), # Entropy stats
self.get_breakpoint_stddev(), self.get_breakpoint_disco_entropy(),
"&".join([str(edge) for edge in self.edges]))) # Data structure

def get_overall_entropy(self):
Expand Down Expand Up @@ -1542,6 +1632,7 @@ def __str__(self):
"n-edges" "\t" "n-nodes-A" "\t" "n-nodes-B" "\t"
"n-splice-junc-A" "\t" "n-splice-junc-B" "\t"
"entropy-bp-edge" "\t" "entropy-all-edges" "\t"
"bp-pos-stddev" "\t" "entropy-disco-bps" "\t"
"data-structure" "\n"
"%s" % (''.join([str(subnet) for subnet in ordered])))

Expand Down Expand Up @@ -1710,7 +1801,7 @@ def filter_subnets(self, subnets):
"""Total of 8 reads is minimum, of which 2 must be
discordant and the entropy must be above 0.55"""

entropy = subnet.edges[0].get_entropy()
entropy = subnet.edges[0].get_entropy_of_alignments()
if entropy < MIN_SUBNET_ENTROPY:
subnet.discarded.append("entropy=" + str(entropy) + '<' + str(MIN_SUBNET_ENTROPY))

Expand Down
2 changes: 1 addition & 1 deletion drdisco/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
gmail dot com
"""

__version_info__ = ('0', '5', '0')
__version_info__ = ('0', '6', '0')
__version__ = '.'.join(__version_info__) if (len(__version_info__) == 3) else '.'.join(__version_info__[0:3]) + "-" + __version_info__[3]
__author__ = 'Youri Hoogstrate'
__homepage__ = 'https://github.com/yhoogstrate/dr-disco'
Expand Down
4 changes: 2 additions & 2 deletions tests/detect-intronic/test_01.out.dbed
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
chr-A pos-A direction-A chr-B pos-B direction-B genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges data-structure
chr21 39877811 - chr21 42873374 + 2995563 n_discordant_reads=0<1,n_support=3<8 linear intronic 9 6 3 0 1 1 1 0 0 1.0 1.0 chr21:39877811/39877812(-)->chr21:42873374/42873375(+):(spanning_paired_1_t:3,spanning_paired_2_t:3)
chr-A pos-A direction-A chr-B pos-B direction-B genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps data-structure
chr21 39877811 - chr21 42873374 + 2995563 n_discordant_reads=0<1,n_support=3<8 linear intronic 9 6 3 0 1 1 1 0 0 1.0000 1.0000 0.0000 0.0000 chr21:39877811/39877812(-)->chr21:42873374/42873375(+):(spanning_paired_1_t:3,spanning_paired_2_t:3)
4 changes: 2 additions & 2 deletions tests/detect-intronic/test_02.out.dbed
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
chr-A pos-A direction-A chr-B pos-B direction-B genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges data-structure
chr21 39877811 - chr21 42873374 + 2995563 valid linear intronic 21 12 6 3 1 1 1 0 0 0.929896694048 0.929896694048 chr21:39877811/39877812(-)->chr21:42873374/42873375(+):(discordant_mates:6,spanning_paired_1:2,spanning_paired_1_t:4,spanning_paired_2:2,spanning_paired_2_t:4)
chr-A pos-A direction-A chr-B pos-B direction-B genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps data-structure
chr21 39877811 - chr21 42873374 + 2995563 valid linear intronic 21 12 6 3 1 1 1 0 0 0.9299 0.9299 0.0000 1.0000 chr21:39877811/39877812(-)->chr21:42873374/42873375(+):(discordant_mates:6,spanning_paired_1:2,spanning_paired_1_t:4,spanning_paired_2:2,spanning_paired_2_t:4)
4 changes: 2 additions & 2 deletions tests/detect-intronic/test_03.out.dbed
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
chr-A pos-A direction-A chr-B pos-B direction-B genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges data-structure
chr21 39817544 - chr21 42880007 + 3062463 n_discordant_reads=0<1,n_support=5<8 linear intronic 15 10 5 0 1 1 1 0 0 0.827729376771 0.827729376771 chr21:39817544/39817545(-)->chr21:42880007/42880008(+):(spanning_paired_1:5,spanning_paired_2:5)
chr-A pos-A direction-A chr-B pos-B direction-B genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps data-structure
chr21 39817544 - chr21 42880007 + 3062463 n_discordant_reads=0<1,n_support=5<8 linear intronic 15 10 5 0 1 1 1 0 0 0.8277 0.8277 0.0000 0.0000 chr21:39817544/39817545(-)->chr21:42880007/42880008(+):(spanning_paired_1:5,spanning_paired_2:5)
4 changes: 2 additions & 2 deletions tests/detect-intronic/test_04.out.dbed
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
chr-A pos-A direction-A chr-B pos-B direction-B genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges data-structure
chr21 39817544 - chr21 42880007 + 3062463 n_discordant_reads=0<2,n_support=7<12 linear intronic 19 14 7 0 2 1 2 0 0 0.827729376771 0.796453035938 chr21:39817544/39817545(-)->chr21:42880007/42880008(+):(spanning_paired_1:5,spanning_paired_2:5)&chr21:39817544/39817545(-)->chr21:42879876/42879877(+):(spanning_singleton_1_r:2,spanning_singleton_2_r:2)
chr-A pos-A direction-A chr-B pos-B direction-B genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps data-structure
chr21 39817544 - chr21 42880007 + 3062463 n_discordant_reads=0<2,n_support=7<12 linear intronic 19 14 7 0 2 1 2 0 0 0.8277 0.7965 0.0000 0.0000 chr21:39817544/39817545(-)->chr21:42880007/42880008(+):(spanning_paired_1:5,spanning_paired_2:5)&chr21:39817544/39817545(-)->chr21:42879876/42879877(+):(spanning_singleton_1_r:2,spanning_singleton_2_r:2)
4 changes: 2 additions & 2 deletions tests/detect-intronic/test_05.out.dbed
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
chr-A pos-A direction-A chr-B pos-B direction-B genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges data-structure
chr21 39817544 - chr21 42880007 + 3062463 n_discordant_reads=0<2 linear exonic 247 170 85 0 5 2 4 1 0 0.811127845003 0.836571490862 chr21:39817544/39817545(-)->chr21:42880007/42880008(+):(spanning_paired_1:65,spanning_paired_2:65,spanning_singleton_1:2,spanning_singleton_1_r:3,spanning_singleton_2:2,spanning_singleton_2_r:3)&chr21:39817544/39817545(-)->chr21:42879876/42879877(+):(spanning_paired_1:6,spanning_paired_1_t:1,spanning_paired_2:6,spanning_paired_2_t:1,spanning_singleton_1_r:2,spanning_singleton_2_r:2)&chr21:39817544/39817545(-)->chr21:42878371/42878372(+):(spanning_paired_1:1,spanning_paired_1_t:1,spanning_paired_2:1,spanning_paired_2_t:1,spanning_singleton_1_r:1,spanning_singleton_2_r:1)&chr21:39846044/39846045(-)->chr21:42879876/42879877(+):(spanning_paired_1:2,spanning_paired_2:2)&chr21:39817544/39817545(-)->chr21:42876293/42876294(+):(spanning_paired_1:1,spanning_paired_2:1)
chr-A pos-A direction-A chr-B pos-B direction-B genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps data-structure
chr21 39817544 - chr21 42880007 + 3062463 n_discordant_reads=0<2 linear exonic 247 170 85 0 5 2 4 1 0 0.8111 0.8366 0.0000 0.0000 chr21:39817544/39817545(-)->chr21:42880007/42880008(+):(spanning_paired_1:65,spanning_paired_2:65,spanning_singleton_1:2,spanning_singleton_1_r:3,spanning_singleton_2:2,spanning_singleton_2_r:3)&chr21:39817544/39817545(-)->chr21:42879876/42879877(+):(spanning_paired_1:6,spanning_paired_1_t:1,spanning_paired_2:6,spanning_paired_2_t:1,spanning_singleton_1_r:2,spanning_singleton_2_r:2)&chr21:39817544/39817545(-)->chr21:42878371/42878372(+):(spanning_paired_1:1,spanning_paired_1_t:1,spanning_paired_2:1,spanning_paired_2_t:1,spanning_singleton_1_r:1,spanning_singleton_2_r:1)&chr21:39846044/39846045(-)->chr21:42879876/42879877(+):(spanning_paired_1:2,spanning_paired_2:2)&chr21:39817544/39817545(-)->chr21:42876293/42876294(+):(spanning_paired_1:1,spanning_paired_2:1)
4 changes: 2 additions & 2 deletions tests/detect-intronic/test_08.out.dbed
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
chr-A pos-A direction-A chr-B pos-B direction-B genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges data-structure
chr21 39817544 - chr21 42880007 + 3062463 valid linear intronic 24 14 7 5 2 1 2 0 0 0.827729376771 0.907019018116 chr21:39817544/39817545(-)->chr21:42880007/42880008(+):(spanning_paired_1:5,spanning_paired_2:5)&chr21:39817544/39817545(-)->chr21:42879876/42879877(+):(discordant_mates:10,spanning_singleton_1_r:2,spanning_singleton_2_r:2)
chr-A pos-A direction-A chr-B pos-B direction-B genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps data-structure
chr21 39817544 - chr21 42880007 + 3062463 valid linear intronic 24 14 7 5 2 1 2 0 0 0.8277 0.9070 0.0000 0.9398 chr21:39817544/39817545(-)->chr21:42880007/42880008(+):(spanning_paired_1:5,spanning_paired_2:5)&chr21:39817544/39817545(-)->chr21:42879876/42879877(+):(discordant_mates:10,spanning_singleton_1_r:2,spanning_singleton_2_r:2)
2 changes: 1 addition & 1 deletion tests/detect-intronic/test_09.out.dbed
Original file line number Diff line number Diff line change
@@ -1 +1 @@
chr-A pos-A direction-A chr-B pos-B direction-B genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges data-structure
chr-A pos-A direction-A chr-B pos-B direction-B genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps data-structure
2 changes: 1 addition & 1 deletion tests/detect-intronic/test_10.out.dbed
Original file line number Diff line number Diff line change
@@ -1 +1 @@
chr-A pos-A direction-A chr-B pos-B direction-B genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges data-structure
chr-A pos-A direction-A chr-B pos-B direction-B genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps data-structure
4 changes: 2 additions & 2 deletions tests/detect-intronic/test_11.out.dbed
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
chr-A pos-A direction-A chr-B pos-B direction-B genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges data-structure
chr7 151970252 + chr21 39777764 - inf entropy=0.0<0.8,n_discordant_reads=0<1,n_support=1<8 linear intronic 3 2 1 0 1 1 1 0 0 0.0 0.0 chr7:151970252/151970253(+)->chr21:39777764/39777765(-):(spanning_paired_1_t:1,spanning_paired_2_t:1)
chr-A pos-A direction-A chr-B pos-B direction-B genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps data-structure
chr7 151970252 + chr21 39777764 - inf entropy=0.0<0.8,n_discordant_reads=0<1,n_support=1<8 linear intronic 3 2 1 0 1 1 1 0 0 0.0000 0.0000 0.0000 0.0000 chr7:151970252/151970253(+)->chr21:39777764/39777765(-):(spanning_paired_1_t:1,spanning_paired_2_t:1)
Loading

0 comments on commit f746ff1

Please sign in to comment.