Skip to content

Commit

Permalink
Merge pull request #348 from CGATOxford/KB_extra_plots_IDR
Browse files Browse the repository at this point in the history
pep8
  • Loading branch information
sebastian-luna-valero authored Jul 27, 2017
2 parents 87a4362 + 26f7d8c commit f927b1c
Showing 1 changed file with 83 additions and 1 deletion.
84 changes: 83 additions & 1 deletion CGATPipelines/pipeline_peakcalling.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,10 @@
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec


#########################################################################
Expand Down Expand Up @@ -891,7 +895,7 @@ def estimateInsertSize(infile, outfile):
Output is stored in insert_size.tsv
'''

PipelinePeakcalling.estimateInsertSize(infile,
PipelinePeakcalling.estimateInsertSize(infile,
outfile,
PARAMS['paired_end'],
PARAMS['insert_alignments'],
Expand Down Expand Up @@ -1502,13 +1506,91 @@ def findOptimalPeaks(infile, outfiles):
i += 1


@merge(runIDR, ['IDR.dir/self_consistency.png',
'IDR.dir/replicate_consistency.png',
'IDR.dir/pooled_consistency.png'])
def plotIDR(infiles, outfiles):
'''
Generates plots showing the correlation between the ranking in the
two subsets for self consistency, replicate consistency and
pooled consistency datasets.
Ideally the plots should show a strong correlation for highly ranked
peaks and the blue section (which represents the peaks which pass IDR)
should represent the part of the graph where the correlation is strong.
'''
sns.set_style('ticks')

infiles = np.array(infiles)
self_c = infiles[['pseudo' in p and 'pooled' not in p for p in infiles]]
pooled_c = infiles[['pseudo' in p and 'pooled' in p for p in infiles]]
replicate_c = infiles[['pseudo' not in p and 'pooled' not in p
for p in infiles]]
cols = ['chrom', 'chromStart', 'chromEnd', 'name',
'score', 'strand', 'signalValue', 'p-value',
'q-value', 'summit', 'localIDR', 'globalIDR',
'rep1_chromStart', 'rep1_chromEnd', 'rep1_signalValue',
'rep1_summit', 'rep2_chromStart', 'rep2_chromEnd',
'rep2_signalValue', 'rep2_summit']

i = 0

for L in self_c, replicate_c, pooled_c:
if i == 0:
thresh = PARAMS['IDR_softthresh_selfconsistency']
label = 'Self'
elif i == 1:
thresh = PARAMS['IDR_softthresh_replicateconsistency']
label = 'Replicate'
elif i == 2:
thresh = PARAMS['IDR_softthresh_pooledconsistency']
label = 'Pooled'

idr_score_threshold = -math.log(thresh, 10)
nrows = math.ceil(len(L) / 4)
ncols = 4
fig = plt.figure(figsize=(ncols*3, nrows*3))
f = gridspec.GridSpec(nrows, ncols)
j = 0
row = 0
col = 0
for item in L:
a = plt.subplot(f[row, col])
tab = pd.read_csv(item, sep="\t", names=cols)
tab['sig'] = tab['globalIDR'] >= idr_score_threshold
colours = ['#0571b0' if x else '#404040' for x in tab['sig']]
tab1 = tab.sort_values('rep1_signalValue', ascending=False)
tab2 = tab.sort_values('rep2_signalValue', ascending=False)
tab1['rank1'] = range(len(tab1))
tab2['rank2'] = range(len(tab2))
tab = tab.merge(tab1).merge(tab2)
a.scatter(tab['rank1'], tab['rank2'],
s=0.1, alpha=1, color=colours)
sns.despine()
title = item.replace("_v_", "\n")
title = title.replace(".macs2", "")
title = title.replace("_filtered", "").replace("IDR.dir/", "")
a.set_title(title, fontsize=8)
a.set_xlabel('rank subset 1', fontsize=8)
a.set_ylabel('rank subset 2', fontsize=8)
col += 1
if col == ncols:
col = 0
row += 1
fig.suptitle('''Correlation Between Ranks in Pairs of Subsets: %(label)s Consistency\nblue = peak passing IDR, grey = peak not passing IDR''' % locals())
fig.tight_layout(rect=[0, 0.03, 1, 0.95])
fig.savefig(outfiles[i])
i += 1


@active_if(IDR_ON)
@follows(loadIDRPairs)
@follows(loadIDRsummary)
@follows(loadIDRQC)
@follows(findConservativePeaks)
@follows(findOptimalPeaks)
@follows(runIDRQC)
@follows(plotIDR)
def IDR():
pass

Expand Down

0 comments on commit f927b1c

Please sign in to comment.