Skip to content

Commit

Permalink
added flag to remove global transcriptome filtering, global-cutoff
Browse files Browse the repository at this point in the history
  • Loading branch information
Gabriel Pratt committed Nov 2, 2012
1 parent 7d8180a commit 4412085
Show file tree
Hide file tree
Showing 4 changed files with 245 additions and 90 deletions.
1 change: 1 addition & 0 deletions clipper/src/CLIP_Analysis_Display.py
Original file line number Diff line number Diff line change
Expand Up @@ -695,6 +695,7 @@ def motif_boxplots(kmerloc, filename, klengths, highlight_motifs, subplot=None):
def get_motif_distance(clusters, motif, slop=500):

"""
TODO: This shouldn't be in the visualization side of things, need to factor out
Compares two bed files and computes distance from center of first (indicated by bed12)
Expand Down
38 changes: 20 additions & 18 deletions clipper/src/CLIP_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import pysam
import CLIP_Analysis_Display
import pylab
from kmerdiff import kmerdiff

def intersection(A, B=None):

Expand Down Expand Up @@ -88,7 +89,7 @@ def build_AS_STRUCTURE_dict(species, dir):
Important return values:
PArses out all important AS structure - see constructed dict in function
Parses out all important AS structure - see constructed dict in function
for information on what is needed...
GTypes - number of each exon type (used for getting background number
Expand Down Expand Up @@ -811,25 +812,26 @@ def main(options):
Gtype_count = [Gtypes["CE:"], Gtypes["SE:"], Gtypes["MXE:"], Gtypes["A5E:"], Gtypes["A3E:"]]

### write fasta files and run homer and/or kmer analysis if at least one analysis is requested
#runs kmer and homer analysis
#runs kmer and homer analysis

kmer_results = {}
if options.reMotif is True:

for region in all_regions:
try:
#reads nicely named files
real_fa = fa_file(clusters, region=region, fd = fastadir, type="real")
rand_fa = fa_file(clusters, region=region, fd = fastadir, type="random")
if options.k is not None:
if options.homer is True:
region_homer_out = os.path.join(homerout, region)
run_homer(real_fa, rand_fa, options.k, outloc=region_homer_out)
for k in options.k:
kmerfile = clusters + ".k" + str(k) + "." + region + ".kmerdiff"
kmerfile = os.path.join(kmerout, kmerfile)
kmer_sorted_output = run_kmerdiff(real_fa, rand_fa, outfile=kmerfile, k=k)
except:
continue

#reads nicely named files
real_fa = fa_file(clusters, region=region, fd = fastadir, type="real")
rand_fa = fa_file(clusters, region=region, fd = fastadir, type="random")
if options.k is not None:
if options.homer is True:
region_homer_out = os.path.join(homerout, region)
run_homer(real_fa, rand_fa, options.k, outloc=region_homer_out)
for k in options.k:
kmer_results[k] = {}
kmer_results[k][region] = kmerdiff(real_fa, rand_fa, k)
kmerfile = clusters + ".k" + str(k) + "." + region + ".kmerdiff"
kmerfile = os.path.join(kmerout, kmerfile)
kmer_sorted_output = run_kmerdiff(real_fa, rand_fa, outfile=kmerfile, k=k)


#all the different motifs that the user specifices
motifs = list(options.motif)
Expand Down
204 changes: 134 additions & 70 deletions clipper/src/peakfinder.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import time
import pybedtools
import gzip
#import pp
import math
import multiprocessing
import clipper
Expand All @@ -20,6 +19,7 @@
#define verbose printing here for test cases
#get rid of this and switch to logging
global varboseprint

def verboseprint(*args):

"""
Expand Down Expand Up @@ -67,7 +67,6 @@ def check_for_index(bamfile):
bamfile - a path to a bam file
Returns 1
TODO make it so a failaure returns 0
"""

Expand Down Expand Up @@ -231,7 +230,7 @@ def build_transcript_data(species, gene_bed, gene_mrna, gene_pre_mrna, pre_mrna)
acceptable_species = get_acceptable_species()
if (species is None and
gene_bed is None and
(gene_mrna is None or gene_premrna is None)):
(gene_mrna is None or gene_pre_mrna is None)):

raise ValueError("You must set either \"species\" or \"geneBed\"+\"geneMRNA\"+\"genePREMRNA\"")

Expand Down Expand Up @@ -265,6 +264,126 @@ def build_transcript_data(species, gene_bed, gene_mrna, gene_pre_mrna, pre_mrna)

return genes, lengths



def transcriptome_filter(poisson_cutoff, transcriptome_size, transcriptome_reads, cluster):

"""
filters each cluster by if it passes a transciptome wide cutoff or not, returns true if it passes
transcriptome cutoff, false if not
poisson_cutoff - float,user set cutoff
transcriptome_size - int number of genes in transcriptome
transcritpmoe_reads - int total number of reads analized
cluster - dict, stats about the cluster we are analizing {'Nreads' : int, 'size' : int}
"""

transcriptome_p = poissonP(transcriptome_reads,
cluster['Nreads'],
transcriptome_size,
cluster['size'])

if math.isnan(transcriptome_p):
verboseprint("""Transcriptome P is NaN, transcriptome_reads = %d,
cluster reads = %d, transcriptome_size = %d,
cluster_size = %d""" % (transcriptome_reads, cluster['Nreads'], transcriptome_size, cluster['size']))
return False

if transcriptome_p > poisson_cutoff:
print """%s\n Failed Transcriptome cutoff with %s reads,
pval: %s""" % (cluster,
cluster['Nreads'],
transcriptome_p)

return False

return True


def count_transcriptome_reads(results):

"""
Counts number of reads in the entire transcriptome
results -- the result returned back by call_peaks
returns int, the number of reads in the transcriptome
"""
#count total number of reads in transcriptiome
transcriptome_reads = 0
#print >> sys.stderr, results
for gene_result in results:
if gene_result is not None:
verboseprint("nreads", gene_result['nreads'])
transcriptome_reads += gene_result['nreads']


return transcriptome_reads

def filter_results(results, poisson_cutoff, transcriptome_size, transcriptome_reads, global_cutoff):

"""
Takes a list of results, filters them based off of various argunments and returns only the filtered
reads
options - the options object from the initial parsing
poisson_cutoff - user defined possion cutoff (also from options) that filters reads
results - list of results generated by call_peaks
transcriptome_size - number of genes there are in the transcriptome
"""

#combine results
allpeaks = set([])

for gene_result in results:

#alert user that there aren't any clusters for specific gene
if gene_result['clusters'] is None:
print >> sys.stderr, gene_result, "no clusters"


for cluster_id, cluster in gene_result['clusters'].items():
meets_cutoff = True
try:

if global_cutoff:
meets_cutoff = meets_cutoff and transcriptome_filter(poisson_cutoff,
transcriptome_size,
transcriptome_reads,
cluster)

#should factor out this as well, but I'll leave it be until nessessary
#does SlOP always get used? it looks like it does
corrected_SloP_pval = gene_result['clusters'][cluster_id]['SloP']
corrected_gene_pval = gene_result['clusters'][cluster_id]['GeneP']
min_pval = min([corrected_SloP_pval, corrected_gene_pval])

if not (min_pval < poisson_cutoff):
verboseprint("Failed Gene Pvalue: %s and failed SloP Pvalue: %s for cluster_id %s" % (corrected_gene_pval, corrected_SloP_pval, cluster_id))
meets_cutoff = False

if meets_cutoff:
#print >> sys.stderr, cluster_id, cluster
chrom, g_start, g_stop, peak_name, geneP, signstrand, thick_start, thick_stop = cluster_id.split("\t")

#adds beadline to total peaks that worked
allpeaks.add("%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d" % (chrom, int(g_start), int(g_stop), peak_name, min_pval, signstrand, int(thick_start), int(thick_stop)))

except NameError as error:
print >> sys.stderr, error
print >> sys.stderr, "parsing failed"
raise error

return allpeaks



def main(options):

if options.np == 'autodetect':
Expand Down Expand Up @@ -345,78 +464,22 @@ def main(options):
pickle_file = open(options.outfile + ".pickle", 'w')
pickle.dump(results, file=pickle_file)

#combine results
allpeaks = set([])

#count total number of reads in transcriptiome
transcriptome_reads = 0
transcriptome_reads = count_transcriptome_reads(results)

for gene_result in results:
if gene_result is not None:
verboseprint("nreads", gene_result['nreads'])
transcriptome_reads += gene_result['nreads']
print """Transcriptome size is %d, transcriptome
reads are %d""" % (transcriptome_size, transcriptome_reads)

for gener in results:
if gener['clusters'] is None:
print >> sys.stderr, gener, "no clusters"
continue

for cluster in gener['clusters'].keys():
try:
transcriptome_p = poissonP(transcriptome_reads,
gener['clusters'][cluster]['Nreads'],
transcriptome_size,
gener['clusters'][cluster]['size'])
if math.isnan(transcriptome_p):
verboseprint("""Transcriptome P is NaN, transcriptome_reads = %d,
cluster reads = %d, transcriptome_size = %d,
cluster_size = %d""" % (transcriptome_reads,
gener['clusters'][cluster]['Nreads'],
transcriptome_size,
gener['clusters'][cluster]['size']))

continue

if transcriptome_p > poisson_cutoff:
print """%s\n Failed Transcriptome cutoff with %s reads,
pval: %s""" % (cluster,
gener['clusters'][cluster]['Nreads'],
transcriptome_p)
continue

min_pval = 1

corrected_SloP_pval = gener['clusters'][cluster]['SloP']
corrected_gene_pval = gener['clusters'][cluster]['GeneP']

if (corrected_SloP_pval < poisson_cutoff or
corrected_gene_pval < poisson_cutoff):
min_pval = min([corrected_SloP_pval, corrected_gene_pval])
else:
verboseprint("Failed Gene Pvalue: %s and failed SloP Pvalue: %s for cluster %s" % (corrected_gene_pval, corrected_SloP_pval, cluster))
continue


(chrom, g_start, g_stop, peak_name, geneP, signstrand, thick_start, thick_stop) = cluster.split("\t")
#print >> sys.stderr, cluster
bedline = "%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d" % (chrom, int(g_start), int(g_stop), peak_name, min_pval, signstrand, int(thick_start), int(thick_stop))
allpeaks.add(bedline)

except NameError as error:
print >> sys.stderr, error
print >> sys.stderr, "parsing failed"
raise error

verboseprint("""Transcriptome size is %d, transcriptome
reads are %d""" % (transcriptome_size, transcriptome_reads))

allpeaks = filter_results(results,
poisson_cutoff,
transcriptome_size,
transcriptome_reads,
options.global_cutoff)

outbed = options.outfile
color = options.color
pybedtools.BedTool("\n".join(allpeaks), from_string=True).sort(stream=True).saveas(outbed, trackline="track name=\"%s\" visibility=2 colorByStrand=\"%s %s\"" % (outbed, color, color))
print "wrote peaks to %s" % (options.outfile)
return 1


verboseprint("wrote peaks to %s" % (options.outfile))

def call_main():

usage = """\npython peakfinder.py -b <bamfile> -s <hg18/hg19/mm9>\n OR
Expand Down Expand Up @@ -446,6 +509,7 @@ def call_main():
parser.add_option("--trim", dest="trim", action="store_true", default=False, help="Trim reads with the same start/stop to count as 1")
parser.add_option("--premRNA", dest="premRNA", action="store_true", help="use premRNA length cutoff, default:%default", default=False)
parser.add_option("--poisson-cutoff", dest="poisson_cutoff", type="float", help="p-value cutoff for poisson test, Default:%default", default=0.05, metavar="P")
parser.add_option("--global-cutoff", dest="global_cutoff", action="store_false", help="apply global transcriptome level cutoff to CLIP-seq peaks, Default:%default", default=True, metavar="P")
parser.add_option("--FDR", dest="FDR_alpha", type="float", default=0.05, help="FDR cutoff for significant height estimation, default=%default")
parser.add_option("--threshold", dest="threshold", type="int", default=None, help="Skip FDR calculation and set a threshold yourself")
parser.add_option("--maxgenes", dest="maxgenes", default=None, help="stop computation after this many genes, for testing", metavar="NGENES")
Expand Down
Loading

0 comments on commit 4412085

Please sign in to comment.