forked from yarden/MISO
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pe_utils.py
139 lines (106 loc) · 4.78 KB
/
pe_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
##
## Utilities for working with paired-end reads and
## fragment distributions
##
import os
import glob
import time
from scipy import *
from numpy import *
from parse_csv import *
import Gene as gene_utils
import sam_utils
def load_insert_dist(insert_dist_filename):
"""
Read insert length distribution.
"""
insert_dist_file = open(insert_dist_filename, "r")
insert_dist = array([int(line.strip()) \
for line in insert_dist_file])
return insert_dist
# insert_filename = glob.glob(os.path.join(insert_dir, "*.insert_dist"))
# if len(insert_filename) == 0:
# raise Exception, "No insert length file found in %s." %(insert_dir)
def compute_insert_len(bam_filename, gff_filename, output_dir,
min_exon_size):
"""
Compute insert length distribution and output it to the given
directory.
"""
print "Computing insert length distribution of %s" %(bam_filename)
print " - Using gene models from: %s" %(gff_filename)
print " - Outputting to: %s" %(output_dir)
print " - Minimum exon size used: %d" %(min_exon_size)
if not os.path.isdir(output_dir):
print "Making directory: %s" %(output_dir)
os.makedirs(output_dir)
output_filename = os.path.join(output_dir,
"%s.insert_len" %(os.path.basename(bam_filename)))
# Load BAM file with reads
bamfile = sam_utils.load_bam_reads(bam_filename)
# Load the genes from the GFF
print "Loading genes from GFF..."
t1 = time.time()
gff_genes = gene_utils.load_genes_from_gff(gff_filename)
t2 = time.time()
print " - Loading genes from GFF took %.2f seconds" %(t2 - t1)
insert_lengths = []
t1 = time.time()
relevant_region = 0
for gene_id, gene_info in gff_genes.iteritems():
gene_obj = gene_info["gene_object"]
# Get all the constitutive parts
const_parts = gene_obj.get_const_parts()
chrom = gene_obj.chrom
# Consider only the large constitutive parts
for part in const_parts:
if part.len >= min_exon_size:
# Get all the reads that land in the coordinates of the exon
try:
exon_reads = bamfile.fetch(chrom, part.start, part.end)
except ValueError:
print "Could not fetch from region: ", chrom, part.start, part.end
continue
# Pair all the paired-end reads that land there
paired_reads = sam_utils.pair_sam_reads(exon_reads)
num_paired_reads = len(paired_reads)
if num_paired_reads == 0:
continue
print "Found %d region" %(relevant_region)
relevant_region += 1
# Compute the insert length of each read
for read_pair_id, read_pair in paired_reads.iteritems():
if len(read_pair) != 2:
# Skip non-paired reads
continue
left_read, right_read = read_pair
insert_len = right_read.pos - left_read.pos + 1
if insert_len > 0:
insert_lengths.append(insert_len)
else:
print "Negative or zero insert length ignored..."
# Output results to file
output_file = open(output_filename, 'w')
insert_length_str = "\n".join(map(str, insert_lengths))
output_file.write(insert_length_str)
output_file.close()
t2 = time.time()
print "Insert length computation took %.2f seconds." %(t2 - t1)
def main():
from optparse import OptionParser
parser = OptionParser()
parser.add_option("--compute-insert-len", dest="compute_insert_len", nargs=3, default=None,
help="Compute insert length for given sample. Takes as input "
"a sorted, indexed BAM file with headers, a GFF file with gene models, "
"and an output directory.")
parser.add_option("--min-exon-size", dest="min_exon_size", nargs=1, type="int", default=500,
help="Minimum size of constitutive exon (in nucleotides) that should be used "
"in the computation. Default is 500 bp.")
(options, args) = parser.parse_args()
if options.compute_insert_len != None:
bam_filename = os.path.abspath(os.path.expanduser(options.compute_insert_len[0]))
gff_filename = os.path.abspath(os.path.expanduser(options.compute_insert_len[1]))
output_dir = os.path.abspath(os.path.expanduser(options.compute_insert_len[2]))
compute_insert_len(bam_filename, gff_filename, output_dir, options.min_exon_size)
if __name__ == "__main__":
main()