-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathextract_insert_sizes
executable file
·139 lines (108 loc) · 3.59 KB
/
extract_insert_sizes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/perl
use strict;
use Getopt::Long;
my $print_usage = 0;
my $fasta_file = undef;
my $read_length = 95;
my $usage = <<USAGE;
given a bowtie output (SAM) file, extract data to plot apparent insert length
as a function of genome position, for all reads
will output an tab separated x y pair for each read where
x = 5' base of forward read pair
y = insert size (apparent size of lib molecule)
Mark Stenglein Jan 6, 2014
usage: $0 [-h] [-f fasta_file] <sam_file>";
-h print this message.
-l read length (default = $read_length)
USAGE
# TODO: make this configurable?
# -o window_offset offset between sliding windows
# (default = $offset)
if (scalar @ARGV == 0) { warn $usage and die; }
GetOptions ( "h" => \$print_usage,
"l=s" => \$read_length,
"f=s" => \$fasta_file);
if ($print_usage) { print $usage and exit;}
my $fasta_fh = undef;
my $sequence_name = undef;
my $sequence_length = undef;
my %reads = ();
my %mapped_pairs = ();
while (my $sam_file = shift)
{
open (my $sam_fh, "<", $sam_file) or print "error: couldn't open SAM results file: $sam_file\n$usage" and exit;
if ($fasta_file)
{
## # try to figure out fasta filename
## $sam_file =~ /(.*\.(fasta|fa))(?!.*\.(fasta|fa))/;
## $fasta_file = $1;
open ($fasta_fh, "<", $fasta_file) or print "error: couldn't open fasta file: $fasta_file\n$usage" and exit;
}
my @fields = ();
my %ids = ();
# parse sam file, keeping track of query IDs
## warn "parsing sam file: $sam_file\n";
while (<$sam_fh>)
{
chomp;
if (/^@/)
{
# header lines
#
if (/\@SQ/)
{
if (/\@SQ\s+SN:(\S+)\s+LN:(\d+)/)
{
$sequence_name = $1;
$sequence_length = $2;
}
if (!$sequence_name or !$sequence_length)
{
die "error: couldn't parse sequence name or length from SAM file\nline: $_\n";
}
}
# mostly ignore headers
next;
}
# split line into tab-delimited components
@fields = split "\t";
my $query = $fields[0];
my $flag = $fields[1];
# see SAM format spec.
if ($flag & 4) { warn "skipping line: $_\n"; next; } # skip mismatches
my $insert_length = $fields[8];
if ($insert_length == 0)
{
#discordant alignment
my $left_coord = $fields[3];
# warn "$left_coord\t$left_coord\n";
}
elsif ($insert_length < 0)
{
# this is the line for the rev-comp alignment, skip it because the information we
# care about is mirrored in the fwd alignment info
}
else
{
my $left_coord = $fields[3];
my $pair_left_coord = $fields[7];
my $pair_right_coord = $pair_left_coord + $read_length - 1;
if ($left_coord < 0)
{
die "error: left coordinate ($left_coord) less than 0 on line: $_\n";
}
if ($pair_right_coord > $sequence_length)
{
# die "error: right coordinate ($pair_right_coord) greater than sequence length $sequence_length\non line: $_\n";
# just mapped past end
$pair_right_coord = $sequence_length;
}
my $insert_size = $pair_right_coord - $left_coord + 1;
if ($insert_size <= 100)
{
warn "insert size ($insert_size) < read length: lc: $left_coord pair_lc: $pair_left_coord\n";
}
print "$left_coord\t$insert_size\n";
}
}
}