-
Notifications
You must be signed in to change notification settings - Fork 3
/
ddrad_lowmem.pl
executable file
·413 lines (329 loc) · 14.5 KB
/
ddrad_lowmem.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
#!/usr/bin/perl
use warnings;
use strict;
use Getopt::Std;
use Chart::Gnuplot;
use Data::Dumper;
# kill program and print help if no command line arguments were given
if( scalar( @ARGV ) == 0 ){
&help;
die "Exiting program because no command line options were used.\n\n";
}
# take command line arguments
my %opts;
getopts( '1:2:e:f:hrs:', \%opts );
# if -h flag is used, or if no command line arguments were specified, kill program and print help
if( $opts{h} ){
&help;
die "Exiting program because help flag was used.\n\n";
}
# declare hash to hold restriction enzyme cut sites
my %re = (
'AciI' => 'C^CGC',
'AgeI' => 'A^CCGGT',
'AluI' => 'AG^CT',
#'ApeKI' => 'G^CWGC',
#'ApoI' => 'R^AATY',
'AseI' => 'AT^TAAT',
'BamHI' => 'G^GATCC',
'BfaI' => 'C^TAG',
'BgIII' => 'A^GATCT',
'BspDI' => 'AT^CGAT',
#'BstYI' => 'R^GATCY',
'ClaI' => 'AT^CGAT',
#'DdeI' => 'C^TNAG',
'DpnII' => '^GATC',
#'EaeI' => 'Y^GGCCR',
'EcoRV' => 'GAT^ATC',
'EcoT22I' => 'ATGCA^T',
'HindIII' => 'A^AGCTT',
'KpnI' => 'GGTAC^C',
'MluCI' => '^AATT',
'MseI' => 'T^TAA',
'NdeI' => 'CA^TATG',
'NheI' => 'G^CTAGC',
'NlaIII' => 'CATG^',
'NotI' => 'GC^GGCCGC',
'NsiI' => 'ATGCA^T',
'RsaI' => 'GT^AC',
'SacI' => 'GAGCT^C',
'Sau3AI' => '^GATC',
#'SexAI' => 'A^CCWGGT',
#'SgrAI' => 'CR^CCGGYG',
'SpeI' => 'A^CTAGT',
'SphI' => 'GCATG^C',
'TaqI' => 'T^CGA',
'XbaI' => 'T^CTAGA',
'XhoI' => 'C^TCGAG',
'SbfI' => 'CCTGCA^GG',
'EcoRI' => 'G^AATTC',
'PstI' => 'CTGCA^G',
'MspI' => 'C^CGG',
);
# if -r flag is used, kill program and print supported enzyme list
if( $opts{r} ){
&printenzymes( \%re );
die "Exiting program following request to print supported enzyme list.\n\n";
}
# parse the command line options and return values
my( $first, $second, $error, $file, $size ) = &parsecom( \%opts );
# get the restriction site sequences
my $re1 = "$re{$first}" or die "\nMatch for first restriction enzyme not found in the program.\nCheck your input for flag -1. Input is case sensitive.\nAlternatively, your enzyme may not be supported at this time.\n\n"; # string to hold sequence of first restriction enzyme
my $re2 = "$re{$second}" or die "\nMatch for second restriction enzyme not found in the program.\nCheck your input for flag -2. Input is case sensitive.\nAlternatively, your enzyme may not be supported at this time.\n\n"; # string to hold sequence of second restriction enzyme
my $cut1 = "$re{$first}"; # string to hold sequence of RE1 with carot in place of cut site
my $cut2 = "$re{$second}"; # string to hold sequence of RE2 with carot in place of cut site
my $select = 0; # counter to hold the number of RE1 x RE2 fragments that fall within the desired size range
my $discard = 0; # counter to hold the number of RE1 x RE2 fragments that do not fall within the desired size range
my $re1xre1select = 0; # counter to hold the number of RE1 x RE1 fragments that fall within the desired size range
my $re1xre1discard = 0; # counter to hold the number of RE1 x RE1 fragments that do not fall within the desired size range
my $re2xre2select = 0; # counter to hold the number of RE2 x RE2 fragments that fall within the desired size range
my $re2xre2discard = 0; # counter to hold the number of RE2 x RE2 fragments that do not fall within the desired size range
my $re1frags = 0; # counts the total number of RE1 x RE1 fragments
my $re2frags = 0; # counts the total number of RE2 x RE2 fragments
my $ddradfrags = 0; # counts the total number of RE1 x RE2 fragments
my $totalfrags = 0; # counts the total number of fragments produced
my $re1out = "$first.fasta";
my $re2out = "$second.fasta";
my $re1xre2out = "$first.by.$second.fasta";
my $genomelength = 0; # calculate total length of genome
my $adjgenomelength = 0; # adjust genome length for Ns in sequences
my $gc = 0; # count gc bases
my $n = 0; # count the Ns in the genome
#my @re1xre1; # array to hold RE1 x RE1 matches
#my @re1xre2; # array to hold RE1 x RE2 matches
#my @re2xre2; # array to hold RE2 x RE2 matches
my @lengths; # array to hold lengths of RE1 x RE2 matches
my @counts; # array to hold counts of lengths for RE1 x RE2 matches
my %hash; # hash to hold counts of all fragment lengths for RE1 x RE2 matches. key = fragment length, value = count
# output file name
my $out = $first . '_by_' . $second . '_' . $size . '_' . $error . '.log';
# remove carot from restriction enzyme cut sites for matching
$re1 =~ s/\^//;
$re2 =~ s/\^//;
# convert the restriction enzyme cut sites to symbols
$cut1 =~ s/\^/1\^1/;
$cut2 =~ s/\^/2\^2/;
# get minimum and maximum fragment lengths for size selection
my $min = ( $size - $error );
my $max = ( $size + $error );
# parse the fasta file and return strings with header and full sequence
#my( $head, $seqs ) = &fastaparse( $file, $seq, $genomelength, $gc, $n );
&fastaparse( \$file, \$genomelength, \$gc, \$n, $min, $max,
\$re1xre1select, \$re1xre1discard,
\$re2xre2select, \$re2xre2discard,
\$re1frags, \$re2frags,
\$ddradfrags, \$totalfrags,
\$select, \$discard,
\%hash
);
# open output file
open( OUT, '>', $out ) or die "Can't open $out: $!\n\n";
# print results summary statistics
# print genome length
print OUT "Genome length:\t", $genomelength, "\n\n";
# print GC Content
$adjgenomelength = ( $genomelength - $n );
my $gcpercent = ( $gc/$adjgenomelength );
print OUT "GC Content:\t", $gcpercent, "\n\n";
# print summary of all fragments
print OUT "All fragments (any combination):\n";
print OUT "Total:\t", $totalfrags, "\n\n";
# print summary of RE1 x RE2 fragments
print OUT "$re1 by $re2 fragments:\n";
print OUT $min, "bp to ", $max, "bp:\t", $select, "\n";
print OUT "Others:\t", $discard, "\n";
print OUT "Total:\t", $ddradfrags, "\n\n";
# print summary of RE1 x RE1 fragments
print OUT "$re1 by $re1 fragments:\n";
print OUT $min, "bp to ", $max, "bp:\t", $re1xre1select, "\n";
print OUT "Others:\t", $re1xre1discard, "\n";
print OUT "Total:\t", $re1frags, "\n\n";
# print summary of RE2 x RE2 fragments
print OUT "$re2 by $re2 fragments:\n";
print OUT $min, "bp to ", $max, "bp:\t", $re2xre2select, "\n";
print OUT "Others:\t", $re2xre2discard, "\n";
print OUT "Total:\t", $re2frags, "\n\n";
# calculate the percent of sequences that will be retained.
#my $total = ( $select + $discard );
my $unwanted = ($re1xre1select + $re2xre2select );
my $total = ( $select + $unwanted );
my $percent = ( $select / $total );
$percent = sprintf( '%.2f%%', 100 * $percent );
print OUT $percent, " of fragments between ", $min, " and ", $max, " bp will be selected.\n\n";
close OUT;
foreach my $key( sort {$a <=> $b} keys %hash ){
$key += 0;
push( @lengths, $key );
push( @counts, $hash{$key} );
}
#print Dumper(\%hash);
#make a file name to send to gnuplot
my $name = join('.', $first, $second, "histogram", "png");
&gnuhistogram( \@lengths, \@counts, $name );
print "There are $re1frags $re1 by $re1 fragments\n\n";
print "There are $re2frags $re2 by $re2 fragments\n\n";
print "There are $ddradfrags $re1 by $re2 fragments\n\n";
exit;
#####################################################################################################
############################################ Subroutines ############################################
#####################################################################################################
# subroutine to print help
sub help{
print "\nddradsim is a program developed by Steven Michael Mussmann\n\n";
print "To report bugs send an email to mussmann\@email.uark.edu\n";
print "When submitting bugs please include all input files, options used for the program, and all error messages that were printed to the screen.\n\n";
print "Program Options:\n";
print "\t\t[ -1 | -2 | -e | -f | -h | -r | -s ]\n\n";
print "\t-1:\tUse this to specify the first restriction enzyme.\n";
print "\t\tProgram will terminate if no enzyme is specified.\n\n";
print "\t-2:\tUse this to specify the second restriction enzyme.\n";
print "\t\tProgram will terminate if no enzyme is specified.\n\n";
print "\t-e:\tUse this to specify the error around the median value of your selected size range.\n";
print "\t\tFor example, if you have an -s value of 300 and an -e value of 25, the program will calculate the number of fragments between 275bp and 325bp.\n";
print "\t\tProgram will terminate if no value is specified.\n\n";
print "\t-f:\tUse this to specify the input file name.\n";
print "\t\tProgram will terminate if no file name is provided.\n\n";
print "\t-h:\tUse this command to display this help message.\n";
print "\t\tProgram will terminate after displaying this help message.\n\n";
print "\t-r:\tUse this command to display a list of supported restriction enzymes.\n";
print "\t\tProgram will terminate after displaying the enzyme list.\n\n";
print "\t-s:\tUse this to specify the median value of your selected size range.\n";
print "\t\tFor example, if you have an -s value of 300 and an -e value of 25, the program will calculate the number of fragments between 275bp and 325bp.\n";
print "\t\tProgram will terminate if no value is specified.\n\n";
}
#####################################################################################################
# subroutine to print supported enzyme list
sub printenzymes{
my( $re ) = @_;
my %enz_seq = %$re;
print "\nEnzyme", "\t\t", "Target sequence (^ denotes the cut site)", "\n\n";
#print the list of enzymes
foreach my $enzyme( sort{ $a cmp $b } keys %enz_seq ){
print $enzyme, "\t\t", $enz_seq{$enzyme}, "\n";
}
print "\n";
}
#####################################################################################################
# subroutine to parse the command line options
sub parsecom{
my( $params ) = @_;
my %opts = %$params;
# set default values for command line arguments
my $first = $opts{1} or die "\nFirst restriction enzyme not specified\n\n";
my $second = $opts{2} or die "\nSecond restriction enzyme not specified\n\n";
my $error = $opts{e} or die "\nError for size selection not specified (+ or - how many bases?)\n\n";
my $file = $opts{f} or die "\nNo input file specified\n\n";
my $size = $opts{s} or die "\nNo size selection value provided (how many bases?)\n\n";
return( $first, $second, $error, $file, $size );
}
#####################################################################################################
# subroutine to parse a fasta file
sub fastaparse{
my( $file, $genomelength, $gc, $n, $min, $max, $re1xre1select, $re1xre1discard, $re2xre2select, $re2xre2discard, $re1frags, $re2frags, $ddradfrags, $totalfrags, $select, $discard, $hash ) = @_;
#print Dumper(\%$hash);
# declare strings to hold header and sequence
my @head;
my $seq;
my $counter = 0;
# open the fasta file
open( FASTA, $$file ) or die "Can't open $file: $!\n\n";
# loop through fasta file and extract data
while( my $line = <FASTA> ){
chomp( $line );
if( $line =~/^>/ ){
push( @head, $line );
# push the previous sequence off to the @seqs array
$counter++;
print "Now working on contig $counter.\n";
if( $seq ){
&cutseq(\$seq, \$genomelength, \$gc, \$n, $min, $max, \$re1xre1select, \$re1xre1discard, \$re2xre2select, \$re2xre2discard, \$re1frags, \$re2frags, \$ddradfrags, \$totalfrags, \$select, \$discard, \$hash );
# undefine the $seq string
undef( $seq );
}
}else{
$seq .= uc($line); # if the line does not start with > then append the sequence to the $seq string
}
}
# take care of the last sequence by pushing it to @seqs array
if( $seq ){
$seq = uc($seq);
&cutseq(\$seq, \$genomelength, \$gc, \$n, $min, $max );
undef( $seq );
}
close FASTA;
}
#####################################################################################################
# subroutine to count fragments of a desired size range
sub sizeselect{
my( $fragment, $select, $discard, $min, $max ) = @_;
if( length( $$fragment ) ~~ [$$min..$$max] ){
$$$$select++;
}else{
$$$$discard++;
}
}
#####################################################################################################
# subroutine to remove numbers from fragments and push sequences to appropriate arrays
sub transform{
my( $seq, $frags ) = @_;
$$seq =~ s/[12]//g;
$$$$frags++;
}
#####################################################################################################
# subroutine to cut a sequence on the restriction sites - designed to be more memory efficient
sub cutseq{
my( $seq, $genomelength, $gc, $n, $min, $max, $re1xre1select, $re1xre1discard, $re2xre2select, $re2xre2discard, $re1frags, $re2frags, $ddradfrags, $totalfrags, $select, $discard, $hash ) = @_;
#print Dumper(\%$$hash);
# get total length of genome
$$$genomelength += length( $$seq );
$$$gc += ( $$seq =~ s/G/G/g );
$$$gc += ( $$seq =~ s/C/C/g );
$$$n += ( $$seq =~ s/N/N/g );
# find all restriction sites for the two enzymes and replace them with the cuts
$$seq =~ s/$re1/$cut1/eg;
$$seq =~ s/$re2/$cut2/eg;
# split the string at the cut sites
my @fragments = split( /\^/, $$seq );
# add to the running total of fragments
$$$totalfrags += scalar(@fragments);
foreach my $item( @fragments ){
if( $item =~ /^1.*1$/ ){
&sizeselect( \$item, \$re1xre1select, \$re1xre1discard, \$min, \$max );
&transform( \$item, \$re1frags );
}elsif( $item =~ /^1.*2$|^2.*1$/ ){
&sizeselect( \$item, \$select, \$discard, \$min, \$max );
&transform( \$item, \$ddradfrags );
my $len = length( $item );
$$$hash{$len}++;
}elsif( $item =~ /^2.*2$/ ){
&sizeselect( \$item, \$re2xre2select, \$re2xre2discard, \$min, \$max );
&transform( \$item, \$re2frags );
}
}
#print Dumper(\%hash);
}
#####################################################################################################
sub gnuhistogram{
my( $data, $counts, $name ) = @_;
# Create the chart object
my $chart = Chart::Gnuplot->new(
output => "$name",
xlabel => "Fragment Size",
ylabel => "Fragment Count",
yrange => [0, '*'],
xrange => [0, 500],
xtics => {
labels => [0, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500],
},
);
# Data set object
my $dataSet = Chart::Gnuplot::DataSet->new(
xdata => \@$data,
ydata => \@$counts,
title => "ddrad sim",
style => "impulses",
);
# Plot the graph
$chart->plot2d($dataSet);
}
#####################################################################################################