-
Notifications
You must be signed in to change notification settings - Fork 1
/
merge_tally.pl
executable file
·382 lines (289 loc) · 19.2 KB
/
merge_tally.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
#!/usr/bin/env perl
use warnings;
$| = 1;
#Add use lib statement to assume there is a directory at the same level as bin in which the script is run, called 'lib'
use FindBin;
use lib "$FindBin::Bin/../lib";
use lib "$FindBin::Bin";
use strict;
use FileHandle;
use aomisc;
use primerid;
use Cwd;
use diagnostics;
use Getopt::Long;
use Data::Dumper;
#use File::Basename;
#use Array::Utils qw(:all);
#use File::Slurp qw( prepend_file );
#use File::Which;
# Andrew J. Oler, PhD
# Computational Biology Section
# Bioinformatics and Computational Biosciences Branch (BCBB)
# OCICB/OSMO/OD/NIAID/NIH
# Bethesda, MD 20892
#
# andrew.oler@gmail.com
#
#This package is free software; you can redistribute it and/or modify
#it under the terms of the GPL (either version 1, or at your option,
#any later version) or the Artistic License 2.0.
#my %options; #Hash in which to store option arguments
#use vars qw($opt_s $opt_f $opt_n $opt);
#getopts ('s:f:n:bda:g:om:i:r:ecu:kh:j:l:ypx:',\%options);
#Print out the options
if (@ARGV){ print STDERR "Arguments: ", join " ", @ARGV, "\n"; }
my $PWD = pwd_for_hpc();
my $save;
my $dir;
my $verbose;
my $output;
my $gzip;
my $ref;
my $debug;
my $fullpeptide;
my $prefix;
my $variant_threshold;
my $use_sample_col;
my $sample;
GetOptions('save=s' => \$save,
'output=s' => \$output,
'verbose' => \$verbose,
'dir|d|i|input|files=s' => \$dir,
'gzip' => \$gzip,
'ref|r=s' => \$ref,
'debug' => \$debug,
'fullpeptide=s' => \$fullpeptide,
'prefix=s' => \$prefix,
'variant_threshold=s' => \$variant_threshold,
'sample=s' => \$sample,
'use_sample_col' => \$use_sample_col,
);
#-----------------------------------------------------------------------------
#----------------------------------- MAIN ------------------------------------
#-----------------------------------------------------------------------------
my $usage = "
merge_tally.pl takes a directory containing output from convert_reads_to_amino_acid.pl
and merges the results. The merged results are output as frequency tables of nucleotide,
codon, and amino acid sequence by position -- the same format as the output to
convert_reads_to_amino_acid.pl. A merged table containing frequencies for all three is also
produced. For now, Phylip files for nucleotide and peptide sequence are not merged.
Required Arguments:
--d/--dir/-i/--input Directory containing files to merge.
--sample Sample Id for all of the reads. This will force merge all of the tally files
in the input directory, regardless of the 'Sample' column. No spaces or commas
allowed in the id.
OR
--use_sample_col Use the 'Sample' column in the output of convert_reads_to_amino_acid.pl
to group reads into separate sets of output files, one set of files for each unique
sample id.
Optional Arguments:
--prefix Prefix for output files. Default = Merge_convert_reads. Sample id will be
added to the prefix. If the prefix value contains a directory, the directory must exist.
--variant_threshold Minimum threshold frequency for output variant file. Default = 0
(i.e., print all variants). (Note that in calculate_linkage_disequilibrium.pl
there is an option to filter for a variant_threshold as well.)
--verbose Print out verbose progress/error messages.
Example:
merge_tally.pl --dir 30_S1_tallies --sample Parental --prefix 30_S1.contigs.pi.btrim.0.majority.cons
";
# To Do:
# Change log
# 2017-02-04
# Created script, based on convert_reads_to_amino_acid.pl
# 2017-02-11
# Removed --save option. Just include any directory in the --prefix. ($prefix can contain absolute or relative path).
# --save Directory in which to save files. Default = pwd. If folder doesn\'t exist, it
# will be created.
# 2017-02-16
# Added 6 more columns to convert_reads output, so modified this to match. Also modified some subs in primerid.pm, so
# modified calls here.
unless ($dir||$ARGV[0]){ print STDERR "$usage\n"; exit; } #fasta and gff are required
unless (defined($sample) || $use_sample_col){ print STDERR "Please specify --sample or --use_sample_col.\n"; exit; }
if ($use_sample_col && defined($sample)){
print STDERR "Provide a sample id (--sample) or use the Sample column in the tally files (--use_sample_col), not both.\n";
exit 1;
}
if (defined($sample) && $sample =~ m/[\s+,\|]/){
print STDERR "Sample id invalid: $sample\n";
exit 1;
}
my $start_time = time;
my @suffixes = (qw(.bed .bed.gz .bed12 .bed12.gz .txt .txt.gz .BED .BED.gz .BED12 .BED12.gz .fasta .fa .FA .FASTA .FAS .fas), @SUFFIXES); #for fileparse. Feel free to add more accepted extensions. @SUFFIXES comes from aomisc.pm.
# Defaults for input options
$prefix ||= 'Merge_convert_reads';
$variant_threshold //= 0; # Was 0.0001 (1 variant out of 10000 sequences). 0 is a valid value, hence // instead of ||.
# Make hash to convert codons to amino acids (maybe this should go into primerid.pm too?)
my %converter = make_codon_to_aa_hash();
# Get the tally files
my @all_files = &aomisc::get_files($dir, '(nuc|aa|codon).tally.xls'); # This will add $dir to each of the file names. Was: #opendir(DIR, "$dir"); my @all_files = grep(/\.(nuc|aa|codon).tally.xls$/,readdir(DIR)); closedir(DIR);
my @nuc_files = grep(/\.nuc.tally.xls$/,@all_files);
my @codon_files = grep(/\.codon.tally.xls$/,@all_files);
my @aa_files = grep(/\.aa.tally.xls$/,@all_files);
# print STDERR join "\n", @all_files, "\n";
# Get the gene name. Note that all of the file should be for the same gene. Check to make sure. Die if not.
my $gene = get_gene(\@all_files);
my @NUC = qw(A C G T N);
my @AA = qw(A C D E F G H I K L M N P Q R S T V W Y * X -); # All amino acids, and * for stop, X for unknown, and - for deletion in read compared to reference
my @CODON = get_codons("$codon_files[0]"); # Take the first file named *.codon.tally.xls in the directory and read the codon order from there.
# print join " ", @CODON; print "\n"; exit;
# Read all files and store in data structure.
# Reference CDS sequence -- coding nucleotides, codons, and amino acids by position.
# Global variable to store the reference sequence
# $cds->{'aa'}->{$aa_pos} = $aa;
# Other first keys are codon, nuc
# Store this as we're reading the input files
my ($sample_nuc_aa_codon_tally, $cds) = read_input_tally(\@nuc_files, \@codon_files, \@aa_files);
# print Dumper($nuc_aa_codon_tally); exit;
print STDERR "Done processing " . scalar(@all_files) . " files.";
&elapsed($start_time, ' Elapsed', $verbose);
my @samples = keys %$sample_nuc_aa_codon_tally;
# print Dumper(\@samples); exit;
for (my $i = 0; $i < @samples; $i++){
my $prefix_with_sample = $prefix . "." . $samples[$i];
print_reports($sample_nuc_aa_codon_tally->{$samples[$i]}, $samples[$i], $gene, \@NUC, \@CODON, \@AA, $cds, \%converter, $prefix_with_sample, $start_time, $verbose, $debug);
print_merged_report($sample_nuc_aa_codon_tally->{$samples[$i]}, $samples[$i], $gene, \@CODON, $cds, $prefix_with_sample);
print_variants($sample_nuc_aa_codon_tally->{$samples[$i]}, $samples[$i], $gene, $variant_threshold, $prefix_with_sample, $start_time, $verbose);
}
&elapsed($start_time, 'Total', $verbose);
#-----------------------------------------------------------------------------
#---------------------------------- SUBS -------------------------------------
#-----------------------------------------------------------------------------
sub get_gene {
my $all_file_list = shift;
# Reads all of the files, getting gene from column 2
# If there is only one gene across files, then it returns the gene name
# If there is more than one gene, it will exit.
# #name gene nucleotidePosition refNucleotide consensusNucleotide refDiffConsensus Sample coverageDepth unambigCoverageDepth unambigConsensus numUnambigConsensus numUnambigNonConsensus majorAltAllele numMajorAltAllele numA numC numG numT numN numOther
# HA_full:37:A HA_full 37 A A 151_78_S2.0.000 9922 9922 A 9889 33 G 23 9889 3 23 7 0 0
# HA_full:38:G HA_full 38 G G 151_78_S2.0.000 9922 9922 G 9918 4 A 3 3 0 9918 1 0 0
my $genes; # hashref to store all gene id(s). Should only have one gene across all files, theoretically.
print STDERR "Checking to see if same gene listed for all files...\n";
foreach my $file (@$all_file_list){
my $readfh = open_to_read("$file");
while(<$readfh>){
chomp;
next if (m/^#/);
my @F = split(/\t/);
next unless (@F > 1);
$genes->{$F[1]}++; # gene is second column.
}
close($readfh);
}
# print Dumper($genes);
my @genes = keys %$genes;
if (@genes > 1){
print STDERR "More than one gene present in files! Exiting...\n";
exit;
}
print STDERR "Gene: $genes[0]\n";
return $genes[0];
}
#-----------------------------------------------------------------------------
sub get_codons {
my $codon_file = shift;
# print STDERR "$codon_file\n";
my @codons;
my @header = get_header($codon_file);
# #name gene codonPosition refCodon consensusCodon refDiffConsensus Sample coverageDepth unambigCoverageDepth unambigConsensus numUnambigConsensus numUnambigNonConsensus majorAltAllele numMajorAltAllele numGCA:A numGCB:A numGCC:A numGCD:A numGCG:A numGCH:A numGCK:A numGCM:A numGCN:A numGCR:A numGCS:A numGCT:A numGCV:A numGCW:A numGCY:A numTGC:C numTGT:C numTGY:C numGAC:D numGAT:D numGAY:D numGAA:E numGAG:E numGAR:E numTTC:F numTTT:F numTTY:F numGGA:G numGGB:G numGGC:G numGGD:G numGGG:G numGGH:G numGGK:G numGGM:G numGGN:G numGGR:G numGGS:G numGGT:G numGGV:G numGGW:G numGGY:G numCAC:H numCAT:H numCAY:H numATA:I numATC:I numATH:I numATM:I numATT:I numATW:I numATY:I numAAA:K numAAG:K numAAR:K numCTA:L numCTB:L numCTC:L numCTD:L numCTG:L numCTH:L numCTK:L numCTM:L numCTN:L numCTR:L numCTS:L numCTT:L numCTV:L numCTW:L numCTY:L numTTA:L numTTG:L numTTR:L numATG:M numAAC:N numAAT:N numAAY:N numCCA:P numCCB:P numCCC:P numCCD:P numCCG:P numCCH:P numCCK:P numCCM:P numCCN:P numCCR:P numCCS:P numCCT:P numCCV:P numCCW:P numCCY:P numCAA:Q numCAG:Q numCAR:Q numAGA:R numAGG:R numAGR:R numCGA:R numCGB:R numCGC:R numCGD:R numCGG:R numCGH:R numCGK:R numCGM:R numCGN:R numCGR:R numCGS:R numCGT:R numCGV:R numCGW:R numCGY:R numAGC:S numAGT:S numAGY:S numTCA:S numTCB:S numTCC:S numTCD:S numTCG:S numTCH:S numTCK:S numTCM:S numTCN:S numTCR:S numTCS:S numTCT:S numTCV:S numTCW:S numTCY:S numACA:T numACB:T numACC:T numACD:T numACG:T numACH:T numACK:T numACM:T numACN:T numACR:T numACS:T numACT:T numACV:T numACW:T numACY:T numGTA:V numGTB:V numGTC:V numGTD:V numGTG:V numGTH:V numGTK:V numGTM:V numGTN:V numGTR:V numGTS:V numGTT:V numGTV:V numGTW:V numGTY:V numTGG:W numTAC:Y numTAT:Y numTAY:Y numTAA:* numTAG:* numTGA:* numOther
@codons = @header[14..$#header -1]; # Not "numOther". This is added by print_reports (via print_tally_line())
s/num// for @codons;
print STDERR "codons: ", join ", ", @codons, "\n" if $verbose;
return @codons; # Return the array (not arrayref)
}
#-----------------------------------------------------------------------------
sub read_input_tally {
my ($nuc_files, $codon_files, $aa_files) = @_; # lists of files inside $dir (need to add "$dir/" to the file name to access the file)
my $sample_nuc_aa_codon_tally; # data structure for saving all of the count information. e.g., $sample_nuc_aa_codon_tally->{$sample}->{$type}->{$pos}->{$base_codon_aa} += $seq_count;
my $cds; # data structure for saving reference sequence as nuc, codon, aa. e.g., $cds->{'nuc'}->{$pos} = $nuc; first keys: nuc, aa, codon
# Global variable to store the reference sequence
# $cds->{'aa'}->{$aa_pos} = $aa;
# Other first keys are codon, nuc
# Store this as we're reading the input files
## Example input. (old -- see below for new)
# nuc:
# #name gene nucleotidePosition refNucleotide consensusNucleotide refDiffConsensus Sample coverageDepth unambigCoverageDepth unambigConsensus numUnambigConsensus numUnambigNonConsensus majorAltAllele numMajorAltAllele numA numC numG numT numN numOther
# HA_full:37:A HA_full 37 A A 151_78_S2.0.000 9922 9922 A 9889 33 G 23 9889 3 23 7 0 0
# HA_full:38:G HA_full 38 G G 151_78_S2.0.000 9922 9922 G 9918 4 A 3 3 0 9918 1 0 0
# codon:
# #name gene codonPosition refCodon consensusCodon refDiffConsensus Sample coverageDepth unambigCoverageDepth unambigConsensus numUnambigConsensus numUnambigNonConsensus majorAltAllele numMajorAltAllele numGCA:A numGCB:A numGCC:A numGCD:A numGCG:A numGCH:A numGCK:A numGCM:A numGCN:A numGCR:A numGCS:A numGCT:A numGCV:A numGCW:A numGCY:A numTGC:C numTGT:C numTGY:C numGAC:D numGAT:D numGAY:D numGAA:E numGAG:E numGAR:E numTTC:F numTTT:F numTTY:F numGGA:G numGGB:G numGGC:G numGGD:G numGGG:G numGGH:G numGGK:G numGGM:G numGGN:G numGGR:G numGGS:G numGGT:G numGGV:G numGGW:G numGGY:G numCAC:H numCAT:H numCAY:H numATA:I numATC:I numATH:I numATM:I numATT:I numATW:I numATY:I numAAA:K numAAG:K numAAR:K numCTA:L numCTB:L numCTC:L numCTD:L numCTG:L numCTH:L numCTK:L numCTM:L numCTN:L numCTR:L numCTS:L numCTT:L numCTV:L numCTW:L numCTY:L numTTA:L numTTG:L numTTR:L numATG:M numAAC:N numAAT:N numAAY:N numCCA:P numCCB:P numCCC:P numCCD:P numCCG:P numCCH:P numCCK:P numCCM:P numCCN:P numCCR:P numCCS:P numCCT:P numCCV:P numCCW:P numCCY:P numCAA:Q numCAG:Q numCAR:Q numAGA:R numAGG:R numAGR:R numCGA:R numCGB:R numCGC:R numCGD:R numCGG:R numCGH:R numCGK:R numCGM:R numCGN:R numCGR:R numCGS:R numCGT:R numCGV:R numCGW:R numCGY:R numAGC:S numAGT:S numAGY:S numTCA:S numTCB:S numTCC:S numTCD:S numTCG:S numTCH:S numTCK:S numTCM:S numTCN:S numTCR:S numTCS:S numTCT:S numTCV:S numTCW:S numTCY:S numACA:T numACB:T numACC:T numACD:T numACG:T numACH:T numACK:T numACM:T numACN:T numACR:T numACS:T numACT:T numACV:T numACW:T numACY:T numGTA:V numGTB:V numGTC:V numGTD:V numGTG:V numGTH:V numGTK:V numGTM:V numGTN:V numGTR:V numGTS:V numGTT:V numGTV:V numGTW:V numGTY:V numTGG:W numTAC:Y numTAT:Y numTAY:Y numTAA:* numTAG:* numTGA:* numOther
# HA_full:13:AGG HA_full 13 AGG AGG 151_78_S2.0.000 9922 9922 AGG 9877 45 GGG 23 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 23 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 9877 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0
# HA_full:14:CAA HA_full 14 CAA CAA 151_78_S2.0.000 9922 9922 CAA 9888 34 CAG 15 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9888 15 0 0 0 0 9 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 0 0 0
# aa:
# #name gene aminoAcidPosition refAminoAcid consensusAminoAcid refDiffConsensus Sample coverageDepth unambigCoverageDepth unambigConsensus numUnambigConsensus numUnambigNonConsensus majorAltAllele numMajorAltAllele numA numC numD numE numF numG numH numI numK numL numM numN numP numQ numR numS numT numV numW numY num* numX num- numOther
# HA_full:13:R HA_full 13 R R 151_78_S2.0.000 9922 9922 R 9887 35 G 23 0 0 0 0 0 23 0 0 3 0 1 0 0 0 9887 1 0 0 7 0 0 0 0 0
# HA_full:14:Q HA_full 14 Q Q 151_78_S2.0.000 9922 9922 Q 9903 19 R 9 0 0 0 1 0 0 0 0 1 1 0 0 0 9903 9 0 0 0 0 0 7 0 0 0
## ** Example input 2017-02-16 (additional columns)
# #name gene nucleotidePosition refNucleotide consensusNucleotide refDiffConsensus Sample coverageDepth unambigCoverageDepth unambigConsensus numUnambigConsensus numUnambigNonConsensus majorAltAllele numMajorAltAllele cMAF cMAF_95%_CI_low cMAF_95%_CI_high medianUnambigFreq median95%ConfIntHigh passThreshold numA numC numG numT numN numOther
# HA:427:C HA 427 C C SV12.1 467 467 C 467 0 0 0.00000 0.00000 0.00787 0.00428 0.01538 no 0 467 0 0 0 0
# HA:428:A HA 428 A A SV12.1 467 467 A 466 1 G 1 0.00214 0.00005 0.01187 0.00428 0.01538 no 466 0 1 0 0 0
# HA:469:A HA 469 A A SV12.1 467 467 A 397 70 G 70 0.14989 0.11875 0.18555 0.00428 0.01538 yes 397 0 70 0 0 0
my @type = qw(nuc codon aa);
my @file_lists_by_type = ($nuc_files, $codon_files, $aa_files);
# Tally up all "num..." columns except "numOther" -- "numOther" is added by print_reports (actually print_tally_line)"
# Need to remove the "num" part
# I can either assume that the order of @NUC, @AA, and @CODON is the same order as the columns (which it should be), or I can lookup the header for each file and use that.
# I would hate to be off by one for some reason, so I'll just lookup the header for each file.
my $files_done = 0;
for (my $i = 0; $i < @type; $i++){
my $type = $type[$i];
my $file_list = $file_lists_by_type[$i];
my @header = get_header("$file_list->[0]"); # get header array from first file
s/^num// for @header;
s/:.+// for @header;
my $max_coverage = 0;
foreach my $file (@$file_list){
print STDERR "reading file: $file\n" if $verbose;
my $wc = `cat $file | wc -l`;
# print STDERR "wc: $wc\n";
if ($files_done && ($files_done % 100) == 0){
my $total_files = scalar(@all_files);
print "Done processing $files_done of $total_files files.";
&elapsed($start_time, ' Elapsed', $verbose);
}
$files_done++;
next if $wc < 2;
my $readfh = open_to_read("$file");
while(<$readfh>){
chomp;
next if (m/^#/);
my @F = split(/\t/);
my $coverage = $F[7];
$max_coverage = $coverage > $max_coverage ? $coverage : $max_coverage;
next if ( ($coverage / $max_coverage) < 0.01 ); # Skip very low coverage rows
# my ($pos,$ref_residue,$sample) = ($F[2],$F[3],$F[6]);
my ($pos,$ref_residue) = ($F[2],$F[3]);
my $sample = defined($sample) ? $sample : $F[6]; # If user specifies $sample, use that as the sample id for all reads. Otherwise, use the 'Sample' column value.
# Populate $cds for reference
if (exists($cds->{$type}->{$pos}) && $cds->{$type}->{$pos} ne $ref_residue){
print STDERR "Reference residue $ref_residue different from stored value for this type ($type) and position ($pos): $cds->{$type}->{$pos}\n";
} else {
$cds->{$type}->{$pos} = $ref_residue;
}
for (my $c = 20; $c < (scalar(@header) - 1); $c++){ # i.e., start in column index 20, go all the way to the second to last column. The last column is numOther, which we don't want to capture
$sample_nuc_aa_codon_tally->{$sample}->{$type}->{$pos}->{$header[$c]} += $F[$c] if ($F[$c] > 0);
}
}
# print Dumper($sample_nuc_aa_codon_tally); exit;
close($readfh);
}
}
return ($sample_nuc_aa_codon_tally, $cds);
}
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#Useful things
#my $total_lines = 0;
# $total_lines++; #somewhere in subroutine, if I run through the file first.
# my $lines_done = 0;
# $lines_done++; #in while loop, increment as each new line is processed
# if (($lines_done % 25) == 0){ #put this in the while loop too.
# print "Done processing $lines_done of $total_lines sequences.";
# &elapsed($start_time, ' Elapsed', $verbose);
# }