-
Notifications
You must be signed in to change notification settings - Fork 0
/
concat_and_trim_stefan_sanger_seqs.pl
executable file
·127 lines (104 loc) · 3.19 KB
/
concat_and_trim_stefan_sanger_seqs.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/perl
=pod
written by Will Overholt
4-3-2013
-----------------------------------------------------
PURPOSE OF SCRIPT
-----------------
Sometimes Stefan doesn't send us trimmed sequences
This script takes the .seq, .qual, and .TXT files
that are generated from phred and from ABI and uses
them to trim out the crap.
It takes as input the directory containing all the
individual sequences you are interested in trimming. It
also takes a description as to where and what the resulting
output files will be called
(e.g. ./this_scrip /path/to/input/dir /path/to/output/output_identifier_string)
When this script was written each individual sequence
had 3 files associated with it. Therefore the first step
was to make three concatenated multifasta files.
Then I call Lucy (http://lucy.sourceforge.net/) and use it
to identify bad regions.
Finally I trim the shit and output the final file (XXX_trimmed.fasta).
=cut
use warnings;
use strict;
use Bio::SeqIO; #you'll need to have the bioperl core modules installed
my $dir = $ARGV[0]; #directory containing all the sequences you want to trim
my $out = $ARGV[1]; #path and unique identifier you want to use for the sequences
opendir(DIR, $dir);
my @files = readdir(DIR);
closedir(DIR);
my @good_files;
my @fasta;
my @qual;
my @second;
#remove Linux's default directors (. and ..)
foreach (@files) {
if ($_ =~ m/[A-z]/) {
push(@good_files, $_);
}
}
#set up 3 arrays for each file type
foreach (@good_files) {
if ($_ =~ m/seq/) {
push(@fasta, $_);
}
elsif ($_ =~ m/qual/) {
push(@qual, $_);
}
elsif ($_ =~ m/TXT/) {
push(@second, $_);
}
}
@fasta = sort(@fasta);
@qual = sort(@qual);
@second = sort(@second);
open(FASTA, ">>", $out.".fasta");
open(QUAL, ">>", $out.".qual");
open(SECOND, ">>", $out.".TXT");
#for whatever reason the fasta sequences didn't have a header
#We first add the sequence name to the output file, then
#add the sequence
foreach (@fasta) {
my @fields = split('\.', $_);
my $seq_name = $fields[0];
print FASTA ">".$seq_name."\n";
open(FILE, $dir.$_);
my @seq = <FILE>;
print FASTA @seq;
close(FILE);
}
#write the qual files to a new file
foreach (@qual) {
open(FILE2, $dir.$_);
my @quality = <FILE2>;
print QUAL @quality;
close(FILE2);
}
#write the TXT files to a new file
foreach (@second) {
open(FILE3, $dir.$_);
my @seq2 = <FILE3>;
print SECOND @seq2;
close(FILE3);
}
close(FASTA);
close(QUAL);
close(SECOND);
#run lucy using default parameters
`lucy $out.fasta $out.qual $out.TXT -output $dir"lucy_out.fasta" $dir"lucy_out.qual"`;
#remove the damn spaces lucy adds to the names so Bio::SeqIO can parse the sequence headers
`sed -i 's/ /_/g' $dir"lucy_out.fasta"`;
open(FINAL, ">", $out."_trimmed.fasta");
#trim the sequences, since lucy is a little bitch and doesn't trim them herself
my $new_fasta = $dir."lucy_out.fasta";
my $seqio_obj = Bio::SeqIO->new( -file => $new_fasta, -format => "fasta");
while (my $seq_obj = $seqio_obj->next_seq) {
my $seq_name = $seq_obj->primary_id;
my @fields = split("_", $seq_name);
my $start = $fields[4];
my $end = $fields[5];
print FINAL ">".$fields[0]."\n";
print FINAL $seq_obj->subseq($start, $end)."\n";
}