-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathfilter_seq
executable file
·157 lines (125 loc) · 3.01 KB
/
filter_seq
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env perl
use warnings;
use strict;
use FileHandle;
my $USAGE = "Usage: filter_seq [-index seq.fa] [-v] subset.fa all.fa\n";
my $HELPTEXT = qq~
Extract specified fasta records listed in subset.fa from a master file all.fa.
If available, use the index file all.fa.idx to allow random access. Create the index file
by running 'filter_seq -index all.fa'.
$USAGE
Options
-------
-index Create an index file of the fasta file\n
-v skip everything listed in the subset.fa file\n
~;
my $createindex = 0;
my $skiplisted = 0;
my $good = shift @ARGV or die $USAGE;
if ($good eq "-help")
{
die $HELPTEXT;
}
elsif ($good eq "-index")
{
$createindex = 1;
$good = shift @ARGV or die $USAGE;
}
elsif ($good eq "-v")
{
$skiplisted = 1;
$good = shift @ARGV or die $USAGE;
print STDERR "Skipping everything listed in $good\n";
}
if ($createindex)
{
my $orig = new FileHandle "$good", "<"
or die("Can't open $good ($!)");
open IDX, "> $good.idx"
or die("Can't open $good.idx ($!)");
while (!$orig->eof())
{
my $pos = $orig->tell();
my $line = $orig->getline();
if ($line =~ /^\>(\S+)/)
{
print IDX "$1 $pos\n";
}
}
close IDX;
}
else
{
my $copy = shift @ARGV or die $USAGE;
my %sequencelist;
## Find the seqnames from the good list
open GOOD, "< $good"
or die("Could't open $good ($!)");
while (<GOOD>)
{
if (/^\#(\S+)\(/ || /^\>(\S+)/)
{
$sequencelist{$1} = 1;
}
}
close GOOD;
if ((!$skiplisted) && (-r "$copy.idx"))
{
## Create the index as: grep -b '>' tvg2.qual | tr -d ':' | tr '>' ' ' | awk '{print $2" "$1}' > tvg2.qual.idx
my %offsettable;
open IDX, "< $copy.idx"
or die("Couldnt open $copy.idx ($!)");
while (<IDX>)
{
my @val = split / /, $_;
$offsettable{$val[0]} = $val[1]
if (exists $sequencelist{$val[0]});
}
close IDX;
my $copy = new FileHandle "$copy", "r"
or die("Couldnt open $copy ($!)");
foreach my $seqname (keys %sequencelist)
{
if (exists $offsettable{$seqname})
{
$sequencelist{$seqname} = 0;
$copy->seek($offsettable{$seqname}, 0);
## Print the headerline for sure
my $line = $copy->getline();
print $line;
## loop until next record
$line = $copy->getline();
while ($line !~ /^>/)
{
print $line;
last if $copy->eof();
$line = $copy->getline();
}
}
}
}
else
{
## Pull the sequences out of the copy file
my $printid = 0;
open COPY, "< $copy"
or die("Couldnt open $copy ($!)");
while (<COPY>)
{
if (/^\>(\S+)/)
{
$printid = $sequencelist{$1};
if ($skiplisted) { $printid = !$printid; }
$sequencelist{$1} = 0;
}
print $_ if $printid;
}
close COPY;
}
## Make sure we found each id
foreach my $seqname (keys %sequencelist)
{
print STDERR "$seqname in $good but not in $copy"
if ($sequencelist{$seqname});
}
}