-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFAlite.pm
executable file
·129 lines (96 loc) · 2.71 KB
/
FAlite.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
package FAlite;
use strict;
sub new {
my ($class, $fh) = @_;
if (ref $fh !~ /GLOB/)
{die ref $fh, "\n", "FAlite ERROR: expect a GLOB reference\n"}
my $this = bless {};
$this->{FH} = $fh;
while(<$fh>) {last if $_ =~ /\S/} # not supposed to have blanks, but...
my $firstline = $_;
if (not defined $firstline) {warn "FAlite: Empty\n"; return $this}
if ($firstline !~ /^>/) {warn "FAlite: Not FASTA formatted\n"; return $this}
$this->{LASTLINE} = $firstline;
chomp $this->{LASTLINE};
return $this;
}
sub nextEntry {
my ($this) = @_;
return 0 if not defined $this->{LASTLINE};
my $fh = $this->{FH};
my $def = $this->{LASTLINE};
my @seq;
my $lines_read = 0;
while(<$fh>) {
$lines_read++;
if ($_ =~ /^>/) {
$this->{LASTLINE} = $_;
chomp $this->{LASTLINE};
last;
}
push @seq, $_;
}
return 0 if $lines_read == 0;
chomp @seq;
my $entry = FAlite::Entry::new($def, \@seq);
return $entry;
}
package FAlite::Entry;
use overload '""' => 'all';
sub new {
my ($def, $seqarry) = @_;
my $this = bless {};
$this->{DEF} = $def;
$this->{SEQ} = join("", @$seqarry);
$this->{SEQ} =~ s/\s//g; # just in case more spaces
return $this;
}
sub def {shift->{DEF}}
sub seq {shift->{SEQ}}
sub all {my $e = shift; return $e->{DEF}."\n".$e->{SEQ}."\n"}
1;
__END__
=head1 NAME
FAlite;
=head1 SYNOPSIS
use FAlite;
my $fasta = new FAlite(\*STDIN);
while(my $entry = $fasta->nextEntry) {
$entry->def;
$entry->seq;
}
=head1 DESCRIPTION
FAlite is a package for parsing FASTA files and databases. The FASTA format is
widely used in bioinformatics. It consists of a definition line followed by
sequence with an arbitrary number of lines and line lengths.
A FASTA file looks like this:
>identifier descriptive text
GAATTC
A FASTA database looks like this:
>identifier1 some text describing this entry
GAATTC
ACTAGT
>identifier2 some text describing this entry
AAACCT
GCTAAT
=head2 Object
FAlite has two kinds of objects, the file and the entry.
my $fasta_file = new FAlite(\*STDIN); # or any other filehandle
$entry = $fasta_file->nextEntry; # single fasta fle
while(my $entry = $fasta_file->nextEntry) {
# canonical form of use for fasta database
}
The entry has two attributes (def and seq).
$entry->def; # access the def line
$entry->seq; # access the sequence
"$entry"; # overload to fasta file ($entry->def . "\n" . $entry->seq)
=head1 AUTHOR
Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf)
=head1 ACKNOWLEDGEMENTS
This software was developed at the Genome Sequencing Center at Washington
Univeristy, St. Louis, MO.
=head1 COPYRIGHT
Copyright (C) 1999 Ian Korf. All Rights Reserved.
=head1 DISCLAIMER
This software is provided "as is" without warranty of any kind.
=cut