Skip to content

Commit 1dd9f0d

Browse files
committed
Moved optimized \n and \r removal subroutine from Bio::DB::Fasta
to a more central location at Bio::DB::IndexedBase, from where its used by Bio::DB::Fasta and Bio::DB::Qual. Also made a small tweak to nhx.t and updated Changes.
1 parent 47a3c45 commit 1dd9f0d

File tree

6 files changed

+53
-49
lines changed

6 files changed

+53
-49
lines changed

Bio/DB/Fasta.pm

+2-42
Original file line numberDiff line numberDiff line change
@@ -139,46 +139,6 @@ use base qw(Bio::DB::IndexedBase);
139139
our $obj_class = 'Bio::PrimarySeq::Fasta';
140140
our $file_glob = '*.{fa,FA,fasta,FASTA,fast,FAST,dna,DNA,fna,FNA,faa,FAA,fsa,FSA}';
141141

142-
# Compiling the below regular expressions speeds up the Pure Perl
143-
# seq/subseq() by about 7% from 7.76s to 7.22s over 32358 calls on
144-
# Variant Effect Prediction data.
145-
my $nl = qr/\n/;
146-
my $cr = qr/\r/;
147-
148-
# Remove carriage returns (\r) and newlines (\n) from a string. When
149-
# called from subseq, this can take a signficiant portion of time, in
150-
# Variant Effect Prediction. Therefore we compile the match
151-
# portion.
152-
sub strip_crnl {
153-
my $str = shift;
154-
$str =~ s/$nl//g;
155-
$str =~ s/$cr//g;
156-
return $str;
157-
}
158-
159-
# C can do perfrom strip_crnl much faster. But this requires the
160-
# Inline::C module which we don't require people to have. So we make
161-
# this optional by wrapping the C code in an eval. If the eval works,
162-
# the Perl strip_crnl() function is overwritten.
163-
eval q{
164-
use Inline C => <<'END_OF_C_CODE';
165-
/* Strip all new line (\n) and carriage return (\r) characters
166-
from string str
167-
*/
168-
char* strip_crnl(char* str) {
169-
char *s;
170-
char *s2 = str;
171-
for (s = str; *s; *s++) {
172-
if (*s != '\n' && *s != '\r') {
173-
*s2++ = *s;
174-
}
175-
}
176-
*s2 = '\0';
177-
return str;
178-
}
179-
END_OF_C_CODE
180-
};
181-
182142

183143
=head2 new
184144
@@ -329,7 +289,7 @@ sub subseq {
329289
seek($fh, $filestart,0);
330290
read($fh, $data, $filestop-$filestart+1);
331291

332-
$data = strip_crnl($data);
292+
$data = Bio::DB::IndexedBase::_strip_crnl($data);
333293

334294
if ($strand == -1) {
335295
# Reverse-complement the sequence
@@ -371,7 +331,7 @@ sub header {
371331
read($fh, $data, $headerlen);
372332
# On Windows chomp remove '\n' but leaves '\r'
373333
# when reading '\r\n' in binary mode
374-
$data = strip_crnl($data);
334+
$data = Bio::DB::IndexedBase::_strip_crnl($data);
375335
substr($data, 0, 1) = '';
376336
return $data;
377337
}

Bio/DB/IndexedBase.pm

+40
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,46 @@ use constant DIE_ON_MISSMATCHED_LINES => 1;
268268
# you can avoid dying if you want but you may get incorrect results
269269

270270

271+
# Compiling the below regular expressions speeds up the Pure Perl
272+
# seq/subseq() from Bio::DB::Fasta by about 7% from 7.76s to 7.22s
273+
# over 32358 calls on Variant Effect Prediction data.
274+
my $nl = qr/\n/;
275+
my $cr = qr/\r/;
276+
277+
# Remove carriage returns (\r) and newlines (\n) from a string. When
278+
# called from subseq, this can take a signficiant portion of time, in
279+
# Variant Effect Prediction. Therefore we compile the match portion.
280+
sub _strip_crnl {
281+
my $str = shift;
282+
$str =~ s/$nl//g;
283+
$str =~ s/$cr//g;
284+
return $str;
285+
}
286+
287+
# C can do perfrom _strip_crnl much faster. But this requires the
288+
# Inline::C module which we don't require people to have. So we make
289+
# this optional by wrapping the C code in an eval. If the eval works,
290+
# the Perl strip_crnl() function is overwritten.
291+
eval q{
292+
use Inline C => <<'END_OF_C_CODE';
293+
/* Strip all new line (\n) and carriage return (\r) characters
294+
from string str
295+
*/
296+
char* _strip_crnl(char* str) {
297+
char *s;
298+
char *s2 = str;
299+
for (s = str; *s; *s++) {
300+
if (*s != '\n' && *s != '\r') {
301+
*s2++ = *s;
302+
}
303+
}
304+
*s2 = '\0';
305+
return str;
306+
}
307+
END_OF_C_CODE
308+
};
309+
310+
271311
=head2 new
272312
273313
Title : new

Bio/DB/Qual.pm

+4-5
Original file line numberDiff line numberDiff line change
@@ -335,8 +335,7 @@ sub subqual {
335335
read($fh, $data, $filestop-$filestart+1);
336336

337337
# Process quality score
338-
$data =~ s/\n//g;
339-
$data =~ s/\r//g;
338+
Bio::DB::IndexedBase::_strip_crnl($data);
340339
my $subqual = 0;
341340
$subqual = 1 if ( $start || $stop );
342341
my @data;
@@ -379,9 +378,9 @@ sub header {
379378
seek($fh, $offset, 0);
380379
read($fh, $data, $headerlen);
381380
# On Windows chomp remove '\n' but leaves '\r'
382-
# when reading '\r\n' in binary mode
383-
$data =~ s/\n//g;
384-
$data =~ s/\r//g;
381+
# when reading '\r\n' in binary mode,
382+
# _strip_crnl removes both
383+
$data = Bio::DB::IndexedBase::_strip_crnl($data);
385384
substr($data, 0, 1) = '';
386385
return $data;
387386
}

Build.PL

+1
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ my %recommends = (
107107

108108
'Inline::C' => [0.67,
109109
'Speeding up code like Fasta Bio::DB::Fasta'],
110+
110111
'IO::Scalar' => [0,
111112
'Deal with non-seekable filehandles/Bio::Tools::GuessSeqFormat'],
112113

Changes

+2
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ CPAN releases are branched from 'master'.
3434
* Issue #81: Small updates to make sure possible memory leaks are detected [cjfields]
3535
* Issue #84: EMBL format wrapping problem [nyamned]
3636
* Issue #90: Missing entries for translation tables 24 and 25 [fjossandon]
37+
* Issue #95: Speed up of Bio::DB::Fasta::subseq by using a compiled regex
38+
or compiled C code (when Inline::C is installed) [rocky]
3739
* Fix various Bio::Tools::Analysis remote server config problems [cjfields]
3840
* Added several missing 'Data::Stag' and 'LWP::UserAgent' requirements [fjossandon]
3941
* Added a workaround in Bio::DB::Registry to get Username in Windows [fjossandon]

t/Tree/TreeIO/nhx.t

+4-2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ BEGIN {
1313
}
1414

1515
my $verbose = 0; #test_debug();
16+
my $nl = qr/\n/;
17+
my $cr = qr/\r/;
1618

1719
my $treeio = Bio::TreeIO->new(
1820
-format => 'nhx',
@@ -85,8 +87,8 @@ sub read_file {
8587
binmode $IN;
8688
$string = <$IN>;
8789
close $IN;
88-
$string =~ s/\n//g;
89-
$string =~ s/\r//g; # For files with Windows line-endings
90+
$string =~ s/$nl//g;
91+
$string =~ s/$cr//g; # For files with Windows line-endings
9092
#print STDERR "STR: $string\n";
9193
return $string;
9294
}

0 commit comments

Comments
 (0)