Skip to content

Commit

Permalink
Merge pull request #8 from andrewjpage/master
Browse files Browse the repository at this point in the history
try to fix broken input fastas and more overall tests
  • Loading branch information
andrewjpage committed Jul 30, 2012
2 parents 805b9a7 + 63ed9ab commit 43dbedd
Show file tree
Hide file tree
Showing 33 changed files with 575 additions and 28 deletions.
3 changes: 1 addition & 2 deletions get_sequence_type
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ path-help@sanger.ac.uk
package SequenceType::Main;

BEGIN { unshift(@INC, './modules') }
use lib "/software/pathogen/internal/prod/lib";
use Moose;
use Getopt::Long;
use Cwd;
Expand Down Expand Up @@ -102,5 +103,3 @@ else
);
$multiple_fastas->create_result_files;
}

# list all available MLST databases
3 changes: 2 additions & 1 deletion modules/MLST/CompareAlleles.pm
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@ use File::Basename;
use Bio::SeqIO;
use MLST::Blast::Database;
use MLST::Blast::BlastN;
use MLST::Types;

has 'sequence_filename' => ( is => 'ro', isa => 'Str', required => 1 );
has 'sequence_filename' => ( is => 'ro', isa => 'MLST::File', required => 1 );
has 'allele_filenames' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'makeblastdb_exec' => ( is => 'ro', isa => 'Str', default => 'makeblastdb' );
has 'blastn_exec' => ( is => 'ro', isa => 'Str', default => 'blastn' );
Expand Down
7 changes: 7 additions & 0 deletions modules/MLST/Exceptions.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package MLST::Exceptions;

use Exception::Class (
MLST::Exceptions::FileDoestExist => { description => 'File doesnt exist' },
);

1;
19 changes: 11 additions & 8 deletions modules/MLST/MultipleFastas.pm
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ use Moose;
use Parallel::ForkManager;
use MLST::ProcessFasta;
use MLST::Spreadsheet::File;
use MLST::NormaliseFasta;
use File::Temp;

has 'species' => ( is => 'ro', isa => 'Str', required => 1 );
has 'base_directory' => ( is => 'ro', isa => 'Str', required => 1 );
Expand All @@ -41,6 +43,7 @@ has '_input_fasta_files' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, build

has '_concat_names' => ( is => 'rw', isa => 'ArrayRef', default => sub {[]} );
has '_concat_sequences' => ( is => 'rw', isa => 'ArrayRef', default => sub {[]} );
has '_working_directory' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir(CLEANUP => 1); });

sub _generate_spreadsheet_rows
{
Expand Down Expand Up @@ -93,18 +96,18 @@ sub _generate_spreadsheet_rows
sub _build__input_fasta_files
{
my($self) = @_;
# TODO: Validate and Reformat the fasta files if theres a pipe character

# Validate
my @normalised_fasta_files;

for my $fastafile (@{$self->raw_input_fasta_files})
{
if(!(-e $fastafile ))
{
die "Input file doesnt exist: $fastafile\n";
}
my $output_fasta_obj = MLST::NormaliseFasta->new(
fasta_filename => $fastafile,
working_directory => $self->_working_directory->dirname()
);
push(@normalised_fasta_files,$output_fasta_obj->processed_fasta_filename());
}

return $self->raw_input_fasta_files;
return \@normalised_fasta_files;
}

sub create_result_files
Expand Down
71 changes: 71 additions & 0 deletions modules/MLST/NormaliseFasta.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
=head1 NAME
NormaliseFasta - Take in a Fasta file, check for invalid characters and build a corrected file if needed. This is needed for NCBI makeblastdb which doesnt like the pipe character in the sequence name
=head1 SYNOPSIS
use MLST::NormaliseFasta;
my $output_fasta = MLST::NormaliseFasta->new(
fasta_filename => 'Filename.fasta'
);
$output_fasta->processed_fasta_filename();
=cut

package MLST::NormaliseFasta;
use Moose;
use Bio::SeqIO;
use File::Basename;
use MLST::Types;

has 'fasta_filename' => ( is => 'ro', isa => 'MLST::File', required => 1 );
has 'working_directory' => ( is => 'ro', isa => 'Str', required => 1 );

has '_normalised_fasta_filename' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__normalised_fasta_filename' );

sub _build__normalised_fasta_filename
{
my($self) = @_;
my $fasta_obj = Bio::SeqIO->new( -file => $self->fasta_filename , -format => 'Fasta');

while(my $seq = $fasta_obj->next_seq())
{
if($seq->id =~ m/\|/ )
{
return $self->_rename_sequences();
}
}

return $self->fasta_filename;
}

sub _rename_sequences
{
my($self) = @_;
my $in_fasta_obj = Bio::SeqIO->new( -file => $self->fasta_filename , -format => 'Fasta');
my($filename, $directories, $suffix) = fileparse($self->fasta_filename);
my $output_filename = $self->working_directory.'/'.$filename.$suffix ;
my $out_fasta_obj = Bio::SeqIO->new(-file => "+>".$output_filename , -format => 'Fasta');

my $counter = 1;
while(my $seq = $in_fasta_obj->next_seq())
{
$seq->id($counter."");
$out_fasta_obj->write_seq($seq);
$counter++;
}
return $output_filename;
}

sub processed_fasta_filename
{
my($self) = @_;
return $self->_normalised_fasta_filename;
}

no Moose;
__PACKAGE__->meta->make_immutable;
1;

3 changes: 2 additions & 1 deletion modules/MLST/OutputFasta.pm
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,12 @@ use File::Basename;
use File::Path qw(make_path);
use Bio::PrimarySeq;
use Bio::SeqIO;
use MLST::Types;

has 'matching_sequences' => ( is => 'ro', isa => 'Maybe[HashRef]', required => 1 );
has 'non_matching_sequences' => ( is => 'ro', isa => 'Maybe[HashRef]', required => 1 );
has 'output_directory' => ( is => 'ro', isa => 'Str', required => 1 );
has 'input_fasta_file' => ( is => 'ro', isa => 'Str', required => 1 );
has 'input_fasta_file' => ( is => 'ro', isa => 'MLST::File', required => 1 );

has '_fasta_filename' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__fasta_filename' );
has 'concat_sequence' => ( is => 'rw', isa => 'Maybe[Str]' );
Expand Down
3 changes: 2 additions & 1 deletion modules/MLST/ProcessFasta.pm
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,11 @@ use MLST::CompareAlleles;
use MLST::SequenceType;
use MLST::OutputFasta;
use MLST::Spreadsheet::Row;
use MLST::Types;

has 'species' => ( is => 'ro', isa => 'Str', required => 1 );
has 'base_directory' => ( is => 'ro', isa => 'Str', required => 1 );
has 'fasta_file' => ( is => 'ro', isa => 'Str', required => 1 );
has 'fasta_file' => ( is => 'ro', isa => 'MLST::File', required => 1 );
has 'makeblastdb_exec' => ( is => 'ro', isa => 'Str', required => 1 );
has 'blastn_exec' => ( is => 'ro', isa => 'Str', required => 1 );
has 'output_directory' => ( is => 'ro', isa => 'Str', required => 1 );
Expand Down
3 changes: 2 additions & 1 deletion modules/MLST/SearchForFiles.pm
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@ $search_results->profiles_filename();

package MLST::SearchForFiles;
use Moose;
use MLST::Types;

has 'species_name' => ( is => 'ro', isa => 'Str', required => 1 );
has 'base_directory' => ( is => 'ro', isa => 'Str', required => 1 );

has 'profiles_filename' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_profiles_filename');
has 'profiles_filename' => ( is => 'ro', isa => 'MLST::File', lazy => 1, builder => '_build_profiles_filename');
has 'allele_filenames' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_allele_filenames');
has 'search_base_directory' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__search_base_directory');

Expand Down
3 changes: 2 additions & 1 deletion modules/MLST/SequenceType.pm
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ $st->sequence_type();

package MLST::SequenceType;
use Moose;
use MLST::Types;

has 'profiles_filename' => ( is => 'ro', isa => 'Str', required => 1 );
has 'profiles_filename' => ( is => 'ro', isa => 'MLST::File', required => 1 );
has 'sequence_names' => ( is => 'ro', isa => 'ArrayRef', required => 1 );

has 'allele_to_number' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_allele_to_number' );
Expand Down
5 changes: 5 additions & 0 deletions modules/MLST/Types.pm
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,16 @@ package MLST::Types;
use Moose;
use Moose::Util::TypeConstraints;
use MLST::Validate::Executable;
use MLST::Validate::File;

subtype 'MLST::Executable',
as 'Str',
where { MLST::Validate::Executable->new()->does_executable_exist($_) };

subtype 'MLST::File',
as 'Str',
where { MLST::Validate::File->new()->does_file_exist($_) };

no Moose;
no Moose::Util::TypeConstraints;
__PACKAGE__->meta->make_immutable;
Expand Down
22 changes: 22 additions & 0 deletions modules/MLST/Validate/File.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
=head1 NAME
File - Does a file exist?
=head1 SYNOPSIS
=cut

package MLST::Validate::File;
use Moose;

sub does_file_exist
{
my($self, $file) = @_;
return 1 if(-e $file);

return 0;
}

no Moose;
__PACKAGE__->meta->make_immutable;
1;
32 changes: 32 additions & 0 deletions t/Input/NormaliseFasta.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/env perl
use strict;
use warnings;
use File::Temp;
use Bio::SeqIO;

BEGIN { unshift(@INC, './modules') }
BEGIN {
use Test::Most;
use_ok('MLST::NormaliseFasta');
}

my $tmpdirectory_obj = File::Temp->newdir(CLEANUP => 1);
my $tmpdirectory = $tmpdirectory_obj->dirname();

ok((my $output_fasta = MLST::NormaliseFasta->new(
fasta_filename => 't/data/contigs.fa',
working_directory => $tmpdirectory
)),'Initalise file wihtout pipe characters in sequence names');
is($output_fasta->processed_fasta_filename(),'t/data/contigs.fa', 'file without pipe characters shouldnt change at all');


ok(($output_fasta = MLST::NormaliseFasta->new(
fasta_filename => 't/data/contigs_pipe_character_in_seq_name.fa',
working_directory => $tmpdirectory
)),'Initalise file with pipe characters in filename');
is($output_fasta->processed_fasta_filename(), $tmpdirectory.'/contigs_pipe_character_in_seq_name.fa', 'file without pipe characters shouldnt change at all');
ok((my $in_fasta_obj = Bio::SeqIO->new( -file => $tmpdirectory.'/contigs_pipe_character_in_seq_name.fa' , -format => 'Fasta')), 'Open temp fasta file');
is($in_fasta_obj->next_seq()->id, '1', 'seq name now 1');
is($in_fasta_obj->next_seq()->id, '2', 'seq name now 2');
is($in_fasta_obj->next_seq()->id, '3', 'seq name now 3');
done_testing();
83 changes: 83 additions & 0 deletions t/Output/MultipleFastas.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#!/usr/bin/env perl
use strict;
use warnings;
use File::Temp;

BEGIN { unshift(@INC, './modules') }
BEGIN {
use Test::Most;
use_ok('MLST::MultipleFastas');
}

my $tmpdirectory_obj = File::Temp->newdir(CLEANUP => 1);
my $tmpdirectory = $tmpdirectory_obj->dirname();

ok((my $multiple_fastas = MLST::MultipleFastas->new(
species => "E.coli",
base_directory => 't/data',
raw_input_fasta_files => ['t/data/contigs.fa'],
makeblastdb_exec => 'makeblastdb',
blastn_exec => 'blastn',
output_directory => $tmpdirectory,
output_fasta_files => 1,
spreadsheet_basename => 'mlst_results',
parallel_processes => 1
)),'Initialise single valid fasta');
ok(($multiple_fastas->create_result_files),'create all the results files for a single valid fasta');
compare_files('t/data/expected_mlst_results.genomic.csv', $tmpdirectory.'/mlst_results.genomic.csv');
compare_files('t/data/expected_mlst_results.allele.csv', $tmpdirectory.'/mlst_results.allele.csv');
compare_files('t/data/expected_concatenated_alleles.fa', $tmpdirectory.'/concatenated_alleles.fa');

$tmpdirectory_obj = File::Temp->newdir(CLEANUP => 1);
$tmpdirectory = $tmpdirectory_obj->dirname();
ok(($multiple_fastas = MLST::MultipleFastas->new(
species => "E.coli",
base_directory => 't/data',
raw_input_fasta_files => ['t/data/contigs.fa','t/data/contigs_pipe_character_in_seq_name.fa'],
makeblastdb_exec => 'makeblastdb',
blastn_exec => 'blastn',
output_directory => $tmpdirectory,
output_fasta_files => 1,
spreadsheet_basename => 'mlst_results',
parallel_processes => 1
)),'Initialise 2 files, one with pipe char and no hits');
ok(($multiple_fastas->create_result_files),'create all the results files for two fastas');
compare_files('t/data/expected_two_mlst_results.genomic.csv', $tmpdirectory.'/mlst_results.genomic.csv');
compare_files('t/data/expected_two_mlst_results.allele.csv', $tmpdirectory.'/mlst_results.allele.csv');
compare_files('t/data/expected_two_concatenated_alleles.fa', $tmpdirectory.'/concatenated_alleles.fa');


$tmpdirectory_obj = File::Temp->newdir(CLEANUP => 1);
$tmpdirectory = $tmpdirectory_obj->dirname();
ok(($multiple_fastas = MLST::MultipleFastas->new(
species => "E.coli",
base_directory => 't/data',
raw_input_fasta_files => ['t/data/contigs.fa','t/data/contigs_pipe_character_in_seq_name.fa','t/data/contigs_one_unknown.tfa'],
makeblastdb_exec => 'makeblastdb',
blastn_exec => 'blastn',
output_directory => $tmpdirectory,
output_fasta_files => 1,
spreadsheet_basename => 'mlst_results',
parallel_processes => 1
)),'Initialise 3 files where 1 has near matches');
ok(($multiple_fastas->create_result_files),'create all the results files for three fastas');
compare_files('t/data/expected_three_mlst_results.genomic.csv', $tmpdirectory.'/mlst_results.genomic.csv');
compare_files('t/data/expected_three_mlst_results.allele.csv', $tmpdirectory.'/mlst_results.allele.csv');
compare_files('t/data/expected_three_concatenated_alleles.fa', $tmpdirectory.'/concatenated_alleles.fa');
compare_files('t/data/expected_three_contigs_one_unknown.unknown_allele.adk-2.fa', $tmpdirectory.'/contigs_one_unknown.unknown_allele.adk-2.fa');
compare_files('t/data/expected_three_contigs_one_unknown.unknown_allele.recA-1.fa', $tmpdirectory.'/contigs_one_unknown.unknown_allele.recA-1.fa');


done_testing();

sub compare_files
{
my($expected_file, $actual_file) = @_;
ok((-e $actual_file),' results file exist');
local $/ = undef;
open(EXPECTED, $expected_file);
open(ACTUAL, $actual_file);
my $expected_line = <EXPECTED>;
my $actual_line = <ACTUAL>;
is($expected_line,$actual_line, 'Content matches expected');
}
Loading

0 comments on commit 43dbedd

Please sign in to comment.