Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfixes/merge fixes 110 #833

Merged
merged 10 commits into from
Oct 16, 2023
68 changes: 68 additions & 0 deletions modules/Bio/EnsEMBL/Production/Pipeline/Common/Gzip.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
=head1 LICENSE

Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Copyright [2016-2023] EMBL-European Bioinformatics Institute

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

=head1 NAME

Bio::EnsEMBL::Production::Pipeline::Common::Gzip;

=head1 DESCRIPTION

A simple script for gzipping files and catching errors

=head1 AUTHOR

ckong@ebi.ac.uk

=cut
package Bio::EnsEMBL::Production::Pipeline::Common::Gzip;;

use strict;
use warnings;
use base qw/Bio::EnsEMBL::Production::Pipeline::Common::Base/;
use IO::Compress::Gzip qw(gzip $GzipError) ;

sub fetch_input {
my ($self) = @_;
return;
}

sub run {
my ($self) = @_;
my @compress = ();
if (ref $self->param_required('compress') eq 'ARRAY') {
@compress = @{$self->param_required('compress')};
}else{
push(@compress, $self->param_required('compress'))
}
foreach my $file (@compress) {
my $output_file = $file.'.gz';
eval {
local $SIG{PIPE} = sub { die "gzip interrupted by SIGPIPE\n" };
gzip $file => $output_file
or die "gzip failed: $GzipError\n";
unlink $file;
};
if ($@) {
print "Error compressing '$file': $@\n";
} else {
print "Compressed '$file' to '$output_file' and removed the original file\n";
}
}
return;
}

1;
13 changes: 7 additions & 6 deletions modules/Bio/EnsEMBL/Production/Pipeline/Flatfile/DumpFile.pm
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,8 @@ sub run {
push(@chromosomes, $s) if $chr;
push(@non_chromosomes, $s) if ! $chr;
}

my @compress = ();

if(@non_chromosomes) {
my $path = $self->_generate_file_name('nonchromosomal');
$self->info('Dumping non-chromosomal data to %s', $path);
Expand All @@ -123,12 +124,11 @@ sub run {
}
return;
});
$self->run_cmd("gzip -n $path");
push (@compress, $path);
} else {
$self->info('Did not find any non-chromosomal data');
}

my @compress = ();
foreach my $slice (@chromosomes) {
$self->fine('Dumping chromosome %s', $slice->name());
my $path = $self->_generate_file_name($slice->coord_system_name(), $slice->seq_region_name());
Expand All @@ -147,9 +147,10 @@ sub run {
return;
});
}

map { $self->run_cmd("gzip -n $_") } @compress;

unless (@compress == 0) {
$self->dataflow_output_id(
{ "compress" => \@compress }, 4);
}
$self->_create_README();
$self->core_dbc()->disconnect_if_idle();
$self->hive_dbc()->disconnect_if_idle();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -363,30 +363,35 @@ sub pipeline_analyses {
-parameters => { type => 'embl', },
-hive_capacity => 50,
-rc_name => '4GB',
-flow_into => { '-1' => 'embl_32GB', },
-flow_into => { '-1' => 'embl_32GB',
'4' => 'compress_file' },
},

{ -logic_name => 'embl_32GB',
-module => 'Bio::EnsEMBL::Production::Pipeline::Flatfile::DumpFile',
-parameters => { type => 'embl', },
-hive_capacity => 50,
-rc_name => '32GB',
-flow_into => { '-1' => 'embl_64GB', },
-flow_into => { '-1' => 'embl_64GB',
'4' => 'compress_file' },

},

{ -logic_name => 'embl_64GB',
-module => 'Bio::EnsEMBL::Production::Pipeline::Flatfile::DumpFile',
-parameters => { type => 'embl', },
-hive_capacity => 50,
-rc_name => '64GB',
-flow_into => { '-1' => 'embl_128GB', },
-flow_into => { '-1' => 'embl_128GB',
'4' => 'compress_file' },
},

{ -logic_name => 'embl_128GB',
-module => 'Bio::EnsEMBL::Production::Pipeline::Flatfile::DumpFile',
-parameters => { type => 'embl', },
-hive_capacity => 50,
-rc_name => '128GB',
-flow_into => { '4' => 'compress_file' },
},

### GENBANK
Expand All @@ -395,30 +400,34 @@ sub pipeline_analyses {
-parameters => { type => 'genbank', },
-hive_capacity => 50,
-rc_name => '4GB',
-flow_into => { -1 => 'genbank_32GB', },
-flow_into => { -1 => 'genbank_32GB',
'4' => 'compress_file' },
},

{ -logic_name => 'genbank_32GB',
-module => 'Bio::EnsEMBL::Production::Pipeline::Flatfile::DumpFile',
-parameters => { type => 'genbank', },
-hive_capacity => 50,
-rc_name => '32GB',
-flow_into => { -1 => 'genbank_64GB', },
-flow_into => { -1 => 'genbank_64GB',
'4' => 'compress_file' },
},

{ -logic_name => 'genbank_64GB',
-module => 'Bio::EnsEMBL::Production::Pipeline::Flatfile::DumpFile',
-parameters => { type => 'genbank', },
-hive_capacity => 50,
-rc_name => '64GB',
-flow_into => { -1 => 'genbank_128GB', },
-flow_into => { -1 => 'genbank_128GB',
'4' => 'compress_file' },
},

{ -logic_name => 'genbank_128GB',
-module => 'Bio::EnsEMBL::Production::Pipeline::Flatfile::DumpFile',
-parameters => { type => 'genbank', },
-hive_capacity => 50,
-rc_name => '128GB',
-flow_into => { '4' => 'compress_file' },
},

### FASTA (cdna, cds, dna, pep, ncrna)
Expand Down Expand Up @@ -585,6 +594,7 @@ sub pipeline_analyses {
},
-hive_capacity => 50,
-rc_name => '2GB',
-flow_into => { '4' => 'compress_file', },
},

{ -logic_name => 'tsv_refseq',
Expand All @@ -595,6 +605,7 @@ sub pipeline_analyses {
},
-hive_capacity => 50,
-rc_name => '2GB',
-flow_into => { '4' => 'compress_file', },
},

{ -logic_name => 'tsv_entrez',
Expand All @@ -605,19 +616,28 @@ sub pipeline_analyses {
},
-hive_capacity => 50,
-rc_name => '2GB',
-flow_into => { '4' => 'compress_file', },
},


{ -logic_name => 'tsv_ena',
-module => 'Bio::EnsEMBL::Production::Pipeline::TSV::DumpFileEna',
-hive_capacity => 50,
-rc_name => '2GB',
-flow_into => { '4' => 'compress_file', },
},

{ -logic_name => 'tsv_metadata',
-module => 'Bio::EnsEMBL::Production::Pipeline::TSV::DumpFileMetadata',
-hive_capacity => 50,
-rc_name => '2GB',
-flow_into => { '4' => 'compress_file', },

},
{ -logic_name => 'compress_file',
-module => 'Bio::EnsEMBL::Production::Pipeline::Common::Gzip',
-hive_capacity => 50,
-rc_name => '4GB',
},

];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,14 +201,13 @@ sub pipeline_analyses {
},
{
-logic_name => 'MySQL_Compress',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
-module => 'Bio::EnsEMBL::Production::Pipeline::Common::Gzip',
-max_retry_count => 1,
-analysis_capacity => 10,
-batch_size => 10,
-parameters => {
cmd => 'gzip -n -f "#output_filename#"',
compress => "#output_filename#",
},
-rc_name => '1GB',
},
{
-logic_name => 'Checksum',
Expand Down
9 changes: 1 addition & 8 deletions modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFile.pm
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ return;

sub run {
my ($self) = @_;

$self->info( "Starting tsv dump for " . $self->param('species'));
$self->_write_tsv();
$self->_create_README();
Expand Down Expand Up @@ -115,14 +114,8 @@ sub _write_tsv {
}#transcript
}#gene
}#slice
close $fh;
close $fh;
$self->core_dbc()->disconnect_if_idle();
$self->info( "Compressing tsv dump for " . $self->param('species'));
my $unzip_out_file = $out_file;
`gzip -n $unzip_out_file`;

if (-e $unzip_out_file) { `rm $unzip_out_file`; }

return;
}

Expand Down
14 changes: 6 additions & 8 deletions modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFileEna.pm
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ sub _write_tsv {
my ($self) = @_;

my $out_file = $self->_generate_file_name();
my $header = $self->_build_headers();
my $header = $self->_build_headers();

open my $fh, '>', $out_file or die "cannot open $out_file for writing!";
print $fh join ("\t", @$header);
Expand Down Expand Up @@ -107,8 +107,8 @@ sub _write_tsv {
if(!defined $row->[5]){
$row->[5] = $self->_find_contig($ta, $contig_ids, $row->[3] );
} elsif( !defined $row->[6] && defined $row->[4]){
$row->[6] = $cds2acc->{$row->[4]};
}
$row->[6] = $cds2acc->{$row->[4]};
}

if (defined $row->[5]) {
$row->[5] =~ s/\.[0-9]+$//;
Expand All @@ -121,11 +121,9 @@ sub _write_tsv {
close $fh;

if ($xrefs_exist == 1) {
$self->info( "Compressing ENA tsv dump for " . $self->param('species'));
my $unzip_out_file = $out_file;
`gzip -n $unzip_out_file`;
} else {
# If we have no xrefs, delete the file (which will just have a header).
$self->dataflow_output_id(
{ "compress" => [ $out_file ] }, 4);
}else{
unlink $out_file or die "failed to delete $out_file!";
}

Expand Down
13 changes: 5 additions & 8 deletions modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFileMetadata.pm
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,9 @@ return;

sub run {
my ($self) = @_;

$self->_make_karyotype_file();

return;
}

Expand All @@ -81,7 +81,7 @@ sub _make_karyotype_file {

my $sp = $self->param_required('species');
my $sa = Bio::EnsEMBL::Registry->get_adaptor($sp, 'core', 'slice');

if(! $sa) {
$self->info("Cannot continue as we cannot find a core:slice DBAdaptor for %s", $sp);
return;
Expand All @@ -92,7 +92,7 @@ sub _make_karyotype_file {
my $slices = $sa->fetch_all_karyotype();
# If we don't have any slices (ie. chromosomes), don't make the file
return unless(scalar(@$slices));

my $file = $self->_generate_file_name();

work_with_file($file, 'w', sub {
Expand All @@ -103,10 +103,7 @@ sub _make_karyotype_file {
}
});

$self->info( "Compressing tsv dump for " . $sp);
my $unzip_file = $file;
`gzip -n $unzip_file`;

$self->dataflow_output_id({ "compress" => [$file] }, 4);
return;
}

Expand Down
Loading