Skip to content

Commit

Permalink
Merge pull request #833 from Ensembl/bugfixes/merge-fixes-110
Browse files Browse the repository at this point in the history
Bugfixes/merge fixes 110
  • Loading branch information
marcoooo authored Oct 16, 2023
2 parents 45f88d3 + 19a9198 commit b0bd310
Show file tree
Hide file tree
Showing 8 changed files with 126 additions and 51 deletions.
68 changes: 68 additions & 0 deletions modules/Bio/EnsEMBL/Production/Pipeline/Common/Gzip.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
=head1 LICENSE
Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Copyright [2016-2023] EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
=head1 NAME
Bio::EnsEMBL::Production::Pipeline::Common::Gzip;
=head1 DESCRIPTION
A simple script for gzipping files and catching errors
=head1 AUTHOR
ckong@ebi.ac.uk
=cut
package Bio::EnsEMBL::Production::Pipeline::Common::Gzip;;

use strict;
use warnings;
use base qw/Bio::EnsEMBL::Production::Pipeline::Common::Base/;
use IO::Compress::Gzip qw(gzip $GzipError) ;

sub fetch_input {
my ($self) = @_;
return;
}

sub run {
my ($self) = @_;
my @compress = ();
if (ref $self->param_required('compress') eq 'ARRAY') {
@compress = @{$self->param_required('compress')};
}else{
push(@compress, $self->param_required('compress'))
}
foreach my $file (@compress) {
my $output_file = $file.'.gz';
eval {
local $SIG{PIPE} = sub { die "gzip interrupted by SIGPIPE\n" };
gzip $file => $output_file
or die "gzip failed: $GzipError\n";
unlink $file;
};
if ($@) {
print "Error compressing '$file': $@\n";
} else {
print "Compressed '$file' to '$output_file' and removed the original file\n";
}
}
return;
}

1;
13 changes: 7 additions & 6 deletions modules/Bio/EnsEMBL/Production/Pipeline/Flatfile/DumpFile.pm
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,8 @@ sub run {
push(@chromosomes, $s) if $chr;
push(@non_chromosomes, $s) if ! $chr;
}

my @compress = ();

if(@non_chromosomes) {
my $path = $self->_generate_file_name('nonchromosomal');
$self->info('Dumping non-chromosomal data to %s', $path);
Expand All @@ -123,12 +124,11 @@ sub run {
}
return;
});
$self->run_cmd("gzip -n $path");
push (@compress, $path);
} else {
$self->info('Did not find any non-chromosomal data');
}

my @compress = ();
foreach my $slice (@chromosomes) {
$self->fine('Dumping chromosome %s', $slice->name());
my $path = $self->_generate_file_name($slice->coord_system_name(), $slice->seq_region_name());
Expand All @@ -147,9 +147,10 @@ sub run {
return;
});
}

map { $self->run_cmd("gzip -n $_") } @compress;

unless (@compress == 0) {
$self->dataflow_output_id(
{ "compress" => \@compress }, 4);
}
$self->_create_README();
$self->core_dbc()->disconnect_if_idle();
$self->hive_dbc()->disconnect_if_idle();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -363,30 +363,35 @@ sub pipeline_analyses {
-parameters => { type => 'embl', },
-hive_capacity => 50,
-rc_name => '4GB',
-flow_into => { '-1' => 'embl_32GB', },
-flow_into => { '-1' => 'embl_32GB',
'4' => 'compress_file' },
},

{ -logic_name => 'embl_32GB',
-module => 'Bio::EnsEMBL::Production::Pipeline::Flatfile::DumpFile',
-parameters => { type => 'embl', },
-hive_capacity => 50,
-rc_name => '32GB',
-flow_into => { '-1' => 'embl_64GB', },
-flow_into => { '-1' => 'embl_64GB',
'4' => 'compress_file' },

},

{ -logic_name => 'embl_64GB',
-module => 'Bio::EnsEMBL::Production::Pipeline::Flatfile::DumpFile',
-parameters => { type => 'embl', },
-hive_capacity => 50,
-rc_name => '64GB',
-flow_into => { '-1' => 'embl_128GB', },
-flow_into => { '-1' => 'embl_128GB',
'4' => 'compress_file' },
},

{ -logic_name => 'embl_128GB',
-module => 'Bio::EnsEMBL::Production::Pipeline::Flatfile::DumpFile',
-parameters => { type => 'embl', },
-hive_capacity => 50,
-rc_name => '128GB',
-flow_into => { '4' => 'compress_file' },
},

### GENBANK
Expand All @@ -395,30 +400,34 @@ sub pipeline_analyses {
-parameters => { type => 'genbank', },
-hive_capacity => 50,
-rc_name => '4GB',
-flow_into => { -1 => 'genbank_32GB', },
-flow_into => { -1 => 'genbank_32GB',
'4' => 'compress_file' },
},

{ -logic_name => 'genbank_32GB',
-module => 'Bio::EnsEMBL::Production::Pipeline::Flatfile::DumpFile',
-parameters => { type => 'genbank', },
-hive_capacity => 50,
-rc_name => '32GB',
-flow_into => { -1 => 'genbank_64GB', },
-flow_into => { -1 => 'genbank_64GB',
'4' => 'compress_file' },
},

{ -logic_name => 'genbank_64GB',
-module => 'Bio::EnsEMBL::Production::Pipeline::Flatfile::DumpFile',
-parameters => { type => 'genbank', },
-hive_capacity => 50,
-rc_name => '64GB',
-flow_into => { -1 => 'genbank_128GB', },
-flow_into => { -1 => 'genbank_128GB',
'4' => 'compress_file' },
},

{ -logic_name => 'genbank_128GB',
-module => 'Bio::EnsEMBL::Production::Pipeline::Flatfile::DumpFile',
-parameters => { type => 'genbank', },
-hive_capacity => 50,
-rc_name => '128GB',
-flow_into => { '4' => 'compress_file' },
},

### FASTA (cdna, cds, dna, pep, ncrna)
Expand Down Expand Up @@ -585,6 +594,7 @@ sub pipeline_analyses {
},
-hive_capacity => 50,
-rc_name => '2GB',
-flow_into => { '4' => 'compress_file', },
},

{ -logic_name => 'tsv_refseq',
Expand All @@ -595,6 +605,7 @@ sub pipeline_analyses {
},
-hive_capacity => 50,
-rc_name => '2GB',
-flow_into => { '4' => 'compress_file', },
},

{ -logic_name => 'tsv_entrez',
Expand All @@ -605,19 +616,28 @@ sub pipeline_analyses {
},
-hive_capacity => 50,
-rc_name => '2GB',
-flow_into => { '4' => 'compress_file', },
},


{ -logic_name => 'tsv_ena',
-module => 'Bio::EnsEMBL::Production::Pipeline::TSV::DumpFileEna',
-hive_capacity => 50,
-rc_name => '2GB',
-flow_into => { '4' => 'compress_file', },
},

{ -logic_name => 'tsv_metadata',
-module => 'Bio::EnsEMBL::Production::Pipeline::TSV::DumpFileMetadata',
-hive_capacity => 50,
-rc_name => '2GB',
-flow_into => { '4' => 'compress_file', },

},
{ -logic_name => 'compress_file',
-module => 'Bio::EnsEMBL::Production::Pipeline::Common::Gzip',
-hive_capacity => 50,
-rc_name => '4GB',
},

];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,14 +201,13 @@ sub pipeline_analyses {
},
{
-logic_name => 'MySQL_Compress',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
-module => 'Bio::EnsEMBL::Production::Pipeline::Common::Gzip',
-max_retry_count => 1,
-analysis_capacity => 10,
-batch_size => 10,
-parameters => {
cmd => 'gzip -n -f "#output_filename#"',
compress => "#output_filename#",
},
-rc_name => '1GB',
},
{
-logic_name => 'Checksum',
Expand Down
9 changes: 1 addition & 8 deletions modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFile.pm
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ return;

sub run {
my ($self) = @_;

$self->info( "Starting tsv dump for " . $self->param('species'));
$self->_write_tsv();
$self->_create_README();
Expand Down Expand Up @@ -115,14 +114,8 @@ sub _write_tsv {
}#transcript
}#gene
}#slice
close $fh;
close $fh;
$self->core_dbc()->disconnect_if_idle();
$self->info( "Compressing tsv dump for " . $self->param('species'));
my $unzip_out_file = $out_file;
`gzip -n $unzip_out_file`;

if (-e $unzip_out_file) { `rm $unzip_out_file`; }

return;
}

Expand Down
14 changes: 6 additions & 8 deletions modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFileEna.pm
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ sub _write_tsv {
my ($self) = @_;

my $out_file = $self->_generate_file_name();
my $header = $self->_build_headers();
my $header = $self->_build_headers();

open my $fh, '>', $out_file or die "cannot open $out_file for writing!";
print $fh join ("\t", @$header);
Expand Down Expand Up @@ -107,8 +107,8 @@ sub _write_tsv {
if(!defined $row->[5]){
$row->[5] = $self->_find_contig($ta, $contig_ids, $row->[3] );
} elsif( !defined $row->[6] && defined $row->[4]){
$row->[6] = $cds2acc->{$row->[4]};
}
$row->[6] = $cds2acc->{$row->[4]};
}

if (defined $row->[5]) {
$row->[5] =~ s/\.[0-9]+$//;
Expand All @@ -121,11 +121,9 @@ sub _write_tsv {
close $fh;

if ($xrefs_exist == 1) {
$self->info( "Compressing ENA tsv dump for " . $self->param('species'));
my $unzip_out_file = $out_file;
`gzip -n $unzip_out_file`;
} else {
# If we have no xrefs, delete the file (which will just have a header).
$self->dataflow_output_id(
{ "compress" => [ $out_file ] }, 4);
}else{
unlink $out_file or die "failed to delete $out_file!";
}

Expand Down
13 changes: 5 additions & 8 deletions modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFileMetadata.pm
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,9 @@ return;

sub run {
my ($self) = @_;

$self->_make_karyotype_file();

return;
}

Expand All @@ -81,7 +81,7 @@ sub _make_karyotype_file {

my $sp = $self->param_required('species');
my $sa = Bio::EnsEMBL::Registry->get_adaptor($sp, 'core', 'slice');

if(! $sa) {
$self->info("Cannot continue as we cannot find a core:slice DBAdaptor for %s", $sp);
return;
Expand All @@ -92,7 +92,7 @@ sub _make_karyotype_file {
my $slices = $sa->fetch_all_karyotype();
# If we don't have any slices (ie. chromosomes), don't make the file
return unless(scalar(@$slices));

my $file = $self->_generate_file_name();

work_with_file($file, 'w', sub {
Expand All @@ -103,10 +103,7 @@ sub _make_karyotype_file {
}
});

$self->info( "Compressing tsv dump for " . $sp);
my $unzip_file = $file;
`gzip -n $unzip_file`;

$self->dataflow_output_id({ "compress" => [$file] }, 4);
return;
}

Expand Down
Loading

0 comments on commit b0bd310

Please sign in to comment.