Merge pull request #833 from Ensembl/bugfixes/merge-fixes-110

Bugfixes/merge fixes 110
Ensembl · Oct 16, 2023 · b0bd310 · b0bd310
2 parents 45f88d3 + 19a9198
commit b0bd310
Show file tree

Hide file tree

Showing 8 changed files with 126 additions and 51 deletions.
diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Common/Gzip.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Common/Gzip.pm
@@ -0,0 +1,68 @@
+=head1 LICENSE
+
+Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
+Copyright [2016-2023] EMBL-European Bioinformatics Institute
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+=head1 NAME
+
+ Bio::EnsEMBL::Production::Pipeline::Common::Gzip;
+
+=head1 DESCRIPTION
+
+A simple script for gzipping files and catching errors
+
+=head1 AUTHOR
+
+ ckong@ebi.ac.uk 
+
+=cut
+package Bio::EnsEMBL::Production::Pipeline::Common::Gzip;;
+
+use strict;
+use warnings;
+use base qw/Bio::EnsEMBL::Production::Pipeline::Common::Base/;
+use IO::Compress::Gzip qw(gzip $GzipError) ;
+
+sub fetch_input {
+    my ($self) = @_;
+return;
+}
+
+sub run {
+    my ($self) = @_;
+    my @compress = ();
+    if (ref $self->param_required('compress') eq 'ARRAY') {
+        @compress = @{$self->param_required('compress')};
+    }else{
+        push(@compress, $self->param_required('compress'))
+    }
+    foreach my $file (@compress) {
+        my $output_file = $file.'.gz';
+        eval {
+            local $SIG{PIPE} = sub { die "gzip interrupted by SIGPIPE\n" };
+            gzip $file => $output_file
+                or die "gzip failed: $GzipError\n";
+            unlink $file;
+        };
+        if ($@) {
+            print "Error compressing '$file': $@\n";
+        } else {
+            print "Compressed '$file' to '$output_file' and removed the original file\n";
+        }
+    }
+return;
+}
+
+1;
diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Flatfile/DumpFile.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Flatfile/DumpFile.pm
@@ -111,7 +111,8 @@ sub run {
     push(@chromosomes, $s) if $chr;
     push(@non_chromosomes, $s) if ! $chr;
   }
-
+  my @compress = ();
+
   if(@non_chromosomes) {
     my $path = $self->_generate_file_name('nonchromosomal');
     $self->info('Dumping non-chromosomal data to %s', $path);
@@ -123,12 +124,11 @@ sub run {
       }
       return;
     });
-    $self->run_cmd("gzip -n $path");
+    push (@compress, $path);
   } else {
     $self->info('Did not find any non-chromosomal data');
   }
 
-  my @compress = ();
   foreach my $slice (@chromosomes) {
     $self->fine('Dumping chromosome %s', $slice->name());
     my $path = $self->_generate_file_name($slice->coord_system_name(), $slice->seq_region_name());
@@ -147,9 +147,10 @@ sub run {
       return;
     });
   }
-
-  map { $self->run_cmd("gzip -n $_") } @compress;
-
+  unless (@compress == 0) {
+    $self->dataflow_output_id(
+        { "compress" => \@compress  }, 4);
+  }
   $self->_create_README();
   $self->core_dbc()->disconnect_if_idle();  
   $self->hive_dbc()->disconnect_if_idle();  

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DumpCore_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DumpCore_conf.pm
@@ -363,30 +363,35 @@ sub pipeline_analyses {
             -parameters    => { type => 'embl', },
             -hive_capacity => 50,
             -rc_name       => '4GB',
-            -flow_into     => { '-1' => 'embl_32GB', },
+            -flow_into     => { '-1' => 'embl_32GB',
+                                 '4' => 'compress_file' },
         },
 
         { -logic_name      => 'embl_32GB',
             -module        => 'Bio::EnsEMBL::Production::Pipeline::Flatfile::DumpFile',
             -parameters    => { type => 'embl', },
             -hive_capacity => 50,
             -rc_name       => '32GB',
-            -flow_into     => { '-1' => 'embl_64GB', },
+            -flow_into     => { '-1' => 'embl_64GB',
+                                 '4' => 'compress_file' },
+
         },
 
         { -logic_name      => 'embl_64GB',
             -module        => 'Bio::EnsEMBL::Production::Pipeline::Flatfile::DumpFile',
             -parameters    => { type => 'embl', },
             -hive_capacity => 50,
             -rc_name       => '64GB',
-            -flow_into     => { '-1' => 'embl_128GB', },
+            -flow_into     => { '-1' => 'embl_128GB',
+                                 '4' => 'compress_file' },
         },
 
         { -logic_name      => 'embl_128GB',
             -module        => 'Bio::EnsEMBL::Production::Pipeline::Flatfile::DumpFile',
             -parameters    => { type => 'embl', },
             -hive_capacity => 50,
             -rc_name       => '128GB',
+            -flow_into => { '4' => 'compress_file' },
         },
 
         ### GENBANK
@@ -395,30 +400,34 @@ sub pipeline_analyses {
             -parameters    => { type => 'genbank', },
             -hive_capacity => 50,
             -rc_name       => '4GB',
-            -flow_into     => { -1 => 'genbank_32GB', },
+            -flow_into     => { -1 => 'genbank_32GB',
+                                '4' => 'compress_file' },
         },
 
         { -logic_name      => 'genbank_32GB',
             -module        => 'Bio::EnsEMBL::Production::Pipeline::Flatfile::DumpFile',
             -parameters    => { type => 'genbank', },
             -hive_capacity => 50,
             -rc_name       => '32GB',
-            -flow_into     => { -1 => 'genbank_64GB', },
+            -flow_into     => { -1 => 'genbank_64GB',
+                                 '4' => 'compress_file' },
         },
 
         { -logic_name      => 'genbank_64GB',
             -module        => 'Bio::EnsEMBL::Production::Pipeline::Flatfile::DumpFile',
             -parameters    => { type => 'genbank', },
             -hive_capacity => 50,
             -rc_name       => '64GB',
-            -flow_into     => { -1 => 'genbank_128GB', },
+            -flow_into     => { -1 => 'genbank_128GB', 
+                                '4' => 'compress_file' },
         },
 
         { -logic_name      => 'genbank_128GB',
             -module        => 'Bio::EnsEMBL::Production::Pipeline::Flatfile::DumpFile',
             -parameters    => { type => 'genbank', },
             -hive_capacity => 50,
             -rc_name       => '128GB',
+            -flow_into => { '4' => 'compress_file' },
         },
 
         ### FASTA (cdna, cds, dna, pep, ncrna)
@@ -585,6 +594,7 @@ sub pipeline_analyses {
             },
             -hive_capacity => 50,
             -rc_name       => '2GB',
+            -flow_into  => { '4' => 'compress_file', },
         },
 
         { -logic_name      => 'tsv_refseq',
@@ -595,6 +605,7 @@ sub pipeline_analyses {
             },
             -hive_capacity => 50,
             -rc_name       => '2GB',
+            -flow_into  => { '4' => 'compress_file', },
         },
 
         { -logic_name      => 'tsv_entrez',
@@ -605,19 +616,28 @@ sub pipeline_analyses {
             },
             -hive_capacity => 50,
             -rc_name       => '2GB',
+            -flow_into  => { '4' => 'compress_file', },
         },
 
 
         { -logic_name      => 'tsv_ena',
             -module        => 'Bio::EnsEMBL::Production::Pipeline::TSV::DumpFileEna',
             -hive_capacity => 50,
             -rc_name       => '2GB',
+            -flow_into  => { '4' => 'compress_file', },
         },
 
         { -logic_name      => 'tsv_metadata',
             -module        => 'Bio::EnsEMBL::Production::Pipeline::TSV::DumpFileMetadata',
             -hive_capacity => 50,
             -rc_name       => '2GB',
+            -flow_into  => { '4' => 'compress_file', },
+
+        },
+        { -logic_name      => 'compress_file',
+            -module        => 'Bio::EnsEMBL::Production::Pipeline::Common::Gzip',
+            -hive_capacity => 50,
+            -rc_name       => '4GB',
         },
 
     ];

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/FileDumpMySQL_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/FileDumpMySQL_conf.pm
@@ -201,14 +201,13 @@ sub pipeline_analyses {
         },
         {
             -logic_name        => 'MySQL_Compress',
-            -module            => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
+            -module            => 'Bio::EnsEMBL::Production::Pipeline::Common::Gzip',
             -max_retry_count   => 1,
             -analysis_capacity => 10,
             -batch_size        => 10,
             -parameters        => {
-                cmd => 'gzip -n -f "#output_filename#"',
+                compress => "#output_filename#",
             },
-            -rc_name           => '1GB',
         },
         {
             -logic_name        => 'Checksum',

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFile.pm b/modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFile.pm
@@ -55,7 +55,6 @@ return;
 
 sub run {
     my ($self) = @_;
-
     $self->info( "Starting tsv dump for " . $self->param('species'));
     $self->_write_tsv();
     $self->_create_README();
@@ -115,14 +114,8 @@ sub _write_tsv {
          }#transcript
       }#gene
   }#slice 
-  close $fh; 
+  close $fh;
   $self->core_dbc()->disconnect_if_idle();
-  $self->info( "Compressing tsv dump for " . $self->param('species'));
-  my $unzip_out_file = $out_file;
-  `gzip -n $unzip_out_file`;
-
-  if (-e $unzip_out_file) { `rm $unzip_out_file`; }
-
 return;
 }
 

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFileEna.pm b/modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFileEna.pm
@@ -64,7 +64,7 @@ sub _write_tsv {
     my ($self) = @_;
 
     my $out_file  = $self->_generate_file_name();
-    my $header    = $self->_build_headers();   
+    my $header    = $self->_build_headers();
 
     open my $fh, '>', $out_file or die "cannot open $out_file for writing!";
     print $fh join ("\t", @$header);
@@ -107,8 +107,8 @@ sub _write_tsv {
         if(!defined $row->[5]){
 	   $row->[5] = $self->_find_contig($ta, $contig_ids, $row->[3] );
         } elsif( !defined $row->[6] && defined $row->[4]){
-	   $row->[6] = $cds2acc->{$row->[4]}; 
- 	} 
+	   $row->[6] = $cds2acc->{$row->[4]};
+ 	}
 
 	if (defined $row->[5]) {
             $row->[5] =~ s/\.[0-9]+$//;
@@ -121,11 +121,9 @@ sub _write_tsv {
     close $fh;
 
     if ($xrefs_exist == 1) {
-      $self->info( "Compressing ENA tsv dump for " . $self->param('species'));
-      my $unzip_out_file = $out_file;
-      `gzip -n $unzip_out_file`;
-    } else {
-      # If we have no xrefs, delete the file (which will just have a header).
+        $self->dataflow_output_id(
+            { "compress" => [ $out_file ] }, 4);
+    }else{
       unlink $out_file  or die "failed to delete $out_file!";
     }
 

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFileMetadata.pm b/modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFileMetadata.pm
@@ -70,9 +70,9 @@ return;
 
 sub run {
   my ($self) = @_;
-  
+
   $self->_make_karyotype_file();
-  
+
 return;
 }
 
@@ -81,7 +81,7 @@ sub _make_karyotype_file {
 
     my $sp = $self->param_required('species');
     my $sa = Bio::EnsEMBL::Registry->get_adaptor($sp, 'core', 'slice');
-   
+
     if(! $sa) {
         $self->info("Cannot continue as we cannot find a core:slice DBAdaptor for %s", $sp);
         return;
@@ -92,7 +92,7 @@ sub _make_karyotype_file {
     my $slices = $sa->fetch_all_karyotype();
     # If we don't have any slices (ie. chromosomes), don't make the file
     return unless(scalar(@$slices));
- 
+
     my $file = $self->_generate_file_name();
 
     work_with_file($file, 'w', sub {
@@ -103,10 +103,7 @@ sub _make_karyotype_file {
       }
    });
 
-  $self->info( "Compressing tsv dump for " . $sp);
-  my $unzip_file = $file;
-  `gzip -n $unzip_file`;
-
+      $self->dataflow_output_id({ "compress" => [$file] }, 4);
 return;
 }