Ensembl · dglemos · Apr 6, 2023 · Apr 6, 2023 · Apr 6, 2023 · Apr 27, 2023
diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm
@@ -84,6 +84,11 @@ sub merge_vcf_files {
 
   # Sort final file
   $self->run_system_command("bcftools sort -o $final_file_sorted $final_file");
+  $self->run_system_command("bgzip $final_file_sorted");
+  $self->run_system_command("tabix -p vcf $final_file_sorted.gz");
+
+  # Remove unsorted file
+  $self->run_system_command("rm $final_file");
 }
 
 1;
diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm
@@ -68,7 +68,12 @@ sub run_spliceai {
     die("Directory ($output_vcf_files_dir) doesn't exist");
   }
 
-  my $cmd = "spliceai -I $vcf_input_dir_chr/$vcf_file -O $output_vcf_files_dir/$vcf_file -R $fasta_file -A $gene_annotation";
+  my $tmp_dir = $main_dir . "/tmp";
+
+  # activate conda env
+  $self->run_system_command("source activate spliceai");
+
+  my $cmd = "spliceai -I $vcf_input_dir_chr/$vcf_file -O $output_vcf_files_dir/$vcf_file -R $fasta_file -A $gene_annotation -B 4096 -T 256 -t $tmp_dir";
 
   # Add option to calculate masked scores
   if($masked_scores) {

diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm
@@ -54,21 +54,21 @@ sub default_options {
         pipeline_name              => 'spliceai_scores',
         main_dir                   => $self->o('main_dir'), # main directory where all files and directories are going to be stored
         input_directory            => $self->o('main_dir') . '/input_vcf_files', # input files
-	split_vcf_no_header_dir    => $self->o('main_dir') . '/split_vcf_no_header', # contains the input files after being splitted (files without headers)
+        split_vcf_no_header_dir    => $self->o('main_dir') . '/split_vcf_no_header', # contains the input files after being splitted (files without headers)
         split_vcf_input_dir        => $self->o('main_dir') . '/split_vcf_input', # contains the splitted input vcf files with headers, these are the files used to run SpliceAI
         split_vcf_output_dir       => $self->o('main_dir') . '/split_vcf_output', # temporary output files, still splitted
         output_dir                 => $self->o('main_dir') . '/output', # final output files already merged by chromosome
         fasta_file                 => $self->o('fasta_file'),
         gene_annotation            => $self->o('gene_annotation'),
-        step_size                  => 4000, # number of variants used to split the main vcf files
+        step_size                  => 500000, # number of variants used to split the main vcf files
         check_transcripts          => 0, # if set to 1 checks which are the new MANE Select transcripts for the last months and only calculates SpliceAI scores for these variants overlapping these transcripts
         transcripts_from_file      => undef,
-	time_interval              => 4, # checks which transcripts were updated/created in the last 4 months; only used if check_transcripts = 1 and we want to check the new transcripts in the core db
-	masked_scores              => 1, # calculate masked scores
+        time_interval              => 4, # checks which transcripts were updated/created in the last 4 months; only used if check_transcripts = 1 and we want to check the new transcripts in the core db
+        masked_scores              => 1, # calculate masked scores
         registry                   => undef, # database where new MANE transcripts are going to be checked; only used if check_transcripts = 1
-        output_file_name           => 'spliceai_final_scores_',
+        output_file_name           => 'spliceai_final_scores_' . $self->o('ensembl_release') . '_',
 
-        pipeline_wide_analysis_capacity => 500,
+        pipeline_wide_analysis_capacity => 80,
 
         pipeline_db => {
             -host   => $self->o('hive_db_host'),
@@ -85,8 +85,18 @@ sub resource_classes {
     my ($self) = @_;
     return {
         %{$self->SUPER::resource_classes},
-        '8Gb_8c_job'  => {'LSF' => '-n 8 -q production -R"select[mem>8000]  rusage[mem=8000]" -M8000' },
-        '4Gb_job'     => {'LSF' => '-q production -R"select[mem>4000] rusage[mem=4000]" -M4000'},
+        'gpu'      => {
+                        'LSF'   => '-q gpu-a100 -gpu "num=1:gmem=80000"', # the queue gpu-a100 can access /hps/nobackup
+                        'SLURM' => '--time=5:00:00 --gres=gpu:a100:1 --mem=24G'
+                      },
+        '4Gb_job'  => {
+                        'LSF' => '-q production -R"select[mem>4000] rusage[mem=4000]" -M4000',
+                        'SLURM' => "--partition=standard --time=4:00:00 --mem=4G"
+                      },
+         'default' => {
+                        'LSF' => '-R"select[mem>1000] rusage[mem=1000]" -M1000',
+                        'SLURM' => "--partition=standard --time=1:00:00 --mem=2G"
+                      }
     };
 }
 
@@ -99,11 +109,11 @@ sub pipeline_analyses {
           -input_ids  => [{}],
           -parameters => {
             'input_directory' => $self->o('input_directory'),
-            'inputcmd'        => 'find #input_directory# -type f -name "all_snps_ensembl_*.vcf" -printf "%f\n"',
+            'inputcmd'        => 'find #input_directory# -type f -name "all_snps_ensembl_*.vcf.gz" -printf "%f\n"',
           },
           -flow_into  => {
             '2->A' => {'split_files' => {'vcf_file' => '#_0#'}},
-            'A->1' => ['get_chr_dir'],
+            'A->1' => ['get_chr_dir']
           },
       },
       { -logic_name => 'split_files',
@@ -151,7 +161,7 @@ sub pipeline_analyses {
         -hive_capacity => $self->o('pipeline_wide_analysis_capacity'),
         -analysis_capacity => $self->o('pipeline_wide_analysis_capacity'),
         -input_ids  => [],
-        -rc_name => '8Gb_8c_job',
+        -rc_name => 'gpu',
         -parameters => {
           'main_dir'             => $self->o('main_dir'),
           'split_vcf_input_dir'  => $self->o('split_vcf_input_dir'),

diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SplitFiles.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SplitFiles.pm
@@ -59,12 +59,15 @@ sub run {
     $self->check_split_vcf_file();
   }
   $self->split_vcf_file();
+
+  # create tmp dir where spliceai tmp files will be stored
+  $self->create_dir($self->param_required('main_dir') . '/tmp');
 }
 
 sub set_chr_from_filename {
   my $self = shift;
   my $vcf_file = $self->param_required('vcf_file');
-  $vcf_file =~ /.*_chr(.*)\.vcf$/;
+  $vcf_file =~ /.*_chr(.*)\.vcf.gz$/;
   my $chr = $1;
   if (!$chr) {
     die("Could not get chromosome name from file name ($vcf_file).");
@@ -82,8 +85,19 @@ sub split_vcf_file {
   my $split_vcf_output_dir = $self->param_required('split_vcf_output_dir');
   my $step_size = $self->param_required('step_size');
 
+  # If we are checking the transcripts then the input file is a VCF
+  # Remove .gz
+  if($check) {
+    $vcf_file =~ s/vcf.gz/vcf/;
+  }
+
   my $vcf_file_path = $input_dir . '/' . $vcf_file;
 
+  if(!$check && $vcf_file =~ /vcf.gz/) {
+    $self->run_system_command("gunzip $vcf_file_path");
+    $vcf_file_path =~ s/vcf.gz/vcf/;
+  }
+
   if (! -d $input_dir) {
     die("Directory ($input_dir) doesn't exist");
   }
@@ -173,8 +187,13 @@ sub check_split_vcf_file {
   }
 
   # prepare input vcf file for tabix
-  $self->run_system_command("bgzip $vcf_file_path");
-  $self->run_system_command("tabix -p vcf $vcf_file_path.gz");
+  if($vcf_file !~ /vcf.gz/) {
+    $self->run_system_command("bgzip $vcf_file_path");
+    $self->run_system_command("tabix -p vcf $vcf_file_path.gz");
+  }
+  else {
+    $self->run_system_command("tabix -p vcf $vcf_file_path");
+  }
 
   # Create directory to store the input file only with the variants of interest (overlapping new transcripts)
   my $vcf_file_path_subset = $main_dir . '/input_vcf_files_subset';
@@ -184,7 +203,10 @@ sub check_split_vcf_file {
 
   $self->param('input_dir_subset', $vcf_file_path_subset);
 
-  open(my $write, '>', $vcf_file_path_subset . '/' . $vcf_file) or die $!;
+  my $vcf_file_sub = $vcf_file;
+  $vcf_file_sub =~ s/.gz//;
+
+  open(my $write, '>', $vcf_file_path_subset . '/' . $vcf_file_sub) or die $!;
 
   my $positions_of_interest = $transcripts->{$chr};
   foreach my $position (@$positions_of_interest) {
@@ -194,8 +216,8 @@ sub check_split_vcf_file {
 
     my $pos_string = sprintf("%s:%i-%i", $chr, $transcript_start, $transcript_end);
 
-    (open my $read, "tabix  " . $input_dir . "/" . $vcf_file . ".gz $pos_string |")
-      || die "Failed to read from input vcf file " . $input_dir . "/" . $vcf_file . ".gz \n" ;
+    (open my $read, "tabix  " . $input_dir . "/" . $vcf_file . " $pos_string |")
+      or die "Failed to read from input vcf file " . $input_dir . "/" . $vcf_file . " \n" ;
 
     while (my $row = <$read>) {
       chomp $row;
@@ -212,8 +234,8 @@ sub check_split_vcf_file {
   close($write);
 
   # Sort new vcf file
-  my $vcf_file_subset = $vcf_file_path_subset . '/' . $vcf_file;
-  my $vcf_file_subset_sorted = $vcf_file_path_subset . '/sorted_' . $vcf_file;
+  my $vcf_file_subset = $vcf_file_path_subset . '/' . $vcf_file_sub;
+  my $vcf_file_subset_sorted = $vcf_file_path_subset . '/sorted_' . $vcf_file_sub;
   $self->run_system_command("sort -o $vcf_file_subset_sorted -k1,1 -k2,2n $vcf_file_subset");
   $self->run_system_command("mv $vcf_file_subset_sorted $vcf_file_subset");
 }
@@ -291,11 +313,4 @@ sub get_new_transcripts_file {
   $self->param('transcripts', \%new_transcripts);
 }
 
-sub count_lines {
-  my $self = shift;
-  my $filename = shift;
-
-
-}
-
 1;
diff --git a/scripts/python/spliceai_annotation_file.py b/scripts/python/spliceai_annotation_file.py
@@ -0,0 +1,187 @@
+#!/usr/bin/env python3
+
+"""
+    Script to generate the gene annotation file for SpliceAI.
+    This file contains data about the transcript that is being used for each gene.
+    SpliceAI only annotates variants overlapping these transcripts.
+
+    Template provided by SpliceAI: https://github.com/Illumina/SpliceAI/blob/master/spliceai/annotations/grch38.txt
+
+    Gene annotation file format:
+        #NAME   CHROM   STRAND  TX_START    TX_END  EXON_START  EXON_END
+        KRTAP27-1   21  -   30337013    30337694    30337013,   30337694,
+
+    Options:
+            --output_file   gene annotation output file (Optional. Default: gene_annotation.txt)
+            --species       species name                (Optional. Default: homo_sapiens)
+            --assembly      assembly version            (Optional. Default: 38)
+            --host          core database host          (Mandatory)
+            --port          host port                   (Mandatory)
+            --user          host user                   (Mandatory)
+            --release       core database version       (Mandatory)
+"""
+
+import argparse
+import mysql.connector
+from mysql.connector import Error
+
+
+def fetch_transcripts(species, assembly, release, host, port, user):
+    gene_annotation = {}
+    database = f"{species}_core_{release}_{assembly}"
+
+    # For human we select 'MANE_Select' transcripts and the gene name is a gene attrib
+    if species == "homo_sapiens":
+        sql_select = """
+                        SELECT ga.value,s.name,t.seq_region_strand,t.seq_region_start,t.seq_region_end,
+                        e.seq_region_start,e.seq_region_end FROM transcript t
+                        JOIN transcript_attrib ta ON t.transcript_id = ta.transcript_id
+                        JOIN attrib_type atr ON ta.attrib_type_id = atr.attrib_type_id
+                        JOIN seq_region s ON t.seq_region_id = s.seq_region_id
+                        JOIN gene g ON g.gene_id = t.gene_id
+                        JOIN gene_attrib ga ON g.gene_id = ga.gene_id
+                        JOIN exon_transcript et ON t.transcript_id = et.transcript_id
+                        JOIN exon e ON e.exon_id = et.exon_id
+                        WHERE t.stable_id like 'ENST%' and t.biotype = 'protein_coding'
+                        and ga.attrib_type_id = 4 and atr.code = 'MANE_Select'
+                        order by ga.value,s.name,t.seq_region_start,t.seq_region_end,e.seq_region_start,e.seq_region_end
+                """
+    else:
+        # For other species we select the canonical transcripts and the gene name is in xref
+        sql_select = """
+                        SELECT DISTINCT g.stable_id,s.name,t.seq_region_strand,t.seq_region_start,t.seq_region_end,
+                        e.seq_region_start,e.seq_region_end FROM transcript t
+                        JOIN transcript_attrib ta ON t.transcript_id = ta.transcript_id
+                        JOIN attrib_type atr ON ta.attrib_type_id = atr.attrib_type_id
+                        JOIN seq_region s ON t.seq_region_id = s.seq_region_id
+                        JOIN gene g ON g.gene_id = t.gene_id
+                        JOIN exon_transcript et ON t.transcript_id = et.transcript_id
+                        JOIN exon e ON e.exon_id = et.exon_id
+                        JOIN xref xr ON g.display_xref_id = xr.xref_id
+                        WHERE t.stable_id like 'ENS%' and t.biotype = 'protein_coding' and atr.code = 'is_canonical'
+                        order by xr.display_label,s.name,t.seq_region_start,t.seq_region_end,e.seq_region_start,e.seq_region_end
+                     """
+
+    connection = mysql.connector.connect(host=host,
+                                         database=database,
+                                         user=user,
+                                         password='',
+                                         port=port)
+
+    try:
+        if connection.is_connected():
+            cursor = connection.cursor()
+            cursor.execute(sql_select)
+            data = cursor.fetchall()
+            for row in data:
+                strand = row[2]
+                if strand == 1:
+                    strand = "+"
+                else:
+                    strand = "-"
+
+                if row[0] not in gene_annotation:
+                    exons_start = []
+                    exons_end = []
+                    exons_start.append(str(row[5]))
+                    exons_end.append(str(row[6]))
+
+                    gene_annotation[row[0]] = {
+                        "chr": row[1],
+                        "strand": strand,
+                        "start": row[3],
+                        "end": row[4],
+                        "exons_start": exons_start,
+                        "exons_end": exons_end
+                    }
+                else:
+                    gene_annotation[row[0]]["exons_start"].append(str(row[5]))
+                    gene_annotation[row[0]]["exons_end"].append(str(row[6]))
+
+    except Error as e:
+        print(f"Error while connecting to MySQL {database}", e)
+    finally:
+        if connection.is_connected():
+            cursor.close()
+            connection.close()
+
+    return gene_annotation
+
+def sanity_checks(transcripts_list):
+    fail_list = []
+
+    for gene, data in transcripts_list.items():
+        check = 1
+
+        # transcript start > end
+        if data["start"] >= data["end"]:
+            check = 0
+
+        # the exon start has to be bigger than the last exon end
+        i = 0
+        for exon_start in data["exons_start"]:
+            if i > 0:
+                last_exon_end = data["exons_end"][i - 1]
+                if(int(exon_start) < int(last_exon_end)):
+                    check = 0
+
+            i += 1
+
+        if check == 0:
+            fail_list.append(gene)
+
+    return fail_list
+
+def write_output(transcripts_list, output_file):
+    # Write to output file
+    with open(output_file, "w") as f:
+        f.write("#NAME\tCHROM\tSTRAND\tTX_START\tTX_END\tEXON_START\tEXON_END\n")
+
+        for gene, data in transcripts_list.items():
+            chr = data["chr"]
+            strand = data["strand"]
+            start = data["start"]
+            end = data["end"]
+            exons_start = (",").join(data["exons_start"])
+            exons_end = (",").join(data["exons_end"])
+
+            f.write(f"{gene}\t{chr}\t{strand}\t{start}\t{end}\t{exons_start},\t{exons_end},\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate the gene annotation file for SpliceAI")
+    parser.add_argument("-o", "--output_file",
+                        default="gene_annotation.txt",
+                        help="output file (default: gene_annotation.txt)")
+    parser.add_argument("-sp", "--species",
+                        default="homo_sapiens",
+                        help="species (default: homo_sapiens)")
+    parser.add_argument("-a", "--assembly",
+                        default="38",
+                        help="species assembly (default: 38)")
+    parser.add_argument("-r", "--release", required=True)
+    parser.add_argument("--host", required=True)
+    parser.add_argument("--port", required=True)
+    parser.add_argument("--user", required=True)
+    args = parser.parse_args()
+
+    output_file = args.output_file
+    species = args.species
+    assembly = args.assembly
+    release = args.release
+    host = args.host
+    port = args.port
+    user = args.user
+
+    transcripts_list = fetch_transcripts(species, assembly, release, host, port, user)
+
+    check = sanity_checks(transcripts_list)
+
+    if not check:
+        write_output(transcripts_list, output_file)
+    else:
+        print("Sanity checks failed for the following genes: ", (", ").join(check))
+
+
+if __name__ == '__main__':
+    main()