diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm index 054cb76620..52cbdd6209 100644 --- a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm +++ b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm @@ -84,6 +84,11 @@ sub merge_vcf_files { # Sort final file $self->run_system_command("bcftools sort -o $final_file_sorted $final_file"); + $self->run_system_command("bgzip $final_file_sorted"); + $self->run_system_command("tabix -p vcf $final_file_sorted.gz"); + + # Remove unsorted file + $self->run_system_command("rm $final_file"); } 1; diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm index be5c5b2be9..fd0aa12b62 100644 --- a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm +++ b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm @@ -68,7 +68,12 @@ sub run_spliceai { die("Directory ($output_vcf_files_dir) doesn't exist"); } - my $cmd = "spliceai -I $vcf_input_dir_chr/$vcf_file -O $output_vcf_files_dir/$vcf_file -R $fasta_file -A $gene_annotation"; + my $tmp_dir = $main_dir . "/tmp"; + + # activate conda env + $self->run_system_command("source activate spliceai"); + + my $cmd = "spliceai -I $vcf_input_dir_chr/$vcf_file -O $output_vcf_files_dir/$vcf_file -R $fasta_file -A $gene_annotation -B 4096 -T 256 -t $tmp_dir"; # Add option to calculate masked scores if($masked_scores) { diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm index 5f5cc0f484..7efd097775 100644 --- a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm +++ b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm @@ -54,21 +54,21 @@ sub default_options { pipeline_name => 'spliceai_scores', main_dir => $self->o('main_dir'), # main directory where all files and directories are going to be stored input_directory => $self->o('main_dir') . '/input_vcf_files', # input files - split_vcf_no_header_dir => $self->o('main_dir') . '/split_vcf_no_header', # contains the input files after being splitted (files without headers) + split_vcf_no_header_dir => $self->o('main_dir') . '/split_vcf_no_header', # contains the input files after being splitted (files without headers) split_vcf_input_dir => $self->o('main_dir') . '/split_vcf_input', # contains the splitted input vcf files with headers, these are the files used to run SpliceAI split_vcf_output_dir => $self->o('main_dir') . '/split_vcf_output', # temporary output files, still splitted output_dir => $self->o('main_dir') . '/output', # final output files already merged by chromosome fasta_file => $self->o('fasta_file'), gene_annotation => $self->o('gene_annotation'), - step_size => 4000, # number of variants used to split the main vcf files + step_size => 500000, # number of variants used to split the main vcf files check_transcripts => 0, # if set to 1 checks which are the new MANE Select transcripts for the last months and only calculates SpliceAI scores for these variants overlapping these transcripts transcripts_from_file => undef, - time_interval => 4, # checks which transcripts were updated/created in the last 4 months; only used if check_transcripts = 1 and we want to check the new transcripts in the core db - masked_scores => 1, # calculate masked scores + time_interval => 4, # checks which transcripts were updated/created in the last 4 months; only used if check_transcripts = 1 and we want to check the new transcripts in the core db + masked_scores => 1, # calculate masked scores registry => undef, # database where new MANE transcripts are going to be checked; only used if check_transcripts = 1 - output_file_name => 'spliceai_final_scores_', + output_file_name => 'spliceai_final_scores_' . $self->o('ensembl_release') . '_', - pipeline_wide_analysis_capacity => 500, + pipeline_wide_analysis_capacity => 80, pipeline_db => { -host => $self->o('hive_db_host'), @@ -85,8 +85,18 @@ sub resource_classes { my ($self) = @_; return { %{$self->SUPER::resource_classes}, - '8Gb_8c_job' => {'LSF' => '-n 8 -q production -R"select[mem>8000] rusage[mem=8000]" -M8000' }, - '4Gb_job' => {'LSF' => '-q production -R"select[mem>4000] rusage[mem=4000]" -M4000'}, + 'gpu' => { + 'LSF' => '-q gpu-a100 -gpu "num=1:gmem=80000"', # the queue gpu-a100 can access /hps/nobackup + 'SLURM' => '--time=5:00:00 --gres=gpu:a100:1 --mem=24G' + }, + '4Gb_job' => { + 'LSF' => '-q production -R"select[mem>4000] rusage[mem=4000]" -M4000', + 'SLURM' => "--partition=standard --time=4:00:00 --mem=4G" + }, + 'default' => { + 'LSF' => '-R"select[mem>1000] rusage[mem=1000]" -M1000', + 'SLURM' => "--partition=standard --time=1:00:00 --mem=2G" + } }; } @@ -99,11 +109,11 @@ sub pipeline_analyses { -input_ids => [{}], -parameters => { 'input_directory' => $self->o('input_directory'), - 'inputcmd' => 'find #input_directory# -type f -name "all_snps_ensembl_*.vcf" -printf "%f\n"', + 'inputcmd' => 'find #input_directory# -type f -name "all_snps_ensembl_*.vcf.gz" -printf "%f\n"', }, -flow_into => { '2->A' => {'split_files' => {'vcf_file' => '#_0#'}}, - 'A->1' => ['get_chr_dir'], + 'A->1' => ['get_chr_dir'] }, }, { -logic_name => 'split_files', @@ -151,7 +161,7 @@ sub pipeline_analyses { -hive_capacity => $self->o('pipeline_wide_analysis_capacity'), -analysis_capacity => $self->o('pipeline_wide_analysis_capacity'), -input_ids => [], - -rc_name => '8Gb_8c_job', + -rc_name => 'gpu', -parameters => { 'main_dir' => $self->o('main_dir'), 'split_vcf_input_dir' => $self->o('split_vcf_input_dir'), diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SplitFiles.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SplitFiles.pm index 1a89d0be10..a67be4b5fc 100644 --- a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SplitFiles.pm +++ b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SplitFiles.pm @@ -59,12 +59,15 @@ sub run { $self->check_split_vcf_file(); } $self->split_vcf_file(); + + # create tmp dir where spliceai tmp files will be stored + $self->create_dir($self->param_required('main_dir') . '/tmp'); } sub set_chr_from_filename { my $self = shift; my $vcf_file = $self->param_required('vcf_file'); - $vcf_file =~ /.*_chr(.*)\.vcf$/; + $vcf_file =~ /.*_chr(.*)\.vcf.gz$/; my $chr = $1; if (!$chr) { die("Could not get chromosome name from file name ($vcf_file)."); @@ -82,8 +85,19 @@ sub split_vcf_file { my $split_vcf_output_dir = $self->param_required('split_vcf_output_dir'); my $step_size = $self->param_required('step_size'); + # If we are checking the transcripts then the input file is a VCF + # Remove .gz + if($check) { + $vcf_file =~ s/vcf.gz/vcf/; + } + my $vcf_file_path = $input_dir . '/' . $vcf_file; + if(!$check && $vcf_file =~ /vcf.gz/) { + $self->run_system_command("gunzip $vcf_file_path"); + $vcf_file_path =~ s/vcf.gz/vcf/; + } + if (! -d $input_dir) { die("Directory ($input_dir) doesn't exist"); } @@ -173,8 +187,13 @@ sub check_split_vcf_file { } # prepare input vcf file for tabix - $self->run_system_command("bgzip $vcf_file_path"); - $self->run_system_command("tabix -p vcf $vcf_file_path.gz"); + if($vcf_file !~ /vcf.gz/) { + $self->run_system_command("bgzip $vcf_file_path"); + $self->run_system_command("tabix -p vcf $vcf_file_path.gz"); + } + else { + $self->run_system_command("tabix -p vcf $vcf_file_path"); + } # Create directory to store the input file only with the variants of interest (overlapping new transcripts) my $vcf_file_path_subset = $main_dir . '/input_vcf_files_subset'; @@ -184,7 +203,10 @@ sub check_split_vcf_file { $self->param('input_dir_subset', $vcf_file_path_subset); - open(my $write, '>', $vcf_file_path_subset . '/' . $vcf_file) or die $!; + my $vcf_file_sub = $vcf_file; + $vcf_file_sub =~ s/.gz//; + + open(my $write, '>', $vcf_file_path_subset . '/' . $vcf_file_sub) or die $!; my $positions_of_interest = $transcripts->{$chr}; foreach my $position (@$positions_of_interest) { @@ -194,8 +216,8 @@ sub check_split_vcf_file { my $pos_string = sprintf("%s:%i-%i", $chr, $transcript_start, $transcript_end); - (open my $read, "tabix " . $input_dir . "/" . $vcf_file . ".gz $pos_string |") - || die "Failed to read from input vcf file " . $input_dir . "/" . $vcf_file . ".gz \n" ; + (open my $read, "tabix " . $input_dir . "/" . $vcf_file . " $pos_string |") + or die "Failed to read from input vcf file " . $input_dir . "/" . $vcf_file . " \n" ; while (my $row = <$read>) { chomp $row; @@ -212,8 +234,8 @@ sub check_split_vcf_file { close($write); # Sort new vcf file - my $vcf_file_subset = $vcf_file_path_subset . '/' . $vcf_file; - my $vcf_file_subset_sorted = $vcf_file_path_subset . '/sorted_' . $vcf_file; + my $vcf_file_subset = $vcf_file_path_subset . '/' . $vcf_file_sub; + my $vcf_file_subset_sorted = $vcf_file_path_subset . '/sorted_' . $vcf_file_sub; $self->run_system_command("sort -o $vcf_file_subset_sorted -k1,1 -k2,2n $vcf_file_subset"); $self->run_system_command("mv $vcf_file_subset_sorted $vcf_file_subset"); } @@ -291,11 +313,4 @@ sub get_new_transcripts_file { $self->param('transcripts', \%new_transcripts); } -sub count_lines { - my $self = shift; - my $filename = shift; - - -} - 1; diff --git a/scripts/python/spliceai_annotation_file.py b/scripts/python/spliceai_annotation_file.py new file mode 100644 index 0000000000..29afcb4e0a --- /dev/null +++ b/scripts/python/spliceai_annotation_file.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python3 + +""" + Script to generate the gene annotation file for SpliceAI. + This file contains data about the transcript that is being used for each gene. + SpliceAI only annotates variants overlapping these transcripts. + + Template provided by SpliceAI: https://github.com/Illumina/SpliceAI/blob/master/spliceai/annotations/grch38.txt + + Gene annotation file format: + #NAME CHROM STRAND TX_START TX_END EXON_START EXON_END + KRTAP27-1 21 - 30337013 30337694 30337013, 30337694, + + Options: + --output_file gene annotation output file (Optional. Default: gene_annotation.txt) + --species species name (Optional. Default: homo_sapiens) + --assembly assembly version (Optional. Default: 38) + --host core database host (Mandatory) + --port host port (Mandatory) + --user host user (Mandatory) + --release core database version (Mandatory) +""" + +import argparse +import mysql.connector +from mysql.connector import Error + + +def fetch_transcripts(species, assembly, release, host, port, user): + gene_annotation = {} + database = f"{species}_core_{release}_{assembly}" + + # For human we select 'MANE_Select' transcripts and the gene name is a gene attrib + if species == "homo_sapiens": + sql_select = """ + SELECT ga.value,s.name,t.seq_region_strand,t.seq_region_start,t.seq_region_end, + e.seq_region_start,e.seq_region_end FROM transcript t + JOIN transcript_attrib ta ON t.transcript_id = ta.transcript_id + JOIN attrib_type atr ON ta.attrib_type_id = atr.attrib_type_id + JOIN seq_region s ON t.seq_region_id = s.seq_region_id + JOIN gene g ON g.gene_id = t.gene_id + JOIN gene_attrib ga ON g.gene_id = ga.gene_id + JOIN exon_transcript et ON t.transcript_id = et.transcript_id + JOIN exon e ON e.exon_id = et.exon_id + WHERE t.stable_id like 'ENST%' and t.biotype = 'protein_coding' + and ga.attrib_type_id = 4 and atr.code = 'MANE_Select' + order by ga.value,s.name,t.seq_region_start,t.seq_region_end,e.seq_region_start,e.seq_region_end + """ + else: + # For other species we select the canonical transcripts and the gene name is in xref + sql_select = """ + SELECT DISTINCT g.stable_id,s.name,t.seq_region_strand,t.seq_region_start,t.seq_region_end, + e.seq_region_start,e.seq_region_end FROM transcript t + JOIN transcript_attrib ta ON t.transcript_id = ta.transcript_id + JOIN attrib_type atr ON ta.attrib_type_id = atr.attrib_type_id + JOIN seq_region s ON t.seq_region_id = s.seq_region_id + JOIN gene g ON g.gene_id = t.gene_id + JOIN exon_transcript et ON t.transcript_id = et.transcript_id + JOIN exon e ON e.exon_id = et.exon_id + JOIN xref xr ON g.display_xref_id = xr.xref_id + WHERE t.stable_id like 'ENS%' and t.biotype = 'protein_coding' and atr.code = 'is_canonical' + order by xr.display_label,s.name,t.seq_region_start,t.seq_region_end,e.seq_region_start,e.seq_region_end + """ + + connection = mysql.connector.connect(host=host, + database=database, + user=user, + password='', + port=port) + + try: + if connection.is_connected(): + cursor = connection.cursor() + cursor.execute(sql_select) + data = cursor.fetchall() + for row in data: + strand = row[2] + if strand == 1: + strand = "+" + else: + strand = "-" + + if row[0] not in gene_annotation: + exons_start = [] + exons_end = [] + exons_start.append(str(row[5])) + exons_end.append(str(row[6])) + + gene_annotation[row[0]] = { + "chr": row[1], + "strand": strand, + "start": row[3], + "end": row[4], + "exons_start": exons_start, + "exons_end": exons_end + } + else: + gene_annotation[row[0]]["exons_start"].append(str(row[5])) + gene_annotation[row[0]]["exons_end"].append(str(row[6])) + + except Error as e: + print(f"Error while connecting to MySQL {database}", e) + finally: + if connection.is_connected(): + cursor.close() + connection.close() + + return gene_annotation + +def sanity_checks(transcripts_list): + fail_list = [] + + for gene, data in transcripts_list.items(): + check = 1 + + # transcript start > end + if data["start"] >= data["end"]: + check = 0 + + # the exon start has to be bigger than the last exon end + i = 0 + for exon_start in data["exons_start"]: + if i > 0: + last_exon_end = data["exons_end"][i - 1] + if(int(exon_start) < int(last_exon_end)): + check = 0 + + i += 1 + + if check == 0: + fail_list.append(gene) + + return fail_list + +def write_output(transcripts_list, output_file): + # Write to output file + with open(output_file, "w") as f: + f.write("#NAME\tCHROM\tSTRAND\tTX_START\tTX_END\tEXON_START\tEXON_END\n") + + for gene, data in transcripts_list.items(): + chr = data["chr"] + strand = data["strand"] + start = data["start"] + end = data["end"] + exons_start = (",").join(data["exons_start"]) + exons_end = (",").join(data["exons_end"]) + + f.write(f"{gene}\t{chr}\t{strand}\t{start}\t{end}\t{exons_start},\t{exons_end},\n") + + +def main(): + parser = argparse.ArgumentParser(description="Generate the gene annotation file for SpliceAI") + parser.add_argument("-o", "--output_file", + default="gene_annotation.txt", + help="output file (default: gene_annotation.txt)") + parser.add_argument("-sp", "--species", + default="homo_sapiens", + help="species (default: homo_sapiens)") + parser.add_argument("-a", "--assembly", + default="38", + help="species assembly (default: 38)") + parser.add_argument("-r", "--release", required=True) + parser.add_argument("--host", required=True) + parser.add_argument("--port", required=True) + parser.add_argument("--user", required=True) + args = parser.parse_args() + + output_file = args.output_file + species = args.species + assembly = args.assembly + release = args.release + host = args.host + port = args.port + user = args.user + + transcripts_list = fetch_transcripts(species, assembly, release, host, port, user) + + check = sanity_checks(transcripts_list) + + if not check: + write_output(transcripts_list, output_file) + else: + print("Sanity checks failed for the following genes: ", (", ").join(check)) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/tools/variant_simulator/simulate_variation b/tools/variant_simulator/simulate_variation index 582fb76d9b..203c8bfe9f 100755 --- a/tools/variant_simulator/simulate_variation +++ b/tools/variant_simulator/simulate_variation @@ -34,7 +34,7 @@ use Bio::EnsEMBL::Registry; use Getopt::Long qw(GetOptions); my ($help, $chromOnly, $gene, $registryFile, -$exonOnly, $intronOnly, $codingOnly,$onlyMane,$refseq_genes,$spliceai +$exonOnly, $intronOnly, $codingOnly, $canonicalOnly, $onlyMane, $refseq_genes, $spliceai ); my ($species, $assembly, $edges, $outFile) = ("homo_sapiens", "grch38", 0, "simulated.vcf"); @@ -47,6 +47,7 @@ GetOptions( "species=s" => \$species, "output|o=s" => \$outFile, "exonsOnly" => \$exonOnly, "codingOnly" => \$codingOnly, + "canonicalOnly" => \$canonicalOnly, "onlyMane" => \$onlyMane, "intronsOnly" => \$intronOnly, "assembly=s" => \$assembly, @@ -60,7 +61,7 @@ usage() if ($help); die "Illegal arguments -exonsOnly -intronsOnly can't be used together !" if $exonOnly && $intronOnly; die "Illegal arguments -intronsOnly -codingsOnly can't be used together !" if $codingOnly && $intronOnly; -die "Illegal arguments -onlyMane can only be used alone !" if $onlyMane && ($exonOnly || $codingOnly || $intronOnly); +die "Illegal arguments -onlyMane can only be used alone!" if $onlyMane && ($exonOnly || $codingOnly || $intronOnly); my $registry = 'Bio::EnsEMBL::Registry'; if (defined $registryFile){ @@ -88,11 +89,12 @@ my $slice_adaptor = $registry->get_adaptor( $species, $dbtype, 'slice' ); my $gene_adaptor = $registry->get_adaptor( $species, $dbtype, "gene" ); my $biotype = 'protein_coding'; +my %list_variants; my $FHO; open ($FHO, ">$outFile") or die "Can't open file to write: $outFile, $!\n"; -print_header($FHO, $assembly, $spliceai); +print_header($FHO, $assembly, $spliceai, $species, $chromOnly); my @geneIDs; if ($chromOnly || $gene){ #if chrom or gene specified then use the most stringent @@ -126,36 +128,24 @@ sub print_header { my $FHO = shift; my $assembly = shift; my $spliceai = shift; + my $species = shift; + my $chromOnly = shift; print $FHO "##fileformat=VCFv4.2", "\n"; + if(defined $spliceai) { - my $reference = $assembly eq "grch38" ? "GRCh38/hg38" : "GRCh37/hg19"; + my $reference = $assembly; + + if($species eq "homo_sapiens" || $species eq "human") { + $reference = $assembly eq "grch38" ? "GRCh38/hg38" : "GRCh37/hg19"; + } print $FHO "##reference=$reference", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; + + # Get chromosome size + my $chr_slice = $slice_adaptor->fetch_by_region( 'chromosome', $chromOnly); + my $length = $chr_slice->seq_region_length(); + + print $FHO "##contig=", "\n"; } else { print $FHO '##INFO=',"\n"; @@ -194,6 +184,13 @@ sub processGenes{ @parts = ($transcript); } } + } elsif ($canonicalOnly) { + my @transc = @{ $geneID->get_all_Transcripts() }; + while ( my $transcript = shift @transc ) { + if($transcript->is_canonical()) { + @parts = ($transcript); + } + } } else { @parts = ($geneID); @@ -211,6 +208,7 @@ sub generateSNPs { my $spliceai = shift; my @bases = qw/A C T G/; + while( my $part = shift (@$elements)){ my $start = $part->seq_region_start - $edges; my $pos = $start; @@ -220,8 +218,10 @@ sub generateSNPs { $partID = $part->display_id if $intronOnly; $partID = $part->display_id; my $seq = $slice_adaptor->fetch_by_region("chromosome", $chrom, $start ,$part->seq_region_end + $edges)->seq; + my @seqAr = split("", $seq); my ($id, $qual, $filter, $info) = (".", ".", ".", "."); + for ( my $i = 0; $i < @seqAr; $i++ ) { my $ref_seq = $seqAr[$i]; my @tmp_alts = grep {$_ ne $ref_seq} @bases; @@ -230,11 +230,18 @@ sub generateSNPs { $info = 'GENE='.$gene_name.';'.'FEATURE='.$partID; } while(my $tmp_alt = shift @tmp_alts) { + my $key = $chrom."-".$pos."-".$ref_seq."-".$tmp_alt; + if(!$spliceai) { - $id = $chrom."-".$pos."-".$ref_seq."-".$tmp_alt; + $id = $key; + } + + # write to file only if variant is unique + if(!$list_variants{$key}) { + $list_variants{$key} = 1; + printf $fho "%s\t%s\t%s\t%s\t%s\t", $chrom, $pos, $id, $ref_seq, $tmp_alt; + printf $fho "%s\t%s\t%s\n", $qual, $filter, $info; } - printf $fho "%s\t%s\t%s\t%s\t%s\t", $chrom, $pos, $id, $ref_seq, $tmp_alt; - printf $fho "%s\t%s\t%s\n", $qual, $filter, $info; } } } @@ -265,6 +272,7 @@ sub usage { -exonsOnly Generate SNPs only for all exons of the genes -intronsOnly Generate SNPs only for all introns of the genes -onlyMane Generate SNPs only for MANE transcripts + -canonicalOnly Generate SNPs only for canonical transcripts -edge Length of flanking region in which to generate SNPs (Default: 0)