commit 30/08/2023 18:07:05

Gabaldonlab · Aug 30, 2023 · 1814c60 · 1814c60
1 parent c63fcc5
commit 1814c60
Show file tree

Hide file tree

Showing 10 changed files with 2,248 additions and 115 deletions.
diff --git a/installation/test_installation/test_installation_modules.py b/installation/test_installation/test_installation_modules.py
@@ -69,7 +69,7 @@
 
 # redefine the gff
 gff = "%s/reduced_annotations.gff"%testing_outputs_dir
-if fun.file_is_empty(gff): fun.run_cmd("cp %s %s"%(test_gff, gff))
+if fun.file_is_empty(gff): fun.run_cmd("rsync %s %s"%(test_gff, gff))
 
 # redefine the mutated genome location
 mut_genome = "%s/mutated_genome.fasta"%testing_outputs_dir
@@ -146,7 +146,7 @@
 if fun.file_is_empty("%s/perSVade_finished_file.txt"%outdir_call_coverage_per_gene): fun.run_cmd("%s get_cov_genes --threads %i --fraction_available_mem 1.0  -o %s --ref %s --min_chromosome_len 100 -gff %s -sbam %s/aligned_reads.bam.sorted"%(fun.perSVade_modules, threads, outdir_call_coverage_per_gene, ref_genome, gff, outdir_align_reads_SVs))
 
 # perform various normal varcall ops per alignment
-sorted_aligners = ["segemehl", "bowtie2_local", "bowtie2", "hisat2", "bwa_mem"]
+sorted_aligners = ["segemehl", "bowtie2", "hisat2", "bwa_mem"] # bowtie2_local, hisat2_no_spliced
 for aligner in sorted_aligners:
 	print("testing variant calling for %s..."%aligner)
 
@@ -162,32 +162,25 @@
 	outdir_small_vars = "%s/calling_small_vars_%s"%(testing_outputs_dir, aligner)
 	if fun.file_is_empty("%s/perSVade_finished_file.txt"%outdir_small_vars): fun.run_cmd("%s call_small_variants --threads %i --fraction_available_mem 1.0 --min_chromosome_len 100 -o %s -r %s -sbam %s/aligned_reads.bam.sorted --repeats_file skip -p 1 --callers bcftools,freebayes,HaplotypeCaller --min_AF 0.9 --min_coverage 0 --outdir_callCNVs %s"%(fun.perSVade_modules, threads, outdir_small_vars, Cglabrata_genome, outdir_align_reads, "%s/call_CNVs_Cglab_subsampledReads_%s"%(testing_outputs_dir, sorted_aligners[0])))
 
-dakhgdgajgad
-
-
 # print the overlaps in variants 
-comparisons = {}
+comparisons = set()
 for aln1 in sorted_aligners:
 	for aln2 in sorted_aligners:
 		if aln1==aln2: continue
-		comparison_tuple = tuple(sorted(aln1, aln2))
+		comparison_tuple = tuple(sorted([aln1, aln2]))
 		if comparison_tuple in comparisons: continue
 
 		# load vars
-		min_nPASS = 2
-		vars1 = set(fun.get_df_and_header_from_vcf("%s/calling_small_vars_%s/.vcf"%(testing_outputs_dir, aln1))[0].ID)
-		vars2 = set(fun.get_df_and_header_from_vcf("%s/calling_small_vars_%s/.vcf"%(testing_outputs_dir, aln2))[0].ID)
+		min_nPASS = 3
+		vars1 = set(fun.get_df_and_header_from_vcf("%s/calling_small_vars_%s/variants_atLeast%iPASS_ploidy1.vcf"%(testing_outputs_dir, aln1, min_nPASS))[0].ID)
+		vars2 = set(fun.get_df_and_header_from_vcf("%s/calling_small_vars_%s/variants_atLeast%iPASS_ploidy1.vcf"%(testing_outputs_dir, aln2, min_nPASS))[0].ID)
 
 		# report
 		all_vars = vars1.union(vars2)
 		overlap_vars = vars1.intersection(vars2)
-		print("NPASS>=%i. %s-vs-%s. %i/%i, %.2f%s"%(min_nPASS, aln1, aln2, len(overlap_vars), len(all_vars), (len(overlap_vars)/len(all_vars))*100, "%"))
+		#print("NPASS>=%i. %s-vs-%s. %i/%i, %.2f%s"%(min_nPASS, aln1, aln2, len(overlap_vars), len(all_vars), (len(overlap_vars)/len(all_vars))*100, "%"))
 		comparisons.add(comparison_tuple)
 
-		adkjagdajdakgd
-
-HGFHGFHFHG
-
 # annotate small variants
 outdir_annotate_small_vars = "%s/annotate_small_vars"%testing_outputs_dir
 fun.run_cmd("%s annotate_small_vars --threads %i --fraction_available_mem 1.0 -o %s --ref %s --min_chromosome_len 10000  --mitochondrial_chromosome mito_C_glabrata_CBS138 --merged_vcf %s/merged_vcfs_allVars_ploidy1.vcf -gff %s -mcode 3 -gcode 1"%(fun.perSVade_modules, threads, outdir_annotate_small_vars, ref_genome, outdir_small_vars, Cglabrata_annotations))
@@ -206,8 +199,10 @@
 
 # annotate SVs
 outdir_annotate_SVs = "%s/annotate_SVs"%testing_outputs_dir
-fun.run_cmd("%s annotate_SVs --threads %i --fraction_available_mem 1.0 -o %s --ref %s --min_chromosome_len 10000  --mitochondrial_chromosome mito_C_glabrata_CBS138  --SV_CNV_vcf %s/SV_and_CNV_variant_calling.vcf -gff %s -mcode 3 -gcode 1"%(fun.perSVade_modules, threads, outdir_annotate_SVs, ref_genome, outdir_merged_calling, gff))
-
+gff_all_Cglab_input = "%s/annots_ABmito.gff"%testing_inputs_dir
+gff_all_Cglab = "%s/annots_ABmito.gff"%testing_outputs_dir
+if fun.file_is_empty(gff_all_Cglab): fun.rsync_file(gff_all_Cglab_input, gff_all_Cglab)
+fun.run_cmd("%s annotate_SVs --threads %i --fraction_available_mem 1.0 -o %s --ref %s --min_chromosome_len 10000  --mitochondrial_chromosome mito_C_glabrata_CBS138  --SV_CNV_vcf %s/SV_and_CNV_variant_calling.vcf -gff %s -mcode 3 -gcode 1 --replace"%(fun.perSVade_modules, threads, outdir_annotate_SVs, ref_genome, outdir_merged_calling, gff_all_Cglab))
 
 # get homologous regions in C. albicans
 outdir_homRegions = "%s/find_hom_regions"%testing_outputs_dir

diff --git a/installation/test_installation/testing_inputs/annots_ABmito.gff b/installation/test_installation/testing_inputs/annots_ABmito.gff
diff --git a/scripts/__pycache__/sv_functions.cpython-36.pyc b/scripts/__pycache__/sv_functions.cpython-36.pyc
diff --git a/scripts/align_reads b/scripts/align_reads
@@ -53,7 +53,7 @@ parser.add_argument("--min_chromosome_len", dest="min_chromosome_len", default=1
 parser.add_argument("--skip_marking_duplicates", dest="skip_marking_duplicates", default=False, action="store_true", help="Don't mark the duplicate reads in the .bam file. This can save you some time.")
 parser.add_argument("--replace", dest="replace", action="store_true", help="Re-run all the steps by deleting the output directory.")
 parser.add_argument("--verbose", dest="verbose", action="store_true", default=False, help="Print a verbose log.")
-parser.add_argument("--aligner", dest="aligner", default='bwa_mem', type=str, help="The aligner algorithm for the short reads. It should map the used to generate the --sortedbam. It can be any of 'hisat2', 'segemehl', 'bowtie2', 'bowtie2_local', 'bwa_mem'. Most of the testing of perSVade was done on bwa_mem. For 'hisat2', 'segemehl', the bam files get the secondary alignments removed. In addition, for segemehl the bam file will not contain unmapped reads, as these create compatibility problems.")
+parser.add_argument("--aligner", dest="aligner", default='bwa_mem', type=str, help="The aligner algorithm for the short reads. It should map the used to generate the --sortedbam. It can be any of 'hisat2', 'hisat2_no_spliced', 'segemehl', 'bowtie2', 'bowtie2_local', 'bwa_mem'. Most of the testing of perSVade was done on bwa_mem. For 'hisat2', 'hisat2_no_spliced', 'segemehl', the bam files get the secondary alignments removed. In addition, for segemehl the bam file will not contain unmapped reads, as these create compatibility problems.")
 
 # resources
 parser.add_argument("--fraction_available_mem", dest="fraction_available_mem", default=None, type=float, help="This pipeline calculates the available RAM for several steps, and it may not work well in some systems (i.e. HPC clusters). This parameter allows you to correct possible errors. If --fraction_available_mem is not provided (default behavior), this pipeline will calculate the available RAM by filling the memory, which may give errors. If you want to use all the available memory you should specify --fraction_available_mem 1.0. See the FAQ 'How does the --fraction_available_mem work?' from https://github.com/Gabaldonlab/perSVade/wiki/8.-FAQs for more info.")
@@ -139,9 +139,6 @@ index_bam = "%s.bai"%sorted_bam
 if opt.skip_marking_duplicates is True: bwa_mem_MarkDuplicates = False
 else: bwa_mem_MarkDuplicates = True
 
-# check the aligner
-if opt.aligner not in {"hisat2", "segemehl", "bowtie2", "bwa_mem", "bowtie2_local"}: raise ValueError("invalid aligner %s"%opt.aligner)
-
 # check that the reads are different (this generated errors in the insert size calculation)
 fun.check_that_paired_reads_are_different(opt.fastq1, opt.fastq2, "%s/checking_that_reads_are_different"%opt.outdir)
 

diff --git a/scripts/call_small_variants b/scripts/call_small_variants
@@ -71,6 +71,7 @@ parser.add_argument("-c", "--min_coverage", dest="min_coverage", required=True,
 parser.add_argument("--pooled_sequencing", dest="pooled_sequencing", action="store_true", default=False, help="It runs variant calling assuming that the input is a pool of different strains (i.e. a population). If provided, the only caller used will be 'freebayes'. In addition, note that the only positions considered for variant calling will be those with at least one allele with more reads than --min_coverage.")
 parser.add_argument("--stop_before_VariantIntegration", dest="stop_before_VariantIntegration", action="store_true", default=False, help="It stops the execution after the 'Variant calling' step.")
 parser.add_argument("--window_freebayes_bp", dest="window_freebayes_bp", default=10000, type=int, help="The window (in bp) in which freebayes regions are split to run in parallel. If you increase this number the splitting will be in larger chunks of the genome.")
+parser.add_argument("--fb_use_best_n_alleles", dest="fb_use_best_n_alleles", default=20, type=int, help="The maximum number of alleles considered by freebayes in pooled mode. Default is 20.")
 parser.add_argument("--min_chromosome_len", dest="min_chromosome_len", default=100000, type=int, help="The minimum length to consider chromosomes from the provided fasta for calculating the window length (used in may steps of perSVade to parallelize across fractions of the genome).")
 parser.add_argument("--outdir_callCNVs", dest="outdir_callCNVs", default=None, help="The path to the output directory of the 'call_CNVs' module. If provided, this module will add a 'relative_CN' column to the file 'variant_calling_ploidy<ploidy>.tab', which includes the relative copy number of the region in which each variant is found. This may be used to filter out variants that are in certain regions (i.e. duplicated or deleted).")
 parser.add_argument("--replace", dest="replace", action="store_true", help="Re-run all the steps by deleting the output directory.")
@@ -251,7 +252,7 @@ if "freebayes" in callers:
     outdir_freebayes = "%s/freebayes_ploidy%i_out"%(opt.outdir, opt.ploidy)
 
     # run freebayes in normal configuratiom, in parallel for each chromosome
-    freebayes_filtered = fun.run_freebayes_parallel_regions(outdir_freebayes, opt.ref, sorted_bam, opt.ploidy, opt.min_coverage, replace=opt.replace, threads=opt.threads, pooled_sequencing=opt.pooled_sequencing, window_fb=opt.window_freebayes_bp) 
+    freebayes_filtered = fun.run_freebayes_parallel_regions(outdir_freebayes, opt.ref, sorted_bam, opt.ploidy, opt.min_coverage, replace=opt.replace, threads=opt.threads, pooled_sequencing=opt.pooled_sequencing, window_fb=opt.window_freebayes_bp, fb_use_best_n_alleles=opt.fb_use_best_n_alleles) 
 
     # keep
     filtered_vcf_results.append(freebayes_filtered)

diff --git a/scripts/create_random_simulatedSVgenome.R b/scripts/create_random_simulatedSVgenome.R
@@ -26,6 +26,8 @@ argp = add_argument(argp, "--number_Tra", default=0, help="The number of translo
 argp = add_argument(argp, "--number_Dup", default=0, help="The number of duplications to generate")
 
 argp = add_argument(argp, "--len_shortest_chr", default=1000, help="Len of the shortest chrom")
+argp = add_argument(argp, "--max_max_time_rearrangement", default=100000, help="Maximum max_time_rearrangement tried")
+argp = add_argument(argp, "--max_fraction_shortest_chr_to_consider", default=1.0, help="Maximum fraction tried")
 argp = add_argument(argp, "--percCopiedIns", default=0.5, help="The fraction of INS that are copy-and-paste")
 argp = add_argument(argp, "--percBalancedTrans", default=1, help="The fraction of TRA that are balanced")
 #argp = add_argument(argp, "--replace", flag=TRUE, default=FALSE, help="Replace genomes that a")
@@ -54,14 +56,16 @@ rearranged_genome_generated = FALSE # a tag that defines that the rearranged gen
 
 # define the fraction of longest chromosome that will be considered
 all_fraction_shortest_chr_to_consider = c(0.1, 0.07, 0.05, 0.03, 0.02, 0.01, 0.005)
+all_fraction_shortest_chr_to_consider = all_fraction_shortest_chr_to_consider[all_fraction_shortest_chr_to_consider <= argv$max_fraction_shortest_chr_to_consider]
 #all_fraction_shortest_chr_to_consider = c(0.005)
 
 # define the fraction of nevents that will be considered
 all_fraction_n_events = c(1, 0.8, 0.7, 0.5, 0.3, 0.2, 0.1, 0.05, 0.01, 0.05)
 #all_fraction_n_events = c(1)
 
-# define the max_time_rearrangement
-all_max_time_rearrangement = c(200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000, 5000, 10000)
+# define the max_time_rearrangement and filter
+all_max_time_rearrangement = c(100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000, 5000, 10000)
+all_max_time_rearrangement = all_max_time_rearrangement[all_max_time_rearrangement <= argv$max_max_time_rearrangement]
 
 # first iterate through the number of events. It is more important to get as much events as possible
 for (fraction_n_events in all_fraction_n_events) {
@@ -112,7 +116,7 @@ for (fraction_n_events in all_fraction_n_events) {
   if (class(rearranged_genome)[1]=="DNAStringSet"){ break }
 
 }
-  
+
 # at the end check that the genome has been created 
 if (!rearranged_genome_generated){stop("It has not been possible to generate the rearranged genome. You may try different combinations of fraction_shortest_chr_to_consider")}
 

diff --git a/scripts/get_stats_optimization b/scripts/get_stats_optimization
@@ -50,11 +50,11 @@ parser.add_argument("-o", "--outdir", dest="outdir", action="store", required=Tr
 parser.add_argument("--samples_file", dest="samples_file", action="store", required=True, help="Tab-sepparated csv-file with 'sampleID' and 'sorted_bam' columns. The 'sorted_bam' column should have the paths to the bam files to analyze.")
 parser.add_argument("-r", "--ref", dest="ref", required=True, help="Reference genome. It has to end with .fasta.")
 parser.add_argument("-mchr", "--mitochondrial_chromosome", dest="mitochondrial_chromosome", required=True, type=str, help="The name of the mitochondrial chromosome. If there is no mitochondria just put 'no_mitochondria'. If there is more than one mitochindrial scaffold, provide them as comma-sepparated IDs, like '--mitochondrial_chromosome chr_mito_1,chr_mito_2'.")
-
-parser.add_argument("--overlap_coverage", dest="overlap_coverage", default=10, type=int, help="The maximum difference in coverage, in %, to say that two samples have the same coverage.")
-parser.add_argument("--overlap_insert_size", dest="overlap_insert_size", default=10, type=int, help="The maximum difference in insert size, in %, to say that two samples have the same insert size.")
-parser.add_argument("--overlap_insert_size_sd", dest="overlap_insert_size_sd", default=10, type=int, help="The maximum difference in insert size, in %, SD to say that two samples have the same insert size SD.")
-parser.add_argument("--overlap_read_len", dest="overlap_read_len", default=10, type=int, help="The maximum difference in read length, in %, to say that two samples have the same read length.")
+parser.add_argument("--overlap_coverage", dest="overlap_coverage", default=10, type=int, help="The maximum difference in coverage, in percentage, to say that two samples have the same coverage.")
+parser.add_argument("--overlap_coverage_mtDNA", dest="overlap_coverage_mtDNA", default=None, type=int, help="The maximum difference in coverage, in percentage, to say that two samples have the same coverage in terms of mtDNA. By default it's None, meaning that its the same as overlap_coverage, but it can be set.")
+parser.add_argument("--overlap_insert_size", dest="overlap_insert_size", default=10, type=int, help="The maximum difference in insert size, in percentage, to say that two samples have the same insert size.")
+parser.add_argument("--overlap_insert_size_sd", dest="overlap_insert_size_sd", default=10, type=int, help="The maximum difference in insert size, in percentage, SD to say that two samples have the same insert size SD.")
+parser.add_argument("--overlap_read_len", dest="overlap_read_len", default=10, type=int, help="The maximum difference in read length, in percentage, to say that two samples have the same read length.")
 
 # optional args
 parser.add_argument("--replace", dest="replace", action="store_true", help="Re-run all the steps by deleting the output directory.")
@@ -163,7 +163,8 @@ df_parameters = df_parameters.sort_values(by=["gDNA_median_coverage", "mtDNA_med
 
 # define the pairs of similar samples according to the overlapping distances parameters
 fun.print_with_runtime("Calculating pairs of strains that have similar parameters")
-field_to_overlapping_distance = {"gDNA_median_coverage":opt.overlap_coverage, "mtDNA_median_coverage":opt.overlap_coverage, "read_length":opt.overlap_read_len, "median_insert_size":opt.overlap_insert_size, "median_insert_size_sd":opt.overlap_insert_size_sd,}
+if opt.overlap_coverage_mtDNA is None: opt.overlap_coverage_mtDNA = opt.overlap_coverage
+field_to_overlapping_distance = {"gDNA_median_coverage":opt.overlap_coverage, "mtDNA_median_coverage":opt.overlap_coverage_mtDNA, "read_length":opt.overlap_read_len, "median_insert_size":opt.overlap_insert_size, "median_insert_size_sd":opt.overlap_insert_size_sd,}
 
 pairs_similar_samples = set()
 for s1 in sorted_samples:
@@ -205,7 +206,7 @@ for s in sorted_samples:
 
 # create final file with all data
 df_parameters["representative_sampleID"] = df_parameters.sampleID.apply(lambda x: sampleID_to_repSample[x])
-fun.save_df_as_tab(df_parameters[["sampleID", "representative_sampleID", "gDNA_median_coverage", "mtDNA_median_coverage", "read_length", "median_insert_size", "median_insert_size_sd"]], "%s/samples_parameter_optimization.tab"%opt.outdir)
+fun.save_df_as_tab(df_parameters.reset_index(drop=True)[["representative_sampleID", "sampleID", "gDNA_median_coverage", "mtDNA_median_coverage", "read_length", "median_insert_size", "median_insert_size_sd"]].sort_values(by=["representative_sampleID", "gDNA_median_coverage", "mtDNA_median_coverage", "read_length", "median_insert_size", "median_insert_size_sd", "sampleID"]), "%s/samples_parameter_optimization.tab"%opt.outdir)
 fun.print_with_runtime("There are %i/%i representative samples"%(len(set(df_parameters.representative_sampleID)), len(sorted_samples)))
 
 #################################