Skip to content

Commit

Permalink
commit 30/08/2023 18:07:05
Browse files Browse the repository at this point in the history
  • Loading branch information
MikiSchikora committed Aug 30, 2023
1 parent c63fcc5 commit 1814c60
Show file tree
Hide file tree
Showing 10 changed files with 2,248 additions and 115 deletions.
29 changes: 12 additions & 17 deletions installation/test_installation/test_installation_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@

# redefine the gff
gff = "%s/reduced_annotations.gff"%testing_outputs_dir
if fun.file_is_empty(gff): fun.run_cmd("cp %s %s"%(test_gff, gff))
if fun.file_is_empty(gff): fun.run_cmd("rsync %s %s"%(test_gff, gff))

# redefine the mutated genome location
mut_genome = "%s/mutated_genome.fasta"%testing_outputs_dir
Expand Down Expand Up @@ -146,7 +146,7 @@
if fun.file_is_empty("%s/perSVade_finished_file.txt"%outdir_call_coverage_per_gene): fun.run_cmd("%s get_cov_genes --threads %i --fraction_available_mem 1.0 -o %s --ref %s --min_chromosome_len 100 -gff %s -sbam %s/aligned_reads.bam.sorted"%(fun.perSVade_modules, threads, outdir_call_coverage_per_gene, ref_genome, gff, outdir_align_reads_SVs))

# perform various normal varcall ops per alignment
sorted_aligners = ["segemehl", "bowtie2_local", "bowtie2", "hisat2", "bwa_mem"]
sorted_aligners = ["segemehl", "bowtie2", "hisat2", "bwa_mem"] # bowtie2_local, hisat2_no_spliced
for aligner in sorted_aligners:
print("testing variant calling for %s..."%aligner)

Expand All @@ -162,32 +162,25 @@
outdir_small_vars = "%s/calling_small_vars_%s"%(testing_outputs_dir, aligner)
if fun.file_is_empty("%s/perSVade_finished_file.txt"%outdir_small_vars): fun.run_cmd("%s call_small_variants --threads %i --fraction_available_mem 1.0 --min_chromosome_len 100 -o %s -r %s -sbam %s/aligned_reads.bam.sorted --repeats_file skip -p 1 --callers bcftools,freebayes,HaplotypeCaller --min_AF 0.9 --min_coverage 0 --outdir_callCNVs %s"%(fun.perSVade_modules, threads, outdir_small_vars, Cglabrata_genome, outdir_align_reads, "%s/call_CNVs_Cglab_subsampledReads_%s"%(testing_outputs_dir, sorted_aligners[0])))

dakhgdgajgad


# print the overlaps in variants
comparisons = {}
comparisons = set()
for aln1 in sorted_aligners:
for aln2 in sorted_aligners:
if aln1==aln2: continue
comparison_tuple = tuple(sorted(aln1, aln2))
comparison_tuple = tuple(sorted([aln1, aln2]))
if comparison_tuple in comparisons: continue

# load vars
min_nPASS = 2
vars1 = set(fun.get_df_and_header_from_vcf("%s/calling_small_vars_%s/.vcf"%(testing_outputs_dir, aln1))[0].ID)
vars2 = set(fun.get_df_and_header_from_vcf("%s/calling_small_vars_%s/.vcf"%(testing_outputs_dir, aln2))[0].ID)
min_nPASS = 3
vars1 = set(fun.get_df_and_header_from_vcf("%s/calling_small_vars_%s/variants_atLeast%iPASS_ploidy1.vcf"%(testing_outputs_dir, aln1, min_nPASS))[0].ID)
vars2 = set(fun.get_df_and_header_from_vcf("%s/calling_small_vars_%s/variants_atLeast%iPASS_ploidy1.vcf"%(testing_outputs_dir, aln2, min_nPASS))[0].ID)

# report
all_vars = vars1.union(vars2)
overlap_vars = vars1.intersection(vars2)
print("NPASS>=%i. %s-vs-%s. %i/%i, %.2f%s"%(min_nPASS, aln1, aln2, len(overlap_vars), len(all_vars), (len(overlap_vars)/len(all_vars))*100, "%"))
#print("NPASS>=%i. %s-vs-%s. %i/%i, %.2f%s"%(min_nPASS, aln1, aln2, len(overlap_vars), len(all_vars), (len(overlap_vars)/len(all_vars))*100, "%"))
comparisons.add(comparison_tuple)

adkjagdajdakgd

HGFHGFHFHG

# annotate small variants
outdir_annotate_small_vars = "%s/annotate_small_vars"%testing_outputs_dir
fun.run_cmd("%s annotate_small_vars --threads %i --fraction_available_mem 1.0 -o %s --ref %s --min_chromosome_len 10000 --mitochondrial_chromosome mito_C_glabrata_CBS138 --merged_vcf %s/merged_vcfs_allVars_ploidy1.vcf -gff %s -mcode 3 -gcode 1"%(fun.perSVade_modules, threads, outdir_annotate_small_vars, ref_genome, outdir_small_vars, Cglabrata_annotations))
Expand All @@ -206,8 +199,10 @@

# annotate SVs
outdir_annotate_SVs = "%s/annotate_SVs"%testing_outputs_dir
fun.run_cmd("%s annotate_SVs --threads %i --fraction_available_mem 1.0 -o %s --ref %s --min_chromosome_len 10000 --mitochondrial_chromosome mito_C_glabrata_CBS138 --SV_CNV_vcf %s/SV_and_CNV_variant_calling.vcf -gff %s -mcode 3 -gcode 1"%(fun.perSVade_modules, threads, outdir_annotate_SVs, ref_genome, outdir_merged_calling, gff))

gff_all_Cglab_input = "%s/annots_ABmito.gff"%testing_inputs_dir
gff_all_Cglab = "%s/annots_ABmito.gff"%testing_outputs_dir
if fun.file_is_empty(gff_all_Cglab): fun.rsync_file(gff_all_Cglab_input, gff_all_Cglab)
fun.run_cmd("%s annotate_SVs --threads %i --fraction_available_mem 1.0 -o %s --ref %s --min_chromosome_len 10000 --mitochondrial_chromosome mito_C_glabrata_CBS138 --SV_CNV_vcf %s/SV_and_CNV_variant_calling.vcf -gff %s -mcode 3 -gcode 1 --replace"%(fun.perSVade_modules, threads, outdir_annotate_SVs, ref_genome, outdir_merged_calling, gff_all_Cglab))

# get homologous regions in C. albicans
outdir_homRegions = "%s/find_hom_regions"%testing_outputs_dir
Expand Down
1,965 changes: 1,965 additions & 0 deletions installation/test_installation/testing_inputs/annots_ABmito.gff

Large diffs are not rendered by default.

Binary file modified scripts/__pycache__/sv_functions.cpython-36.pyc
Binary file not shown.
5 changes: 1 addition & 4 deletions scripts/align_reads
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ parser.add_argument("--min_chromosome_len", dest="min_chromosome_len", default=1
parser.add_argument("--skip_marking_duplicates", dest="skip_marking_duplicates", default=False, action="store_true", help="Don't mark the duplicate reads in the .bam file. This can save you some time.")
parser.add_argument("--replace", dest="replace", action="store_true", help="Re-run all the steps by deleting the output directory.")
parser.add_argument("--verbose", dest="verbose", action="store_true", default=False, help="Print a verbose log.")
parser.add_argument("--aligner", dest="aligner", default='bwa_mem', type=str, help="The aligner algorithm for the short reads. It should map the used to generate the --sortedbam. It can be any of 'hisat2', 'segemehl', 'bowtie2', 'bowtie2_local', 'bwa_mem'. Most of the testing of perSVade was done on bwa_mem. For 'hisat2', 'segemehl', the bam files get the secondary alignments removed. In addition, for segemehl the bam file will not contain unmapped reads, as these create compatibility problems.")
parser.add_argument("--aligner", dest="aligner", default='bwa_mem', type=str, help="The aligner algorithm for the short reads. It should map the used to generate the --sortedbam. It can be any of 'hisat2', 'hisat2_no_spliced', 'segemehl', 'bowtie2', 'bowtie2_local', 'bwa_mem'. Most of the testing of perSVade was done on bwa_mem. For 'hisat2', 'hisat2_no_spliced', 'segemehl', the bam files get the secondary alignments removed. In addition, for segemehl the bam file will not contain unmapped reads, as these create compatibility problems.")

# resources
parser.add_argument("--fraction_available_mem", dest="fraction_available_mem", default=None, type=float, help="This pipeline calculates the available RAM for several steps, and it may not work well in some systems (i.e. HPC clusters). This parameter allows you to correct possible errors. If --fraction_available_mem is not provided (default behavior), this pipeline will calculate the available RAM by filling the memory, which may give errors. If you want to use all the available memory you should specify --fraction_available_mem 1.0. See the FAQ 'How does the --fraction_available_mem work?' from https://github.com/Gabaldonlab/perSVade/wiki/8.-FAQs for more info.")
Expand Down Expand Up @@ -139,9 +139,6 @@ index_bam = "%s.bai"%sorted_bam
if opt.skip_marking_duplicates is True: bwa_mem_MarkDuplicates = False
else: bwa_mem_MarkDuplicates = True

# check the aligner
if opt.aligner not in {"hisat2", "segemehl", "bowtie2", "bwa_mem", "bowtie2_local"}: raise ValueError("invalid aligner %s"%opt.aligner)

# check that the reads are different (this generated errors in the insert size calculation)
fun.check_that_paired_reads_are_different(opt.fastq1, opt.fastq2, "%s/checking_that_reads_are_different"%opt.outdir)

Expand Down
3 changes: 2 additions & 1 deletion scripts/call_small_variants
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ parser.add_argument("-c", "--min_coverage", dest="min_coverage", required=True,
parser.add_argument("--pooled_sequencing", dest="pooled_sequencing", action="store_true", default=False, help="It runs variant calling assuming that the input is a pool of different strains (i.e. a population). If provided, the only caller used will be 'freebayes'. In addition, note that the only positions considered for variant calling will be those with at least one allele with more reads than --min_coverage.")
parser.add_argument("--stop_before_VariantIntegration", dest="stop_before_VariantIntegration", action="store_true", default=False, help="It stops the execution after the 'Variant calling' step.")
parser.add_argument("--window_freebayes_bp", dest="window_freebayes_bp", default=10000, type=int, help="The window (in bp) in which freebayes regions are split to run in parallel. If you increase this number the splitting will be in larger chunks of the genome.")
parser.add_argument("--fb_use_best_n_alleles", dest="fb_use_best_n_alleles", default=20, type=int, help="The maximum number of alleles considered by freebayes in pooled mode. Default is 20.")
parser.add_argument("--min_chromosome_len", dest="min_chromosome_len", default=100000, type=int, help="The minimum length to consider chromosomes from the provided fasta for calculating the window length (used in may steps of perSVade to parallelize across fractions of the genome).")
parser.add_argument("--outdir_callCNVs", dest="outdir_callCNVs", default=None, help="The path to the output directory of the 'call_CNVs' module. If provided, this module will add a 'relative_CN' column to the file 'variant_calling_ploidy<ploidy>.tab', which includes the relative copy number of the region in which each variant is found. This may be used to filter out variants that are in certain regions (i.e. duplicated or deleted).")
parser.add_argument("--replace", dest="replace", action="store_true", help="Re-run all the steps by deleting the output directory.")
Expand Down Expand Up @@ -251,7 +252,7 @@ if "freebayes" in callers:
outdir_freebayes = "%s/freebayes_ploidy%i_out"%(opt.outdir, opt.ploidy)

# run freebayes in normal configuratiom, in parallel for each chromosome
freebayes_filtered = fun.run_freebayes_parallel_regions(outdir_freebayes, opt.ref, sorted_bam, opt.ploidy, opt.min_coverage, replace=opt.replace, threads=opt.threads, pooled_sequencing=opt.pooled_sequencing, window_fb=opt.window_freebayes_bp)
freebayes_filtered = fun.run_freebayes_parallel_regions(outdir_freebayes, opt.ref, sorted_bam, opt.ploidy, opt.min_coverage, replace=opt.replace, threads=opt.threads, pooled_sequencing=opt.pooled_sequencing, window_fb=opt.window_freebayes_bp, fb_use_best_n_alleles=opt.fb_use_best_n_alleles)

# keep
filtered_vcf_results.append(freebayes_filtered)
Expand Down
10 changes: 7 additions & 3 deletions scripts/create_random_simulatedSVgenome.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ argp = add_argument(argp, "--number_Tra", default=0, help="The number of translo
argp = add_argument(argp, "--number_Dup", default=0, help="The number of duplications to generate")

argp = add_argument(argp, "--len_shortest_chr", default=1000, help="Len of the shortest chrom")
argp = add_argument(argp, "--max_max_time_rearrangement", default=100000, help="Maximum max_time_rearrangement tried")
argp = add_argument(argp, "--max_fraction_shortest_chr_to_consider", default=1.0, help="Maximum fraction tried")
argp = add_argument(argp, "--percCopiedIns", default=0.5, help="The fraction of INS that are copy-and-paste")
argp = add_argument(argp, "--percBalancedTrans", default=1, help="The fraction of TRA that are balanced")
#argp = add_argument(argp, "--replace", flag=TRUE, default=FALSE, help="Replace genomes that a")
Expand Down Expand Up @@ -54,14 +56,16 @@ rearranged_genome_generated = FALSE # a tag that defines that the rearranged gen

# define the fraction of longest chromosome that will be considered
all_fraction_shortest_chr_to_consider = c(0.1, 0.07, 0.05, 0.03, 0.02, 0.01, 0.005)
all_fraction_shortest_chr_to_consider = all_fraction_shortest_chr_to_consider[all_fraction_shortest_chr_to_consider <= argv$max_fraction_shortest_chr_to_consider]
#all_fraction_shortest_chr_to_consider = c(0.005)

# define the fraction of nevents that will be considered
all_fraction_n_events = c(1, 0.8, 0.7, 0.5, 0.3, 0.2, 0.1, 0.05, 0.01, 0.05)
#all_fraction_n_events = c(1)

# define the max_time_rearrangement
all_max_time_rearrangement = c(200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000, 5000, 10000)
# define the max_time_rearrangement and filter
all_max_time_rearrangement = c(100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000, 5000, 10000)
all_max_time_rearrangement = all_max_time_rearrangement[all_max_time_rearrangement <= argv$max_max_time_rearrangement]

# first iterate through the number of events. It is more important to get as much events as possible
for (fraction_n_events in all_fraction_n_events) {
Expand Down Expand Up @@ -112,7 +116,7 @@ for (fraction_n_events in all_fraction_n_events) {
if (class(rearranged_genome)[1]=="DNAStringSet"){ break }

}

# at the end check that the genome has been created
if (!rearranged_genome_generated){stop("It has not been possible to generate the rearranged genome. You may try different combinations of fraction_shortest_chr_to_consider")}

Expand Down
15 changes: 8 additions & 7 deletions scripts/get_stats_optimization
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,11 @@ parser.add_argument("-o", "--outdir", dest="outdir", action="store", required=Tr
parser.add_argument("--samples_file", dest="samples_file", action="store", required=True, help="Tab-sepparated csv-file with 'sampleID' and 'sorted_bam' columns. The 'sorted_bam' column should have the paths to the bam files to analyze.")
parser.add_argument("-r", "--ref", dest="ref", required=True, help="Reference genome. It has to end with .fasta.")
parser.add_argument("-mchr", "--mitochondrial_chromosome", dest="mitochondrial_chromosome", required=True, type=str, help="The name of the mitochondrial chromosome. If there is no mitochondria just put 'no_mitochondria'. If there is more than one mitochindrial scaffold, provide them as comma-sepparated IDs, like '--mitochondrial_chromosome chr_mito_1,chr_mito_2'.")

parser.add_argument("--overlap_coverage", dest="overlap_coverage", default=10, type=int, help="The maximum difference in coverage, in %, to say that two samples have the same coverage.")
parser.add_argument("--overlap_insert_size", dest="overlap_insert_size", default=10, type=int, help="The maximum difference in insert size, in %, to say that two samples have the same insert size.")
parser.add_argument("--overlap_insert_size_sd", dest="overlap_insert_size_sd", default=10, type=int, help="The maximum difference in insert size, in %, SD to say that two samples have the same insert size SD.")
parser.add_argument("--overlap_read_len", dest="overlap_read_len", default=10, type=int, help="The maximum difference in read length, in %, to say that two samples have the same read length.")
parser.add_argument("--overlap_coverage", dest="overlap_coverage", default=10, type=int, help="The maximum difference in coverage, in percentage, to say that two samples have the same coverage.")
parser.add_argument("--overlap_coverage_mtDNA", dest="overlap_coverage_mtDNA", default=None, type=int, help="The maximum difference in coverage, in percentage, to say that two samples have the same coverage in terms of mtDNA. By default it's None, meaning that its the same as overlap_coverage, but it can be set.")
parser.add_argument("--overlap_insert_size", dest="overlap_insert_size", default=10, type=int, help="The maximum difference in insert size, in percentage, to say that two samples have the same insert size.")
parser.add_argument("--overlap_insert_size_sd", dest="overlap_insert_size_sd", default=10, type=int, help="The maximum difference in insert size, in percentage, SD to say that two samples have the same insert size SD.")
parser.add_argument("--overlap_read_len", dest="overlap_read_len", default=10, type=int, help="The maximum difference in read length, in percentage, to say that two samples have the same read length.")

# optional args
parser.add_argument("--replace", dest="replace", action="store_true", help="Re-run all the steps by deleting the output directory.")
Expand Down Expand Up @@ -163,7 +163,8 @@ df_parameters = df_parameters.sort_values(by=["gDNA_median_coverage", "mtDNA_med

# define the pairs of similar samples according to the overlapping distances parameters
fun.print_with_runtime("Calculating pairs of strains that have similar parameters")
field_to_overlapping_distance = {"gDNA_median_coverage":opt.overlap_coverage, "mtDNA_median_coverage":opt.overlap_coverage, "read_length":opt.overlap_read_len, "median_insert_size":opt.overlap_insert_size, "median_insert_size_sd":opt.overlap_insert_size_sd,}
if opt.overlap_coverage_mtDNA is None: opt.overlap_coverage_mtDNA = opt.overlap_coverage
field_to_overlapping_distance = {"gDNA_median_coverage":opt.overlap_coverage, "mtDNA_median_coverage":opt.overlap_coverage_mtDNA, "read_length":opt.overlap_read_len, "median_insert_size":opt.overlap_insert_size, "median_insert_size_sd":opt.overlap_insert_size_sd,}

pairs_similar_samples = set()
for s1 in sorted_samples:
Expand Down Expand Up @@ -205,7 +206,7 @@ for s in sorted_samples:

# create final file with all data
df_parameters["representative_sampleID"] = df_parameters.sampleID.apply(lambda x: sampleID_to_repSample[x])
fun.save_df_as_tab(df_parameters[["sampleID", "representative_sampleID", "gDNA_median_coverage", "mtDNA_median_coverage", "read_length", "median_insert_size", "median_insert_size_sd"]], "%s/samples_parameter_optimization.tab"%opt.outdir)
fun.save_df_as_tab(df_parameters.reset_index(drop=True)[["representative_sampleID", "sampleID", "gDNA_median_coverage", "mtDNA_median_coverage", "read_length", "median_insert_size", "median_insert_size_sd"]].sort_values(by=["representative_sampleID", "gDNA_median_coverage", "mtDNA_median_coverage", "read_length", "median_insert_size", "median_insert_size_sd", "sampleID"]), "%s/samples_parameter_optimization.tab"%opt.outdir)
fun.print_with_runtime("There are %i/%i representative samples"%(len(set(df_parameters.representative_sampleID)), len(sorted_samples)))

#################################
Expand Down
Loading

0 comments on commit 1814c60

Please sign in to comment.