Snakemake pipeline which leverages subseq to validate structural variants
ln -s /net/eichler/vol26/7200/software/pipelines/subseq/202209/runsnake
# Required directory tree . ├── runsnake -> /net/eichler/vol26/7200/software/pipelines/subseq/202209/runsnake └── definitions.snakefile
VARIANT_BED_PATTERN = '/net/eichler/vol27/projects/hprc/nobackups/svpop/freeze1/chm13/results/variant/{source}/{caller}/bed/{sample}/all/chm13_lc/byref/sv_{svtype}.bed'
# Aligned data to validate against (reads and assemblies)
ALNSOURCE_PATTERN_DICT = {
'hifi': '/net/eichler/vol27/projects/hprc/nobackups/variants/pbsv/pbsv_chm13_v1.1/align/{sample}/mapped_reads.bam'
}
# Location of the subseq script (extracts parts of aligned reads/contigs corresponding to a reference region and writes a FASTA)
SUBSEQ_EXE = '/net/eichler/vol27/projects/structural_variation/nobackups/tools/seqtools/201910/bin/subseqfa'
ALNSOURCE_PLOIDY_DICT = {
'hifi': subseqlib.stats.align_summary_diploid
}
# SV tuples:
# [0] Min svlen
# [1] Max svlen (exclusive)
# [2] Window (flank) bp
SET_DEF = {
'sv50-100':
(50, 100, 20),
'sv100-200':
(100, 200, 25),
'sv200-500':
(200, 500, 40),
'sv500-1000':
(500, 1000, 100),
'sv1-2k':
(1000, 2000, 200),
'sv2-4k':
(2000, 4000, 250),
'sv4k-max':
(4000, None, 300),
}
./runsnake 80 tables/validation/{sample}/caller_extern-pav/sv_{svtype}/{sample}_hifi/size{percent_size}_{minimum_reads}_{minimum_support}.tsv.gz
# Example
## ./runsnake 80 tables/validation/HG00733/caller_extern-pav/sv_insdel/HG00733_hifi/size50_2_4.tsv.gz
- {percent_size}
- Percent size of the SV the read differential has to be to support
- {minimum_reads}
- Minimum number of reads revruited to a region to make a call VALID/NOTVALID. If not recruited, NOCALL
- {minimum_support}
- Minimum number of reads which support the variant to require before passing VALID