diff --git a/workflow/leviosam2.py b/workflow/leviosam2.py new file mode 100644 index 0000000..dd778e9 --- /dev/null +++ b/workflow/leviosam2.py @@ -0,0 +1,255 @@ +''' +Nae-Chyun Chen +Johns Hopkins University +2021-2022 +''' + +import argparse +import pathlib +import subprocess +import typing + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument('--leviosam2_binary', + default='leviosam2', + type=str, + help='Path to the leviosam2 executable') + parser.add_argument('--samtools_binary', + default='samtools', + type=str, + help='Path to the samtools executable') + parser.add_argument('--gnu_time_binary', + default='gtime', + type=str, + help=('Path to the GNU Time executable ' + '(see https://www.gnu.org/software/time/)')) + parser.add_argument('--measure_time', + action='store_true', + help=('Activate to measure time with ' + '`--gnu_time_binary`')) + parser.add_argument('--keep_tmp_files', + action='store_true', + help=('Activate to keep temp files ' + 'generated by the workflow')) + parser.add_argument('-t', + '--num_threads', + type=int, + default=8, + help='Number of threads to use') + parser.add_argument('--sequence_type', + type=str, + required=True, + choices=['ilmn_pe', 'ilmn_se', 'pb_hifi', 'ont'], + help='Type of sequence data') + parser.add_argument( + '--aligner', + type=str, + required=True, + choices=['bowtie2', 'bwamem', 'bwamem2', 'minimap2', 'winnowmap2']) + parser.add_argument('--source_label', + type=str, + default='source', + help='Label of the source reference') + parser.add_argument('--target_label', + type=str, + default='target', + help='Label of the target reference') + parser.add_argument('--read_group', type=str, help='Read group string') + parser.add_argument('-g', + '--lift_max_gap', + type=int, + help='[lift] Max chain gap size allowed') + parser.add_argument('--lift_commit_min_mapq', + type=int, + help='[lift] Min MAPQ to commit') + parser.add_argument('--lift_commit_min_score', + type=int, + help='[lift] Min alignment score (AS:i tag) to commit') + parser.add_argument('--lift_commit_max_frac_clipped', + type=float, + help='[lift] Min fraction of clipped bases to commit') + parser.add_argument('--lift_commit_max_isize', + type=int, + help='[lift] Max template length (isize) to commit') + parser.add_argument('--lift_commit_max_hdist', + type=int, + help='[lift] Max edit distance (NM:i tag) to commit') + parser.add_argument('--lift_bed_commit_source', + type=str, + help=('[lift] Path to a BED (source coordinates) ' + 'where reads in the regions are always ' + 'committed (often for suppress annotations)')) + parser.add_argument('--lift_bed_defer_target', + type=str, + help=('[lift] Path to a BED (target cooridnates' + 'where reads in the regions are always ' + 'deferred')) + parser.add_argument('--lift_realign_config', + type=str, + help=('[lift] Path to the config file for ' + 'realignment')) + parser.add_argument('-i', + '--input_alignment', + type=str, + required=True, + help='Path to the input SAM/BAM/CRAM file') + parser.add_argument('-o', + '--out_prefix', + type=str, + required=True, + help='Output prefix') + parser.add_argument('-C', + '--leviosam2_index', + type=str, + required=True, + help='Path to the leviosam2 index') + parser.add_argument('-f', + '--target_fasta', + type=str, + required=True, + help='Path to the target reference (FASTA file)') + parser.add_argument('--dryrun', + action='store_true', + help='Activate the dryrun mode') + parser.add_argument('--forcerun', + action='store_true', + help='Activate the forcerun mode. Rerun everything') + # parser.add_argument() + # parser.add_argument() + # parser.add_argument() + # parser.add_argument() + # parser.add_argument() + + args = parser.parse_args() + return args + + +def run_leviosam2(time_cmd: str, leviosam2: str, clft: str, fn_input: str, + out_prefix: str, num_threads: int, + lift_commit_min_mapq: typing.Union[int, None], + lift_commit_min_score: typing.Union[int, None], + lift_commit_max_frac_clipped: typing.Union[float, None], + lift_commit_max_isize: typing.Union[int, None], + lift_commit_max_hdist: typing.Union[int, None], + lift_max_gap: typing.Union[int, None], + lift_bed_commit_source: str, lift_bed_defer_target: str, + lift_realign_config: str, target_fasta: str, dryrun: bool): + lift_commit_min_mapq_arg = f'-S mapq:{lift_commit_min_mapq} ' \ + if lift_commit_min_mapq else '' + lift_commit_min_score_arg = f'-S aln_score:{lift_commit_min_score} ' \ + if lift_commit_min_score else '' + lift_commit_max_frac_clipped_arg = \ + f'-S clipped_frac:{lift_commit_max_frac_clipped} ' \ + if lift_commit_max_frac_clipped else '' + lift_commit_max_isize_arg = f'-S isize:{lift_commit_max_isize} ' \ + if lift_commit_max_isize else '' + lift_commit_max_hdist_arg = f'-S hdist:{lift_commit_max_hdist} ' \ + if lift_commit_max_hdist else '' + lift_max_gap_arg = f'-G {lift_max_gap} ' if lift_max_gap else '' + lift_bed_commit_source_arg = f'-r {lift_bed_commit_source} ' \ + if lift_bed_commit_source else '' + lift_bed_defer_target_arg = f'-D {lift_bed_defer_target} ' \ + if lift_bed_defer_target else '' + lift_realign_config_arg = f'-x {lift_realign_config} ' \ + if lift_realign_config else '' + + cmd = (f'{time_cmd} {leviosam2} lift -C {clft} ' + f'-a {fn_input} -p {out_prefix}' + f'-t {num_threads} -m -f {target_fasta} ' + f'{lift_commit_min_mapq_arg}' + f'{lift_commit_min_score_arg}' + f'{lift_commit_max_frac_clipped_arg}' + f'{lift_commit_max_hdist_arg}' + f'{lift_commit_max_isize_arg}' + f'{lift_max_gap_arg}' + f'{lift_bed_commit_source_arg}' + f'{lift_bed_defer_target_arg}' + f'{lift_realign_config_arg}') + if dryrun: + print(cmd) + else: + subprocess.run([cmd], shell=True) + + +def run_sort_committed(time_cmd: str, samtools: str, num_threads: int, + out_prefix: str, dryrun: bool, forcerun: bool): + fn_committed = pathlib.Path(f'{out_prefix}-committed.bam') + fn_committed_sorted = pathlib.Path(f'{out_prefix}-committed-sorted.bam') + + cmd = (f'{time_cmd} {samtools} sort -@ {num_threads} ' + f'-o {fn_committed_sorted} {fn_committed}') + if dryrun: + print(cmd) + else: + if not fn_committed.is_file(): + raise FileNotFoundError(f'{fn_committed} is not a file') + if (not forcerun) and fn_committed_sorted.is_file(): + print(f'[Info] Skip sort_committed -- ' + f'{fn_committed_sorted} exists') + return + subprocess.run([cmd], shell=True) + + +def run_collate(): + ''' + if [ ! -s ${PFX}-paired-deferred-R1.fq.gz ]; then + ${MT} ${LEVIOSAM} collate \ + -a ${PFX}-committed-sorted.bam -b ${PFX}-deferred.bam -p ${PFX}-paired + fi + ''' + pass + + +def run_workflow(args: argparse.Namespace): + time_cmd = '' + if args.measure_time: + time_cmd = f'{args.gnu_time_binary} -v -ao {args.out_prefix}.time_log' + run_leviosam2( + time_cmd=time_cmd, + leviosam2=args.leviosam2_binary, + clft=args.leviosam2_index, + fn_input=args.input_alignment, + out_prefix=args.out_prefix, + num_threads=args.num_threads, + lift_commit_min_mapq=args.lift_commit_min_mapq, + lift_commit_min_score=args.lift_commit_min_score, + lift_commit_max_frac_clipped=args.lift_commit_max_frac_clipped, + lift_commit_max_isize=args.lift_commit_max_isize, + lift_commit_max_hdist=args.lift_commit_max_hdist, + lift_max_gap=args.lift_max_gap, + lift_bed_commit_source=args.lift_bed_commit_source, + lift_bed_defer_target=args.lift_bed_defer_target, + lift_realign_config=args.lift_realign_config, + target_fasta=args.target_fasta, + dryrun=args.dryrun) + + run_sort_committed(time_cmd=time_cmd, + samtools=args.samtools_binary, + num_threads=args.num_threads, + out_prefix=args.out_prefix, + dryrun=args.dryrun, + forcerun=args.forcerun) + + if args.sequence_type in ['ilmn_pe']: + run_collate_pe() + run_realign_deferred_pe() + run_refflow_merge_pe() + run_merge_pe() + else: + run_bam_to_fastq_se() + run_realign_deferred_se() + run_merge_se() + + run_index() + run_clean() + + +def validate_binary(): + args.gnu_time_binary + + +if __name__ == '__main__': + args = parse_args() + run_workflow(args) diff --git a/workflow/leviosam2.sh b/workflow/leviosam2.sh index c341ee2..9ee65de 100644 --- a/workflow/leviosam2.sh +++ b/workflow/leviosam2.sh @@ -69,6 +69,11 @@ function print_usage_and_exit { echo " -M Toggle to measure time using GNU time [off]" echo " -T path Path to the GNU time binary [time]" echo " -K Toggle to keep tmp files [off]" + echo " Align to source:" + echo " -s path Path to the aligner index (source reference) []" + echo " -F path Path to the source FASTA file []" + echo " -w path Path to the input FASTQ (read 1) []" + echo " -W path Path to the input FASTQ (read 2, optional) []" echo " LevioSAM2-lift:" echo " -g INT Number of gaps allowed during leviosam2-lift [0]" echo " -x path Path to the levioSAM2 re-alignment config YAML []" @@ -82,7 +87,7 @@ function print_usage_and_exit { echo " -B float Bed intersect threshold. See 'leviosam2 lift -h' for details. [0]" echo " Aligner:" echo " -a string Aligner to use (bowtie2|bwamem|bwamem2|minimap2|winnowmap2) [bowtie2]" - echo " -b string Prefix to the aligner index" + echo " -b path Path to the aligner index (target reference)" echo " -l string Aligner mode for long read aligner (map-hifi|map-ont) [map-hifi]" echo " -S Toggle to use single-end mode [off]" echo " -r string The read group (RG) string []" @@ -90,7 +95,7 @@ function print_usage_and_exit { exit 1 } -while getopts hKMSa:A:b:B:C:D:f:H:g:i:l:L:m:o:p:q:r:R:t:T:x: flag +while getopts hKMSa:A:b:B:C:D:f:F:H:g:i:l:L:m:o:p:q:r:R:s:t:T:w:W:x: flag do case "${flag}" in h) print_usage_and_exit;; @@ -104,6 +109,7 @@ do C) CLFT=${OPTARG};; D) DEFER_DEST_BED=" -D ${OPTARG}";; f) REF=${OPTARG};; + F) REF_SOURCE=${OPTARG};; H) HDIST=" -S hdist:${OPTARG}";; g) ALLOWED_GAPS=${OPTARG};; i) INPUT=${OPTARG};; @@ -114,14 +120,17 @@ do p) FRAC_CLIPPED=" -S clipped_frac:${OPTARG}";; q) MAPQ=" -S mapq:${OPTARG}";; r) ALN_RG=${OPTARG};; + s) ALN_IDX_SOURCE=${OPTARG};; R) COMMIT_SOURCE_BED=" -r ${OPTARG}";; t) THR=${OPTARG};; T) TIME=${OPTARG};; + w) INPUT_FQ_1=${OPTARG};; + W) INPUT_FQ_2=${OPTARG};; x) REALN_CONFIG=" -x ${OPTARG}";; esac done -if [[ ${INPUT} == "" ]]; then +if [[ ${INPUT} == "" && ${INPUT_FQ_1} == "" ]]; then echo "Input is not set" print_usage_and_exit fi @@ -135,7 +144,7 @@ if [[ ${REF} == "" ]]; then fi MT="" -if (( ${MEASURE_TIME} > 0 )); then +if [[ ${MEASURE_TIME} > 0 ]]; then MT="${TIME} -v -ao leviosam2.time_log " fi @@ -153,6 +162,71 @@ fi set -xp +# Align to the source reference +if [[ ${INPUT} == "" ]]; then + INPUT=${PFX}.bam + if [[ ${SINGLE_END} == 1 ]]; then + if [[ ${ALN} == "bowtie2" ]]; then + ${MT} bowtie2 ${ALN_RG} -p ${THR} -x ${ALN_IDX_SOURCE} \ + -U ${INPUT_FQ_1} |\ + ${MT} samtools sort -o ${INPUT} + elif [[ ${ALN} == "bwamem" ]]; then + if [[ ${ALN_RG} != "" ]]; then + ALN_RG="-R ${ALN_RG}" + fi + ${MT} bwa mem -t ${THR} ${ALN_RG} ${ALN_IDX_SOURCE} \ + ${INPUT_FQ_1} |\ + ${MT} samtools sort -o ${INPUT} + elif [[ ${ALN} == "bwamem2" ]]; then + if [[ ${ALN_RG} != "" ]]; then + ALN_RG="-R ${ALN_RG}" + fi + ${MT} bwa-mem2 mem -t ${THR} ${ALN_RG} ${ALN_IDX_SOURCE} \ + ${INPUT_FQ_1} |\ + ${MT} samtools sort -o ${INPUT} + elif [[ ${ALN} =~ ^(minimap2|winnowmap2)$ ]]; then + if [[ ${ALN_RG} != "" ]]; then + ALN_RG="-R ${ALN_RG}" + fi + ${MT} ${ALN} -ax ${LR_MODE} --MD -t ${THR} ${ALN_IDX_SOURCE} \ + ${REF_SOURCE} ${INPUT_FQ_1} | \ + ${MT} samtools sort -o ${INPUT} + else + print_usage_and_exit + fi + else + if [[ ${ALN} == "bowtie2" ]]; then + ${MT} bowtie2 ${ALN_RG} -p ${THR} -x ${ALN_IDX_SOURCE} \ + -1 ${INPUT_FQ_1} -2 ${INPUT_FQ_2} |\ + ${MT} samtools sort -o ${INPUT} + elif [[ ${ALN} == "bwamem" ]]; then + if [[ ${ALN_RG} != "" ]]; then + ALN_RG="-R ${ALN_RG}" + fi + ${MT} bwa mem -t ${THR} ${ALN_RG} ${ALN_IDX_SOURCE} \ + ${INPUT_FQ_1} ${INPUT_FQ_2} |\ + ${MT} samtools sort -o ${INPUT} + elif [[ ${ALN} == "bwamem2" ]]; then + if [[ ${ALN_RG} != "" ]]; then + ALN_RG="-R ${ALN_RG}" + fi + ${MT} bwa-mem2 mem -t ${THR} ${ALN_RG} ${ALN_IDX_SOURCE} \ + ${INPUT_FQ_1} ${INPUT_FQ_2} |\ + ${MT} samtools sort -o ${INPUT} + elif [[ ${ALN} =~ ^(minimap2|winnowmap2)$ ]]; then + if [[ ${ALN_RG} != "" ]]; then + ALN_RG="-R ${ALN_RG}" + fi + ${MT} ${ALN} -ax ${LR_MODE} --MD -t ${THR} ${ALN_IDX_SOURCE} \ + ${REF_SOURCE} ${INPUT_FQ_1} ${INPUT_FQ_2} | \ + ${MT} samtools sort -o ${INPUT} + else + print_usage_and_exit + fi + fi + +fi + # Lifting over using leviosam2 if [ ! -s ${PFX}-committed.bam ]; then ${MT} ${LEVIOSAM} lift -C ${CLFT} -a ${INPUT} -t ${THR} -p ${PFX} -O bam \ @@ -168,7 +242,7 @@ if [ ! -s ${PFX}-committed-sorted.bam ]; then -o ${PFX}-committed-sorted.bam ${PFX}-committed.bam fi -if (( ${SINGLE_END} == 1 )); then +if [[ ${SINGLE_END} == 1 ]]; then # Convert deferred reads to FASTQ if [ ! -s ${PFX}-deferred.fq.gz ]; then ${MT} samtools fastq ${PFX}-deferred.bam | \ @@ -215,7 +289,7 @@ if (( ${SINGLE_END} == 1 )); then fi # Clean tmp files - if (( ${KEEP_TMP} < 1 )); then + if [[ ${KEEP_TMP} < 1 ]]; then rm ${PFX}-committed.bam ${PFX}-committed-sorted.bam rm ${PFX}-deferred.bam ${PFX}-deferred.fq.gz fi @@ -274,7 +348,7 @@ else fi # Clean tmp files - if (( ${KEEP_TMP} < 1 )); then + if [[ ${KEEP_TMP} < 1 ]]; then rm ${PFX}-paired-deferred.bam ${PFX}-paired-deferred-sorted_n.bam rm ${PFX}-paired-realigned.bam ${PFX}-paired-realigned-sorted_n.bam rm ${PFX}-paired-deferred-reconciled.bam ${PFX}-paired-committed.bam