From 654e5ffcca1fa591fe296b1aa09bf2939644bfce Mon Sep 17 00:00:00 2001 From: Pay Giesselmann Date: Sun, 31 Mar 2019 23:54:42 +0200 Subject: [PATCH 01/19] tagged output for demux mode --- nanopype.yaml | 1 + rules/alignment.smk | 69 ++++++++++------- rules/basecalling.smk | 62 +++++++++------ rules/demux.smk | 42 +++++++--- rules/methylation.smk | 76 +++++++++---------- rules/storage.smk | 37 ++++----- rules/sv.smk | 14 ++++ rules/utils/get_file.py | 56 ++++---------- rules/utils/storage.py | 2 +- rules/utils/storage_fast5Index.py | 122 ++++++++++++++++++++++++++++++ 10 files changed, 316 insertions(+), 165 deletions(-) create mode 100644 rules/utils/storage_fast5Index.py diff --git a/nanopype.yaml b/nanopype.yaml index 848cb96..ee7409d 100755 --- a/nanopype.yaml +++ b/nanopype.yaml @@ -61,6 +61,7 @@ methylation_flappie_qval_threshold: 3 # demux +demux_batch_size: 4000 deepbinner_models: default: SQK-RBK004_read_starts EXP-NBD103: EXP-NBD103_read_starts diff --git a/rules/alignment.smk b/rules/alignment.smk index 4dbcf72..7703ac2 100755 --- a/rules/alignment.smk +++ b/rules/alignment.smk @@ -36,25 +36,28 @@ import os, glob from rules.utils.get_file import get_batch_ids_raw, get_sequence_batch from rules.utils.storage import get_ID # local rules -localrules: graphmap_index, ngmlr_index, aligner_merge_batches_run, aligner_1D2 +localrules: graphmap_index, ngmlr_index, aligner_merge_batches, aligner_merge_tag, aligner_1D2 #ruleorder: aligner_sam2bam > aligner_merge_batches_run - # get batches def get_batches_aligner(wildcards, config): - if wildcards.tag in config['runnames']: - return expand("alignments/{aligner}/{sequence_workflow}/batches/{runname}/{batch}.{reference}.bam", - aligner=wildcards.aligner, - sequence_workflow=wildcards.sequence_workflow, - runname=wildcards.tag, - batch=get_batch_ids_raw(wildcards.tag, config), - reference=wildcards.reference) - else: - return expand("alignments/{aligner}/{sequence_workflow}/{runname}.{reference}.bam", - aligner=wildcards.aligner, - sequence_workflow=wildcards.sequence_workflow, - runname=[runname for runname in config['runnames']], - reference=wildcards.reference) + r = expand("alignments/{aligner}/{sequence_workflow}/batches/{tag}/{runname}/{batch}.{reference}.bam", + aligner=wildcards.aligner, + sequence_workflow=wildcards.sequence_workflow, + tag=wildcards.tag, + runname=wildcards.runname, + batch=get_batch_ids_raw(wildcards.runname, config), + reference=wildcards.reference) + return r + +def get_batches_runname(wildcards, config): + r = expand("alignments/{aligner}/{sequence_workflow}/batches/{tag}/{runname}.{reference}.bam", + aligner=wildcards.aligner, + sequence_workflow=wildcards.sequence_workflow, + tag=wildcards.tag, + runname=[runname for runname in config['runnames']], + reference=wildcards.reference) + return r # minimap alignment @@ -63,7 +66,7 @@ rule minimap2: sequence = lambda wildcards: get_sequence_batch(wildcards, config), reference = lambda wildcards: config['references'][wildcards.reference]['genome'] output: - pipe("alignments/minimap2/{sequence_workflow, ((?!batches).)*}/batches/{batch, [^.]*}.{reference}.sam") + pipe("alignments/minimap2/{sequence_workflow}/batches/{tag, [^\/]*}/{runname, [^.\/]*}/{batch, [^.]*}.{reference}.sam") threads: config['threads_alignment'] group: "minimap2" resources: @@ -83,7 +86,7 @@ rule graphmap: reference = lambda wildcards: config['references'][wildcards.reference]['genome'], index = lambda wildcards: config['references'][wildcards.reference]['genome'] + ".gmidx" output: - pipe("alignments/graphmap/{sequence_workflow, ((?!batches).)*}/batches/{batch, [^.]*}.{reference}.sam") + pipe("alignments/graphmap/{sequence_workflow}/batches/{tag, [^\/]*}/{runname, [^.\/]*}/{batch, [^.]*}.{reference}.sam") threads: config['threads_alignment'] group: "graphmap" resources: @@ -117,7 +120,7 @@ rule ngmlr: index = lambda wildcards : directory(os.path.dirname(config['references'][wildcards.reference]['genome'])), index_flag = lambda wildcards: config['references'][wildcards.reference]['genome'] + '.ngm' output: - pipe("alignments/ngmlr/{sequence_workflow, ((?!batches).)*}/batches/{batch, [^.]*}.{reference}.sam") + pipe("alignments/ngmlr/{sequence_workflow}/batches/{tag, [^\/]*}/{runname, [^.\/]*}/{batch, [^.]*}.{reference}.sam") threads: config['threads_alignment'] group: "ngmlr" resources: @@ -147,10 +150,10 @@ rule ngmlr_index: # sam to bam conversion and RG tag rule aligner_sam2bam: input: - sam = "alignments/{aligner}/{sequence_workflow}/batches/{batch}.{reference}.sam" + sam = "alignments/{aligner}/{sequence_workflow}/batches/{tag}/{runname}/{batch}.{reference}.sam" output: - bam = "alignments/{aligner, [^.\/]*}/{sequence_workflow}/batches/{batch, [^.]*}.{reference, [^.]*}.bam", - bai = "alignments/{aligner, [^.\/]*}/{sequence_workflow}/batches/{batch, [^.]*}.{reference, [^.]*}.bam.bai" + bam = "alignments/{aligner, [^.\/]*}/{sequence_workflow}/batches/{tag, [^\/]*}/{runname, [^.\/]*}/{batch, [^.]*}.{reference, [^.]*}.bam", + bai = "alignments/{aligner, [^.\/]*}/{sequence_workflow}/batches/{tag, [^\/]*}/{runname, [^.\/]*}/{batch, [^.]*}.{reference, [^.]*}.bam.bai" shadow: "minimal" threads: 1 resources: @@ -164,12 +167,26 @@ rule aligner_sam2bam: """ # merge batch files -rule aligner_merge_batches_run: +rule aligner_merge_batches: input: bam = lambda wildcards: get_batches_aligner(wildcards, config) output: - bam = "alignments/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/{tag, [^.\/]*}.{reference, [^.]*}.bam", - bai = "alignments/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/{tag, [^.\/]*}.{reference, [^.]*}.bam.bai" + bam = "alignments/{aligner, [^.\/]*}/{sequence_workflow}/batches/{tag, [^\/]*}/{runname, [^.\/]*}.{reference, [^.]*}.bam", + bai = "alignments/{aligner, [^.\/]*}/{sequence_workflow}/batches/{tag, [^\/]*}/{runname, [^.\/]*}.{reference, [^.]*}.bam.bai" + singularity: + "docker://nanopype/alignment:{tag}".format(tag=config['version']['tag']) + shell: + """ + {config[bin_singularity][samtools]} merge {output.bam} {input} -p + {config[bin_singularity][samtools]} index {output.bam} + """ + +rule aligner_merge_tag: + input: + bam = lambda wildcards: get_batches_runname(wildcards, config) + output: + bam = "alignments/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/{tag, [^\/]*}.{reference, [^.]*}.bam", + bai = "alignments/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/{tag, [^\/]*}.{reference, [^.]*}.bam.bai" singularity: "docker://nanopype/alignment:{tag}".format(tag=config['version']['tag']) shell: @@ -181,9 +198,9 @@ rule aligner_merge_batches_run: # match 1D2 read pairs rule aligner_1D2: input: - "alignments/{aligner}/{basecaller}/runs/{runname}.{reference}.bam" + "alignments/{aligner}/{sequence_workflow}/batches/{tag}/{runname}.{reference}.bam" output: - "alignments/{aligner, [^.\/]*}/{basecaller, [^.\/]*}/runs/{runname, [^.\/]*}.{reference, [^.\/]*}.1D2.tsv" + "alignments/{aligner, [^.\/]*}/{sequence_workflow}/batches/{tag, [^\/]*}/{runname, [^.\/]*}.{reference, [^.]*}.1D2.tsv" params: buffer = 200, tolerance = 200 diff --git a/rules/basecalling.smk b/rules/basecalling.smk index 2a1bc23..ce3de29 100755 --- a/rules/basecalling.smk +++ b/rules/basecalling.smk @@ -36,28 +36,34 @@ import os, sys from rules.utils.get_file import get_batch_ids_raw, get_batch_ext from rules.utils.storage import get_flowcell, get_kit # local rules -localrules: basecaller_merge_run, basecaller_qc +localrules: basecaller_merge_batches, basecaller_merge_tag, basecaller_qc # get batches def get_batches_basecaller(wildcards, config): - if wildcards.tag in config['runnames']: - return expand("sequences/{sequence_workflow}/batches/{runname}/{batch}.{format}.gz", - sequence_workflow=wildcards.sequence_workflow, - runname=wildcards.tag, - batch=get_batch_ids_raw(wildcards.tag, config), - format=wildcards.format) - else: - return expand("sequences/{sequence_workflow}/{runname}.{format}.gz", - sequence_workflow=wildcards.sequence_workflow, - runname=[runname for runname in config['runnames']], - format=wildcards.format) + r = expand("sequences/{sequence_workflow}/batches/{tag}/{runname}/{batch}.{format}.gz", + sequence_workflow=wildcards.sequence_workflow, + tag=wildcards.tag, + runname=wildcards.runname, + batch=get_batch_ids_raw(wildcards.runname, config), + format=wildcards.format) + return r + +def get_batches_runname(wildcards, config): + r = expand("sequences/{sequence_workflow}/batches/{tag}/{runname}.{format}.gz", + sequence_workflow=wildcards.sequence_workflow, + tag=wildcards.tag, + runname=[runname for runname in config['runnames']], + format=wildcards.format) + return r # albacore basecalling rule albacore: input: - lambda wildcards : "{data_raw}/{{runname}}/reads/{{batch}}.{ext}".format(data_raw = config["storage_data_raw"], ext=get_batch_ext(wildcards, config)) + lambda wildcards : "{data_raw}/{{runname}}/reads/{{batch}}.{ext}".format( + data_raw = config["storage_data_raw"], + ext=get_batch_ext(wildcards, config)) output: - "sequences/albacore/batches/{runname, [^.\/]*}/{batch, [^.]*}.{format, (fasta|fastq|fa|fq)}.gz" + "sequences/albacore/batches/{tag, [^\/]*}/{runname, [^.\/]*}/{batch, [^.]*}.{format, (fasta|fastq|fa|fq)}.gz" shadow: "minimal" threads: config['threads_basecalling'] resources: @@ -88,9 +94,11 @@ rule albacore: # guppy basecalling rule guppy: input: - lambda wildcards : "{data_raw}/{{runname}}/reads/{{batch}}.{ext}".format(data_raw = config["storage_data_raw"], ext=get_batch_ext(wildcards, config)) + lambda wildcards : "{data_raw}/{{runname}}/reads/{{batch}}.{ext}".format( + data_raw = config["storage_data_raw"], + ext=get_batch_ext(wildcards, config)) output: - "sequences/guppy/batches/{runname, [^.\/]*}/{batch, [^.]*}.{format, (fasta|fastq|fa|fq)}.gz" + "sequences/guppy/batches/{tag, [^\/]*}/{runname, [^.\/]*}/{batch, [^.]*}.{format, (fasta|fastq|fa|fq)}.gz" shadow: "minimal" threads: config['threads_basecalling'] resources: @@ -107,7 +115,7 @@ rule guppy: """ mkdir -p raw {config[sbin_singularity][storage_batch2fast5.sh]} {input} raw/ {config[sbin_singularity][base]} {config[bin_singularity][python]} - {config[bin_singularity][guppy]} -i raw/ --recursive --cpu_threads_per_caller {threads} -s workspace/ --flowcell {params.flowcell} --kit {params.kit} {params.filtering} {config[basecalling_guppy_flags]} + {config[bin_singularity][guppy]} -i raw/ --recursive --num_callers 1 --cpu_threads_per_caller {threads} -s workspace/ --flowcell {params.flowcell} --kit {params.kit} {params.filtering} {config[basecalling_guppy_flags]} FASTQ_DIR='workspace/pass' if [ \'{params.filtering}\' = '' ]; then FASTQ_DIR='workspace' @@ -123,10 +131,12 @@ rule guppy: # flappie basecalling rule flappie: input: - lambda wildcards : "{data_raw}/{{runname}}/reads/{{batch}}.{ext}".format(data_raw = config["storage_data_raw"], ext=get_batch_ext(wildcards, config)) + lambda wildcards : "{data_raw}/{{runname}}/reads/{{batch}}.{ext}".format( + data_raw = config["storage_data_raw"], + ext=get_batch_ext(wildcards, config)) output: - sequence = "sequences/flappie/batches/{runname, [^.\/]*}/{batch, [^.]*}.{format, (fasta|fastq|fa|fq)}.gz", - methyl_marks = "sequences/flappie/batches/{runname, [^.\/]*}/{batch, [^.]*}.{format, (fasta|fastq|fa|fq)}.tsv.gz" + sequence = "sequences/flappie/batches/{tag, [^\/]*}/{runname, [^.\/]*}/{batch, [^.]*}.{format, (fasta|fastq|fa|fq)}.gz", + methyl_marks = "sequences/flappie/batches/{tag, [^\/]*}/{runname, [^.\/]*}/{batch, [^.]*}.{format, (fasta|fastq|fa|fq)}.tsv.gz" shadow: "minimal" threads: config['threads_basecalling'] resources: @@ -152,11 +162,19 @@ rule flappie: """ # merge and compression -rule basecaller_merge_run: +rule basecaller_merge_batches: input: lambda wildcards: get_batches_basecaller(wildcards, config) output: - "sequences/{sequence_workflow, ((?!batches).)*}/{tag, [^.\/]*}.{format, (fasta|fastq|fa|fq)}.gz" + "sequences/{sequence_workflow}/batches/{tag, [^\/]*}/{runname, [^.\/]*}.{format, (fasta|fastq|fa|fq)}.gz" + shell: + "cat {input} > {output}" + +rule basecaller_merge_tag: + input: + lambda wildcards: get_batches_runname(wildcards, config) + output: + "sequences/{sequence_workflow, ((?!batches).)*}/{tag, [^\/]*}.{format, (fasta|fastq|fa|fq)}.gz" shell: "cat {input} > {output}" diff --git a/rules/demux.smk b/rules/demux.smk index 17570aa..62f460f 100755 --- a/rules/demux.smk +++ b/rules/demux.smk @@ -34,19 +34,22 @@ # imports from rules.utils.get_file import get_batch_ids_raw # local rules -localrules: demux_merge_run +localrules: demux_split_barcodes # get batches -def get_batches_demux(wildcards): - return expand("demux/{wildcards.demultiplexer}/{wildcards.runname}/{{batch}}.tsv".format(wildcards=wildcards), batch=get_batch_ids_raw(wildcards, config)) +def get_batches_demux(wildcards, config): + return expand("demux/{demultiplexer}/batches/{runname}/{batch}.tsv", + demultiplexer=wildcards.demultiplexer, + runname=wildcards.runname, + batch=get_batch_ids_raw(wildcards.runname, config)) # deepbinner demux rule deepbinner: input: - signals = lambda wildcards : "{data_raw}/{{runname}}/reads/{{batch}}.{ext}".format(data_raw = config["storage_data_raw"], ext=get_batch_ext(wildcards, config)), + signals = lambda wildcards : get_signal_batch(wildcards, config), model = lambda wildcards : config["deepbinner_models"][get_kit(wildcards)] if get_kit(wildcards, config) in config["deepbinner_models"] else config["deepbinner_models"]['default'] output: - "demux/deepbinner/{runname, [^.\/]*}/{batch, [^.\/]*}.tsv" + "demux/deepbinner/batches/{runname, [^.\/]*}/{batch, [^.\/]*}.tsv" shadow: "minimal" threads: config['threads_demux'] resources: @@ -61,13 +64,30 @@ rule deepbinner: {config[bin_singularity][python]} {config[bin_singularity][deepbinner]} classify raw -s {input.model} --intra_op_parallelism_threads {threads} --omp_num_threads {threads} --inter_op_parallelism_threads {threads} | tail -n +2 > {output} """ -# merge -rule demux_merge_run: +# split into barcode id list per run +checkpoint demux_split_barcodes: input: - get_batches_demux + batches = lambda wildcards: get_batches_demux(wildcards, config) output: - "demux/{demultiplexer, [^.\/]*}/{runname, [^.\/]*}.tsv" + barcodes = directory("demux/{demultiplexer, [^.\/]*}/barcodes/{runname}") singularity: "docker://nanopype/demux:{tag}".format(tag=config['version']['tag']) - shell: - "cat {input} > {output}" + run: + import os, itertools, collections + os.makedirs(output.barcodes) + barcode_ids = collections.defaultdict(list) + for f in input.batches: + read_barcodes = [] + with open(f, 'r') as fp: + read_barcodes = [tuple(line.split('\t')) for line in fp.read().split('\n') if line] + read_barcodes.sort(key=lambda x : x[1]) + for bc, ids in itertools.groupby(read_barcodes, key=lambda x:x[1]): + barcode_ids[bc].extend([id for id, barcode in ids]) + def chunked(l, n): + for i in range(0, len(l), n): + yield l[i:i + n] + for barcode, ids in barcode_ids.items(): + os.mkdir(os.path.join(output.barcodes, barcode)) + for i, batch in enumerate(chunked(ids, config['demux_batch_size'])): + with open(os.path.join(output.barcodes, barcode, str(i) + '.txt'), 'w') as fp: + print('\n'.join(batch), file=fp) diff --git a/rules/methylation.smk b/rules/methylation.smk index e8b12aa..66fb3e7 100755 --- a/rules/methylation.smk +++ b/rules/methylation.smk @@ -41,31 +41,25 @@ localrules: methylation_bedGraph, methylation_bigwig, methylation_compress # get batches def get_batches_methylation(wildcards, config, methylation_caller): - if wildcards.tag in config['runnames']: - return expand("methylation/{methylation_caller}/{aligner}/{sequence_workflow}/batches/{runname}/{batch}.{reference}.tsv", + r = expand("methylation/{methylation_caller}/{aligner}/{sequence_workflow}/batches/{tag}/{runname}/{batch}.{reference}.tsv", methylation_caller=methylation_caller, aligner=wildcards.aligner, sequence_workflow=wildcards.sequence_workflow, + tag=wildcards.tag, + runname=wildcards.runname, reference=wildcards.reference, - runname=wildcards.tag, - batch=get_batch_ids_raw(wildcards.tag, config)) + batch=get_batch_ids_raw(wildcards.runname, config)) + return r -def get_tsv_methylation(wildcards, config, methylation_caller): - if wildcards.tag in config['runnames']: - return 'methylation/{methylation_caller}/{aligner}/{sequence_workflow}/{runname}.{reference}.tsv.gz'.format( - methylation_caller=methylation_caller, - aligner=wildcards.aligner, - sequence_workflow=wildcards.sequence_workflow, - runname=wildcards.tag, - reference=wildcards.reference - ) - else: - return expand('methylation/{methylation_caller}/{aligner}/{sequence_workflow}/{tag}.{reference}.tsv.gz', - methylation_caller=methylation_caller, - aligner=wildcards.aligner, - sequence_workflow=wildcards.sequence_workflow, - tag=[r for r in config['runnames']], - reference=wildcards.reference) +def get_batches_runname(wildcards, config, methylation_caller): + r = expand("methylation/{methylation_caller}/{aligner}/{sequence_workflow}/batches/{tag}/{runname}.{reference}.tsv.gz", + methylation_caller=methylation_caller, + aligner=wildcards.aligner, + sequence_workflow=wildcards.sequence_workflow, + tag=wildcards.tag, + runname=[runname for runname in config['runnames']], + reference=wildcards.reference) + return r # parse min coverage from wildcards def get_min_coverage(wildcards): @@ -85,7 +79,7 @@ rule methylation_nanopolish: bai = lambda wildcards : get_alignment_batch(wildcards, config) + '.bai', reference = lambda wildcards: config['references'][wildcards.reference]['genome'] output: - "methylation/nanopolish/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/batches/{batch, [^.]*}.{reference, [^.\/]*}.tsv" + "methylation/nanopolish/{aligner, [^.\/]*}/{sequence_workflow}/batches/{tag, [^\/]*}/{runname, [^.\/]*}/{batch, [^.]*}.{reference, [^.\/]*}.tsv" shadow: "minimal" threads: config['threads_methylation'] resources: @@ -97,9 +91,9 @@ rule methylation_nanopolish: """ mkdir -p raw {config[sbin_singularity][storage_batch2fast5.sh]} {input.signals} raw/ {config[sbin_singularity][base]} {config[bin_singularity][python]} - zcat {input.sequences} > sequences.fasta - {config[bin_singularity][nanopolish]} index -d raw/ sequences.fasta - {config[bin_singularity][nanopolish]} call-methylation -t {threads} -r sequences.fasta -g {input.reference} -b {input.bam} > {output} + zcat {input.sequences} > sequences.fastx + {config[bin_singularity][nanopolish]} index -d raw/ sequences.fastx + {config[bin_singularity][nanopolish]} call-methylation -t {threads} -r sequences.fastx -g {input.reference} -b {input.bam} > {output} """ # merge batch tsv files and split connected CpGs @@ -107,7 +101,7 @@ rule methylation_nanopolish_merge_run: input: lambda wildcards: get_batches_methylation(wildcards, config, 'nanopolish') output: - temp("methylation/nanopolish/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/{tag, [^.\/]*}.{reference, [^.\/]*}.tsv") + temp("methylation/nanopolish/{aligner, [^.\/]*}/{sequence_workflow}/batches/{tag, [^\/]*}/{runname, [^.\/]*}.{reference, [^.\/]*}.tsv") run: from rules.utils.methylation_nanopolish import tsvParser recordIterator = tsvParser() @@ -129,7 +123,7 @@ rule methylation_flappie: bam = lambda wildcards, config=config : get_alignment_batch(wildcards, config), tsv = lambda wildcards ,config=config : re.sub('.gz$', '.tsv.gz', get_sequence_batch(wildcards, config)) output: - "methylation/flappie/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/batches/{batch, [^.]*}.{reference, [^.\/]*}.tsv" + "methylation/flappie/{aligner, [^.\/]*}/{sequence_workflow}/batches/{tag, [^\/]*}/{runname, [^.\/]*}/{batch, [^.]*}.{reference, [^.\/]*}.tsv" shadow: "minimal" threads: 1 resources: @@ -147,7 +141,7 @@ rule methylation_flappie_merge_run: input: lambda wildcards: get_batches_methylation(wildcards, config, 'flappie') output: - temp("methylation/flappie/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/{tag, [^.\/]*}.{reference, [^.\/]*}.tsv") + temp("methylation/flappie/{aligner, [^.\/]*}/{sequence_workflow}/batches/{tag, [^\/]*}/{runname, [^.\/]*}.{reference, [^.\/]*}.tsv") singularity: "docker://nanopype/methylation:{tag}".format(tag=config['version']['tag']) shell: @@ -158,9 +152,9 @@ rule methylation_flappie_merge_run: # compress methylation caller tsv output rule methylation_compress: input: - "methylation/{methylation_caller}/{aligner}/{sequence_workflow}/{runname}.{reference}.tsv" + "methylation/{methylation_caller}/{aligner}/{sequence_workflow}/batches/{tag}/{runname}.{reference}.tsv" output: - "methylation/{methylation_caller, [^.\/]*}/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/{runname, [^.\/]*}.{reference, [^.\/]*}.tsv.gz" + "methylation/{methylation_caller, [^.\/]*}/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/batches/{tag, [^\/]*}/{runname, [^.\/]*}.{reference, [^.\/]*}.tsv.gz" singularity: "docker://nanopype/methylation:{tag}".format(tag=config['version']['tag']) shell: @@ -169,11 +163,11 @@ rule methylation_compress: # single read methylation tracks for IGV/GViz rule methylation_single_read_run: input: - tsv = "methylation/{methylation_caller}/{aligner}/{sequence_workflow}/batches/{runname}.{reference}.tsv.gz", - bam = "alignments/{aligner}/{sequence_workflow}/batches/{runname}.{reference}.bam" + tsv = "methylation/{methylation_caller}/{aligner}/{sequence_workflow}/batches/{tag}/{runname}.{reference}.tsv.gz", + bam = "alignments/{aligner}/{sequence_workflow}/batches/{tag}/{runname}.{reference}.bam" output: - bam = "methylation/{methylation_caller, [^.\/]*}/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/batches/{runname, [^.\/]*}.{reference, [^.\/]*}.bam", - bai = "methylation/{methylation_caller, [^.\/]*}/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/batches/{runname, [^.\/]*}.{reference, [^.\/]*}.bam.bai" + bam = "methylation/{methylation_caller, [^.\/]*}/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/batches/{tag, [^\/]*}/{runname, [^.\/]*}.{reference, [^.\/]*}.bam", + bai = "methylation/{methylation_caller, [^.\/]*}/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/batches/{tag, [^\/]*}/{runname, [^.\/]*}.{reference, [^.\/]*}.bam.bai" params: reference = lambda wildcards: os.path.abspath(config['references'][wildcards.reference]['genome']), threshold = lambda wildcards : config['methylation_nanopolish_logp_threshold'] if wildcards.methylation_caller == 'nanopolish' else config['methylation_flappie_qval_threshold'] if wildcards.methylation_caller == 'flappie' else 0 @@ -188,9 +182,9 @@ rule methylation_single_read_run: # nanopolish methylation probability to frequencies rule methylation_nanopolish_frequencies: input: - lambda wildcards : get_tsv_methylation(wildcards, config, 'nanopolish') + lambda wildcards : get_batches_runname(wildcards, config, 'nanopolish') output: - "methylation/nanopolish/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/{tag, [^.\/]*}.{reference, [^.\/]*}.frequencies.tsv" + "methylation/nanopolish/{aligner, [^.\/]*}/{sequence_workflow}/{tag, [^\/]*}.{reference, [^.\/]*}.frequencies.tsv" params: log_p_threshold = config['methylation_nanopolish_logp_threshold'] singularity: @@ -203,9 +197,9 @@ rule methylation_nanopolish_frequencies: # flappie methylation with sequences quality to frequencies rule methylation_flappie_frequencies: input: - lambda wildcards : get_tsv_methylation(wildcards, config, 'flappie') + lambda wildcards : get_batches_runname(wildcards, config, 'flappie') output: - "methylation/flappie/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/{tag, [^.\/]*}.{reference, [^.\/]*}.frequencies.tsv" + "methylation/flappie/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/{tag, [^\/]*}.{reference, [^.\/]*}.frequencies.tsv" params: qval_threshold = config['methylation_flappie_qval_threshold'] singularity: @@ -236,7 +230,7 @@ rule methylation_bigwig: bedGraph = "methylation/{methylation_caller}/{aligner}/{sequence_workflow}/{tag}.{coverage}.{reference}.bedGraph", chr_sizes = lambda wildcards : config["references"][wildcards.reference]["chr_sizes"] output: - "methylation/{methylation_caller, [^.\/]*}/{aligner, [^.\/]*}/{sequence_workflow}/{tag, [^.\/]*}.{coverage, [^.\/]*}.{reference, [^.\/]*}.bw" + "methylation/{methylation_caller, [^.\/]*}/{aligner, [^.\/]*}/{sequence_workflow}/{tag, [^\/]*}.{coverage, [^.\/]*}.{reference, [^.\/]*}.bw" singularity: "docker://nanopype/methylation:{tag}".format(tag=config['version']['tag']) shell: @@ -247,10 +241,10 @@ rule methylation_bigwig: # 1D2 matched methylation table rule methylation_1D2: input: - values = "methylation/{methylation_caller}/{aligner}/{sequence_workflow}/batches/{runname}.{reference}.tsv.gz", - pairs = "alignments/{aligner}/{sequence_workflow}/batches/{runname}.{reference}.1D2.tsv" + values = "methylation/{methylation_caller}/{aligner}/{sequence_workflow}/batches/{tag}/{runname}.{reference}.tsv.gz", + pairs = "alignments/{aligner}/{sequence_workflow}/batches/{tag}/{runname}.{reference}.1D2.tsv" output: - "methylation/{methylation_caller, [^.\/]*}/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/batches/{runname, [^.\/]*}.{reference, [^.\/]*}.1D2.tsv.gz" + "methylation/{methylation_caller, [^.\/]*}/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/batches/{tag, [^\/]*}/{runname, [^.\/]*}.{reference, [^.\/]*}.1D2.tsv.gz" singularity: "docker://nanopype/methylation:{tag}".format(tag=config['version']['tag']) shell: diff --git a/rules/storage.smk b/rules/storage.smk index 6f4458e..a2d4b6b 100755 --- a/rules/storage.smk +++ b/rules/storage.smk @@ -39,12 +39,17 @@ localrules: storage_index_run, storage_extract LOC_RAW = "/Raw/" def get_batches_indexing(wildcards): - return expand("{data_raw}/{wildcards.runname}/reads/{{batch}}.fofn".format(data_raw = config["storage_data_raw"], wildcards=wildcards), batch=get_batch_ids_raw(wildcards, config=config)) + return expand("{data_raw}/{runname}/reads/{batch}.fofn", + data_raw = config["storage_data_raw"], + runname=wildcards.runname, + batch=get_batch_ids_raw(wildcards, config=config)) # extract read ID from individual fast5 files rule storage_index_batch: input: - "{data_raw}/{{runname}}/reads/{{batch}}.tar".format(data_raw = config["storage_data_raw"]) + lambda wildcards : "{data_raw}/{{runname}}/reads/{{batch}}.{ext}".format( + data_raw = config["storage_data_raw"], + ext=get_batch_ext(wildcards, config)) output: temp("{data_raw}/{{runname}}/reads/{{batch}}.fofn".format(data_raw = config["storage_data_raw"])) shadow: "minimal" @@ -52,21 +57,10 @@ rule storage_index_batch: resources: mem_mb = lambda wildcards, attempt: int((1.0 + (0.1 * (attempt - 1))) * 4000), time_min = 15 - run: - import os, subprocess, h5py - os.mkdir('reads') - subprocess.run('tar -C reads/ -xf {input}'.format(input=input), check=True, shell=True, stdout=subprocess.PIPE) - f5files = [os.path.join(dirpath, f) for dirpath, _, files in os.walk('reads/') for f in files if f.endswith('.fast5')] - with open(output[0], 'w') as fp_out: - # scan files - for f5file in f5files: - try: - with h5py.File(f5file, 'r') as f5: - s = f5[LOC_RAW].visit(lambda name: name if 'Signal' in name else None) - ID = str(f5[LOC_RAW + '/' + s.rpartition('/')[0]].attrs['read_id'], 'utf-8') - print('\t'.join([os.path.join('reads', wildcards.batch + '.tar', os.path.relpath(f5file, start='./reads')), ID]), file=fp_out) - except: - pass + shell: + """ + {config[bin][python]} {config[sbin][storage_fast5Index.py]} index {input} --out_prefix reads --tmp_prefix $(pwd) > {output} + """ # merge batch indices rule storage_index_run: @@ -74,11 +68,10 @@ rule storage_index_run: batches = get_batches_indexing output: fofn = "{data_raw}/{{runname}}/reads.fofn".format(data_raw = config["storage_data_raw"]) - run: - with open(output.fofn, 'w') as fp_out: - for f in input.batches: - with open(f, 'r') as fp_in: - fp_out.write(fp_in.read()) + shell: + """ + cat {input} > {output} + """ # index multiple runs rule storage_index_runs: diff --git a/rules/sv.smk b/rules/sv.smk index 022bb87..13a61b9 100755 --- a/rules/sv.smk +++ b/rules/sv.smk @@ -65,3 +65,17 @@ rule sv_compress: """ cat {input} | gzip > {output} """ + +rule strique: + input: + signal = "", + bam = "" + output: + "sv/strique/{aligner, [^.\/]*}/{sequence_workflow, [^.\/]*}/{tag, [^.\/]*}.{reference, [^.\/]*}.tsv" + threads: 1 + singularity: + "docker://nanopype/sv:{tag}".format(tag=config['version']['tag']) + shell: + """ + + """ diff --git a/rules/utils/get_file.py b/rules/utils/get_file.py index bce9429..a38ea93 100755 --- a/rules/utils/get_file.py +++ b/rules/utils/get_file.py @@ -53,9 +53,8 @@ def get_batch_ext(wildcards, config): pass def get_signal_batch(wildcards, config): - batch_type, batch = wildcards.batch.split('/', 1) raw_dir = config['storage_data_raw'] - batch_file = os.path.join(raw_dir, batch_type, 'reads', batch) + batch_file = os.path.join(raw_dir, wildcards.runname, 'reads', wildcards.batch) if os.path.isfile(batch_file + '.tar'): return batch_file + '.tar' elif os.path.isfile(batch_file + '.fast5'): @@ -65,52 +64,25 @@ def get_signal_batch(wildcards, config): # get available batch sequence def get_sequence_batch(wildcards, config): - base = "sequences/{wildcards.sequence_workflow}/batches/{wildcards.batch}".format(wildcards=wildcards) + base = "sequences/{sequence_workflow}/batches/{tag}/{runname}/{batch}".format( + sequence_workflow=wildcards.sequence_workflow, + tag=wildcards.tag, + runname=wildcards.runname, + batch=wildcards.batch) extensions = ['.fa', '.fasta', '.fq', '.fastq'] for ext in extensions: if os.path.isfile(base + ext + '.gz') or os.path.isfile(base + ext): return base + ext + '.gz' return base + '.fastq.gz' -# get available merged sequence -# def get_sequence(wildcards, config, force_basecaller=None): - # if force_basecaller: - # basecaller = force_basecaller - # elif hasattr(wildcards, 'basecaller'): - # basecaller = wildcards.basecaller - # else: - # raise RuntimeError("Unable to determine sequence file with wildcards: {wildcards}".format( - # wildcards=', '.join([str(key) + ':' + str(value) for key,value in wildcards]))) - # if hasattr(wildcards, 'runname') and wildcards.runname: - # base = "sequences/{basecaller}/runs/{wildcards.runname}".format(wildcards=wildcards, basecaller=basecaller) - # else: - # base = "sequences/{basecaller}/{wildcards.tag}".format(wildcards=wildcards, basecaller=basecaller) - # extensions = ['.fa', '.fasta', '.fq', '.fastq'] - # for ext in extensions: - # if os.path.isfile(base + ext + '.gz'): - # return base + ext + '.gz' - # return base + '.fastq.gz' - # get alignment batch with default basecaller and aligner def get_alignment_batch(wildcards, config): - bam = "alignments/{wildcards.aligner}/{wildcards.sequence_workflow}/batches/{wildcards.batch}.{wildcards.reference}.bam".format( - wildcards=wildcards) + bam = "alignments/{aligner}/{sequence_workflow}/batches/{tag}/{runname}/{batch}.{reference}.bam".format( + aligner=wildcards.aligner, + sequence_workflow=wildcards.sequence_workflow, + tag=wildcards.tag, + runname=wildcards.runname, + batch=wildcards.batch, + reference=wildcards.reference + ) return bam - -# get alignment with default basecaller and aligner -# def get_alignment(wildcards, config, force_basecaller=None, force_aligner=None): - # if force_basecaller: - # basecaller = force_basecaller - # elif hasattr(wildcards, 'basecaller'): - # basecaller = wildcards.basecaller - # else: - # raise RuntimeError("Unable to determine alignment file with wildcards: {wildcards}".format( - # wildcards=', '.join([str(key) + ':' + str(value) for key,value in wildcards]))) - # if force_aligner: - # aligner = force_aligner - # elif hasattr(wildcards, 'aligner'): - # aligner = wildcards.aligner - # else: - # aligner = config['alignment_default'] - # bam = "alignments/{aligner}/{basecaller}/{wildcards.tag}.{wildcards.reference}.bam".format(wildcards=wildcards, basecaller=basecaller, aligner=aligner) - # return bam diff --git a/rules/utils/storage.py b/rules/utils/storage.py index 2b47d84..18df0d2 100755 --- a/rules/utils/storage.py +++ b/rules/utils/storage.py @@ -46,7 +46,7 @@ def get_flowcell(wildcards, config): if flowcell == 'FLO-PRO002': flowcell = 'FLO-PRO001' # end - return fields[config['storage_runname']['field_flowcell']] + return flowcell else: raise ValueError('Could not detect flowcell from ' + wildcards.runname) diff --git a/rules/utils/storage_fast5Index.py b/rules/utils/storage_fast5Index.py new file mode 100644 index 0000000..ace4f81 --- /dev/null +++ b/rules/utils/storage_fast5Index.py @@ -0,0 +1,122 @@ +# \HEADER\------------------------------------------------------------------------- +# +# CONTENTS : fast5 raw data index +# +# DESCRIPTION : indexing and extraction of single reads +# +# RESTRICTIONS : none +# +# REQUIRES : none +# +# --------------------------------------------------------------------------------- +# Copyright (c) 2018-2019, Pay Giesselmann, Max Planck Institute for Molecular Genetics +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +# Written by Pay Giesselmann +# --------------------------------------------------------------------------------- +import os, sys, argparse +import subprocess, tempfile +import tarfile +import h5py + + +class fast5_index(): + def __init__(self): + parser = argparse.ArgumentParser( + description='Nanopore raw fast5 index', + usage='''storage_fast5Index.py [] +Available commands are: + index Index batch(es) of bulk-fast5 or tar archived single fast5 + extract Extract single reads from indexed sequencing run +''') + parser.add_argument('command', help='Subcommand to run') + args = parser.parse_args(sys.argv[1:2]) + if not hasattr(self, args.command): + print('Unrecognized command', file=sys.stderr) + parser.print_help(file=sys.stderr) + exit(1) + getattr(self, args.command)(sys.argv[2:]) + + def __is_multi_read__(self, f5_file): + with h5py.File(f5_file, 'r') as f5: + if "/Raw" in f5: + return False + else: + return True + + def __get_ID_single__(self, f5_file): + with h5py.File(f5_file, 'r') as f5: + s = f5["/Raw/"].visit(lambda name: name if 'Signal' in name else None) + return str(f5["/Raw/" + s.rpartition('/')[0]].attrs['read_id'], 'utf-8') + + def __get_ID_multi__(self, f5_file): + with h5py.File(f5_file, 'r') as f5: + reads = [] + for group in f5: + s = f5[group + "/Raw/"].visit(lambda name: name if 'Signal' in name else None) + ID = (str(f5[group + "/Raw/" + s.rpartition('/')[0]].attrs['read_id'], 'utf-8')) + reads.append((group, ID)) + return reads + + def index(self, argv): + parser = argparse.ArgumentParser(description="Flappie methylation reference alignment") + parser.add_argument("input", help="Input batch or directory of batches") + parser.add_argument("--recursive", action='store_true', help="Recursively scan input") + parser.add_argument("--out_prefix", default="", help="Prefix for file paths in output") + parser.add_argument("--tmp_prefix", default=None, help="Prefix for temporary data") + args = parser.parse_args(argv) + input_files = [] + # scan input + if os.path.isfile(args.input): + input_files.append(args.input) + else: + if args.recursive: + input_files.extend([os.path.join(dirpath, f) for dirpath, _, files in os.walk(args.input) for f in files if f.endswith('.fast5') or f.endswith('.tar')]) + else: + input_files.extend(glob.glob(os.path.join(args.input, '*.fast5'))) + input_files.extend(glob.glob(os.path.join(args.input, '*.tar'))) + # index all provided files + for input_file in input_files: + # extract reads from packed tar archive and retrieve read IDs + input_relative = os.path.normpath(os.path.join(args.out_prefix, + os.path.dirname(os.path.relpath(input_file, start=args.input)), + os.path.basename(input_file))) + if input_file.endswith('.tar'): + with tempfile.TemporaryDirectory(prefix=args.tmp_prefix) as tmpdirname, tarfile.open(input_file) as fp_tar: + fp_tar.extractall(path=tmpdirname) + f5files = [os.path.join(dirpath, f) for dirpath, _, files in os.walk(tmpdirname) for f in files if f.endswith('.fast5')] + for f5file in f5files: + try: + ID = self.__get_ID_single__(f5file) + print("\t".join([os.path.normpath(os.path.join(input_relative, + os.path.relpath(f5file, start=tmpdirname))), ID])) + except: + print("Error opening {f5}, skip file for indexing".format(f5=f5file)) + else: + if self.__is_multi_read__(input_file): + reads = self.__get_ID_multi__(input_file) + print("\n".join(['\t'.join((os.path.join(f,group), ID)) for f, (group, ID) in zip([input_relative] * len(reads), reads)])) + + + def extract(self, argv): + pass + +if __name__ == "__main__": + fast5_index() From 9c71eb24560a186451f515759246df10585dc704 Mon Sep 17 00:00:00 2001 From: Pay Giesselmann Date: Mon, 1 Apr 2019 00:02:04 +0200 Subject: [PATCH 02/19] fix batch getter names --- rules/alignment.smk | 4 ++-- rules/basecalling.smk | 4 ++-- rules/methylation.smk | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/rules/alignment.smk b/rules/alignment.smk index 7703ac2..f4c4587 100755 --- a/rules/alignment.smk +++ b/rules/alignment.smk @@ -50,7 +50,7 @@ def get_batches_aligner(wildcards, config): reference=wildcards.reference) return r -def get_batches_runname(wildcards, config): +def get_batches_aligner2(wildcards, config): r = expand("alignments/{aligner}/{sequence_workflow}/batches/{tag}/{runname}.{reference}.bam", aligner=wildcards.aligner, sequence_workflow=wildcards.sequence_workflow, @@ -183,7 +183,7 @@ rule aligner_merge_batches: rule aligner_merge_tag: input: - bam = lambda wildcards: get_batches_runname(wildcards, config) + bam = lambda wildcards: get_batches_aligner2(wildcards, config) output: bam = "alignments/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/{tag, [^\/]*}.{reference, [^.]*}.bam", bai = "alignments/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/{tag, [^\/]*}.{reference, [^.]*}.bam.bai" diff --git a/rules/basecalling.smk b/rules/basecalling.smk index ce3de29..0f76a7a 100755 --- a/rules/basecalling.smk +++ b/rules/basecalling.smk @@ -48,7 +48,7 @@ def get_batches_basecaller(wildcards, config): format=wildcards.format) return r -def get_batches_runname(wildcards, config): +def get_batches_basecaller2(wildcards, config): r = expand("sequences/{sequence_workflow}/batches/{tag}/{runname}.{format}.gz", sequence_workflow=wildcards.sequence_workflow, tag=wildcards.tag, @@ -172,7 +172,7 @@ rule basecaller_merge_batches: rule basecaller_merge_tag: input: - lambda wildcards: get_batches_runname(wildcards, config) + lambda wildcards: get_batches_basecaller2(wildcards, config) output: "sequences/{sequence_workflow, ((?!batches).)*}/{tag, [^\/]*}.{format, (fasta|fastq|fa|fq)}.gz" shell: diff --git a/rules/methylation.smk b/rules/methylation.smk index 66fb3e7..4c3fbac 100755 --- a/rules/methylation.smk +++ b/rules/methylation.smk @@ -51,7 +51,7 @@ def get_batches_methylation(wildcards, config, methylation_caller): batch=get_batch_ids_raw(wildcards.runname, config)) return r -def get_batches_runname(wildcards, config, methylation_caller): +def get_batches_methylation2(wildcards, config, methylation_caller): r = expand("methylation/{methylation_caller}/{aligner}/{sequence_workflow}/batches/{tag}/{runname}.{reference}.tsv.gz", methylation_caller=methylation_caller, aligner=wildcards.aligner, @@ -182,7 +182,7 @@ rule methylation_single_read_run: # nanopolish methylation probability to frequencies rule methylation_nanopolish_frequencies: input: - lambda wildcards : get_batches_runname(wildcards, config, 'nanopolish') + lambda wildcards : get_batches_methylation2(wildcards, config, 'nanopolish') output: "methylation/nanopolish/{aligner, [^.\/]*}/{sequence_workflow}/{tag, [^\/]*}.{reference, [^.\/]*}.frequencies.tsv" params: @@ -197,7 +197,7 @@ rule methylation_nanopolish_frequencies: # flappie methylation with sequences quality to frequencies rule methylation_flappie_frequencies: input: - lambda wildcards : get_batches_runname(wildcards, config, 'flappie') + lambda wildcards : get_batches_methylation2(wildcards, config, 'flappie') output: "methylation/flappie/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/{tag, [^\/]*}.{reference, [^.\/]*}.frequencies.tsv" params: From 7d6b7822ffabbbeff644219f3d588075dac685a1 Mon Sep 17 00:00:00 2001 From: Pay Giesselmann Date: Mon, 1 Apr 2019 18:34:37 +0200 Subject: [PATCH 03/19] test barcoded output --- Snakefile | 17 +++-- nanopype.yaml | 1 + rules/basecalling.smk | 46 ++++++------- rules/demux.smk | 13 ++-- rules/storage.smk | 6 +- rules/utils/get_file.py | 45 ++++++++---- rules/utils/storage_fast5Index.py | 110 ++++++++++++++++++++++++++++-- 7 files changed, 179 insertions(+), 59 deletions(-) mode change 100644 => 100755 rules/utils/storage_fast5Index.py diff --git a/Snakefile b/Snakefile index 5c27f3c..760c9f4 100755 --- a/Snakefile +++ b/Snakefile @@ -38,7 +38,7 @@ from snakemake.utils import min_version # snakemake config -min_version("5.4.0") +min_version("5.4.3") configfile: "nanopype.yaml" @@ -64,7 +64,7 @@ if hasattr(workflow, "shadow_prefix") and workflow.shadow_prefix: shadow_prefix = workflow.shadow_prefix if not os.environ['USER'] in shadow_prefix: shadow_prefix = os.path.join(shadow_prefix, os.environ['USER']) - print("Shadow prefix is changed from {p1} to {p2} to be user-specific".format( + print("[INFO] Shadow prefix is changed from {p1} to {p2} to be user-specific".format( p1=workflow.shadow_prefix, p2=shadow_prefix), file=sys.stderr) workflow.shadow_prefix = shadow_prefix @@ -83,11 +83,11 @@ if 'references' in nanopype_env: genome = values['genome'] chr_sizes = values['chr_sizes'] if not os.path.isfile(genome): - print("Genome for {name} not found in {genome}, skipping entry".format( + print("[WARNING] Genome for {name} not found in {genome}, skipping entry".format( name=name, genome=genome), file=sys.stderr) continue if not os.path.isfile(chr_sizes): - print("Chromosome sizes for {name} not found in {chr_sizes}, skipping entry".format( + print("[WARNING] Chromosome sizes for {name} not found in {chr_sizes}, skipping entry".format( name=name, chr_sizes=chr_sizes), file=sys.stderr) continue config['references'][name] = {"genome":genome, "chr_sizes":chr_sizes} @@ -174,6 +174,15 @@ if os.path.isfile('runnames.txt'): config['runnames'] = runnames +# barcode mappings +barcodes = {} +if os.path.isfile('barcodes.yaml'): + with open("barcodes.yaml", 'r') as fp: + barcode_map = yaml.load(fp) + barcodes = barcode_map +config['barcodes'] = barcodes + + # include modules include : "rules/storage.smk" include : "rules/basecalling.smk" diff --git a/nanopype.yaml b/nanopype.yaml index ee7409d..647409d 100755 --- a/nanopype.yaml +++ b/nanopype.yaml @@ -62,6 +62,7 @@ methylation_flappie_qval_threshold: 3 # demux demux_batch_size: 4000 +demux_default: 'deepbinner' deepbinner_models: default: SQK-RBK004_read_starts EXP-NBD103: EXP-NBD103_read_starts diff --git a/rules/basecalling.smk b/rules/basecalling.smk index 0f76a7a..f93871c 100755 --- a/rules/basecalling.smk +++ b/rules/basecalling.smk @@ -33,35 +33,33 @@ # --------------------------------------------------------------------------------- # imports import os, sys -from rules.utils.get_file import get_batch_ids_raw, get_batch_ext +from rules.utils.get_file import get_batch_ids_raw, get_signal_batch from rules.utils.storage import get_flowcell, get_kit # local rules localrules: basecaller_merge_batches, basecaller_merge_tag, basecaller_qc -# get batches -def get_batches_basecaller(wildcards, config): +# get batches +def get_batches_basecaller(wildcards): r = expand("sequences/{sequence_workflow}/batches/{tag}/{runname}/{batch}.{format}.gz", sequence_workflow=wildcards.sequence_workflow, tag=wildcards.tag, runname=wildcards.runname, - batch=get_batch_ids_raw(wildcards.runname, config), + batch=get_batch_ids_raw(wildcards.runname, config=config, tag=wildcards.tag, checkpoints=checkpoints), format=wildcards.format) + print(r) return r -def get_batches_basecaller2(wildcards, config): - r = expand("sequences/{sequence_workflow}/batches/{tag}/{runname}.{format}.gz", +def get_batches_basecaller2(wildcards): + return expand("sequences/{sequence_workflow}/batches/{tag}/{runname}.{format}.gz", sequence_workflow=wildcards.sequence_workflow, tag=wildcards.tag, runname=[runname for runname in config['runnames']], format=wildcards.format) - return r # albacore basecalling rule albacore: input: - lambda wildcards : "{data_raw}/{{runname}}/reads/{{batch}}.{ext}".format( - data_raw = config["storage_data_raw"], - ext=get_batch_ext(wildcards, config)) + signal = lambda wildcards : get_signal_batch(wildcards, config, checkpoints) output: "sequences/albacore/batches/{tag, [^\/]*}/{runname, [^.\/]*}/{batch, [^.]*}.{format, (fasta|fastq|fa|fq)}.gz" shadow: "minimal" @@ -73,11 +71,12 @@ rule albacore: flowcell = lambda wildcards: get_flowcell(wildcards, config), kit = lambda wildcards: get_kit(wildcards, config), barcoding = lambda wildcards : '--barcoding' if config['basecalling_albacore_barcoding'] else '', - filtering = lambda wildcards : '--disable_filtering' if config['basecalling_albacore_disable_filtering'] else '' + filtering = lambda wildcards : '--disable_filtering' if config['basecalling_albacore_disable_filtering'] else '', + index = lambda wildcards : os.path.join(config['storage_data_raw'], wildcards.runname, 'reads.fofn') shell: """ mkdir -p raw - {config[sbin][storage_batch2fast5.sh]} {input} raw/ {config[sbin][base]} {config[bin][python]} + {config[bin][python]} {config[sbin][storage_fast5Index.py]} extract {input.signal[0]} raw/ --index {params.index} --output_format lazy {config[bin][albacore]} -i raw/ --recursive -t {threads} -s raw/ --flowcell {params.flowcell} --kit {params.kit} --output_format fastq {params.filtering} {params.barcoding} {config[basecalling_albacore_flags]} FASTQ_DIR='raw/workspace/' if [ \'{params.filtering}\' = '' ]; then @@ -94,9 +93,8 @@ rule albacore: # guppy basecalling rule guppy: input: - lambda wildcards : "{data_raw}/{{runname}}/reads/{{batch}}.{ext}".format( - data_raw = config["storage_data_raw"], - ext=get_batch_ext(wildcards, config)) + batch = lambda wildcards : get_signal_batch(wildcards, config, checkpoints), + run = lambda wildcards : os.path.join(config['storage_data_raw'], wildcards.runname) output: "sequences/guppy/batches/{tag, [^\/]*}/{runname, [^.\/]*}/{batch, [^.]*}.{format, (fasta|fastq|fa|fq)}.gz" shadow: "minimal" @@ -108,13 +106,15 @@ rule guppy: flowcell = lambda wildcards: get_flowcell(wildcards, config), kit = lambda wildcards: get_kit(wildcards, config), #barcoding = lambda wildcards : '--barcoding' if config['basecalling_albacore_barcoding'] else '', - filtering = lambda wildcards : '--qscore_filtering --min_qscore {score}'.format(score = config['basecalling_guppy_qscore_filter']) if config['basecalling_guppy_qscore_filter'] > 0 else '' + filtering = lambda wildcards : '--qscore_filtering --min_qscore {score}'.format(score = config['basecalling_guppy_qscore_filter']) if config['basecalling_guppy_qscore_filter'] > 0 else '', + index = lambda wildcards : os.path.join(config['storage_data_raw'], wildcards.runname, 'reads.fofn') singularity: "docker://nanopype/basecalling:{tag}".format(tag=config['version']['tag']) shell: """ mkdir -p raw - {config[sbin_singularity][storage_batch2fast5.sh]} {input} raw/ {config[sbin_singularity][base]} {config[bin_singularity][python]} + echo 'extract {input.batch} raw/ --index {params.index} --output_format lazy' + {config[bin_singularity][python]} {config[sbin_singularity][storage_fast5Index.py]} extract {input.batch} raw/ --index {params.index} --output_format lazy {config[bin_singularity][guppy]} -i raw/ --recursive --num_callers 1 --cpu_threads_per_caller {threads} -s workspace/ --flowcell {params.flowcell} --kit {params.kit} {params.filtering} {config[basecalling_guppy_flags]} FASTQ_DIR='workspace/pass' if [ \'{params.filtering}\' = '' ]; then @@ -131,9 +131,7 @@ rule guppy: # flappie basecalling rule flappie: input: - lambda wildcards : "{data_raw}/{{runname}}/reads/{{batch}}.{ext}".format( - data_raw = config["storage_data_raw"], - ext=get_batch_ext(wildcards, config)) + signal = lambda wildcards : get_signal_batch(wildcards, config, checkpoints) output: sequence = "sequences/flappie/batches/{tag, [^\/]*}/{runname, [^.\/]*}/{batch, [^.]*}.{format, (fasta|fastq|fa|fq)}.gz", methyl_marks = "sequences/flappie/batches/{tag, [^\/]*}/{runname, [^.\/]*}/{batch, [^.]*}.{format, (fasta|fastq|fa|fq)}.tsv.gz" @@ -142,13 +140,15 @@ rule flappie: resources: mem_mb = lambda wildcards, threads, attempt: int((1.0 + (0.1 * (attempt - 1))) * (4000 + 5000 * threads)), time_min = lambda wildcards, threads, attempt: int((5760 / threads) * attempt) # 360 min / 16 threads + params: + index = lambda wildcards : os.path.join(config['storage_data_raw'], wildcards.runname, 'reads.fofn') singularity: "docker://nanopype/basecalling:{tag}".format(tag=config['version']['tag']) shell: """ export OPENBLAS_NUM_THREADS=1 mkdir -p raw - {config[sbin_singularity][storage_batch2fast5.sh]} {input} raw/ {config[sbin_singularity][base]} {config[bin_singularity][python]} + {config[bin_singularity][python]} {config[sbin_singularity][storage_fast5Index.py]} extract {input.signal[0]} raw/ --index {params.index} --output_format lazy find raw/ -regextype posix-extended -regex '^.*fast5' -type f -exec du -h {{}} + | sort -r -h | cut -f2 > raw.fofn split -e -n r/{threads} raw.fofn raw.fofn.part. ls raw.fofn.part.* | xargs -n 1 -P {threads} -I {{}} $SHELL -c 'cat {{}} | shuf | xargs -n 1 {config[bin_singularity][flappie]} --model {config[basecalling_flappie_model]} {config[basecalling_flappie_flags]} > raw/{{}}.fastq' @@ -164,7 +164,7 @@ rule flappie: # merge and compression rule basecaller_merge_batches: input: - lambda wildcards: get_batches_basecaller(wildcards, config) + lambda wildcards: get_batches_basecaller(wildcards) output: "sequences/{sequence_workflow}/batches/{tag, [^\/]*}/{runname, [^.\/]*}.{format, (fasta|fastq|fa|fq)}.gz" shell: @@ -172,7 +172,7 @@ rule basecaller_merge_batches: rule basecaller_merge_tag: input: - lambda wildcards: get_batches_basecaller2(wildcards, config) + lambda wildcards: get_batches_basecaller2(wildcards) output: "sequences/{sequence_workflow, ((?!batches).)*}/{tag, [^\/]*}.{format, (fasta|fastq|fa|fq)}.gz" shell: diff --git a/rules/demux.smk b/rules/demux.smk index 62f460f..cb961ac 100755 --- a/rules/demux.smk +++ b/rules/demux.smk @@ -37,16 +37,17 @@ from rules.utils.get_file import get_batch_ids_raw localrules: demux_split_barcodes # get batches -def get_batches_demux(wildcards, config): +def get_batches_demux(wildcards): return expand("demux/{demultiplexer}/batches/{runname}/{batch}.tsv", demultiplexer=wildcards.demultiplexer, runname=wildcards.runname, - batch=get_batch_ids_raw(wildcards.runname, config)) + batch=get_batch_ids_raw(wildcards.runname, config=config)) + # deepbinner demux rule deepbinner: input: - signals = lambda wildcards : get_signal_batch(wildcards, config), + signal = lambda wildcards : get_signal_batch(wildcards, config), model = lambda wildcards : config["deepbinner_models"][get_kit(wildcards)] if get_kit(wildcards, config) in config["deepbinner_models"] else config["deepbinner_models"]['default'] output: "demux/deepbinner/batches/{runname, [^.\/]*}/{batch, [^.\/]*}.tsv" @@ -60,16 +61,16 @@ rule deepbinner: shell: """ mkdir -p raw - {config[sbin_singularity][storage_batch2fast5.sh]} {input.signals} raw/ {config[sbin_singularity][base]} {config[bin_singularity][python]} + {config[sbin_singularity][storage_batch2fast5.sh]} {input.signal} raw/ {config[sbin_singularity][base]} {config[bin_singularity][python]} {config[bin_singularity][python]} {config[bin_singularity][deepbinner]} classify raw -s {input.model} --intra_op_parallelism_threads {threads} --omp_num_threads {threads} --inter_op_parallelism_threads {threads} | tail -n +2 > {output} """ # split into barcode id list per run checkpoint demux_split_barcodes: input: - batches = lambda wildcards: get_batches_demux(wildcards, config) + batches = lambda wildcards: get_batches_demux(wildcards) output: - barcodes = directory("demux/{demultiplexer, [^.\/]*}/barcodes/{runname}") + barcodes = directory("demux/{demultiplexer, [^.\/]*}/barcodes/{runname}/") singularity: "docker://nanopype/demux:{tag}".format(tag=config['version']['tag']) run: diff --git a/rules/storage.smk b/rules/storage.smk index a2d4b6b..7a18309 100755 --- a/rules/storage.smk +++ b/rules/storage.smk @@ -47,9 +47,7 @@ def get_batches_indexing(wildcards): # extract read ID from individual fast5 files rule storage_index_batch: input: - lambda wildcards : "{data_raw}/{{runname}}/reads/{{batch}}.{ext}".format( - data_raw = config["storage_data_raw"], - ext=get_batch_ext(wildcards, config)) + unpack(lambda wildcards : get_signal_batch(wildcards, config)), output: temp("{data_raw}/{{runname}}/reads/{{batch}}.fofn".format(data_raw = config["storage_data_raw"])) shadow: "minimal" @@ -59,7 +57,7 @@ rule storage_index_batch: time_min = 15 shell: """ - {config[bin][python]} {config[sbin][storage_fast5Index.py]} index {input} --out_prefix reads --tmp_prefix $(pwd) > {output} + {config[bin][python]} {config[sbin][storage_fast5Index.py]} index {input.signal} --out_prefix reads --tmp_prefix $(pwd) > {output} """ # merge batch indices diff --git a/rules/utils/get_file.py b/rules/utils/get_file.py index a38ea93..0d3026a 100755 --- a/rules/utils/get_file.py +++ b/rules/utils/get_file.py @@ -35,25 +35,38 @@ from snakemake.io import glob_wildcards from .storage import get_kit -# prefix of raw read batches -def get_batch_ids_raw(runname, config): - batches_tar, = glob_wildcards("{datadir}/{runname}/reads/{{id}}.tar".format(datadir=config["storage_data_raw"], runname=runname)) - batches_fast5, = glob_wildcards("{datadir}/{runname}/reads/{{id}}.fast5".format(datadir=config["storage_data_raw"], runname=runname)) - return batches_tar + batches_fast5 -# get type of fast5 batch from basename -def get_batch_ext(wildcards, config): - raw_prefix = "{datadir}/{wildcards.runname}/reads/{wildcards.batch}".format(datadir=config["storage_data_raw"], wildcards=wildcards) - # TODO idx prefix - if os.path.isfile(raw_prefix + ".tar"): - return "tar" - elif os.path.isfile(raw_prefix + ".fast5"): - return "fast5" +# check if tag maps to barcode +def get_tag_barcode(tag, runname, config): + if runname in config['barcodes']: + bc_batches = [config['barcodes'][runname][bc] for bc in config['barcodes'][runname].keys() if bc in tag] + elif '__default__' in config['barcodes']: + bc_batches = [config['barcodes']['__default__'][bc] for bc in config['barcodes']['__default__'].keys() if bc in tag] + else: + bc_batches = [None] + return bc_batches[0] + + +# prefix of raw read batches +def get_batch_ids_raw(runname, config, tag=None, checkpoints=None): + tag_barcode = get_tag_barcode(tag, runname, config) if tag else None + if tag_barcode and checkpoints: + barcode_batch_dir = checkpoints.demux_split_barcodes.get(demultiplexer=config['demux_default'], runname=runname).output.barcodes + barcode_batch = os.path.join(barcode_batch_dir, tag_barcode, '{id}.txt') + batches_txt, = glob_wildcards(barcode_batch) + return batches_txt else: - pass + batches_tar, = glob_wildcards("{datadir}/{runname}/reads/{{id}}.tar".format(datadir=config["storage_data_raw"], runname=runname)) + batches_fast5, = glob_wildcards("{datadir}/{runname}/reads/{{id}}.fast5".format(datadir=config["storage_data_raw"], runname=runname)) + return batches_tar + batches_fast5 -def get_signal_batch(wildcards, config): + +# get batch of reads as IDs or fast5 +def get_signal_batch(wildcards, config, checkpoints=None): raw_dir = config['storage_data_raw'] + if hasattr(wildcards, 'tag'): + tag_barcode = get_tag_barcode(wildcards.tag, wildcards.runname, config) + return os.path.join('demux/deepbinner/barcodes', wildcards.runname, tag_barcode, wildcards.batch + '.txt') batch_file = os.path.join(raw_dir, wildcards.runname, 'reads', wildcards.batch) if os.path.isfile(batch_file + '.tar'): return batch_file + '.tar' @@ -62,6 +75,7 @@ def get_signal_batch(wildcards, config): else: return [] + # get available batch sequence def get_sequence_batch(wildcards, config): base = "sequences/{sequence_workflow}/batches/{tag}/{runname}/{batch}".format( @@ -75,6 +89,7 @@ def get_sequence_batch(wildcards, config): return base + ext + '.gz' return base + '.fastq.gz' + # get alignment batch with default basecaller and aligner def get_alignment_batch(wildcards, config): bam = "alignments/{aligner}/{sequence_workflow}/batches/{tag}/{runname}/{batch}.{reference}.bam".format( diff --git a/rules/utils/storage_fast5Index.py b/rules/utils/storage_fast5Index.py old mode 100644 new mode 100755 index ace4f81..245e578 --- a/rules/utils/storage_fast5Index.py +++ b/rules/utils/storage_fast5Index.py @@ -31,10 +31,16 @@ # # Written by Pay Giesselmann # --------------------------------------------------------------------------------- -import os, sys, argparse -import subprocess, tempfile -import tarfile +import os, sys, re, argparse +import itertools +import shutil +import tempfile, tarfile import h5py +from ont_fast5_api.fast5_file import Fast5File +from ont_fast5_api.multi_fast5 import MultiFast5File +from ont_fast5_api.conversion_tools import multi_to_single_fast5, single_to_multi_fast5 + + class fast5_index(): @@ -76,7 +82,7 @@ def __get_ID_multi__(self, f5_file): return reads def index(self, argv): - parser = argparse.ArgumentParser(description="Flappie methylation reference alignment") + parser = argparse.ArgumentParser(description="Fast5 Index") parser.add_argument("input", help="Input batch or directory of batches") parser.add_argument("--recursive", action='store_true', help="Recursively scan input") parser.add_argument("--out_prefix", default="", help="Prefix for file paths in output") @@ -94,10 +100,10 @@ def index(self, argv): input_files.extend(glob.glob(os.path.join(args.input, '*.tar'))) # index all provided files for input_file in input_files: - # extract reads from packed tar archive and retrieve read IDs input_relative = os.path.normpath(os.path.join(args.out_prefix, os.path.dirname(os.path.relpath(input_file, start=args.input)), os.path.basename(input_file))) + # extract reads from packed tar archive and retrieve read IDs if input_file.endswith('.tar'): with tempfile.TemporaryDirectory(prefix=args.tmp_prefix) as tmpdirname, tarfile.open(input_file) as fp_tar: fp_tar.extractall(path=tmpdirname) @@ -109,14 +115,104 @@ def index(self, argv): os.path.relpath(f5file, start=tmpdirname))), ID])) except: print("Error opening {f5}, skip file for indexing".format(f5=f5file)) + # bulk and single read fast5 else: if self.__is_multi_read__(input_file): reads = self.__get_ID_multi__(input_file) print("\n".join(['\t'.join((os.path.join(f,group), ID)) for f, (group, ID) in zip([input_relative] * len(reads), reads)])) + else: + print('\t'.join([input_relative, self.__get_ID_single__(input_relative)])) + + def extract(self, argv): + parser = argparse.ArgumentParser(description="Fast5 extraction") + parser.add_argument("batch", help="Input batch") + parser.add_argument("output", help="Output directory") + parser.add_argument("--index", default='', help="Read index") + parser.add_argument("--output_format", default='single', choices=['single', 'bulk', 'lazy'], help="Output as single, bulk or with minimal conversion overhead") + parser.add_argument("--tmp_prefix", default=None, help="Prefix for temporary data") + args = parser.parse_args(argv) + if not os.path.exists(args.output): + os.makedirs(os.path.dirname(output_file)) + batch_name, batch_ext = os.path.splitext(args.batch) + # packed single reads in tar archive + if batch_ext == '.tar': + if args.output_format in ['single', 'lazy']: + with tarfile.open(args.batch) as fp_tar: + fp_tar.extractall(path=args.output) + else: + output_bulk = os.path.join(args.output, os.path.basename(batch_name) + '.fast5') + with tempfile.TemporaryDirectory(prefix=args.tmp_prefix) as tmpdirname, tarfile.open(args.batch) as fp_tar: + fp_tar.extractall(path=tmpdirname) + f5files = [os.path.join(dirpath, f) for dirpath, _, files in os.walk(tmpdirname) for f in files if f.endswith('.fast5')] + single_to_multi_fast5.create_multi_read_file(f5files, output_bulk) + # bulk fast5 + elif batch_ext == '.fast': + if args.output_format in ['bulk', 'lazy']: + shutil.copy(args.batch, args.output) + else: + multi_to_single_fast5.convert_multi_to_single(args.batch, args.output, '') + # read IDs to be extracted + elif batch_ext == '.txt': + # load index and requested IDs + if not os.path.exists(args.index): + raise RuntimeError("[Error] Raw fast5 index file {} not found.".format(args.index)) + else: + with open(args.batch, 'r') as fp: + batch_ids = [id.strip() for id in fp.read().split('\n') if id] + with open(args.index, 'r') as fp: + run_index = {id:path for path,id in [line.split('\t') for line in fp.read().split('\n') if line]} + # sort in affected batches + batch_id_files = [tuple( [id] + re.split('(\.fast5|\.tar)\/', run_index[id]) ) for id in batch_ids if id in run_index] + batch_id_files.sort(key=lambda x : (x[1], x[2]) if len(x) > 2 else x[1]) + def copy_reads_to(fofns, output): + if len(fofns) == 1: + # single read fast5 + id, src_file = fofns[0] + shutil.copy(os.path.join(os.path.dirname(args.index), src_file), output) + else: + _, batch_file, batch_ext, _ = fofns[0] + tarFiles = set([x[3] for x in fofns]) + # single read fast5 batch in tar archive + if batch_ext == '.tar': + tar_file = os.path.join(os.path.dirname(args.index), batch_file + batch_ext) + with tarfile.open(tar_file) as fp_tar: + tar_members = fp_tar.getmembers() + for tar_member in tar_members: + if any(s in tar_member.name for s in tarFiles): + try: + tar_member.name = os.path.basename(tar_member.name) + fp_tar.extract(tar_member, path=output) + except: + RuntimeError('[ERROR] Could not extract {id} from {batch}.'.format(id=tar_member.name, batch=tar_file)) + elif batch_ext == '.fast5': + f5_file = os.path.join(os.path.dirname(args.index), batch_file + batch_ext) + with MultiFast5File(f5_file, 'r') as multi_f5: + target_ids = set([x[0] for x in fofns]) + for read_id in multi_f5.get_read_ids(): + if read_id in target_ids: + try: + read = multi_f5.get_read(read_id) + output_file = os.path.join(output, "{}.fast5".format(read_id)) + multi_to_single_fast5.create_single_f5(output_file, read) + except: + RuntimeError('[ERROR] Could not extract {id} from {batch}.'.format(id=read_id, batch=f5_file)) + else: + pass + for batch_file, id_batch_paths in itertools.groupby(batch_id_files, key=lambda x : (x[1], x[2]) if len(x) > 2 else x[1]): + if args.output_format in ['single', 'lazy']: + copy_reads_to(list(id_batch_paths), args.output) + else: + output_bulk = os.path.join(args.output, os.path.basename(batch_name) + '.fast5') + with tempfile.TemporaryDirectory(prefix=args.tmp_prefix) as tmpdirname: + copy_reads_to(list(id_batch_paths), tmpdirname) + f5files = [os.path.join(dirpath, f) for dirpath, _, files in os.walk(tmpdirname) for f in files if f.endswith('.fast5')] + single_to_multi_fast5.create_multi_read_file(f5files, output_bulk) + + else: + raise RuntimeError('[ERROR] Raw fast5 batch extension {} not supported.'.format(batch_ext)) + - def extract(self, argv): - pass if __name__ == "__main__": fast5_index() From a429878d6f5ad4ea54d0fc1c0a7709cb4d0e81c5 Mon Sep 17 00:00:00 2001 From: Pay Giesselmann Date: Mon, 1 Apr 2019 22:35:20 +0200 Subject: [PATCH 04/19] barcode support in methylation module --- rules/basecalling.smk | 19 +++++++++---------- rules/methylation.smk | 7 ++++--- rules/utils/get_file.py | 19 ++++++++++--------- rules/utils/storage_fast5Index.py | 6 +++--- 4 files changed, 26 insertions(+), 25 deletions(-) diff --git a/rules/basecalling.smk b/rules/basecalling.smk index f93871c..4572118 100755 --- a/rules/basecalling.smk +++ b/rules/basecalling.smk @@ -38,16 +38,14 @@ from rules.utils.storage import get_flowcell, get_kit # local rules localrules: basecaller_merge_batches, basecaller_merge_tag, basecaller_qc -# get batches +# get batches def get_batches_basecaller(wildcards): - r = expand("sequences/{sequence_workflow}/batches/{tag}/{runname}/{batch}.{format}.gz", + return expand("sequences/{sequence_workflow}/batches/{tag}/{runname}/{batch}.{format}.gz", sequence_workflow=wildcards.sequence_workflow, tag=wildcards.tag, runname=wildcards.runname, batch=get_batch_ids_raw(wildcards.runname, config=config, tag=wildcards.tag, checkpoints=checkpoints), format=wildcards.format) - print(r) - return r def get_batches_basecaller2(wildcards): return expand("sequences/{sequence_workflow}/batches/{tag}/{runname}.{format}.gz", @@ -59,7 +57,8 @@ def get_batches_basecaller2(wildcards): # albacore basecalling rule albacore: input: - signal = lambda wildcards : get_signal_batch(wildcards, config, checkpoints) + batch = lambda wildcards : get_signal_batch(wildcards, config), + run = lambda wildcards : os.path.join(config['storage_data_raw'], wildcards.runname) output: "sequences/albacore/batches/{tag, [^\/]*}/{runname, [^.\/]*}/{batch, [^.]*}.{format, (fasta|fastq|fa|fq)}.gz" shadow: "minimal" @@ -76,7 +75,7 @@ rule albacore: shell: """ mkdir -p raw - {config[bin][python]} {config[sbin][storage_fast5Index.py]} extract {input.signal[0]} raw/ --index {params.index} --output_format lazy + {config[bin][python]} {config[sbin][storage_fast5Index.py]} extract {input.batch} raw/ --index {params.index} --output_format single {config[bin][albacore]} -i raw/ --recursive -t {threads} -s raw/ --flowcell {params.flowcell} --kit {params.kit} --output_format fastq {params.filtering} {params.barcoding} {config[basecalling_albacore_flags]} FASTQ_DIR='raw/workspace/' if [ \'{params.filtering}\' = '' ]; then @@ -93,7 +92,7 @@ rule albacore: # guppy basecalling rule guppy: input: - batch = lambda wildcards : get_signal_batch(wildcards, config, checkpoints), + batch = lambda wildcards : get_signal_batch(wildcards, config), run = lambda wildcards : os.path.join(config['storage_data_raw'], wildcards.runname) output: "sequences/guppy/batches/{tag, [^\/]*}/{runname, [^.\/]*}/{batch, [^.]*}.{format, (fasta|fastq|fa|fq)}.gz" @@ -113,7 +112,6 @@ rule guppy: shell: """ mkdir -p raw - echo 'extract {input.batch} raw/ --index {params.index} --output_format lazy' {config[bin_singularity][python]} {config[sbin_singularity][storage_fast5Index.py]} extract {input.batch} raw/ --index {params.index} --output_format lazy {config[bin_singularity][guppy]} -i raw/ --recursive --num_callers 1 --cpu_threads_per_caller {threads} -s workspace/ --flowcell {params.flowcell} --kit {params.kit} {params.filtering} {config[basecalling_guppy_flags]} FASTQ_DIR='workspace/pass' @@ -131,7 +129,8 @@ rule guppy: # flappie basecalling rule flappie: input: - signal = lambda wildcards : get_signal_batch(wildcards, config, checkpoints) + batch = lambda wildcards : get_signal_batch(wildcards, config), + run = lambda wildcards : os.path.join(config['storage_data_raw'], wildcards.runname) output: sequence = "sequences/flappie/batches/{tag, [^\/]*}/{runname, [^.\/]*}/{batch, [^.]*}.{format, (fasta|fastq|fa|fq)}.gz", methyl_marks = "sequences/flappie/batches/{tag, [^\/]*}/{runname, [^.\/]*}/{batch, [^.]*}.{format, (fasta|fastq|fa|fq)}.tsv.gz" @@ -148,7 +147,7 @@ rule flappie: """ export OPENBLAS_NUM_THREADS=1 mkdir -p raw - {config[bin_singularity][python]} {config[sbin_singularity][storage_fast5Index.py]} extract {input.signal[0]} raw/ --index {params.index} --output_format lazy + {config[bin_singularity][python]} {config[sbin_singularity][storage_fast5Index.py]} extract {input.batch} raw/ --index {params.index} --output_format lazy find raw/ -regextype posix-extended -regex '^.*fast5' -type f -exec du -h {{}} + | sort -r -h | cut -f2 > raw.fofn split -e -n r/{threads} raw.fofn raw.fofn.part. ls raw.fofn.part.* | xargs -n 1 -P {threads} -I {{}} $SHELL -c 'cat {{}} | shuf | xargs -n 1 {config[bin_singularity][flappie]} --model {config[basecalling_flappie_model]} {config[basecalling_flappie_flags]} > raw/{{}}.fastq' diff --git a/rules/methylation.smk b/rules/methylation.smk index 4c3fbac..afbbd4b 100755 --- a/rules/methylation.smk +++ b/rules/methylation.smk @@ -48,7 +48,7 @@ def get_batches_methylation(wildcards, config, methylation_caller): tag=wildcards.tag, runname=wildcards.runname, reference=wildcards.reference, - batch=get_batch_ids_raw(wildcards.runname, config)) + batch=get_batch_ids_raw(wildcards.runname, config=config, tag=wildcards.tag, checkpoints=checkpoints)) return r def get_batches_methylation2(wildcards, config, methylation_caller): @@ -73,7 +73,8 @@ def get_min_coverage(wildcards): # nanopolish methylation detection rule methylation_nanopolish: input: - signals = lambda wildcards : get_signal_batch(wildcards, config), + batch = lambda wildcards : get_signal_batch(wildcards, config), + run = lambda wildcards : os.path.join(config['storage_data_raw'], wildcards.runname), sequences = lambda wildcards : get_sequence_batch(wildcards, config), bam = lambda wildcards : get_alignment_batch(wildcards, config), bai = lambda wildcards : get_alignment_batch(wildcards, config) + '.bai', @@ -90,7 +91,7 @@ rule methylation_nanopolish: shell: """ mkdir -p raw - {config[sbin_singularity][storage_batch2fast5.sh]} {input.signals} raw/ {config[sbin_singularity][base]} {config[bin_singularity][python]} + {config[bin][python]} {config[sbin][storage_fast5Index.py]} extract {input.batch} raw/ --index {params.index} --output_format single zcat {input.sequences} > sequences.fastx {config[bin_singularity][nanopolish]} index -d raw/ sequences.fastx {config[bin_singularity][nanopolish]} call-methylation -t {threads} -r sequences.fastx -g {input.reference} -b {input.bam} > {output} diff --git a/rules/utils/get_file.py b/rules/utils/get_file.py index 0d3026a..96011bc 100755 --- a/rules/utils/get_file.py +++ b/rules/utils/get_file.py @@ -38,15 +38,16 @@ # check if tag maps to barcode def get_tag_barcode(tag, runname, config): - if runname in config['barcodes']: - bc_batches = [config['barcodes'][runname][bc] for bc in config['barcodes'][runname].keys() if bc in tag] - elif '__default__' in config['barcodes']: - bc_batches = [config['barcodes']['__default__'][bc] for bc in config['barcodes']['__default__'].keys() if bc in tag] + bc_batches = [None] + if '__default__' in config['barcodes']: + bc_batches.extend([config['barcodes']['__default__'][bc] for bc in config['barcodes']['__default__'].keys() if bc in tag]) + elif runname in config['barcodes']: + bc_batches.extend([config['barcodes'][runname][bc] for bc in config['barcodes'][runname].keys() if bc in tag]) else: - bc_batches = [None] - return bc_batches[0] - - + pass + return bc_batches[-1] + + # prefix of raw read batches def get_batch_ids_raw(runname, config, tag=None, checkpoints=None): tag_barcode = get_tag_barcode(tag, runname, config) if tag else None @@ -62,7 +63,7 @@ def get_batch_ids_raw(runname, config, tag=None, checkpoints=None): # get batch of reads as IDs or fast5 -def get_signal_batch(wildcards, config, checkpoints=None): +def get_signal_batch(wildcards, config): raw_dir = config['storage_data_raw'] if hasattr(wildcards, 'tag'): tag_barcode = get_tag_barcode(wildcards.tag, wildcards.runname, config) diff --git a/rules/utils/storage_fast5Index.py b/rules/utils/storage_fast5Index.py index 245e578..0113fd8 100755 --- a/rules/utils/storage_fast5Index.py +++ b/rules/utils/storage_fast5Index.py @@ -122,7 +122,7 @@ def index(self, argv): print("\n".join(['\t'.join((os.path.join(f,group), ID)) for f, (group, ID) in zip([input_relative] * len(reads), reads)])) else: print('\t'.join([input_relative, self.__get_ID_single__(input_relative)])) - + def extract(self, argv): parser = argparse.ArgumentParser(description="Fast5 extraction") parser.add_argument("batch", help="Input batch") @@ -165,7 +165,7 @@ def extract(self, argv): batch_id_files = [tuple( [id] + re.split('(\.fast5|\.tar)\/', run_index[id]) ) for id in batch_ids if id in run_index] batch_id_files.sort(key=lambda x : (x[1], x[2]) if len(x) > 2 else x[1]) def copy_reads_to(fofns, output): - if len(fofns) == 1: + if len(fofns) == 1 and len(fofns[0]) == 2: # single read fast5 id, src_file = fofns[0] shutil.copy(os.path.join(os.path.dirname(args.index), src_file), output) @@ -207,7 +207,7 @@ def copy_reads_to(fofns, output): copy_reads_to(list(id_batch_paths), tmpdirname) f5files = [os.path.join(dirpath, f) for dirpath, _, files in os.walk(tmpdirname) for f in files if f.endswith('.fast5')] single_to_multi_fast5.create_multi_read_file(f5files, output_bulk) - + else: raise RuntimeError('[ERROR] Raw fast5 batch extension {} not supported.'.format(batch_ext)) From 0afb41944c178100e02f41aaf4c743aaa11e35e8 Mon Sep 17 00:00:00 2001 From: Pay Giesselmann Date: Thu, 4 Apr 2019 12:15:41 +0200 Subject: [PATCH 05/19] cleanup after raw signal batch rewrite --- rules/alignment.smk | 2 +- rules/install.smk | 16 ++ rules/storage.smk | 4 +- rules/sv.smk | 47 ++++- rules/utils/get_file.py | 6 +- rules/utils/storage_batch2fast5.sh | 65 ------- rules/utils/storage_fast5Index.py | 275 ++++++++++++++++------------- 7 files changed, 216 insertions(+), 199 deletions(-) delete mode 100755 rules/utils/storage_batch2fast5.sh diff --git a/rules/alignment.smk b/rules/alignment.smk index f4c4587..2a21141 100755 --- a/rules/alignment.smk +++ b/rules/alignment.smk @@ -46,7 +46,7 @@ def get_batches_aligner(wildcards, config): sequence_workflow=wildcards.sequence_workflow, tag=wildcards.tag, runname=wildcards.runname, - batch=get_batch_ids_raw(wildcards.runname, config), + batch=get_batch_ids_raw(wildcards.runname, config=config, tag=wildcards.tag, checkpoints=checkpoints), reference=wildcards.reference) return r diff --git a/rules/install.smk b/rules/install.smk index 56419a9..f08b07e 100755 --- a/rules/install.smk +++ b/rules/install.smk @@ -424,3 +424,19 @@ rule pinfish: cp polish_clusters/polish_clusters ../../bin/ cp spliced_bam2gff/spliced_bam2gff ../../bin/ """ + +rule strique: + output: + "bin/STRique.py" + shell: + """ + mkdir -p src && cd src + if [ ! -d STRique ]; then + git clone --recursive https://github.com/giesselmann/STRique --branch v0.2.0 && cd STRique + else + cd STRique && git fetch --all --tags --prune && git checkout v0.2.0 + fi + {config[python]} -m pip install -r requirements.txt --upgrade + {config[python]} setup.py install + cp scripts/STRique.py ../../bin/ + """ \ No newline at end of file diff --git a/rules/storage.smk b/rules/storage.smk index 7a18309..57832c1 100755 --- a/rules/storage.smk +++ b/rules/storage.smk @@ -47,7 +47,7 @@ def get_batches_indexing(wildcards): # extract read ID from individual fast5 files rule storage_index_batch: input: - unpack(lambda wildcards : get_signal_batch(wildcards, config)), + batch = lambda wildcards : get_signal_batch(wildcards, config) output: temp("{data_raw}/{{runname}}/reads/{{batch}}.fofn".format(data_raw = config["storage_data_raw"])) shadow: "minimal" @@ -57,7 +57,7 @@ rule storage_index_batch: time_min = 15 shell: """ - {config[bin][python]} {config[sbin][storage_fast5Index.py]} index {input.signal} --out_prefix reads --tmp_prefix $(pwd) > {output} + {config[bin][python]} {config[sbin][storage_fast5Index.py]} index {input.batch} --out_prefix reads --tmp_prefix $(pwd) > {output} """ # merge batch indices diff --git a/rules/sv.smk b/rules/sv.smk index 13a61b9..a1cd5bb 100755 --- a/rules/sv.smk +++ b/rules/sv.smk @@ -32,7 +32,25 @@ # Written by Pay Giesselmann # --------------------------------------------------------------------------------- # imports -localrules: sv_compress +localrules: sv_compress, strique_merge_batches, strique_merge_tag + +# get batches +def get_batches_strique(wildcards): + return expand("sv/strique/{aligner}/{sequence_workflow}/batches/{tag}/{runname}/{batch}.{reference}.tsv", + aligner=wildcards.aligner, + sequence_workflow=wildcards.sequence_workflow, + tag=wildcards.tag, + runname=wildcards.runname, + batch=get_batch_ids_raw(wildcards.runname, config=config, tag=wildcards.tag, checkpoints=checkpoints), + reference=wildcards.reference) + +def get_batches_strique2(wildcards): + return expand("sv/strique/{aligner}/{sequence_workflow}/batches/{tag}/{runname}.{reference}.tsv", + aligner=wildcards.aligner, + sequence_workflow=wildcards.sequence_workflow, + tag=wildcards.tag, + runname=[runname for runname in config['runnames']], + reference=wildcards.reference) # sniffles sv detection rule sniffles: @@ -68,10 +86,11 @@ rule sv_compress: rule strique: input: - signal = "", - bam = "" + index = lambda wildcards : os.path.join(config['storage_data_raw'], wildcards.runname, 'reads.fofn'), + bam = lambda wildcards : get_alignment_batch(wildcards, config), + config = lambda wildcards : config['sv_STRique_config'] output: - "sv/strique/{aligner, [^.\/]*}/{sequence_workflow, [^.\/]*}/{tag, [^.\/]*}.{reference, [^.\/]*}.tsv" + "sv/strique/{aligner, [^\/]*}/{sequence_workflow, ((?!batches).)*}/batches/{tag, [^\/]*}/{runname, [^\/]*}/{batch, [^.\/]*}.{reference}.tsv" threads: 1 singularity: "docker://nanopype/sv:{tag}".format(tag=config['version']['tag']) @@ -79,3 +98,23 @@ rule strique: """ """ + +rule strique_merge_batches: + input: + get_batches_strique + output: + "sv/strique/{aligner, [^\/]*}/{sequence_workflow}/batches/{tag, [^\/]*}/{runname, [^.\/]*}.{reference}.tsv" + shell: + """ + + """ + +rule strique_merge_tag: + input: + get_batches_strique2 + output: + "sv/strique/{aligner, [^\/]*}/{sequence_workflow, ((?!batches).)*}/{tag, [^\/]*}.{reference}.tsv" + shell: + """ + + """ \ No newline at end of file diff --git a/rules/utils/get_file.py b/rules/utils/get_file.py index 96011bc..0c20bef 100755 --- a/rules/utils/get_file.py +++ b/rules/utils/get_file.py @@ -67,7 +67,8 @@ def get_signal_batch(wildcards, config): raw_dir = config['storage_data_raw'] if hasattr(wildcards, 'tag'): tag_barcode = get_tag_barcode(wildcards.tag, wildcards.runname, config) - return os.path.join('demux/deepbinner/barcodes', wildcards.runname, tag_barcode, wildcards.batch + '.txt') + if tag_barcode: + return os.path.join('demux/deepbinner/barcodes', wildcards.runname, tag_barcode, wildcards.batch + '.txt') batch_file = os.path.join(raw_dir, wildcards.runname, 'reads', wildcards.batch) if os.path.isfile(batch_file + '.tar'): return batch_file + '.tar' @@ -99,6 +100,5 @@ def get_alignment_batch(wildcards, config): tag=wildcards.tag, runname=wildcards.runname, batch=wildcards.batch, - reference=wildcards.reference - ) + reference=wildcards.reference) return bam diff --git a/rules/utils/storage_batch2fast5.sh b/rules/utils/storage_batch2fast5.sh deleted file mode 100755 index 543e993..0000000 --- a/rules/utils/storage_batch2fast5.sh +++ /dev/null @@ -1,65 +0,0 @@ -# \HEADER\------------------------------------------------------------------------- -# -# CONTENTS : extract single fast5 from bulk or index file -# -# DESCRIPTION : none -# -# RESTRICTIONS : none -# -# REQUIRES : none -# -# --------------------------------------------------------------------------------- -# Copyright (c) 2018-2019, Pay Giesselmann, Max Planck Institute for Molecular Genetics -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -# Written by Pay Giesselmann -# --------------------------------------------------------------------------------- - -# cmd line arguments -batch_file=$1 -target_dir=$2 -pipeline_dir=$3 -python_bin=$4 - - -# batch type from file extension -batch_file_name=$(basename -- "$batch_file") -batch_file_ext="${batch_file_name##*.}" - - -# extract based on batch type -case $batch_file_ext in - "tar") - # batch of single read fast5 in .tar format - tar -C $target_dir -xf $batch_file - ;; - "fast5") - # batch of single reads in bulk file - $python_bin $pipeline_dir/submodules/ont_fast5_api/ont_fast5_api/conversion_tools/multi_to_single_fast5.py -i $batch_file -s $target_dir - ;; - "txt") - # batch of single read in file of IDs, requires indexed run - # TODO - ;; - *) - echo "Unrecognized batch type." - exit -1 - ;; -esac diff --git a/rules/utils/storage_fast5Index.py b/rules/utils/storage_fast5Index.py index 0113fd8..68e8309 100755 --- a/rules/utils/storage_fast5Index.py +++ b/rules/utils/storage_fast5Index.py @@ -36,43 +36,34 @@ import shutil import tempfile, tarfile import h5py +from ont_fast5_api.fast5_interface import is_multi_read from ont_fast5_api.fast5_file import Fast5File from ont_fast5_api.multi_fast5 import MultiFast5File from ont_fast5_api.conversion_tools import multi_to_single_fast5, single_to_multi_fast5 +class fast5_Index(): + def __init__(self, index_file=None, tmp_prefix=None): + self.index_file = index_file + self.tmp_prefix = tmp_prefix + if index_file and not os.path.exists(index_file): + raise RuntimeError("[Error] Raw fast5 index file {} not found.".format(index_file)) + elif index_file: + with open(index_file, 'r') as fp: + self.index_dict = {id:path for path,id in [line.split('\t') for line in fp.read().split('\n') if line]} + else: + self.index_dict = None + def __chunked__(l, n): + for i in range(0, len(l), n): + yield l[i:i + n] -class fast5_index(): - def __init__(self): - parser = argparse.ArgumentParser( - description='Nanopore raw fast5 index', - usage='''storage_fast5Index.py [] -Available commands are: - index Index batch(es) of bulk-fast5 or tar archived single fast5 - extract Extract single reads from indexed sequencing run -''') - parser.add_argument('command', help='Subcommand to run') - args = parser.parse_args(sys.argv[1:2]) - if not hasattr(self, args.command): - print('Unrecognized command', file=sys.stderr) - parser.print_help(file=sys.stderr) - exit(1) - getattr(self, args.command)(sys.argv[2:]) - - def __is_multi_read__(self, f5_file): - with h5py.File(f5_file, 'r') as f5: - if "/Raw" in f5: - return False - else: - return True - - def __get_ID_single__(self, f5_file): + def __get_ID_single__(f5_file): with h5py.File(f5_file, 'r') as f5: s = f5["/Raw/"].visit(lambda name: name if 'Signal' in name else None) return str(f5["/Raw/" + s.rpartition('/')[0]].attrs['read_id'], 'utf-8') - def __get_ID_multi__(self, f5_file): + def __get_ID_multi__(f5_file): with h5py.File(f5_file, 'r') as f5: reads = [] for group in f5: @@ -81,138 +72,174 @@ def __get_ID_multi__(self, f5_file): reads.append((group, ID)) return reads - def index(self, argv): - parser = argparse.ArgumentParser(description="Fast5 Index") - parser.add_argument("input", help="Input batch or directory of batches") - parser.add_argument("--recursive", action='store_true', help="Recursively scan input") - parser.add_argument("--out_prefix", default="", help="Prefix for file paths in output") - parser.add_argument("--tmp_prefix", default=None, help="Prefix for temporary data") - args = parser.parse_args(argv) + def __copy_reads_to__(self, read_ids, output): + if not os.path.exists(output): + os.makedirs(output) + batch_id_files = [tuple( [id] + re.split('(\.fast5|\.tar)\/', self.index_dict[id]) ) for id in read_ids if id in self.index_dict] + batch_id_files.sort(key=lambda x : (x[1], x[2]) if len(x) > 2 else x[1]) + for _, id_batch_paths in itertools.groupby(batch_id_files, key=lambda x : (x[1], x[2]) if len(x) > 2 else x[1]): + fofns = list(id_batch_paths) + if len(fofns) == 1 and len(fofns[0]) == 2: + # single read fast5 + id, src_file = fofns[0] + shutil.copy(os.path.join(os.path.dirname(args.index), src_file), output) + else: + _, batch_file, batch_ext, _ = fofns[0] + tarFiles = set([x[3] for x in fofns]) + # single read fast5 batch in tar archive + if batch_ext == '.tar': + tar_file = os.path.join(os.path.dirname(self.index_file), batch_file + batch_ext) + with tarfile.open(tar_file) as fp_tar: + tar_members = fp_tar.getmembers() + for tar_member in tar_members: + if any(s in tar_member.name for s in tarFiles): + try: + tar_member.name = os.path.basename(tar_member.name) + fp_tar.extract(tar_member, path=output) + except: + RuntimeError('[ERROR] Could not extract {id} from {batch}.'.format(id=tar_member.name, batch=tar_file)) + elif batch_ext == '.fast5': + f5_file = os.path.join(os.path.dirname(self.index_file), batch_file + batch_ext) + with MultiFast5File(f5_file, 'r') as multi_f5: + target_ids = set([x[0] for x in fofns]) + for read_id in multi_f5.get_read_ids(): + if read_id in target_ids: + try: + read = multi_f5.get_read(read_id) + output_file = os.path.join(output, "{}.fast5".format(read_id)) + multi_to_single_fast5.create_single_f5(output_file, read) + except: + RuntimeError('[ERROR] Could not extract {id} from {batch}.'.format(id=read_id, batch=f5_file)) + else: + pass + + def index(input, recursive=False, output_prefix="", tmp_prefix=None): + print(tmp_prefix, file=sys.stderr) + if tmp_prefix and not os.path.exists(tmp_prefix): + os.makedirs(tmp_prefix) input_files = [] # scan input - if os.path.isfile(args.input): - input_files.append(args.input) + if os.path.isfile(input): + input_files.append(input) else: - if args.recursive: - input_files.extend([os.path.join(dirpath, f) for dirpath, _, files in os.walk(args.input) for f in files if f.endswith('.fast5') or f.endswith('.tar')]) + if recursive: + input_files.extend([os.path.join(dirpath, f) for dirpath, _, files in os.walk(input) for f in files if f.endswith('.fast5') or f.endswith('.tar')]) else: - input_files.extend(glob.glob(os.path.join(args.input, '*.fast5'))) - input_files.extend(glob.glob(os.path.join(args.input, '*.tar'))) + input_files.extend(glob.glob(os.path.join(input, '*.fast5'))) + input_files.extend(glob.glob(os.path.join(input, '*.tar'))) # index all provided files for input_file in input_files: - input_relative = os.path.normpath(os.path.join(args.out_prefix, - os.path.dirname(os.path.relpath(input_file, start=args.input)), + input_relative = os.path.normpath(os.path.join(output_prefix, + os.path.dirname(os.path.relpath(input_file, start=input)), os.path.basename(input_file))) # extract reads from packed tar archive and retrieve read IDs if input_file.endswith('.tar'): - with tempfile.TemporaryDirectory(prefix=args.tmp_prefix) as tmpdirname, tarfile.open(input_file) as fp_tar: + with tempfile.TemporaryDirectory(prefix=tmp_prefix) as tmpdirname, tarfile.open(input_file) as fp_tar: fp_tar.extractall(path=tmpdirname) f5files = [os.path.join(dirpath, f) for dirpath, _, files in os.walk(tmpdirname) for f in files if f.endswith('.fast5')] for f5file in f5files: try: - ID = self.__get_ID_single__(f5file) + ID = fast5_Index.__get_ID_single__(f5file) print("\t".join([os.path.normpath(os.path.join(input_relative, os.path.relpath(f5file, start=tmpdirname))), ID])) except: - print("Error opening {f5}, skip file for indexing".format(f5=f5file)) + print("[ERROR] Failed to open {f5}, skip file for indexing".format(f5=f5file), file=sys.stderr) # bulk and single read fast5 else: - if self.__is_multi_read__(input_file): - reads = self.__get_ID_multi__(input_file) - print("\n".join(['\t'.join((os.path.join(f,group), ID)) for f, (group, ID) in zip([input_relative] * len(reads), reads)])) + if is_multi_read(input_file): + reads = fast5_Index.__get_ID_multi__(input_file) + for f, (group, ID) in zip([input_relative] * len(reads), reads): + yield '\t'.join((os.path.join(f,group), ID)) else: - print('\t'.join([input_relative, self.__get_ID_single__(input_relative)])) - - def extract(self, argv): - parser = argparse.ArgumentParser(description="Fast5 extraction") - parser.add_argument("batch", help="Input batch") - parser.add_argument("output", help="Output directory") - parser.add_argument("--index", default='', help="Read index") - parser.add_argument("--output_format", default='single', choices=['single', 'bulk', 'lazy'], help="Output as single, bulk or with minimal conversion overhead") - parser.add_argument("--tmp_prefix", default=None, help="Prefix for temporary data") - args = parser.parse_args(argv) - if not os.path.exists(args.output): - os.makedirs(os.path.dirname(output_file)) - batch_name, batch_ext = os.path.splitext(args.batch) + try: + ID = fast5_Index.__get_ID_single__(input_relative) + except: + print("[ERROR] Failed to open {f5}, skip file for indexing".format(f5=input_file), file=sys.stderr) + yield '\t'.join([input_relative, ID]) + + def extract(self, input, output, format='single'): + if not os.path.exists(output): + os.makedirs(output) + batch_name, batch_ext = os.path.splitext(input) # packed single reads in tar archive if batch_ext == '.tar': - if args.output_format in ['single', 'lazy']: - with tarfile.open(args.batch) as fp_tar: - fp_tar.extractall(path=args.output) + if format in ['single', 'lazy']: + with tarfile.open(input) as fp_tar: + fp_tar.extractall(path=output) else: - output_bulk = os.path.join(args.output, os.path.basename(batch_name) + '.fast5') - with tempfile.TemporaryDirectory(prefix=args.tmp_prefix) as tmpdirname, tarfile.open(args.batch) as fp_tar: + with tempfile.TemporaryDirectory(prefix=self.tmp_prefix) as tmpdirname, tarfile.open(input) as fp_tar: fp_tar.extractall(path=tmpdirname) f5files = [os.path.join(dirpath, f) for dirpath, _, files in os.walk(tmpdirname) for f in files if f.endswith('.fast5')] - single_to_multi_fast5.create_multi_read_file(f5files, output_bulk) + output_bulk_file = os.path.join(output, os.path.basename(batch_name) + '.fast5') + single_to_multi_fast5.create_multi_read_file(f5files, output_bulk_file) # bulk fast5 - elif batch_ext == '.fast': - if args.output_format in ['bulk', 'lazy']: - shutil.copy(args.batch, args.output) + elif batch_ext == '.fast5': + if format in ['bulk', 'lazy']: + shutil.copy(input, output) else: - multi_to_single_fast5.convert_multi_to_single(args.batch, args.output, '') + multi_to_single_fast5.convert_multi_to_single(input, output, '') # read IDs to be extracted elif batch_ext == '.txt': # load index and requested IDs - if not os.path.exists(args.index): - raise RuntimeError("[Error] Raw fast5 index file {} not found.".format(args.index)) + if not self.index_dict : + raise RuntimeError("[Error] Extraction of reads from IDs without index file provided.") + with open(input, 'r') as fp: + batch_ids = [id.strip() for id in fp.read().split('\n') if id] + if format in ['single', 'lazy']: + self.__copy_reads_to__(batch_ids, output) else: - with open(args.batch, 'r') as fp: - batch_ids = [id.strip() for id in fp.read().split('\n') if id] - with open(args.index, 'r') as fp: - run_index = {id:path for path,id in [line.split('\t') for line in fp.read().split('\n') if line]} - # sort in affected batches - batch_id_files = [tuple( [id] + re.split('(\.fast5|\.tar)\/', run_index[id]) ) for id in batch_ids if id in run_index] - batch_id_files.sort(key=lambda x : (x[1], x[2]) if len(x) > 2 else x[1]) - def copy_reads_to(fofns, output): - if len(fofns) == 1 and len(fofns[0]) == 2: - # single read fast5 - id, src_file = fofns[0] - shutil.copy(os.path.join(os.path.dirname(args.index), src_file), output) - else: - _, batch_file, batch_ext, _ = fofns[0] - tarFiles = set([x[3] for x in fofns]) - # single read fast5 batch in tar archive - if batch_ext == '.tar': - tar_file = os.path.join(os.path.dirname(args.index), batch_file + batch_ext) - with tarfile.open(tar_file) as fp_tar: - tar_members = fp_tar.getmembers() - for tar_member in tar_members: - if any(s in tar_member.name for s in tarFiles): - try: - tar_member.name = os.path.basename(tar_member.name) - fp_tar.extract(tar_member, path=output) - except: - RuntimeError('[ERROR] Could not extract {id} from {batch}.'.format(id=tar_member.name, batch=tar_file)) - elif batch_ext == '.fast5': - f5_file = os.path.join(os.path.dirname(args.index), batch_file + batch_ext) - with MultiFast5File(f5_file, 'r') as multi_f5: - target_ids = set([x[0] for x in fofns]) - for read_id in multi_f5.get_read_ids(): - if read_id in target_ids: - try: - read = multi_f5.get_read(read_id) - output_file = os.path.join(output, "{}.fast5".format(read_id)) - multi_to_single_fast5.create_single_f5(output_file, read) - except: - RuntimeError('[ERROR] Could not extract {id} from {batch}.'.format(id=read_id, batch=f5_file)) - else: - pass - for batch_file, id_batch_paths in itertools.groupby(batch_id_files, key=lambda x : (x[1], x[2]) if len(x) > 2 else x[1]): - if args.output_format in ['single', 'lazy']: - copy_reads_to(list(id_batch_paths), args.output) - else: - output_bulk = os.path.join(args.output, os.path.basename(batch_name) + '.fast5') - with tempfile.TemporaryDirectory(prefix=args.tmp_prefix) as tmpdirname: - copy_reads_to(list(id_batch_paths), tmpdirname) - f5files = [os.path.join(dirpath, f) for dirpath, _, files in os.walk(tmpdirname) for f in files if f.endswith('.fast5')] - single_to_multi_fast5.create_multi_read_file(f5files, output_bulk) - + with tempfile.TemporaryDirectory(prefix=self.tmp_prefix) as tmpdirname: + self.__copy_reads_to__(batch_ids, tmpdirname) + f5files = [os.path.join(dirpath, f) for dirpath, _, files in os.walk(tmpdirname) for f in files if f.endswith('.fast5')] + output_bulk_file = os.path.join(output, os.path.basename(batch_name) + '.fast5') + single_to_multi_fast5.create_multi_read_file(f5files, output_bulk_file) else: raise RuntimeError('[ERROR] Raw fast5 batch extension {} not supported.'.format(batch_ext)) + def raw(self, ID): + pass + + +class main(): + def __init__(self): + parser = argparse.ArgumentParser( + description='Nanopore raw fast5 index', + usage='''storage_fast5Index.py [] +Available commands are: + index Index batch(es) of bulk-fast5 or tar archived single fast5 + extract Extract single reads from indexed sequencing run +''') + parser.add_argument('command', help='Subcommand to run') + args = parser.parse_args(sys.argv[1:2]) + if not hasattr(self, args.command): + print('Unrecognized command', file=sys.stderr) + parser.print_help(file=sys.stderr) + exit(1) + getattr(self, args.command)(sys.argv[2:]) + + def index(self, argv): + parser = argparse.ArgumentParser(description="Fast5 Index") + parser.add_argument("input", help="Input batch or directory of batches") + parser.add_argument("--recursive", action='store_true', help="Recursively scan input") + parser.add_argument("--out_prefix", default="", help="Prefix for file paths in output") + parser.add_argument("--tmp_prefix", default=None, help="Prefix for temporary data") + args = parser.parse_args(argv) + for record in fast5_Index.index(args.input, recursive=args.recursive, output_prefix=args.out_prefix, tmp_prefix=args.tmp_prefix): + print(record) + + def extract(self, argv): + parser = argparse.ArgumentParser(description="Fast5 extraction") + parser.add_argument("batch", help="Input batch") + parser.add_argument("output", help="Output directory") + parser.add_argument("--index", default=None, help="Read index") + parser.add_argument("--output_format", default='single', choices=['single', 'bulk', 'lazy'], help="Output as single, bulk or with minimal conversion overhead") + parser.add_argument("--tmp_prefix", default=None, help="Prefix for temporary data") + args = parser.parse_args(argv) + f5_index = fast5_Index(args.index, tmp_prefix=args.tmp_prefix) + f5_index.extract(args.batch, args.output, format=args.output_format) + if __name__ == "__main__": - fast5_index() + main() From ca0c44bcb5ffa70d1f6efa0568434de8ef376e3a Mon Sep 17 00:00:00 2001 From: Pay Giesselmann Date: Fri, 5 Apr 2019 09:01:39 +0200 Subject: [PATCH 06/19] include STRique repeat counting --- env.yaml | 1 + nanopype.yaml | 3 ++- rules/sv.smk | 34 +++++++++++++++++++++--------- singularity/alignment/Dockerfile | 2 +- singularity/basecalling/Dockerfile | 2 +- singularity/demux/Dockerfile | 2 +- singularity/methylation/Dockerfile | 2 +- singularity/sv/Dockerfile | 5 ++++- singularity/transcript/Dockerfile | 2 +- 9 files changed, 36 insertions(+), 17 deletions(-) mode change 100644 => 100755 singularity/alignment/Dockerfile mode change 100644 => 100755 singularity/demux/Dockerfile mode change 100644 => 100755 singularity/methylation/Dockerfile mode change 100644 => 100755 singularity/sv/Dockerfile mode change 100644 => 100755 singularity/transcript/Dockerfile diff --git a/env.yaml b/env.yaml index 2112f13..2047148 100755 --- a/env.yaml +++ b/env.yaml @@ -22,3 +22,4 @@ bin: cluster_gff: cluster_gff collapse_partials: collapse_partials polish_clusters: polish_clusters + strique: STRique.py diff --git a/nanopype.yaml b/nanopype.yaml index 647409d..ae29742 100755 --- a/nanopype.yaml +++ b/nanopype.yaml @@ -72,7 +72,8 @@ deepbinner_models: # structural variation # command line flags directly passed to the sniffles execution sv_sniffles_flags: '-s 10 -l 30 -r 2000 --genotype' - +sv_STRique_config: 'STRique.tsv' +sv_STRique_model: 'template_median68pA6mer.model' # transcriptome transcript_use_pychopper: True diff --git a/rules/sv.smk b/rules/sv.smk index a1cd5bb..b39104b 100755 --- a/rules/sv.smk +++ b/rules/sv.smk @@ -88,15 +88,19 @@ rule strique: input: index = lambda wildcards : os.path.join(config['storage_data_raw'], wildcards.runname, 'reads.fofn'), bam = lambda wildcards : get_alignment_batch(wildcards, config), + model = lambda wildcards : config['sv_STRique_model'], config = lambda wildcards : config['sv_STRique_config'] output: "sv/strique/{aligner, [^\/]*}/{sequence_workflow, ((?!batches).)*}/batches/{tag, [^\/]*}/{runname, [^\/]*}/{batch, [^.\/]*}.{reference}.tsv" - threads: 1 + threads: config['threads_sv'] + resources: + mem_mb = lambda wildcards, threads, attempt: int((1.0 + (0.1 * (attempt - 1))) * (4000 + 1500 * threads)), + time_min = lambda wildcards, threads, attempt: int((3840 / threads) * attempt) # 240 min / 16 threads singularity: "docker://nanopype/sv:{tag}".format(tag=config['version']['tag']) shell: """ - + {config[bin_singularity][samtools]} view -F 2308 {input.bam} | {config[bin_singularity][python]} {config[bin_singularity][strique]} count {input.index} {input.model} {input.config} --t {threads} > {output} """ rule strique_merge_batches: @@ -104,17 +108,27 @@ rule strique_merge_batches: get_batches_strique output: "sv/strique/{aligner, [^\/]*}/{sequence_workflow}/batches/{tag, [^\/]*}/{runname, [^.\/]*}.{reference}.tsv" - shell: - """ - - """ + run: + with open(output[0], 'w') as fp_out: + for i, f in enumerate(input): + with open(f, 'r') as fp_in: + if i == 0: + print(fp_in.read(), file=fp_out) + else: + next(fp_in) + print(fp_in.read(), file=fp_out) rule strique_merge_tag: input: get_batches_strique2 output: "sv/strique/{aligner, [^\/]*}/{sequence_workflow, ((?!batches).)*}/{tag, [^\/]*}.{reference}.tsv" - shell: - """ - - """ \ No newline at end of file + run: + with open(output[0], 'w') as fp_out: + for i, f in enumerate(input): + with open(f, 'r') as fp_in: + if i == 0: + print(fp_in.read(), file=fp_out) + else: + next(fp_in) + print(fp_in.read(), file=fp_out) diff --git a/singularity/alignment/Dockerfile b/singularity/alignment/Dockerfile old mode 100644 new mode 100755 index 096045d..c37c96c --- a/singularity/alignment/Dockerfile +++ b/singularity/alignment/Dockerfile @@ -55,7 +55,7 @@ RUN python3 get-pip.py RUN mkdir -p /app COPY . /app/ WORKDIR /app -RUN pip3 install . --upgrade +RUN python3 -m pip install -r requirements.txt ## run setup rules RUN mkdir /build diff --git a/singularity/basecalling/Dockerfile b/singularity/basecalling/Dockerfile index d70d3eb..886b123 100755 --- a/singularity/basecalling/Dockerfile +++ b/singularity/basecalling/Dockerfile @@ -55,7 +55,7 @@ RUN python3 get-pip.py RUN mkdir -p /app COPY . /app/ WORKDIR /app -RUN pip3 install . --upgrade +RUN python3 -m pip install -r requirements.txt ## run setup rules RUN mkdir /build diff --git a/singularity/demux/Dockerfile b/singularity/demux/Dockerfile old mode 100644 new mode 100755 index e862a26..e1ea725 --- a/singularity/demux/Dockerfile +++ b/singularity/demux/Dockerfile @@ -58,7 +58,7 @@ WORKDIR /app RUN wget https://bootstrap.pypa.io/get-pip.py RUN python3 get-pip.py -RUN pip3 install . --upgrade +RUN python3 -m pip install -r requirements.txt RUN snakemake --snakefile rules/install.smk --directory /usr deepbinner diff --git a/singularity/methylation/Dockerfile b/singularity/methylation/Dockerfile old mode 100644 new mode 100755 index 4e21121..fea0d2a --- a/singularity/methylation/Dockerfile +++ b/singularity/methylation/Dockerfile @@ -56,7 +56,7 @@ RUN python3 get-pip.py RUN mkdir -p /app COPY . /app/ WORKDIR /app -RUN pip3 install . --upgrade +RUN python3 -m pip install -r requirements.txt ## run setup rules RUN mkdir /build diff --git a/singularity/sv/Dockerfile b/singularity/sv/Dockerfile old mode 100644 new mode 100755 index ccc90c6..ba29636 --- a/singularity/sv/Dockerfile +++ b/singularity/sv/Dockerfile @@ -55,7 +55,7 @@ RUN python3 get-pip.py RUN mkdir -p /app COPY . /app/ WORKDIR /app -RUN pip3 install . --upgrade +RUN python3 -m pip install -r requirements.txt ## run setup rules RUN mkdir /build @@ -86,6 +86,9 @@ RUN wget https://bootstrap.pypa.io/get-pip.py RUN python3 get-pip.py RUN python3 -m pip install -r requirements.txt +RUN snakemake --snakefile rules/install.smk --directory /usr strique +RUN rm -rf /usr/src/STRique + COPY --from=build_stage /build/bin/* /usr/bin/ WORKDIR / diff --git a/singularity/transcript/Dockerfile b/singularity/transcript/Dockerfile old mode 100644 new mode 100755 index 45841ec..86694b7 --- a/singularity/transcript/Dockerfile +++ b/singularity/transcript/Dockerfile @@ -57,7 +57,7 @@ RUN python3 get-pip.py RUN mkdir -p /app COPY . /app/ WORKDIR /app -RUN pip3 install . --upgrade +RUN python3 -m pip install -r requirements.txt ## run setup rules RUN mkdir /build From 85255da4b9ea8665469e11123a76de45c20cc05d Mon Sep 17 00:00:00 2001 From: Pay Giesselmann Date: Thu, 25 Apr 2019 10:32:00 +0200 Subject: [PATCH 07/19] fix flappie raw fast5 format --- rules/basecalling.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rules/basecalling.smk b/rules/basecalling.smk index 4572118..b382633 100755 --- a/rules/basecalling.smk +++ b/rules/basecalling.smk @@ -147,7 +147,7 @@ rule flappie: """ export OPENBLAS_NUM_THREADS=1 mkdir -p raw - {config[bin_singularity][python]} {config[sbin_singularity][storage_fast5Index.py]} extract {input.batch} raw/ --index {params.index} --output_format lazy + {config[bin_singularity][python]} {config[sbin_singularity][storage_fast5Index.py]} extract {input.batch} raw/ --index {params.index} --output_format single find raw/ -regextype posix-extended -regex '^.*fast5' -type f -exec du -h {{}} + | sort -r -h | cut -f2 > raw.fofn split -e -n r/{threads} raw.fofn raw.fofn.part. ls raw.fofn.part.* | xargs -n 1 -P {threads} -I {{}} $SHELL -c 'cat {{}} | shuf | xargs -n 1 {config[bin_singularity][flappie]} --model {config[basecalling_flappie_model]} {config[basecalling_flappie_flags]} > raw/{{}}.fastq' From cbc292ac741d8d2ace7b82f537b409aa34c77cb2 Mon Sep 17 00:00:00 2001 From: Pay Giesselmann Date: Thu, 25 Apr 2019 10:38:46 +0200 Subject: [PATCH 08/19] strique methylation model --- rules/sv.smk | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/rules/sv.smk b/rules/sv.smk index b39104b..94408d7 100755 --- a/rules/sv.smk +++ b/rules/sv.smk @@ -88,19 +88,24 @@ rule strique: input: index = lambda wildcards : os.path.join(config['storage_data_raw'], wildcards.runname, 'reads.fofn'), bam = lambda wildcards : get_alignment_batch(wildcards, config), - model = lambda wildcards : config['sv_STRique_model'], - config = lambda wildcards : config['sv_STRique_config'] + config = lambda wildcards : config['sv_STRique_config'], + models = lambda wildcards : [config['sv_STRique_model']] + [config['sv_STRique_mod_model']] if os.path.exists(config['sv_STRique_mod_model']) else [] output: "sv/strique/{aligner, [^\/]*}/{sequence_workflow, ((?!batches).)*}/batches/{tag, [^\/]*}/{runname, [^\/]*}/{batch, [^.\/]*}.{reference}.tsv" + shadow: "minimal" threads: config['threads_sv'] + params: + model = config['sv_STRique_model'], + mod_model = '--mod_model {}'.format(config['sv_STRique_mod_model']) if 'sv_STRique_mod_model' in config else '' resources: - mem_mb = lambda wildcards, threads, attempt: int((1.0 + (0.1 * (attempt - 1))) * (4000 + 1500 * threads)), + mem_mb = lambda wildcards, threads, attempt: int((1.0 + (0.1 * (attempt - 1))) * (16000 + 2000 * threads)), time_min = lambda wildcards, threads, attempt: int((3840 / threads) * attempt) # 240 min / 16 threads singularity: "docker://nanopype/sv:{tag}".format(tag=config['version']['tag']) shell: """ - {config[bin_singularity][samtools]} view -F 2308 {input.bam} | {config[bin_singularity][python]} {config[bin_singularity][strique]} count {input.index} {input.model} {input.config} --t {threads} > {output} + export TMPDIR=$(pwd) + {config[bin_singularity][samtools]} view -F 2308 {input.bam} | {config[bin_singularity][python]} {config[bin_singularity][strique]} count {input.index} {params.model} {input.config} {params.mod_model} --t {threads} --log_level debug > {output} """ rule strique_merge_batches: @@ -113,10 +118,10 @@ rule strique_merge_batches: for i, f in enumerate(input): with open(f, 'r') as fp_in: if i == 0: - print(fp_in.read(), file=fp_out) + print(fp_in.read(), file=fp_out, end='') else: next(fp_in) - print(fp_in.read(), file=fp_out) + print(fp_in.read(), file=fp_out, end='') rule strique_merge_tag: input: @@ -128,7 +133,7 @@ rule strique_merge_tag: for i, f in enumerate(input): with open(f, 'r') as fp_in: if i == 0: - print(fp_in.read(), file=fp_out) + print(fp_in.read(), file=fp_out, end='') else: next(fp_in) - print(fp_in.read(), file=fp_out) + print(fp_in.read(), file=fp_out, end='') From 138d3da416736e5c86844750615b668e5f825cc0 Mon Sep 17 00:00:00 2001 From: Pay Giesselmann Date: Thu, 25 Apr 2019 10:39:34 +0200 Subject: [PATCH 09/19] use generic demux from config --- rules/utils/get_file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rules/utils/get_file.py b/rules/utils/get_file.py index 0c20bef..91d5e71 100755 --- a/rules/utils/get_file.py +++ b/rules/utils/get_file.py @@ -68,7 +68,7 @@ def get_signal_batch(wildcards, config): if hasattr(wildcards, 'tag'): tag_barcode = get_tag_barcode(wildcards.tag, wildcards.runname, config) if tag_barcode: - return os.path.join('demux/deepbinner/barcodes', wildcards.runname, tag_barcode, wildcards.batch + '.txt') + return os.path.join('demux', config['demux_default'], 'barcodes', wildcards.runname, tag_barcode, wildcards.batch + '.txt') batch_file = os.path.join(raw_dir, wildcards.runname, 'reads', wildcards.batch) if os.path.isfile(batch_file + '.tar'): return batch_file + '.tar' From 8ddeeb981f0862c52ac4d1fbce32fba5dff8258a Mon Sep 17 00:00:00 2001 From: Pay Giesselmann Date: Thu, 25 Apr 2019 10:40:13 +0200 Subject: [PATCH 10/19] batch output for bulk extraction --- rules/utils/storage_fast5Index.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/rules/utils/storage_fast5Index.py b/rules/utils/storage_fast5Index.py index 68e8309..ca7240a 100755 --- a/rules/utils/storage_fast5Index.py +++ b/rules/utils/storage_fast5Index.py @@ -114,7 +114,6 @@ def __copy_reads_to__(self, read_ids, output): pass def index(input, recursive=False, output_prefix="", tmp_prefix=None): - print(tmp_prefix, file=sys.stderr) if tmp_prefix and not os.path.exists(tmp_prefix): os.makedirs(tmp_prefix) input_files = [] @@ -156,7 +155,7 @@ def index(input, recursive=False, output_prefix="", tmp_prefix=None): except: print("[ERROR] Failed to open {f5}, skip file for indexing".format(f5=input_file), file=sys.stderr) yield '\t'.join([input_relative, ID]) - + def extract(self, input, output, format='single'): if not os.path.exists(output): os.makedirs(output) @@ -191,8 +190,13 @@ def extract(self, input, output, format='single'): with tempfile.TemporaryDirectory(prefix=self.tmp_prefix) as tmpdirname: self.__copy_reads_to__(batch_ids, tmpdirname) f5files = [os.path.join(dirpath, f) for dirpath, _, files in os.walk(tmpdirname) for f in files if f.endswith('.fast5')] - output_bulk_file = os.path.join(output, os.path.basename(batch_name) + '.fast5') - single_to_multi_fast5.create_multi_read_file(f5files, output_bulk_file) + if len(f5files) > 4000: + for i, f5_batch in enumerate(fast5_Index.__chunked__(f5files, 4000)): + output_bulk_file = os.path.join(output, os.path.basename(batch_name) + '_{i}.fast5'.format(i=i)) + single_to_multi_fast5.create_multi_read_file(f5_batch, output_bulk_file) + else: + output_bulk_file = os.path.join(output, os.path.basename(batch_name) + '.fast5') + single_to_multi_fast5.create_multi_read_file(f5files, output_bulk_file) else: raise RuntimeError('[ERROR] Raw fast5 batch extension {} not supported.'.format(batch_ext)) From 4063a4068a9ed49cf6d4bef670c805c17d514059 Mon Sep 17 00:00:00 2001 From: Pay Giesselmann Date: Thu, 25 Apr 2019 10:40:47 +0200 Subject: [PATCH 11/19] allow comments in runnames --- Snakefile | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 760c9f4..7174406 100755 --- a/Snakefile +++ b/Snakefile @@ -59,6 +59,11 @@ nanopype_tag = get_tag() config['version'] = {'tag': nanopype_tag} +# make raw data directory absolute path +if os.path.exists(config['storage_data_raw']): + config['storage_data_raw'] = os.path.abspath(config['storage_data_raw']) + + # append username to shadow prefix if not present if hasattr(workflow, "shadow_prefix") and workflow.shadow_prefix: shadow_prefix = workflow.shadow_prefix @@ -170,7 +175,7 @@ else: # names for multi-run rules runnames = [] if os.path.isfile('runnames.txt'): - runnames = [line.rstrip('\n') for line in open('runnames.txt')] + runnames = [line.rstrip() for line in open('runnames.txt') if line.rstrip() and not line.startswith('#')] config['runnames'] = runnames @@ -183,6 +188,15 @@ if os.path.isfile('barcodes.yaml'): config['barcodes'] = barcodes +# region of interest tags +roi = {} +if os.path.isfile('roi.yaml'): + with open("roi.yaml", 'r') as fp: + roi_map = yaml.load(fp) + roi = roi_map +config['roi'] = roi + + # include modules include : "rules/storage.smk" include : "rules/basecalling.smk" From 0caa53429cacba33f475e5d9c9a95026db548851 Mon Sep 17 00:00:00 2001 From: Pay Giesselmann Date: Thu, 25 Apr 2019 15:36:06 +0200 Subject: [PATCH 12/19] update guppy, pychopper and STRique --- rules/install.smk | 11 ++++++----- singularity/basecalling/Dockerfile | 6 +++--- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/rules/install.smk b/rules/install.smk index f08b07e..f699309 100755 --- a/rules/install.smk +++ b/rules/install.smk @@ -348,7 +348,8 @@ rule guppy: shell: """ # wget https://mirror.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_2.3.1_linux64.tar.gz && - wget https://mirror.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_2.3.5_linux64.tar.gz && \ + # wget https://mirror.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_2.3.5_linux64.tar.gz && + wget https://mirror.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_2.3.7_linux64.tar.gz && \ tar --skip-old-files -xzf ont-guppy-cpu_2.3.5_linux64.tar.gz -C ./ --strip 1 && \ rm ont-guppy-cpu_2.3.5_linux64.tar.gz """ @@ -360,9 +361,9 @@ rule pychopper: """ mkdir -p src && cd src if [ ! -d pychopper ]; then - git clone https://github.com/nanoporetech/pychopper --branch v0.4.0 && cd pychopper + git clone https://github.com/nanoporetech/pychopper --branch v0.5.0 && cd pychopper else - cd pychopper && git fetch --all --tags --prune && git checkout v0.4.0 + cd pychopper && git fetch --all --tags --prune && git checkout v0.5.0 fi # TODO fix for error 'libparasail.so not found' {config[python]} -m pip install --upgrade incremental @@ -432,9 +433,9 @@ rule strique: """ mkdir -p src && cd src if [ ! -d STRique ]; then - git clone --recursive https://github.com/giesselmann/STRique --branch v0.2.0 && cd STRique + git clone --recursive https://github.com/giesselmann/STRique --branch v0.3.0 && cd STRique else - cd STRique && git fetch --all --tags --prune && git checkout v0.2.0 + cd STRique && git fetch --all --tags --prune && git checkout v0.3.0 fi {config[python]} -m pip install -r requirements.txt --upgrade {config[python]} setup.py install diff --git a/singularity/basecalling/Dockerfile b/singularity/basecalling/Dockerfile index 886b123..3fcb401 100755 --- a/singularity/basecalling/Dockerfile +++ b/singularity/basecalling/Dockerfile @@ -76,9 +76,9 @@ apt-get install -y --no-install-recommends wget \ RUN update-ca-certificates ## GUPPY -RUN wget https://mirror.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_2.3.5_linux64.tar.gz && \ - tar --skip-old-files -xzf ont-guppy-cpu_2.3.5_linux64.tar.gz -C /usr/ --strip 1 && \ - rm ont-guppy-cpu_2.3.5_linux64.tar.gz +RUN wget https://mirror.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_2.3.7_linux64.tar.gz && \ + tar --skip-old-files -xzf ont-guppy-cpu_2.3.7_linux64.tar.gz -C /usr/ --strip 1 && \ + rm ont-guppy-cpu_2.3.7_linux64.tar.gz RUN mkdir -p /app COPY . /app/ From bbd64af7b693bc901bc30274b59ff2e112d7be30 Mon Sep 17 00:00:00 2001 From: Pay Giesselmann Date: Thu, 25 Apr 2019 15:38:35 +0200 Subject: [PATCH 13/19] update pip in Docker --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index a20f18b..0607cea 100755 --- a/Dockerfile +++ b/Dockerfile @@ -51,6 +51,7 @@ RUN mkdir -p /src WORKDIR /src RUN wget https://bootstrap.pypa.io/get-pip.py RUN python3 get-pip.py +RUN pip3 install --upgrade pip # copy and configure nanopype RUN mkdir -p /app From 7bb851e23f70dd24299cb20537051787c0612fbe Mon Sep 17 00:00:00 2001 From: Pay Giesselmann Date: Thu, 25 Apr 2019 15:39:45 +0200 Subject: [PATCH 14/19] batch output cleanup rules --- Snakefile | 5 ++- rules/clean.smk | 109 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+), 1 deletion(-) create mode 100755 rules/clean.smk diff --git a/Snakefile b/Snakefile index 7174406..b76c9b6 100755 --- a/Snakefile +++ b/Snakefile @@ -48,7 +48,7 @@ def get_tag(): try: version = subprocess.check_output(cmd.split(), cwd=os.path.dirname(workflow.snakefile)).decode().strip() except subprocess.CalledProcessError: - raise RuntimeError('Unable to get version number from git tags') + raise RuntimeError('[ERROR] Unable to get version number from git tags.') if '-' in version: return 'latest' else: @@ -62,6 +62,8 @@ config['version'] = {'tag': nanopype_tag} # make raw data directory absolute path if os.path.exists(config['storage_data_raw']): config['storage_data_raw'] = os.path.abspath(config['storage_data_raw']) +else: + raise RuntimeError("[ERROR] Raw data archive not found.") # append username to shadow prefix if not present @@ -205,3 +207,4 @@ include : "rules/methylation.smk" include : "rules/sv.smk" include : "rules/demux.smk" include : "rules/transcript.smk" +include : "rules/clean.smk" \ No newline at end of file diff --git a/rules/clean.smk b/rules/clean.smk new file mode 100755 index 0000000..2c48171 --- /dev/null +++ b/rules/clean.smk @@ -0,0 +1,109 @@ +# \HEADER\------------------------------------------------------------------------- +# +# CONTENTS : Snakemake nanopore data pipeline +# +# DESCRIPTION : clean up batch data +# +# RESTRICTIONS : none +# +# REQUIRES : none +# +# --------------------------------------------------------------------------------- +# Copyright (c) 2018-2019, Pay Giesselmann, Max Planck Institute for Molecular Genetics +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +# Written by Pay Giesselmann +# --------------------------------------------------------------------------------- +# imports +import os, sys, glob +# local rules +localrules: basecaller_merge_batches, basecaller_merge_tag, basecaller_qc, basecalling_clean + + +# remove flag files to force re-run of cleanup +for f in glob.glob('.*.done'): + os.remove(f) + + +# clean up compute batches basecalling +rule basecalling_clean: + input: + [dirpath for dirpath, _, files in os.walk('sequences') if dirpath.endswith('batches')] + output: + touch('.basecalling_clean.done') + shell: + "rm -r {input}" + +# clean up compute batches alignment +rule alignment_clean: + input: + [dirpath for dirpath, _, files in os.walk('alignments') if dirpath.endswith('batches')] + output: + touch('.alignment_clean.done') + shell: + "rm -r {input}" + +# clean up compute batches methylation +rule methylation_clean: + input: + [dirpath for dirpath, _, files in os.walk('methylation') if dirpath.endswith('batches')] + output: + touch('.methylation_clean.done') + shell: + "rm -r {input}" + +# clean up compute batches sv +rule sv_clean: + input: + [dirpath for dirpath, _, files in os.walk('sv') if dirpath.endswith('batches')] + output: + touch('.sv_clean.done') + shell: + "rm -r {input}" + +# clean up compute batches demux +rule demux_clean: + input: + [dirpath for dirpath, _, files in os.walk('demux') if dirpath.endswith('batches')] + output: + touch('.demux_clean.done') + shell: + "rm -r {input}" + +# clean up compute batches transcript +rule transcript_isoforms_clean: + input: + [dirpath for dirpath, _, files in os.walk('transcript_isoforms') if dirpath.endswith('batches')] + output: + touch('.transcript_isoforms_clean.done') + shell: + "rm -r {input}" + +# clean up everything +rule clean: + input: + rules.basecalling_clean.output, + rules.alignment_clean.output, + rules.methylation_clean.output, + rules.sv_clean.output, + rules.demux_clean.output, + rules.transcript_isoforms_clean.output + shell: + "rm {input}" \ No newline at end of file From 4d84eaf27596be2e0411aa64bd908516f4e082c8 Mon Sep 17 00:00:00 2001 From: Pay Giesselmann Date: Thu, 25 Apr 2019 15:42:46 +0200 Subject: [PATCH 15/19] minor fixes --- nanopype.yaml | 3 ++- rules/methylation.smk | 8 +++++--- rules/storage.smk | 8 ++++---- rules/sv.smk | 4 ++-- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/nanopype.yaml b/nanopype.yaml index ae29742..cca0c3a 100755 --- a/nanopype.yaml +++ b/nanopype.yaml @@ -73,7 +73,8 @@ deepbinner_models: # command line flags directly passed to the sniffles execution sv_sniffles_flags: '-s 10 -l 30 -r 2000 --genotype' sv_STRique_config: 'STRique.tsv' -sv_STRique_model: 'template_median68pA6mer.model' +sv_STRique_model: 'r9_4_450bps.mpi.model' +sv_STRique_mod_model: 'r9_4_450bps_mCpG.mpi.model' # transcriptome transcript_use_pychopper: True diff --git a/rules/methylation.smk b/rules/methylation.smk index afbbd4b..18bdc81 100755 --- a/rules/methylation.smk +++ b/rules/methylation.smk @@ -74,7 +74,7 @@ def get_min_coverage(wildcards): rule methylation_nanopolish: input: batch = lambda wildcards : get_signal_batch(wildcards, config), - run = lambda wildcards : os.path.join(config['storage_data_raw'], wildcards.runname), + run = lambda wildcards : [os.path.join(config['storage_data_raw'], wildcards.runname)] + [os.path.join(config['storage_data_raw'], wildcards.runname, 'reads.fofn')] if get_signal_batch(wildcards, config).endswith('.txt') else [], sequences = lambda wildcards : get_sequence_batch(wildcards, config), bam = lambda wildcards : get_alignment_batch(wildcards, config), bai = lambda wildcards : get_alignment_batch(wildcards, config) + '.bai', @@ -86,12 +86,14 @@ rule methylation_nanopolish: resources: mem_mb = lambda wildcards, input, threads, attempt: int((1.0 + (0.1 * (attempt - 1))) * (8000 + 500 * threads)), time_min = lambda wildcards, input, threads, attempt: int((960 / threads) * attempt) # 60 min / 16 threads + params: + index = lambda wildcards : os.path.join(config['storage_data_raw'], wildcards.runname, 'reads.fofn') singularity: "docker://nanopype/methylation:{tag}".format(tag=config['version']['tag']) shell: """ mkdir -p raw - {config[bin][python]} {config[sbin][storage_fast5Index.py]} extract {input.batch} raw/ --index {params.index} --output_format single + {config[bin][python]} {config[sbin][storage_fast5Index.py]} extract {input.batch} raw/ --index {params.index} --output_format lazy zcat {input.sequences} > sequences.fastx {config[bin_singularity][nanopolish]} index -d raw/ sequences.fastx {config[bin_singularity][nanopolish]} call-methylation -t {threads} -r sequences.fastx -g {input.reference} -b {input.bam} > {output} @@ -215,7 +217,7 @@ rule methylation_bedGraph: input: "methylation/{methylation_caller}/{aligner}/{sequence_workflow}/{tag}.{reference}.frequencies.tsv" output: - "methylation/{methylation_caller, [^.\/]*}/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/{tag, [^.\/]*}.{coverage, [^.\/]*}.{reference, [^.\/]*}.bedGraph" + "methylation/{methylation_caller, [^.\/]*}/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/{tag, [^\/]*}.{coverage, [^.\/]*}.{reference, [^.\/]*}.bedGraph" params: methylation_min_coverage = lambda wildcards : get_min_coverage(wildcards) singularity: diff --git a/rules/storage.smk b/rules/storage.smk index 57832c1..b861ab3 100755 --- a/rules/storage.smk +++ b/rules/storage.smk @@ -66,10 +66,10 @@ rule storage_index_run: batches = get_batches_indexing output: fofn = "{data_raw}/{{runname}}/reads.fofn".format(data_raw = config["storage_data_raw"]) - shell: - """ - cat {input} > {output} - """ + run: + with open(output[0], 'w') as fp: + for f in input.batches: + print(open(f, 'r').read(), end='', file=fp) # index multiple runs rule storage_index_runs: diff --git a/rules/sv.smk b/rules/sv.smk index 94408d7..67069a0 100755 --- a/rules/sv.smk +++ b/rules/sv.smk @@ -95,7 +95,7 @@ rule strique: shadow: "minimal" threads: config['threads_sv'] params: - model = config['sv_STRique_model'], + model = config['sv_STRique_model'] if 'sv_STRique_model' in config else '', mod_model = '--mod_model {}'.format(config['sv_STRique_mod_model']) if 'sv_STRique_mod_model' in config else '' resources: mem_mb = lambda wildcards, threads, attempt: int((1.0 + (0.1 * (attempt - 1))) * (16000 + 2000 * threads)), @@ -105,7 +105,7 @@ rule strique: shell: """ export TMPDIR=$(pwd) - {config[bin_singularity][samtools]} view -F 2308 {input.bam} | {config[bin_singularity][python]} {config[bin_singularity][strique]} count {input.index} {params.model} {input.config} {params.mod_model} --t {threads} --log_level debug > {output} + {config[bin_singularity][samtools]} view -F 2308 {input.bam} | {config[bin_singularity][python]} {config[bin_singularity][strique]} count {input.index} {params.model} {input.config} {params.mod_model} --t {threads} > {output} """ rule strique_merge_batches: From c636667af1d7cd3a57b84e78d6c4d63eaa97ed74 Mon Sep 17 00:00:00 2001 From: Pay Giesselmann Date: Thu, 25 Apr 2019 20:53:09 +0200 Subject: [PATCH 16/19] update docs --- docs/release-notes.md | 28 ++++++++++++++++++++-------- docs/usage/cluster.md | 2 +- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/docs/release-notes.md b/docs/release-notes.md index 958659d..8a785d8 100755 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -1,7 +1,20 @@ # Release notes -#### v0.5.0 +#### v0.6.0 - 2019-04-26 +Development release: + +The output of nanopype >= v0.6.0 is **not** backwards compatible due to major changes in the output filesystem structure. +: * Introduce tag-concept to re-run the same workflow under different conditions + * Transparent demultiplexing with 'special' tags + * Rules to clean up batch data + * Enhance documentation on singularity usage + * Include STRique version v0.3.0 + * Update Pychopper to version v0.5.0 + * Update guppy_basecaller to version v2.3.7 + + +#### v0.5.0 - 2019-03-26 Development release: The output of nanopype >= v0.5.0 is **not** backwards compatible due to major changes in the output filesystem structure. @@ -9,14 +22,14 @@ The output of nanopype >= v0.5.0 is **not** backwards compatible due to major ch : * Rework alignment module to support sequence post-processing * Enable Pinfish package for isoform detection -#### v0.4.1 +#### v0.4.1 - 2019-03-15 Development release: : * Update guppy_basecaller to version v2.3.5 -#### v0.4.0 +#### v0.4.0 - 2019-03-13 Development release: Version 0.4.0 is the singularity and bulk-fast5 release. @@ -25,8 +38,8 @@ Version 0.4.0 is the singularity and bulk-fast5 release. * Support both, .tar of single fast5 and bulk-fast5 of current MinKNOW versions * Enhance documentation details on installation and cluster usage -#### v0.3.0 +#### v0.3.0 - 2019-02-05 Development release: The output of nanopype >= v0.3.0 is **not** backwards compatible due to major changes in the output filesystem structure. @@ -34,8 +47,8 @@ The output of nanopype >= v0.3.0 is **not** backwards compatible due to major ch : * Fix guppy installation instructions to not overwrite existing library installations * Apply general output structure of *tool2/tool1/runs/runname.x* to all processing chains (see module documentation/tutorial for details). -#### v0.2.1 +#### v0.2.1 - 2019-01-28 Development release: : * Parse experimental Flappie methylation calls to bedGraph/bigWig @@ -45,16 +58,15 @@ Known issues: : * Installing guppy after flappie is overwriting a symlink to *libhdf5.so* causing the flappie basecaller to load a hdf5 library it was not linked to. This will be fixed in a future release by separating the libraries properly. -#### v0.2.0 +#### v0.2.0 - 2019-01-21 Development release: : * Nanopolish v0.11.0 * htslib v1.9 * OpenBLAS v0.3.4 -#### v0.1.0 - +#### v0.1.0 - 2018-12-24 Initial release with tool versions (for development or untagged repositories, the master branch with the tested version in brackets is used): : * Bedtools v2.27.1 diff --git a/docs/usage/cluster.md b/docs/usage/cluster.md index 8c619b0..fd30fc1 100755 --- a/docs/usage/cluster.md +++ b/docs/usage/cluster.md @@ -2,7 +2,7 @@ Snakemake allows both, cloud and cluster execution of workflows. As of now only cluster execution is implemented and tested in Nanopype. The [configuration](../installation/configuration.md) section covers multiple scenarios of integrating supported and custom cluster management systems into the Nanopype workflow. For supported cluster engines please also refer to the [Snakemake](https://snakemake.readthedocs.io/en/stable/executable.html#cluster-execution) documentation. -Following the steps in the [general](general.md) workflow documentation, the usage of a cluster backend requires only a few additional flags and arguments in the Snakemake command. The use of [profiles](../installation/configuration) is therefore recommended. The following command line options of Snakmake are of particular interest: +Following the steps in the [general](general.md) workflow documentation, the usage of a cluster backend requires only a few additional flags and arguments in the Snakemake command. The use of [profiles](../installation/configuration.md) is therefore recommended. The following command line options of Snakmake are of particular interest: **--profile** : Path to profile directory with command line arguments and flags. A *config.yaml* is expected to be found in the profile folder From fe79d4d7924002be06eb585349ca0244cbb99671 Mon Sep 17 00:00:00 2001 From: Pay Giesselmann Date: Thu, 25 Apr 2019 21:40:30 +0200 Subject: [PATCH 17/19] fix pychopper install --- rules/install.smk | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/rules/install.smk b/rules/install.smk index f699309..2fb7039 100755 --- a/rules/install.smk +++ b/rules/install.smk @@ -56,11 +56,6 @@ rule analysis: "bin/racon", "bin/cdna_classifier.py" -rule all: - input: - rules.processing.input, - rules.analysis.input - rule alignment: input: "bin/minimap2", @@ -77,6 +72,7 @@ rule methylation: rule transcript: input: + "bin/cdna_classifier.py", "bin/minimap2", "bin/samtools", "bin/racon", @@ -85,6 +81,12 @@ rule transcript: "bin/polish_clusters", "bin/spliced_bam2gff" +rule all: + input: + rules.processing.input, + rules.analysis.input, + rules.transcript.input + # helper functions def find_go(): for path in os.environ["PATH"].split(os.pathsep): @@ -369,6 +371,7 @@ rule pychopper: {config[python]} -m pip install --upgrade incremental {config[python]} -m pip install --upgrade certifi {config[python]} -m pip install parasail==1.1.15 --upgrade + {config[python]} -m pip install "matplotlib<3.1" --upgrade {config[python]} setup.py install cp $(pwd)/scripts/cdna_classifier.py ../../{output.bin} """ @@ -440,4 +443,4 @@ rule strique: {config[python]} -m pip install -r requirements.txt --upgrade {config[python]} setup.py install cp scripts/STRique.py ../../bin/ - """ \ No newline at end of file + """ From b04c2eaf9a4ee8e75371a37111b682606670b213 Mon Sep 17 00:00:00 2001 From: Pay Giesselmann Date: Fri, 26 Apr 2019 11:38:41 +0200 Subject: [PATCH 18/19] fix guppy and STRique install --- rules/install.smk | 4 ++-- singularity/sv/Dockerfile | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/rules/install.smk b/rules/install.smk index 2fb7039..1e60a52 100755 --- a/rules/install.smk +++ b/rules/install.smk @@ -352,8 +352,8 @@ rule guppy: # wget https://mirror.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_2.3.1_linux64.tar.gz && # wget https://mirror.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_2.3.5_linux64.tar.gz && wget https://mirror.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_2.3.7_linux64.tar.gz && \ - tar --skip-old-files -xzf ont-guppy-cpu_2.3.5_linux64.tar.gz -C ./ --strip 1 && \ - rm ont-guppy-cpu_2.3.5_linux64.tar.gz + tar --skip-old-files -xzf ont-guppy-cpu_2.3.7_linux64.tar.gz -C ./ --strip 1 && \ + rm ont-guppy-cpu_2.3.7_linux64.tar.gz """ rule pychopper: diff --git a/singularity/sv/Dockerfile b/singularity/sv/Dockerfile index ba29636..1d379ca 100755 --- a/singularity/sv/Dockerfile +++ b/singularity/sv/Dockerfile @@ -59,7 +59,8 @@ RUN python3 -m pip install -r requirements.txt ## run setup rules RUN mkdir /build -RUN snakemake --snakefile rules/install.smk --config build_generator=Ninja --directory /build sniffles +RUN snakemake --snakefile rules/install.smk --directory /build sniffles +RUN snakemake --snakefile rules/install.smk --directory /build strique # PACKAGE STAGE FROM ubuntu:16.04 @@ -78,6 +79,8 @@ ENV LANG en_US.UTF-8 ENV LANGUAGE en_US:en ENV LC_ALL en_US.UTF-8 +COPY --from=build_stage /usr/local/lib/python3.5/dist-packages /usr/local/lib/python3.5/dist-packages + RUN mkdir -p /app COPY . /app/ WORKDIR /app @@ -86,9 +89,6 @@ RUN wget https://bootstrap.pypa.io/get-pip.py RUN python3 get-pip.py RUN python3 -m pip install -r requirements.txt -RUN snakemake --snakefile rules/install.smk --directory /usr strique -RUN rm -rf /usr/src/STRique - COPY --from=build_stage /build/bin/* /usr/bin/ WORKDIR / From 33ec63df5eea7bd709fa37a0f32ee081a4934603 Mon Sep 17 00:00:00 2001 From: Pay Giesselmann Date: Tue, 30 Apr 2019 14:53:08 +0200 Subject: [PATCH 19/19] v0.6.0 candidate --- Snakefile | 9 +-- docs/examples/align.md | 4 +- docs/installation/configuration.md | 93 ++++++++++++++++++++++-------- docs/installation/singularity.md | 2 +- docs/release-notes.md | 4 +- docs/rules/alignment.md | 40 ++++++++----- docs/rules/basecalling.md | 25 +++++--- docs/rules/demux.md | 55 +++++++++++------- docs/rules/methylation.md | 31 ++++++---- docs/rules/storage.md | 5 ++ docs/rules/sv.md | 18 ++++++ docs/rules/transcript.md | 6 +- docs/usage/general.md | 85 +++++++++++++++++---------- env.yaml | 1 - requirements.txt | 2 +- rules/basecalling.smk | 18 +++--- rules/clean.smk | 10 ++-- rules/methylation.smk | 4 +- rules/storage.smk | 34 ++--------- setup.py | 4 +- 20 files changed, 280 insertions(+), 170 deletions(-) diff --git a/Snakefile b/Snakefile index b76c9b6..58d479d 100755 --- a/Snakefile +++ b/Snakefile @@ -88,15 +88,16 @@ if 'references' in nanopype_env: config['references'] = {} for name, values in nanopype_env['references'].items(): genome = values['genome'] - chr_sizes = values['chr_sizes'] + chr_sizes = values['chr_sizes'] if 'chr_sizes' in values else '' if not os.path.isfile(genome): print("[WARNING] Genome for {name} not found in {genome}, skipping entry".format( name=name, genome=genome), file=sys.stderr) continue - if not os.path.isfile(chr_sizes): + if chr_sizes and not os.path.isfile(chr_sizes): print("[WARNING] Chromosome sizes for {name} not found in {chr_sizes}, skipping entry".format( name=name, chr_sizes=chr_sizes), file=sys.stderr) continue + print('add ', name) config['references'][name] = {"genome":genome, "chr_sizes":chr_sizes} @@ -155,7 +156,7 @@ for s in [s for s in os.listdir(os.path.join(os.path.dirname(workflow.snakefile) else: config['sbin_singularity'][s] = config['sbin'][s] - + # helper of submodules are called relative to the pipeline base directory config['sbin']['base'] = os.path.join(os.path.dirname(workflow.snakefile)) if hasattr(workflow, 'use_singularity') and workflow.use_singularity: @@ -207,4 +208,4 @@ include : "rules/methylation.smk" include : "rules/sv.smk" include : "rules/demux.smk" include : "rules/transcript.smk" -include : "rules/clean.smk" \ No newline at end of file +include : "rules/clean.smk" diff --git a/docs/examples/align.md b/docs/examples/align.md index 8c88a16..668a4a4 100755 --- a/docs/examples/align.md +++ b/docs/examples/align.md @@ -21,7 +21,7 @@ The *env.yaml* in the original repository already contains an entry for this tut The easiest way to obtain an alignment is to directly request the output file with: ``` -snakemake --snakefile ~/src/nanopype/Snakefile -j 4 alignments/minimap2/guppy/20170725_FAH14126_FLO-MIN107_SQK-LSK308_human_Hues64.test.bam +snakemake --snakefile ~/src/nanopype/Snakefile -j 4 alignments/minimap2/guppy/batches/tutorial/20170725_FAH14126_FLO-MIN107_SQK-LSK308_human_Hues64.test.bam ``` This will trigger base calling using guppy and an alignment against our test genome with minimap2. In order to process multiple runs in parallel and merge the results into a single output file, a runnames.txt in the working directory is used: @@ -49,5 +49,3 @@ Note that the second command only contains jobs running minimap2. ## Batch processing Browsing the working directory after this tutorial you will notice the batch output in form of e.g. *0.fastq.gz* or *0.test.bam* in a specific batches folder. These files are temporary and can be deleted after the processing. They are however required as input for the processing, thus if you delete the sequence batches a subsequent alignment run will include base calling to get the batch-wise input. - -We will target this issue in a future release of Nanopype. The merged fastx filex can be split according to the read name and merged bam files already contain a RG tag with the source batch. diff --git a/docs/installation/configuration.md b/docs/installation/configuration.md index d107d5f..2bc8034 100755 --- a/docs/installation/configuration.md +++ b/docs/installation/configuration.md @@ -1,6 +1,6 @@ # Configuration -Nanopype has two configuration layers: The central **environment** configuration *env.yaml* covers application paths and reference genomes and is set up independent of installation method and operating system once. The environment configuration is stored in the installation directory. +Nanopype has two configuration layers: The central **environment** configuration *env.yaml* covers application paths and reference genomes and is set up independent of installation method and operating system once. The environment configuration is stored in the installation directory. If a compute cluster is available the respective Snakemake configuration is only needed once per Nanopype installation and explained here by the example of a custom scheduler called *mxq*. For a project an additional **[workflow](../usage/general.md)** configuration is required providing data sources, tool flags and parameters. The workflow config file *nanopype.yaml* is expected in the root of each project directory. Configuration files are in .yaml format. @@ -57,6 +57,7 @@ references: bedGraphToBigWig: ~/bin/bedGraphToBigWig ``` +Additional references possibly only needed once in a **[workflow](../usage/general.md)** can be configured in the *nanopype.yaml* of the working directory. ## Profiles @@ -89,31 +90,28 @@ Additional profiles can be found at [https://github.com/snakemake-profiles/doc]( The [cluster configuration](https://snakemake.readthedocs.io/en/stable/snakefiles/configuration.html#cluster-configuration) of Snakemake is separated from the workflow and can be provided in .json or .yaml format. The configuration is composed of default settings mapping to all rules (e.g. basecalling, alignment, etc.) and rule specific settings enabling a fine grained control of resources such as memory and run time. -??? info "example cluster.json for LSF/BSUB ([source](https://snakemake.readthedocs.io/en/stable/snakefiles/configuration.html#cluster-configuration))" +??? info "example cluster.json for MXQ" ``` { "__default__" : { - "queue" : "medium_priority", - "nCPUs" : "16", - "memory" : 20000, - "resources" : "\"select[mem>20000] rusage[mem=20000] span[hosts=1]\"", - "name" : "JOBNAME.{rule}.{wildcards}", - "output" : "logs/cluster/{rule}.{wildcards}.out", - "error" : "logs/cluster/{rule}.{wildcards}.err" + "nCPUs" : "8", + "memory" : "20000", + "time" : "60" }, "minimap2" : { - "memory" : 30000, - "resources" : "\"select[mem>30000] rusage[mem=30000] span[hosts=1]\"", + "memory" : "30000", + "time" : "30" + "nCPUs" : "16" } } ``` -Parameters of the cluster config are accessible inside the cluster submission command: +Rule names of Nanopype can be obtained by starting either a dryrun (-n) or directly from the source. Parameters of the cluster config are accessible inside the cluster submission command: - snakemake --cluster-config cluster.json --cluster "sbatch -A {cluster.account} -p {cluster.partition} -n {cluster.n} -t {cluster.time}" + snakemake --cluster-config cluster.json --cluster "mxqsub --threads={cluster.nCPUs} --memory={cluster.memory} --runtime={cluster.time}" This cluster configuration is static per target and needs to be conservative enough to handle any type of input data. For a more fine grained control Nanopype specifies per rule a configurable number of threads and calculates the estimated run time and memory consumption accordingly. Integration and customization of these parameters is described in the following section. @@ -121,8 +119,10 @@ This cluster configuration is static per target and needs to be conservative eno All Nanopype workflows specify **threads** and **time_min** and **mem_mb** resources. Furthermore runtime and memory are dynamically increased if a clsuter job fails or is killed by the cluster management (Due to runtime or memory violation). To restart jobs Snakemake needs to be executed with e.g. --restart-times 3. If supported by the cluster engine you could then use: - snakemake --cluster "qsub {threads} --runtime {resources.time_min} --memory {resources.mem_mb}" - + snakemake --cluster "mxqsub --threads={threads} --runtime={resources.time_min} --memory={resources.mem_mb}" + +Please note that the *threads* resource in this example is now not set per rule as in the previous example, but configured per module in the workflow config file *nanopype.yaml* in the working directory. + If the formats of estimated runtime and memory consumption do not match your cluster system a custom [wrapper](https://snakemake.readthedocs.io/en/stable/executable.html#job-properties) script is easily set up. To convert from time in minutes to a custom string the following snippet is a starting point: ??? info "example cluster_wrapper.py format conversion" @@ -132,7 +132,7 @@ If the formats of estimated runtime and memory consumption do not match your clu jobscript = sys.argv[-1] job_properties = read_job_properties(jobscript) - + # default resources threads = '1' runtime = '01:00' @@ -144,21 +144,21 @@ If the formats of estimated runtime and memory consumption do not match your clu resources = job_properties["resources"] if "mem_mb" in resources: memory = str(resources["mem_mb"]) + 'M' if "time_min" in resources: runtime = "{:02d}:{:02d}".format(*divmod(resources["time_min"], 60)) - + # cmd and submission - cmd = 'sub --threads={threads} --memory={memory} --time="{runtime}" {jobscript}'.format(threads=threads, memory=memory, runtime=runtime, jobscript=jobscript) + cmd = 'mxqsub --threads={threads} --memory={memory} --time="{runtime}" {jobscript}'.format(threads=threads, memory=memory, runtime=runtime, jobscript=jobscript) os.system(cmd) ``` The respective Snakemake command line would then be: snakemake --cluster cluster_wrapper.py - -resulting in a cluster submission on the shell similar to: - sub --threads=1 --memory=16000M --time="00:15" ./snakemake/tmp123/job_xy.sh - - +resulting in a cluster submission of the temporary job script on the shell similar to: + + mxqsub --threads=1 --memory=16000M --time="00:15" ./snakemake/tmp123/job_xy.sh + + ## Custom cluster integration A full example on how to enable a not yet supported cluster system is given in *profiles/mxq/* of the git repository. Briefly four components are required: @@ -168,8 +168,53 @@ A full example on how to enable a not yet supported cluster system is given in * * status.py - job status request from cluster management * jobscript.sh - wrapper script for rules to be executed -**Important:** When working with batches of raw nanopore reads, Nanopype makes use of Snakemakes shadow mechanism. A shadow directory is temporary per rule execution and can be placed on per node local storage to reduce network overhead. The shadow prefix e.g. */scratch/local/* is set in the profiles config.yaml. The *--profile* argument tells Snakemake to use a specific profile: +**Important:** When working with batches of raw nanopore reads, Nanopype makes use of Snakemakes shadow mechanism. A shadow directory is temporary per rule execution and can be placed on per node local storage to reduce network overhead. The shadow prefix e.g. */scratch/local/* can be set in the profiles config.yaml. The *--profile* argument tells Snakemake to use a specific profile: snakemake --profile /path/nanopype/profiles/mxq [OPTIONS...] [FILE] When running in an environment with **multiple users** the shadow prefix should contain a user specific part to ensure accessibility and to prevent corrupted data. A profile per user is compelling in this case and enforced by Nanopype if not given. + +## Logging + +Writing log files becomes useful to inspect the output of potentially failing jobs. The actual implementation is system and user specific, two possible scenarios are given here: + +### Log per output file + +Some cluster schedulers support the redirection of stdout and stderr. These could be caught using the cluster configuration of Snakemake: + +??? info "example cluster.json for MXQ logging" + ``` + { + "__default__" : + { + "output" : "logs/cluster/{rule}.{wildcards}.out", + "error" : "logs/cluster/{rule}.{wildcards}.err" + } + } + ``` + +Parameters of the cluster config are again accessible inside the cluster submission command: + + snakemake --cluster-config cluster.json --cluster "mxqsub --stdout={cluster.output} --stderr={cluster.error}" + +This type of logging will create one log file per output file of Nanopype. Restarting workflows will overwrite previous logging. + +### Log per job submission + +Logging can also be set up independent of Snakemake by modifying the job script itself. + +??? info "example jobscript.sh for MXQ logging" + ``` + H=${{HOSTNAME}} + J=${{MXQ_JOBID:-${{PID}}}} + DATE=`date +%Y%m%d` + + mkdir -p log + ({exec_job}) &> log/${{DATE}}_${{H}}_${{J}}.log + ``` + +Snakemake will replace the *{exec_job}* wildcard with the temporary job script (Double curly brackets are needed for generic environment variables to escape the Snakemake substitution). Note that the environment variables *MXQ_JOBID* and *PID* are specific to the mxq scheduler. The above wrapper will create log files with timestamp and hostname in the filename without overwriting previous logs. + +The custom job script is included to the Snakemake command line by using: + + snakemake --jobscript jobscript.sh --cluster "mxqsub ..." diff --git a/docs/installation/singularity.md b/docs/installation/singularity.md index bfbf43c..1028058 100644 --- a/docs/installation/singularity.md +++ b/docs/installation/singularity.md @@ -1,6 +1,6 @@ # Singularity -In order to use the Singularity driven version of Nanopype a working python3 with Snakemake and the pipeline repository itself are sufficient. At least python version 3.4 is required and we recommend to use a virtual environment. Additionally Singularity needs to be installed system wide. On Ubuntu the package is called *singularity-container*. The installation requires root privileges and might require help of your IT department. +In order to use the Singularity driven version of Nanopype a working python3 with Snakemake and the pipeline repository itself are sufficient. At least python version 3.4 is required and we recommend to use a virtual environment. Additionally Singularity needs to be installed system wide. On Ubuntu the package is called *singularity-container*. The installation requires root privileges and might require help of your IT department. We currently test workflows with Singularity version 2.4.2. Start with creating a virtual environment: diff --git a/docs/release-notes.md b/docs/release-notes.md index 8a785d8..46937b1 100755 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -1,6 +1,6 @@ # Release notes -#### v0.6.0 - 2019-04-26 +#### v0.6.0 - 2019-04-30 Development release: The output of nanopype >= v0.6.0 is **not** backwards compatible due to major changes in the output filesystem structure. @@ -12,7 +12,7 @@ The output of nanopype >= v0.6.0 is **not** backwards compatible due to major ch * Include STRique version v0.3.0 * Update Pychopper to version v0.5.0 * Update guppy_basecaller to version v2.3.7 - + #### v0.5.0 - 2019-03-26 Development release: diff --git a/docs/rules/alignment.md b/docs/rules/alignment.md index 1b54426..a64e75d 100755 --- a/docs/rules/alignment.md +++ b/docs/rules/alignment.md @@ -2,9 +2,9 @@ The alignment step maps basecalled reads against a reference genome. If not already done alignment rules will trigger basecalling of the raw nanopore reads. -To align the reads of one flow cell against reference genome hg38 with *flappie* basecalling and aligner *minimap2* run from within your processing directory: +To align the reads of one flow cell against reference genome hg38 with *guppy* basecalling and aligner *minimap2* run from within your processing directory: - snakemake --snakefile /path/to/nanopype/Snakefile alignments/minimap2/flappie/20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.hg38.bam + snakemake --snakefile /path/to/nanopype/Snakefile alignments/minimap2/guppy/WA01/batches/20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.hg38.bam This requires an entry *hg38* in your **env.yaml** in the Nanopype installation directory: @@ -15,7 +15,7 @@ This requires an entry *hg38* in your **env.yaml** in the Nanopype installation Providing a *runnames.txt* with one runname per line it is possible to process multiple flow cells at once and merge the output into a single track e.g.: - snakemake --snakefile /path/to/nanopype/Snakefile alignments/minimap2/flappie/WA01.hg38.bam + snakemake --snakefile /path/to/nanopype/Snakefile alignments/minimap2/guppy/WA01.hg38.bam Some aligners require an indexed reference genome, Nanopype will automatically build one uppon first use. An alignment job is always connected with a sam2bam conversion and sorting in an additional thread. The above commands with 3 configured alignment threads will therefore fail if Snakemake is invoked with less than 4 threads (at least *-j 4* is required). The default input sequence format for alignment rules is *fastq*, yet if possible, the alignment module will use already basecalled batches in the sequences/[basecaller]/[runname]/ folder of the working directory. @@ -27,23 +27,33 @@ The alignment module can create the following file structure relative to the wor |--alignments/ |--minimap2/ # Minimap2 alignment |--albacore/ # Using albacore basecalling - |--batches - |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01/ - |--0.hg38.bam - |--1.hg38.bam - ... - |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.hg38.bam + |--batches/ + |--WA01/ + |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01/ + |--0.hg38.bam + |--1.hg38.bam + ... + |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.hg38.bam |--WA01.hg38.bam |--graphmap/ # GraphMap alignment - |--flappie/ # Using flappie basecalling - |--batches - |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01/ - |--0.hg19.bam - ... - |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.hg19.bam + |--guppy/ # Using guppy basecalling + |--batches/ + |--WA01/ + |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01/ + |--0.hg19.bam + ... + |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.hg19.bam |--WA01.hg19.bam ``` +## Cleanup + +The batch processing output of the alignment module can be cleaned up by running: + + snakemake --snakefile /path/to/nanopype/Snakefile alignment_clean + +This will delete all files and folders inside of any **batches** directory and should only be used at the very end of an analysis workflow. The methylation module for instance relies on the single batch alignment files. + ## Tools Depending on the application you can choose from one of the listed aligners. All alignment rules share a set of global config variables: diff --git a/docs/rules/basecalling.md b/docs/rules/basecalling.md index 9acbce5..d391c51 100755 --- a/docs/rules/basecalling.md +++ b/docs/rules/basecalling.md @@ -4,7 +4,7 @@ The basecaller translates the raw electrical signal from the sequencer into a nu In order to process the output of one flow cell with the basecaller *albacore* run from within your processing directory: - snakemake --snakefile /path/to/nanopype/Snakefile sequences/albacore/20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.fastq.gz + snakemake --snakefile /path/to/nanopype/Snakefile sequences/albacore/WA01/batches/20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.fastq.gz Valid file extensions are fasta/fa, fastq/fq in gzipped form. Providing a *runnames.txt* with one run name per line it is possible to process multiple flow cells at once and merge the output into a single file e.g.: @@ -24,20 +24,29 @@ The basecalling module can create the following file structure relative to the w |--sequences/ |--albacore/ # Albacore basecaller |--batches/ - |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01/ - |--0.fastq.gz # Sequence batches - |--1.fastq.gz - ... - |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.fastq.gz - |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.fastq.pdf + |--WA01/ + |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01/ + |--0.fastq.gz # Sequence batches + |--1.fastq.gz + ... + |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.fastq.gz + |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.fastq.pdf |--WA01.fastq.gz |--WA01.fastq.pdf |--guppy/ # Guppy basecaller |--... ``` +## Cleanup + +The batch processing output of the basecalling module can be cleaned up by running: + + snakemake --snakefile /path/to/nanopype/Snakefile sequences_clean + +This will delete all files and folders inside of any **batches** directory and should only be used at the very end of an analysis workflow. The alignment module for instance relies on the single batch sequence files. + ## Tools -Depending on the application you can choose from one of the following basecallers, listed with their associated configuration options. Downstream applications making use of the basecalling module can either enforce a specific basecaller or used the default configuration: +Depending on the application you can choose from one of the following basecallers, listed with their associated configuration options. : * threads_basecalling: 4 diff --git a/docs/rules/demux.md b/docs/rules/demux.md index 1ece025..ee88496 100755 --- a/docs/rules/demux.md +++ b/docs/rules/demux.md @@ -1,10 +1,28 @@ # Demultiplexing -With barcoded libraries nanopore sequencing allows to pool multiple samples on a single flow cell. Demultiplexing describes the classification of barcodes per read and the assignment of read groups. The output of the demultiplexing module is a tsv file of read ID and detected barcode. In order to process a barcoded flow cell with e.g. Deepbinner run: +With barcoded libraries nanopore sequencing allows to pool multiple samples on a single flow cell. Demultiplexing describes the classification of barcodes per read and the assignment of read groups. The output of the demultiplexing module is batches of read IDs belonging to a detected barcode. In order to process a barcoded flow cell with Deepbinner run: snakemake --snakefile /path/to/nanopype/Snakefile demux/deepbinner/20180101_FAH12345_FLO-MIN106_SQK-RBK004_WA01.tsv -Note the different sequencing kit *SQK-RBK004* used in this example. +Note the different sequencing kit *SQK-RBK004* used in this example. Aside from explicit demultiplexing the module supports mapping of previously introduced tags to barcodes. A **barcodes.yaml** in the working directory is detected by Nanopype and can map raw demux output to tags: + +??? info "example barcodes.yaml" + ``` + __default__: + NB01 : '1' + NB02 : '2' + 20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01: + NB01 : '3' + NB02 : '4' + ``` + +The file may contain a `__default__` mapping, applied to any not listed run as well as run specific mappings. Each mapping is a key:value pair of tag and barcode. For different runs, the same tag can map to different barcodes allowing the efficient usage of available barcodes in the sequencing kit. Without running the demultiplexing explicitly, an alignment of only one barcode can be obtained by running: + + snakemake --snakefile /path/to/nanopype/Snakefile alignments/minimap2/guppy/NB01.hg38.bam + +Nanopype will detect *NB01* to be a special tag listed in *barcodes.yaml*, start demultiplexing, create batches of read IDs with barcode 1 and run basecalling and alignment only for this subset of reads. + + ## Folder structure @@ -12,35 +30,32 @@ The demultiplexing module can create the following file structure relative to th ```sh |--demux/ - |--albacore/ # Albacore basecaller - |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01/ - |--0.tsv # Demultiplexed batches - |--1.tsv - ... - |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.tsv |--deepbinner/ # Deepbinner neural network - |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01/ - |--0.tsv # Demultiplexed batches - |--1.tsv + |--batches/ + |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01/ + |--0.tsv # classified batches + |--1.tsv ... - |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.tsv + |--barcodes/ + |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01/ + |--1/ # raw barcode for tag:barcode mapping + |--0.tsv # 4k reads of barcode 1 + |--1.tsv + |--2/ ``` ## Tools -The demultiplexing module includes the following tools and their respective configuration: - -### Albacore - -The ONT basecaller directly supports demultiplexing in sequence space. +The demultiplexing module includes the following tools and their respective configuration, shared config variables are: - basecalling_albacore_barcoding: true + threads_demux: 4 + demux_batch_size: 4000 + demux_default: 'deepbinner' ### Deepbinner -Deepbinner: Demultiplexing barcoded Oxford Nanopore Technologies reads with deep convolutional neural networks (CNN). The network is trained to classify barcodes based on the raw nanopore signal. The model for the CNN needs to be copied from the Deepbinner repository to the working directory and depends on the used sequencing kit. The kit is parsed from the run name as described in the [configuration](../installation/configuration.md). +Deepbinner: Demultiplexing barcoded Oxford Nanopore Technologies reads with deep convolutional neural networks (CNN). The network is trained to classify barcodes based on the raw nanopore signal. The model for the CNN needs to be copied from the Deepbinner repository to the working directory and depends on the used sequencing kit. The kit is parsed from the run name as described in the [configuration](../installation/configuration.md), alternatively the *default* mapping can be used to override the kit of the runname. - threads_demux: 4 deepbinner_models: default: SQK-RBK004_read_starts EXP-NBD103: EXP-NBD103_read_starts diff --git a/docs/rules/methylation.md b/docs/rules/methylation.md index 699a051..21a38ac 100755 --- a/docs/rules/methylation.md +++ b/docs/rules/methylation.md @@ -18,31 +18,40 @@ If not already present, this will trigger basecalling and alignment of the raw d |--ngmlr/ # NGMLR alignment |--guppy/ # Guppy sequences |--batches/ - |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01/ - |--0.hg38.tsv # Single read batches - |--1.hg38.tsv - ... - |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.hg38.tsv.gz + |--WA01/ + |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01/ + |--0.hg38.tsv # Single read batches + |--1.hg38.tsv + ... + |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.hg38.tsv.gz |--WA01.1x.hg38.bedGraph # Mean methylation level |--WA01.1x.hg38.bw |--flappie/ # Flappie |--ngmlr/ |--flappie/ |--batches/ - |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01/ - |--0.hg38.tsv # Single read batches - |--1.hg38.tsv - |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.hg38.tsv.gz + |--WA01/ + |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01/ + |--0.hg38.tsv # Single read batches + |--1.hg38.tsv + |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.hg38.tsv.gz |--WA01.1x.hg38.bedGraph |--WA01.1x.hg38.bw ``` +## Cleanup + +The batch processing output of the methylation module can be cleaned up by running: + + snakemake --snakefile /path/to/nanopype/Snakefile methylation_clean + +This will delete all files and folders inside of any **batches** directory and should only be used at the very end of an analysis workflow. ## Tools The output of nanopore methylation calling tools on a lower level is not consistent. Here we provide the description of the intermediate results, enabling more advanced analysis. All methylation rules share a set of global config variables: -: * threads_methylation: 3 +: * threads_methylation: 4 ### Nanopolish @@ -54,7 +63,7 @@ A log-likelihood ratio greater 2.5 is considered a methylated CpG, a ratio less To directly get the single read methylation levels of a single flow cell you can run: - snakemake --snakefile /path/to/nanopype/Snakefile methylation/nanopolish/ngmlr/guppy/20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.hg38.tsv.gz + snakemake --snakefile /path/to/nanopype/Snakefile methylation/nanopolish/ngmlr/guppy/batches/WA01/20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.hg38.tsv.gz Nanopolish specific configuration parameters are: diff --git a/docs/rules/storage.md b/docs/rules/storage.md index e343a84..5df51af 100755 --- a/docs/rules/storage.md +++ b/docs/rules/storage.md @@ -26,6 +26,8 @@ For bulk-fast5 output from recent MinKNOW versions, the batches can be directly |--reads.fofn # Index file ``` +Nanopype expects all batches to be found in the *reads* folder of a run. Restarting an experiment in MinKNOW results in a new raw output folder with batch numbers starting from zero. In current versions of MinKNOW a unique run-ID is part of the batch name, therefore bulk-fast5 files from multiple restarts can be copied into the same directory. After updating MinKNOW the output naming should be verified to avoid overwriting batches with equal names. + ## Import @@ -52,6 +54,9 @@ Together with the import, this is the only rule requiring **write access** to th chmod 555 /data/raw/$run/reads chmod 555 /data/raw/$run +??? tip "Tip" + Internally we use an isolated unix-user and group *mduser* and *mdgrp* owning the raw data. Setting permissions to e.g. 744 for the batch files and 755 for folders allows any analyst to securely read the raw data without accidentally compromising it. + ## Extraction It is possible to extract a **subset** of fast5 files from the packed and indexed run. Extraction requires a previously indexed run, a list of read IDs and works by requesting a directory from Nanopype: diff --git a/docs/rules/sv.md b/docs/rules/sv.md index 10ebc62..7fd217a 100755 --- a/docs/rules/sv.md +++ b/docs/rules/sv.md @@ -16,6 +16,24 @@ The structural variation module can create the following file structure relative |--WA01.hg38.vcf/ ``` +## Cleanup + +The structural variation module requires a merged alignment file of one or multiple flow cells. Since most of the pipeline is working with batches of reads, the merged files can be deleted to save disk space: + +```sh +|--alignments/ + |--ngmlr/ # NGMLR alignment + |--guppy/ # Using guppy basecalling + |--batches/ + |--WA01/ + |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01/ # keep + |--0.hg38.bam # keep + |--1.hg38.bam # keep + ... + |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.hg38.bam # delete + |--WA01.hg38.bam # delete +``` + ## Tools The SV module includes the following tools: diff --git a/docs/rules/transcript.md b/docs/rules/transcript.md index 8df74c4..8b0ae79 100755 --- a/docs/rules/transcript.md +++ b/docs/rules/transcript.md @@ -6,7 +6,7 @@ Another application of the long-read nanopore technology is sequencing of cDNA a The isoform detection in Nanopype is oriented to meet the implementation of the [pinfish-pipeline](https://github.com/nanoporetech/pipeline-pinfish-analysis) from ONT, extended by the basecalling and alignment back-end of Nanopype. To align the reads of one flow cell against reference genome hg38 with *guppy* basecalling and aligner *minimap2* and detect isoforms using pinfish run from within your processing directory: - snakemake --snakefile /path/to/nanopype/Snakefile transcript_isoforms/pinfish/minimap2/guppy/20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.hg38.gff + snakemake --snakefile /path/to/nanopype/Snakefile transcript_isoforms/pinfish/minimap2/guppy/batches/WA01/20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.hg38.gff This requires an entry *hg38* in your **env.yaml** in the Nanopype installation directory: @@ -28,7 +28,9 @@ The transcript module can create the following example file structure relative t |--pinfish |--minimap2/ # Minimap2 alignment |--guppy/ # Using guppy basecalling - |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.hg38.gff + |--batches/ + |--WA01/ + |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.hg38.gff |--WA01.hg38.gff ``` diff --git a/docs/usage/general.md b/docs/usage/general.md index 0a5d3ea..157605c 100755 --- a/docs/usage/general.md +++ b/docs/usage/general.md @@ -8,6 +8,36 @@ The default workflow configuration **nanopype.yaml** in the installation directo The parameters in the config file have significant impact on the computed output. In contrast to the incorporated tools the configuration is not controlled or versioned by Nanopype. Changing entries without re-running the pipeline will lead to inconsistency between config and processed data. +## Snakemake basics + +The basic concept of Snakemake and thus also Nanopype is to request an output file which can be generated from one or more input files. Snakemake will recursively build a graph and determine required intermediate or temporary data to be processed. In general you can run Snakemake from within your processing directory with + + snakemake --snakefile /path/to/nanopype/Snakefile [OPTIONS...] [FILE] + +or from the installation path with specifying the working directory: + + snakemake --directory /path/to/processing [OPTIONS...] [FILE] + +Nanopype expects a **nanopype.yaml** in the root of the processing directory. A template with default values is found in the Nanopype repository. You can either edit the parameters within the file or override them from the command line: + + snakemake [OPTIONS...] [FILE] --config threads_basecalling=16 + +For all available snakemake command line options please also refer to the [snakemake](https://snakemake.readthedocs.io/en/stable) documentation. Particularly interesting are: + +* -n / --dryrun : Print required steps but do not execute, useful to check before e.g. cluster submissions +* -j / --cores / --jobs : In local mode: number of provided cores; In cluster mode: number of jobs submitted at the same time +* -k / --keep-going : Continue execution of independent jobs, in case of errors + + +## Singularity + +Using Nanopype with the Singularity installation method is recommended. Environment and workflow configuration remain the same as above, the Snakemake command line needs to be extended to: + + snakemake --snakefile /path/to/nanopype/Snakefile --use-singularity [OPTIONS...] [FILE] + +Nanopype is detecting its current version from the cloned repository and will fetch the respective Singularity containers. Per default these images are stored in the hidden *.snakemake* folder in the current working directory. For multiple parallel workflows the flag *--singularity-prefix* can be set to fetch and save images only once. Setting the prefix in a [profile](../installation/configuration.md#profiles) along with other static flags is our suggestion. + + ## Raw data archives Raw nanopore read data is organized in directories per flow cell. Their name can be used to encode important experimental information such as flow cell type, kit and IDs. The name **must** at least contain the flow cell type and the sequencing kit which are used by the basecalling module. Furthermore a *reads* sub folder containing batches of raw nanopore reads in *.tar* or bulk-*.fast5* format is required. @@ -40,38 +70,17 @@ Copying the **nanopype.yaml** from the pipeline repository into the processing d The entry *storage_data_raw* is the common base path of the previously described data directories per flow cell. It can be a relative or absolute path and must for cluster usage be accessible from all computing nodes. With *threads_[module]* the number of threads used per batch and specific module can be controlled. The overall number of available threads is set at run time. -Nanopype supports merging of multiple flow cells into a single track. A **runnames.txt** in the processing directory with one raw data folder name per line is required in that case. +Nanopype supports merging of multiple flow cells into a single track. A **runnames.txt** in the processing directory with one raw data folder name per line is required in that case. Lines starting with # are treated as comments, empty lines are ignored. Whereas the following command would trigger basecalling of a single flow cell: - snakemake --snakefile /path/to/nanopype/Snakefile sequences/guppy/20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.fastq.gz + snakemake --snakefile /path/to/nanopype/Snakefile sequences/guppy/WA01/batches/20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.fastq.gz the next will trigger basecalling for all runs listed in *runnames.txt* and merge the results in the end: - snakemake --snakefile /path/to/nanopype/Snakefile sequences/guppy/sample.fastq.gz + snakemake --snakefile /path/to/nanopype/Snakefile sequences/guppy/WA01.fastq.gz -The name of the output file in this case is arbitrary, the extension must be either fasta.gz or fastq.gz. - - -## Snakemake basics - -The basic concept of Snakemake and thus also Nanopype is to request an output file which can be generated from one or more input files. Snakemake will recursively build a graph and determine required intermediate or temporary data to be processed. In general you can run Snakemake from within your processing directory with - - snakemake --snakefile /path/to/nanopype/Snakefile [OPTIONS...] [FILE] - -or from the installation path with specifying the working directory: - - snakemake --directory /path/to/processing [OPTIONS...] [FILE] - -Nanopype expects a **nanopype.yaml** in the root of the processing directory. A template with default values is found in the Nanopype repository. You can either edit the parameters within the file or override them from the command line: - - snakemake [OPTIONS...] [FILE] --config threads_basecalling=16 - -For all available snakemake command line options please also refer to the [snakemake](https://snakemake.readthedocs.io/en/stable) documentation. Particularly interesting are: - -* -n / --dryrun : Print required steps but do not execute, useful to check before e.g. cluster submissions -* -j / --cores / --jobs : In local mode: number of provided cores; In cluster mode: number of jobs submitted at the same time -* -k / --keep-going : Continue execution of independent jobs, in case of errors +The name of the output file is called a **tag** and is in this case is arbitrary. Special tags exist for barcoded runs as described in the [demultiplexing](../rules/demux.md) module. The extension must be either fasta.gz or fastq.gz. ## Processing examples @@ -79,9 +88,9 @@ For all available snakemake command line options please also refer to the [snake If available Nanopype tries to implement and provide multiple, possibly competing tools for the same task. The alignment module for instance allows currently to select from three different long read aligners namely minimap2, graphmap and ngmlr. A key design of Nanopype is that the executed pipeline is reflected in the structure of the output directory. Taking the alignment example, exchanging the aligner in a workflow becomes as simple as: - snakemake --snakefile /path/to/nanopype/Snakefile alignments/minimap2/guppy/sample.hg38.bam - snakemake --snakefile /path/to/nanopype/Snakefile alignments/graphmap/guppy/sample.hg38.bam - snakemake --snakefile /path/to/nanopype/Snakefile alignments/ngmlr/guppy/sample.hg38.bam + snakemake --snakefile /path/to/nanopype/Snakefile alignments/minimap2/guppy/WA01.hg38.bam -j16 + snakemake --snakefile /path/to/nanopype/Snakefile alignments/graphmap/guppy/WA01.hg38.bam -j16 + snakemake --snakefile /path/to/nanopype/Snakefile alignments/ngmlr/guppy/WA01.hg38.bam -j16 All three commands will make use of the same basecalling results but create different .bam files. @@ -99,10 +108,24 @@ In the processing stage Nanopype can create a significant amount of temporary da |--0.hg38.bam |--1.hg38.bam ... - |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.hg38.bam + |--20180101_FAH12345_FLO-MIN106_SQK-LSK108_WA01.hg38.bam |--WA01.hg38.bam ``` -Aside from the final output file *WA01.hg38.bam* the individual .bam files per run and the set of batch alignment files in the *batches* sub folder contain the same information. Depending on the overall processing the intermediate outputs can be deleted. +Aside from the final output file *WA01.hg38.bam* the individual .bam files per run and the set of batch alignment files in the *batches* sub folder contain the same information. Since other modules of the pipeline might later require the batch alignment output as input files, the deletion of the intermediate results is left to the user. + +Nanopype provides a set of cleanup rules to delete the batch processing output. Per module these rules delete everything inside of the **batches** folder. The cleanup is invoked by the rule name from within the working directory for e.g. the basecalling module: + + snakemake --snakefile /path/to/nanopype/Snakefile sequences_clean + +!!! error "Caution" + Deleting the batch output of a module impacts the execution of other workflows as well and should be used with caution at the end of an analysis workflow. Running for instance a structural variation detection requires a merged .bam file. Deleting the batch output of the alignment module afterwards would lead in a later processing of e.g. methylation rates to the reprocessing of the alignment in batches. + +The full list of available cleaning rules is given below: -Since other modules of the pipeline might later require the batch alignment output as input files, the deletion of the intermediate results is left to the user. + * sequences_clean + * alignment_clean + * methylation_clean + * sv_clean + * demux_clean + * transcript_isoforms_clean diff --git a/env.yaml b/env.yaml index 2047148..f689f68 100755 --- a/env.yaml +++ b/env.yaml @@ -1,7 +1,6 @@ references: test: genome: references/chr6.fa - chr_sizes: references/chr6.sizes bin: albacore: read_fast5_basecaller.py diff --git a/requirements.txt b/requirements.txt index ea861aa..21452c3 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -snakemake>=5.3.0 +snakemake>=5.4.3 h5py>=2.7.1 numpy watchdog diff --git a/rules/basecalling.smk b/rules/basecalling.smk index b382633..a594a67 100755 --- a/rules/basecalling.smk +++ b/rules/basecalling.smk @@ -58,7 +58,7 @@ def get_batches_basecaller2(wildcards): rule albacore: input: batch = lambda wildcards : get_signal_batch(wildcards, config), - run = lambda wildcards : os.path.join(config['storage_data_raw'], wildcards.runname) + run = lambda wildcards : [os.path.join(config['storage_data_raw'], wildcards.runname)] + [os.path.join(config['storage_data_raw'], wildcards.runname, 'reads.fofn')] if get_signal_batch(wildcards, config).endswith('.txt') else [] output: "sequences/albacore/batches/{tag, [^\/]*}/{runname, [^.\/]*}/{batch, [^.]*}.{format, (fasta|fastq|fa|fq)}.gz" shadow: "minimal" @@ -71,11 +71,11 @@ rule albacore: kit = lambda wildcards: get_kit(wildcards, config), barcoding = lambda wildcards : '--barcoding' if config['basecalling_albacore_barcoding'] else '', filtering = lambda wildcards : '--disable_filtering' if config['basecalling_albacore_disable_filtering'] else '', - index = lambda wildcards : os.path.join(config['storage_data_raw'], wildcards.runname, 'reads.fofn') + index = lambda wildcards : '--index ' + os.path.join(config['storage_data_raw'], wildcards.runname, 'reads.fofn') if get_signal_batch(wildcards, config).endswith('.txt') else '' shell: """ mkdir -p raw - {config[bin][python]} {config[sbin][storage_fast5Index.py]} extract {input.batch} raw/ --index {params.index} --output_format single + {config[bin][python]} {config[sbin][storage_fast5Index.py]} extract {input.batch} raw/ {params.index} --output_format single {config[bin][albacore]} -i raw/ --recursive -t {threads} -s raw/ --flowcell {params.flowcell} --kit {params.kit} --output_format fastq {params.filtering} {params.barcoding} {config[basecalling_albacore_flags]} FASTQ_DIR='raw/workspace/' if [ \'{params.filtering}\' = '' ]; then @@ -93,7 +93,7 @@ rule albacore: rule guppy: input: batch = lambda wildcards : get_signal_batch(wildcards, config), - run = lambda wildcards : os.path.join(config['storage_data_raw'], wildcards.runname) + run = lambda wildcards : [os.path.join(config['storage_data_raw'], wildcards.runname)] + [os.path.join(config['storage_data_raw'], wildcards.runname, 'reads.fofn')] if get_signal_batch(wildcards, config).endswith('.txt') else [] output: "sequences/guppy/batches/{tag, [^\/]*}/{runname, [^.\/]*}/{batch, [^.]*}.{format, (fasta|fastq|fa|fq)}.gz" shadow: "minimal" @@ -106,13 +106,13 @@ rule guppy: kit = lambda wildcards: get_kit(wildcards, config), #barcoding = lambda wildcards : '--barcoding' if config['basecalling_albacore_barcoding'] else '', filtering = lambda wildcards : '--qscore_filtering --min_qscore {score}'.format(score = config['basecalling_guppy_qscore_filter']) if config['basecalling_guppy_qscore_filter'] > 0 else '', - index = lambda wildcards : os.path.join(config['storage_data_raw'], wildcards.runname, 'reads.fofn') + index = lambda wildcards : '--index ' + os.path.join(config['storage_data_raw'], wildcards.runname, 'reads.fofn') if get_signal_batch(wildcards, config).endswith('.txt') else '' singularity: "docker://nanopype/basecalling:{tag}".format(tag=config['version']['tag']) shell: """ mkdir -p raw - {config[bin_singularity][python]} {config[sbin_singularity][storage_fast5Index.py]} extract {input.batch} raw/ --index {params.index} --output_format lazy + {config[bin_singularity][python]} {config[sbin_singularity][storage_fast5Index.py]} extract {input.batch} raw/ {params.index} --output_format lazy {config[bin_singularity][guppy]} -i raw/ --recursive --num_callers 1 --cpu_threads_per_caller {threads} -s workspace/ --flowcell {params.flowcell} --kit {params.kit} {params.filtering} {config[basecalling_guppy_flags]} FASTQ_DIR='workspace/pass' if [ \'{params.filtering}\' = '' ]; then @@ -130,7 +130,7 @@ rule guppy: rule flappie: input: batch = lambda wildcards : get_signal_batch(wildcards, config), - run = lambda wildcards : os.path.join(config['storage_data_raw'], wildcards.runname) + run = lambda wildcards : [os.path.join(config['storage_data_raw'], wildcards.runname)] + [os.path.join(config['storage_data_raw'], wildcards.runname, 'reads.fofn')] if get_signal_batch(wildcards, config).endswith('.txt') else [] output: sequence = "sequences/flappie/batches/{tag, [^\/]*}/{runname, [^.\/]*}/{batch, [^.]*}.{format, (fasta|fastq|fa|fq)}.gz", methyl_marks = "sequences/flappie/batches/{tag, [^\/]*}/{runname, [^.\/]*}/{batch, [^.]*}.{format, (fasta|fastq|fa|fq)}.tsv.gz" @@ -140,14 +140,14 @@ rule flappie: mem_mb = lambda wildcards, threads, attempt: int((1.0 + (0.1 * (attempt - 1))) * (4000 + 5000 * threads)), time_min = lambda wildcards, threads, attempt: int((5760 / threads) * attempt) # 360 min / 16 threads params: - index = lambda wildcards : os.path.join(config['storage_data_raw'], wildcards.runname, 'reads.fofn') + index = lambda wildcards : '--index ' + os.path.join(config['storage_data_raw'], wildcards.runname, 'reads.fofn') if get_signal_batch(wildcards, config).endswith('.txt') else '' singularity: "docker://nanopype/basecalling:{tag}".format(tag=config['version']['tag']) shell: """ export OPENBLAS_NUM_THREADS=1 mkdir -p raw - {config[bin_singularity][python]} {config[sbin_singularity][storage_fast5Index.py]} extract {input.batch} raw/ --index {params.index} --output_format single + {config[bin_singularity][python]} {config[sbin_singularity][storage_fast5Index.py]} extract {input.batch} raw/ {params.index} --output_format single find raw/ -regextype posix-extended -regex '^.*fast5' -type f -exec du -h {{}} + | sort -r -h | cut -f2 > raw.fofn split -e -n r/{threads} raw.fofn raw.fofn.part. ls raw.fofn.part.* | xargs -n 1 -P {threads} -I {{}} $SHELL -c 'cat {{}} | shuf | xargs -n 1 {config[bin_singularity][flappie]} --model {config[basecalling_flappie_model]} {config[basecalling_flappie_flags]} > raw/{{}}.fastq' diff --git a/rules/clean.smk b/rules/clean.smk index 2c48171..39ea713 100755 --- a/rules/clean.smk +++ b/rules/clean.smk @@ -34,7 +34,7 @@ # imports import os, sys, glob # local rules -localrules: basecaller_merge_batches, basecaller_merge_tag, basecaller_qc, basecalling_clean +localrules: sequences_clean, alignment_clean, methylation_clean, sv_clean, demux_clean, transcript_isoforms_clean, clean # remove flag files to force re-run of cleanup @@ -43,7 +43,7 @@ for f in glob.glob('.*.done'): # clean up compute batches basecalling -rule basecalling_clean: +rule sequences_clean: input: [dirpath for dirpath, _, files in os.walk('sequences') if dirpath.endswith('batches')] output: @@ -68,7 +68,7 @@ rule methylation_clean: touch('.methylation_clean.done') shell: "rm -r {input}" - + # clean up compute batches sv rule sv_clean: input: @@ -99,11 +99,11 @@ rule transcript_isoforms_clean: # clean up everything rule clean: input: - rules.basecalling_clean.output, + rules.sequences_clean.output, rules.alignment_clean.output, rules.methylation_clean.output, rules.sv_clean.output, rules.demux_clean.output, rules.transcript_isoforms_clean.output shell: - "rm {input}" \ No newline at end of file + "rm {input}" diff --git a/rules/methylation.smk b/rules/methylation.smk index 18bdc81..60807fc 100755 --- a/rules/methylation.smk +++ b/rules/methylation.smk @@ -87,13 +87,13 @@ rule methylation_nanopolish: mem_mb = lambda wildcards, input, threads, attempt: int((1.0 + (0.1 * (attempt - 1))) * (8000 + 500 * threads)), time_min = lambda wildcards, input, threads, attempt: int((960 / threads) * attempt) # 60 min / 16 threads params: - index = lambda wildcards : os.path.join(config['storage_data_raw'], wildcards.runname, 'reads.fofn') + index = lambda wildcards : '--index ' + os.path.join(config['storage_data_raw'], wildcards.runname, 'reads.fofn') if get_signal_batch(wildcards, config).endswith('.txt') else '' singularity: "docker://nanopype/methylation:{tag}".format(tag=config['version']['tag']) shell: """ mkdir -p raw - {config[bin][python]} {config[sbin][storage_fast5Index.py]} extract {input.batch} raw/ --index {params.index} --output_format lazy + {config[bin][python]} {config[sbin][storage_fast5Index.py]} extract {input.batch} raw/ {params.index} --output_format lazy zcat {input.sequences} > sequences.fastx {config[bin_singularity][nanopolish]} index -d raw/ sequences.fastx {config[bin_singularity][nanopolish]} call-methylation -t {threads} -r sequences.fastx -g {input.reference} -b {input.bam} > {output} diff --git a/rules/storage.smk b/rules/storage.smk index b861ab3..ebdd5a9 100755 --- a/rules/storage.smk +++ b/rules/storage.smk @@ -84,35 +84,11 @@ rule storage_extract: names = "subset/{tag}.txt" output: directory("subset/{tag, [^.\/]*}/{runname, [^.\/]*}") - run: - import os, itertools, tarfile - # read target names - ids = [] - with open(input.names, 'r') as fp: - for line in fp: - ids.append(line.strip()) - # read index - records = {} - with open(input.index, 'r') as fp: - records = {read_ID:filename for filename, read_ID in [line.rstrip().split('\t')[0:2] for line in fp]} - # lookup file locations per ID - batches = [file.rpartition(".tar/") for file in [records[id] for id in ids if id in records]] - batches.sort(key = lambda x : x[0]) - # group and extract per batch - for archive, batch in itertools.groupby(batches, key= lambda x : x[0]): - tarFiles = [os.path.basename(x[2]) for x in batch] - try: - with tarfile.open(os.path.join(config["storage_data_raw"], wildcards.runname, archive + ".tar")) as tar: - tar_members = tar.getmembers() - for tar_member in tar_members: - if any(s in tar_member.name for s in tarFiles): - try: - tar_member.name = os.path.basename(tar_member.name) - tar.extract(tar_member, path=output[0]) - except: - print("Failed to extract " + tar_member.name + 'from batch' + archive) - except: - print("Failed to open batch " + archive) + shell: + """ + mkdir -p {output} + {config[bin][python]} {config[sbin][storage_fast5Index.py]} extract {input.names} {output} --index {input.index} --output_format bulk + """ # extract reads from indexed runs diff --git a/setup.py b/setup.py index a40efe9..6ef9a02 100755 --- a/setup.py +++ b/setup.py @@ -61,7 +61,7 @@ def run(self): setup( name='nanopype', - version='0.4.1', + version='0.6.0', author='Pay Giesselmann', author_email='giesselmann@molgen.mpg.de', description='Nanopore data processing workflows', @@ -69,7 +69,7 @@ def run(self): url='https://github.com/giesselmann/nanopype', license='LICENSE', python_requires='>=3.4', - install_requires=['snakemake>=5.3.0', 'numpy', 'h5py>=2.7.1', 'watchdog'], + install_requires=['snakemake>=5.4.3', 'numpy', 'h5py>=2.7.1', 'watchdog'], include_package_data=True, scripts=['scripts/nanopype_import.py',], zip_safe=False,