From f1364d023ae33f983ec51f2bfc8bed451b5d0024 Mon Sep 17 00:00:00 2001 From: Charlie Simkin Date: Thu, 11 Jan 2024 12:09:46 -0500 Subject: [PATCH] got the setup working --- snakemake/wrangler_by_sample_finish.smk | 196 +++++++++++++++------- snakemake/wrangler_by_sample_setup.smk | 211 ++++++------------------ user_scripts/wrangler_by_sample.sh | 13 +- 3 files changed, 201 insertions(+), 219 deletions(-) diff --git a/snakemake/wrangler_by_sample_finish.smk b/snakemake/wrangler_by_sample_finish.smk index 4f27261..5dc09d4 100644 --- a/snakemake/wrangler_by_sample_finish.smk +++ b/snakemake/wrangler_by_sample_finish.smk @@ -1,81 +1,159 @@ -''' -creates a mip_ids folder and an allMipsSamplesNames.tab.txt file. extracts mips, -corrects mips, and generates files that can be used to determine sample names as -well as sample names that had extractable data. -''' - configfile: 'wrangler_by_sample.yaml' -output=config['output_folder'] +output='/opt/analysis' + +all_samples, all_targets=[],[] + +for line_number, line in enumerate(open(output+'/mip_ids/allMipsSamplesNames.tab.txt')): + if line_number>0: + line=line.rstrip().split('\t') + if len(line)>1 and len(line[1])>0: + all_samples.append(line[1]) + if len(line[0])>0: + all_targets.append(line[0]) + +final_dict={1: expand(output+'/analysis/{sample}/{sample}_mipExtraction/log.txt', sample=all_samples), + 2: expand(output+'/analysis/{sample}/{sample}_mipBarcodeCorrection/barcodeFilterStats.tab.txt', sample=all_samples), + 3: output+'/analysis/logs/mipCorrectForContamWithSameBarcodes_run1.json', + 4: expand(output+'/clustering_status/{sample}_mip_clustering_finished.txt', sample=all_samples), + 5: expand(output+'/analysis/populationClustering/{target}/analysis/log.txt', target=all_targets), + 6: output+'/allInfo.tsv.gz'} +output_choice=config['output_choice'] +final_out=final_dict[output_choice] rule all: input: - setup_finished=output+'/setup_finished.txt', -# good_samples=output+'/successfully_extracted_samples.txt', - output_configfile=output+'/snakemake_params/wrangler_by_sample.yaml' + final_out -rule copy_files: +rule extract_by_arm: input: - setup_snakefile='setup_run.smk', - finish_snakefile='finish_run.smk', - input_configfile='wrangler_by_sample.yaml', - in_scripts='scripts' + params: + output_dir='/opt/analysis/analysis', +# wrangler_dir=output, +# fastq_dir=config['fastq_dir'] + resources: + time_min=240 output: - setup_snakefile=output+'/snakemake_params/setup_run.smk', - finish_snakefile=output+'/snakemake_params/finish_run.smk', - output_configfile=output+'/snakemake_params/wrangler_by_sample.yaml', - out_scripts=directory(output+'/snakemake_params/scripts') + output+'/analysis/{sample}/{sample}_mipExtraction/log.txt' shell: ''' - cp {input.setup_snakefile} {output.setup_snakefile} - cp {input.finish_snakefile} {output.finish_snakefile} - cp {input.input_configfile} {output.output_configfile} - cp -r {input.in_scripts} {output.out_scripts} + MIPWrangler mipExtractByArm --masterDir {params.output_dir} --sample {wildcards.sample} --overWriteDirs --minCaptureLength=30 ''' +if config['downsample_umi_count']<2**32: + rule mip_barcode_correction: + input: + good_samples=expand(output+'/analysis/{sample}/{sample}_mipExtraction/log.txt', sample=all_samples) + params: + output_dir='/opt/analysis/analysis', +# wrangler_dir=output, +# sif_file=config['miptools_sif'], + downsample_seed=config['downsample_seed'], + downsample_amount=config['downsample_umi_count'] + resources: + mem_mb=config['memory_mb_per_step'], + time_min=20 + output: + barcode_corrections_finished=output+'/analysis/{sample}/{sample}_mipBarcodeCorrection/barcodeFilterStats.tab.txt' + shell: + ''' + MIPWrangler mipBarcodeCorrection --masterDir {params.output_dir} \ + --downSampleAmount {params.downsample_amount} --downSampleSeed \ + {params.downsample_seed} --overWriteDirs --sample {wildcards.sample} + ''' +else: + rule mip_barcode_correction: + input: + good_samples=expand(output+'/analysis/{sample}/{sample}_mipExtraction/log.txt', sample=all_samples) + params: + output_dir='/opt/analysis/analysis', +# wrangler_dir=output, +# sif_file=config['miptools_sif'], + downsample_seed=config['downsample_seed'], + resources: + mem_mb=config['memory_mb_per_step'], + time_min=20 + output: + barcode_corrections_finished=output+'/analysis/{sample}/{sample}_mipBarcodeCorrection/barcodeFilterStats.tab.txt' + shell: + ''' + MIPWrangler mipBarcodeCorrection --masterDir {params.output_dir} \ + --doNotDownSample --downSampleSeed \ + {params.downsample_seed} --overWriteDirs --sample {wildcards.sample} + ''' -rule generate_mip_files: - ''' - given that I'm repackaging miptools wrangler (so wrangler.sh is not needed) - and that the existing generate_wrangler_scripts.py seems unnecessarily - convoluted and that only two files are needed by subsequent steps - (mipArms.txt and allMipsSamplesNames.tab.txt) I wrote my own - script for this. Input is an arms file and a sample sheet. Output is an arms - file with rearranged columns and a two column file with names of all mips - and names of all samples (with no pairing between columns of any given row). - ''' + +rule correct_for_same_barcode_contam: input: - arms_file=config['project_resources']+'/mip_ids/mip_arms.txt', - sample_sheet=config['input_sample_sheet'], - fastq_folder=config['fastq_dir'] + all_corrected=expand(output+'/analysis/{sample}/{sample}_mipBarcodeCorrection/barcodeFilterStats.tab.txt', sample=all_samples) params: - sample_set=config['sample_set_used'], - probe_sets=config['probe_sets_used'] + output_dir='/opt/analysis/analysis', +# wrangler_dir=output, +# sif_file=config['miptools_sif'], + resources: + mem_mb=40000, + time_min=1440, + nodes=20 + threads: 20 output: - mip_arms=output+'/mip_ids/mipArms.txt', - sample_file=output+'/mip_ids/allMipsSamplesNames.tab.txt', - sample_sheet=output+'/sample_sheet.tsv' - script: - 'scripts/generate_mip_files.py' + #name is controlled by --logFile + corrected_barcode_marker=output+'/analysis/logs/mipCorrectForContamWithSameBarcodes_run1.json' + shell: + ''' + MIPWrangler mipCorrectForContamWithSameBarcodesMultiple --masterDir {params.output_dir} --numThreads {threads} --overWriteDirs --overWriteLog --logFile mipCorrectForContamWithSameBarcodes_run1 + ''' -rule setup: +rule mip_clustering: input: - mip_arms=output+'/mip_ids/mipArms.txt', - sample_file=output+'/mip_ids/allMipsSamplesNames.tab.txt' + corrected_barcode_marker=output+'/analysis/logs/mipCorrectForContamWithSameBarcodes_run1.json', + #sample_dir=output+'/analysis/{sample}' params: output_dir='/opt/analysis/analysis', - project_resources=config['project_resources'], - wrangler_dir=output, - sif_file=config['miptools_sif'], - fastq_dir=config['fastq_dir'] +# wrangler_dir=output, +# sif_file=config['miptools_sif'] + resources: + mem_mb=config['memory_mb_per_step'], + time_min=60, output: - setup_finished=output+'/setup_finished.txt' - threads: config['cpu_count'] + mip_clustering=output+'/clustering_status/{sample}_mip_clustering_finished.txt' shell: ''' - singularity exec \ - -B {params.project_resources}:/opt/project_resources \ - -B {params.wrangler_dir}:/opt/analysis \ - -B {params.fastq_dir}:/opt/data \ - {params.sif_file} \ - MIPWrangler mipSetup --mipArmsFilename /opt/analysis/mip_ids/mipArms.txt --mipSampleFile /opt/analysis/mip_ids/allMipsSamplesNames.tab.txt --numThreads {threads} --masterDir {params.output_dir} --dir /opt/data --mipServerNumber 1 - touch {output.setup_finished} + MIPWrangler mipClustering --masterDir {params.output_dir} --overWriteDirs --par /opt/resources/clustering_pars/illumina_collapseHomoploymers.pars.txt --countEndGaps --sample {wildcards.sample} + touch {output.mip_clustering} ''' + +rule pop_cluster_target: + input: + mip_cluster_files=expand(output+'/clustering_status/{sample}_mip_clustering_finished.txt', sample=all_samples) + params: + output_dir='/opt/analysis/analysis', +# wrangler_dir=output, +# sif_file=config['miptools_sif'] + resources: + mem_mb=config['memory_mb_per_step'], + time_min=60, + output: + pop_clustering=output+'/analysis/populationClustering/{target}/analysis/log.txt' + shell: + ''' + MIPWrangler mipPopulationClustering --keepIntermediateFiles --masterDir {params.output_dir} --overWriteDirs --cutoff 0 --countEndGaps --fraccutoff 0.005 --mipName {wildcards.target} + touch {output.pop_clustering} + ''' + +rule output_final_table: + ''' + cat together output files of previous step into a final file, do a "natural + sort" to sort things similar to how Nick's are output. gzip it + ''' + input: + pop_clustering=expand(output+'/analysis/populationClustering/{target}/analysis/log.txt', target=all_targets) +# final_sample_outputs=expand('/path/to/sample/outputs/{sample}.something', sample=sample_list) + params: + all_targets=all_targets, + prefix=output+'/analysis/populationClustering/', + suffix='/analysis/selectedClustersInfo.tab.txt.gz' + resources: + mem_mb=20000, + time_min=480 + output: + final_table=output+'/allInfo.tsv.gz' + script: + 'scripts/output_final_table.py' diff --git a/snakemake/wrangler_by_sample_setup.smk b/snakemake/wrangler_by_sample_setup.smk index 58ed542..ee21195 100644 --- a/snakemake/wrangler_by_sample_setup.smk +++ b/snakemake/wrangler_by_sample_setup.smk @@ -1,179 +1,76 @@ -configfile: 'wrangler_by_sample.yaml' -output=config['output_folder'] - -all_samples, all_targets=[],[] +''' +creates a mip_ids folder and an allMipsSamplesNames.tab.txt file. extracts mips, +corrects mips, and generates files that can be used to determine sample names as +well as sample names that had extractable data. +''' -for line_number, line in enumerate(open(output+'/mip_ids/allMipsSamplesNames.tab.txt')): - if line_number>0: - line=line.rstrip().split('\t') - if len(line)>1 and len(line[1])>0: - all_samples.append(line[1]) - if len(line[0])>0: - all_targets.append(line[0]) - -final_dict={1: expand(output+'/analysis/{sample}/{sample}_mipExtraction/log.txt', sample=all_samples), - 2: expand(output+'/analysis/{sample}/{sample}_mipBarcodeCorrection/barcodeFilterStats.tab.txt', sample=all_samples), - 3: output+'/analysis/logs/mipCorrectForContamWithSameBarcodes_run1.json', - 4: expand(output+'/clustering_status/{sample}_mip_clustering_finished.txt', sample=all_samples), - 5: expand(output+'/analysis/populationClustering/{target}/analysis/log.txt', target=all_targets), - 6: output+'/allInfo.tsv.gz'} -output_choice=config['output_choice'] -final_out=final_dict[output_choice] +configfile: 'wrangler_by_sample.yaml' +output='/opt/analysis' rule all: input: - final_out + setup_finished=output+'/setup_finished.txt', +# good_samples=output+'/successfully_extracted_samples.txt', + output_configfile=output+'/snakemake_params/wrangler_by_sample.yaml' -rule extract_by_arm: +rule copy_files: input: - params: - output_dir='/opt/analysis/analysis', - wrangler_dir=output, - sif_file=config['miptools_sif'], - fastq_dir=config['fastq_dir'] - resources: - time_min=240 - output: - output+'/analysis/{sample}/{sample}_mipExtraction/log.txt' - shell: - ''' - singularity exec \ - -B {params.fastq_dir}:/opt/data \ - -B {params.wrangler_dir}:/opt/analysis \ - {params.sif_file} \ - MIPWrangler mipExtractByArm --masterDir {params.output_dir} --sample {wildcards.sample} --overWriteDirs --minCaptureLength=30 - ''' -if config['downsample_umi_count']<2**32: - rule mip_barcode_correction: - input: - good_samples=expand(output+'/analysis/{sample}/{sample}_mipExtraction/log.txt', sample=all_samples) - params: - output_dir='/opt/analysis/analysis', - wrangler_dir=output, - sif_file=config['miptools_sif'], - downsample_seed=config['downsample_seed'], - downsample_amount=config['downsample_umi_count'] - resources: - mem_mb=config['memory_mb_per_step'], - time_min=20 - output: - barcode_corrections_finished=output+'/analysis/{sample}/{sample}_mipBarcodeCorrection/barcodeFilterStats.tab.txt' - shell: - ''' - singularity exec \ - -B {params.wrangler_dir}:/opt/analysis \ - {params.sif_file} \ - MIPWrangler mipBarcodeCorrection --masterDir {params.output_dir} \ - --downSampleAmount {params.downsample_amount} --downSampleSeed \ - {params.downsample_seed} --overWriteDirs --sample {wildcards.sample} - ''' -else: - rule mip_barcode_correction: - input: - good_samples=expand(output+'/analysis/{sample}/{sample}_mipExtraction/log.txt', sample=all_samples) - params: - output_dir='/opt/analysis/analysis', - wrangler_dir=output, - sif_file=config['miptools_sif'], - downsample_seed=config['downsample_seed'], - resources: - mem_mb=config['memory_mb_per_step'], - time_min=20 - output: - barcode_corrections_finished=output+'/analysis/{sample}/{sample}_mipBarcodeCorrection/barcodeFilterStats.tab.txt' - shell: - ''' - singularity exec \ - -B {params.wrangler_dir}:/opt/analysis \ - {params.sif_file} \ - MIPWrangler mipBarcodeCorrection --masterDir {params.output_dir} \ - --doNotDownSample --downSampleSeed \ - {params.downsample_seed} --overWriteDirs --sample {wildcards.sample} - ''' - - -rule correct_for_same_barcode_contam: - input: - all_corrected=expand(output+'/analysis/{sample}/{sample}_mipBarcodeCorrection/barcodeFilterStats.tab.txt', sample=all_samples) - params: - output_dir='/opt/analysis/analysis', - wrangler_dir=output, - sif_file=config['miptools_sif'], - resources: - mem_mb=40000, - time_min=1440, - nodes=20 - threads: 20 + setup_snakefile='/opt/snakemake/wrangler_by_sample_setup.smk', + finish_snakefile='/opt/snakemake/wrangler_by_sample_finish.smk', + input_configfile='wrangler_by_sample.yaml', + in_scripts='/opt/snakemake/scripts' output: - #name is controlled by --logFile - corrected_barcode_marker=output+'/analysis/logs/mipCorrectForContamWithSameBarcodes_run1.json' + setup_snakefile=output+'/snakemake_params/setup_run.smk', + finish_snakefile=output+'/snakemake_params/finish_run.smk', + output_configfile=output+'/snakemake_params/wrangler_by_sample.yaml', + out_scripts=directory(output+'/snakemake_params/scripts') shell: ''' - singularity exec \ - -B {params.wrangler_dir}:/opt/analysis \ - {params.sif_file} \ - MIPWrangler mipCorrectForContamWithSameBarcodesMultiple --masterDir {params.output_dir} --numThreads {threads} --overWriteDirs --overWriteLog --logFile mipCorrectForContamWithSameBarcodes_run1 + cp {input.setup_snakefile} {output.setup_snakefile} + cp {input.finish_snakefile} {output.finish_snakefile} + cp {input.input_configfile} {output.output_configfile} + cp -r {input.in_scripts} {output.out_scripts} ''' -rule mip_clustering: +rule generate_mip_files: + ''' + given that I'm repackaging miptools wrangler (so wrangler.sh is not needed) + and that the existing generate_wrangler_scripts.py seems unnecessarily + convoluted and that only two files are needed by subsequent steps + (mipArms.txt and allMipsSamplesNames.tab.txt) I wrote my own + script for this. Input is an arms file and a sample sheet. Output is an arms + file with rearranged columns and a two column file with names of all mips + and names of all samples (with no pairing between columns of any given row). + ''' input: - corrected_barcode_marker=output+'/analysis/logs/mipCorrectForContamWithSameBarcodes_run1.json', - #sample_dir=output+'/analysis/{sample}' + arms_file='/opt/project_resources/mip_ids/mip_arms.txt', + sample_sheet='/opt/input_sample_sheet_directory/'+config['input_sample_sheet_name'], + fastq_folder='/opt/data' params: - output_dir='/opt/analysis/analysis', - wrangler_dir=output, - sif_file=config['miptools_sif'] - resources: - mem_mb=config['memory_mb_per_step'], - time_min=60, + sample_set=config['sample_set_used'], + probe_sets=config['probe_sets_used'] output: - mip_clustering=output+'/clustering_status/{sample}_mip_clustering_finished.txt' - shell: - ''' - singularity exec \ - -B {params.wrangler_dir}:/opt/analysis \ - {params.sif_file} \ - MIPWrangler mipClustering --masterDir {params.output_dir} --overWriteDirs --par /opt/resources/clustering_pars/illumina_collapseHomoploymers.pars.txt --countEndGaps --sample {wildcards.sample} - touch {output.mip_clustering} - ''' + mip_arms=output+'/mip_ids/mipArms.txt', + sample_file=output+'/mip_ids/allMipsSamplesNames.tab.txt', + sample_sheet=output+'/sample_sheet.tsv' + script: + 'scripts/generate_mip_files.py' -rule pop_cluster_target: +rule setup: input: - mip_cluster_files=expand(output+'/clustering_status/{sample}_mip_clustering_finished.txt', sample=all_samples) + mip_arms=output+'/mip_ids/mipArms.txt', + sample_file=output+'/mip_ids/allMipsSamplesNames.tab.txt' params: output_dir='/opt/analysis/analysis', - wrangler_dir=output, - sif_file=config['miptools_sif'] - resources: - mem_mb=config['memory_mb_per_step'], - time_min=60, + project_resources='/opt/project_resources', +# wrangler_dir=output, +# sif_file=config['miptools_sif'], + fastq_dir='/opt/data' output: - pop_clustering=output+'/analysis/populationClustering/{target}/analysis/log.txt' + setup_finished=output+'/setup_finished.txt' + threads: config['cpu_count'] shell: ''' - singularity exec \ - -B {params.wrangler_dir}:/opt/analysis \ - {params.sif_file} \ - MIPWrangler mipPopulationClustering --keepIntermediateFiles --masterDir {params.output_dir} --overWriteDirs --cutoff 0 --countEndGaps --fraccutoff 0.005 --mipName {wildcards.target} - touch {output.pop_clustering} + MIPWrangler mipSetup --mipArmsFilename /opt/analysis/mip_ids/mipArms.txt --mipSampleFile /opt/analysis/mip_ids/allMipsSamplesNames.tab.txt --numThreads {threads} --masterDir {params.output_dir} --dir /opt/data --mipServerNumber 1 + touch {output.setup_finished} ''' - -rule output_final_table: - ''' - cat together output files of previous step into a final file, do a "natural - sort" to sort things similar to how Nick's are output. gzip it - ''' - input: - pop_clustering=expand(output+'/analysis/populationClustering/{target}/analysis/log.txt', target=all_targets) -# final_sample_outputs=expand('/path/to/sample/outputs/{sample}.something', sample=sample_list) - params: - all_targets=all_targets, - prefix=output+'/analysis/populationClustering/', - suffix='/analysis/selectedClustersInfo.tab.txt.gz' - resources: - mem_mb=20000, - time_min=480 - output: - final_table=output+'/allInfo.tsv.gz' - script: - 'scripts/output_final_table.py' diff --git a/user_scripts/wrangler_by_sample.sh b/user_scripts/wrangler_by_sample.sh index 597a606..c37a5ee 100644 --- a/user_scripts/wrangler_by_sample.sh +++ b/user_scripts/wrangler_by_sample.sh @@ -36,14 +36,21 @@ eval $(parse_yaml wrangler_by_sample.yaml) ########################## # create output directory if it doesn't exist -mkdir -p $output_directory +mkdir -p $output_folder # define singularity bindings and snakemake arguments to be used each time snakemake is called singularity_bindings="-B $project_resources:/opt/project_resources -B $output_folder:/opt/analysis -B $input_sample_sheet_directory:/opt/input_sample_sheet_directory - -B $fastq_dir:/opt/fastq_dir + -B $fastq_dir:/opt/data -B /home/charlie/projects/MIPTools_wrangler_in_sif/snakemake:/opt/snakemake -H $newhome" -snakemake_args="--cores $cpu_count --keep-going --rerun-incomplete --latency-wait 60" \ No newline at end of file +snakemake_args="--cores $cpu_count --keep-going --rerun-incomplete --latency-wait 60" + +################################## +# Step 1: Check Run Stats +################################# +singularity exec \ + $singularity_bindings \ + $miptools_sif snakemake -s /opt/snakemake/wrangler_by_sample_setup.smk $snakemake_args \ No newline at end of file