Skip to content

Commit

Permalink
got the setup working
Browse files Browse the repository at this point in the history
  • Loading branch information
charliesimkin authored and alfredsimkin committed Jan 16, 2024
1 parent b23bec2 commit f1364d0
Show file tree
Hide file tree
Showing 3 changed files with 201 additions and 219 deletions.
196 changes: 137 additions & 59 deletions snakemake/wrangler_by_sample_finish.smk
Original file line number Diff line number Diff line change
@@ -1,81 +1,159 @@
'''
creates a mip_ids folder and an allMipsSamplesNames.tab.txt file. extracts mips,
corrects mips, and generates files that can be used to determine sample names as
well as sample names that had extractable data.
'''

configfile: 'wrangler_by_sample.yaml'
output=config['output_folder']
output='/opt/analysis'

all_samples, all_targets=[],[]

for line_number, line in enumerate(open(output+'/mip_ids/allMipsSamplesNames.tab.txt')):
if line_number>0:
line=line.rstrip().split('\t')
if len(line)>1 and len(line[1])>0:
all_samples.append(line[1])
if len(line[0])>0:
all_targets.append(line[0])

final_dict={1: expand(output+'/analysis/{sample}/{sample}_mipExtraction/log.txt', sample=all_samples),
2: expand(output+'/analysis/{sample}/{sample}_mipBarcodeCorrection/barcodeFilterStats.tab.txt', sample=all_samples),
3: output+'/analysis/logs/mipCorrectForContamWithSameBarcodes_run1.json',
4: expand(output+'/clustering_status/{sample}_mip_clustering_finished.txt', sample=all_samples),
5: expand(output+'/analysis/populationClustering/{target}/analysis/log.txt', target=all_targets),
6: output+'/allInfo.tsv.gz'}
output_choice=config['output_choice']
final_out=final_dict[output_choice]

rule all:
input:
setup_finished=output+'/setup_finished.txt',
# good_samples=output+'/successfully_extracted_samples.txt',
output_configfile=output+'/snakemake_params/wrangler_by_sample.yaml'
final_out

rule copy_files:
rule extract_by_arm:
input:
setup_snakefile='setup_run.smk',
finish_snakefile='finish_run.smk',
input_configfile='wrangler_by_sample.yaml',
in_scripts='scripts'

This comment has been minimized.

Copy link
@alfredsimkin

alfredsimkin Jan 16, 2024

Contributor

To do: add the mip_ids folder (created in the setup_run step) as an official input to this rule, so that it checks if the necessary input mip arms and sampleNames.txt files exist and alerts user if input is not there

params:
output_dir='/opt/analysis/analysis',
# wrangler_dir=output,
# fastq_dir=config['fastq_dir']
resources:
time_min=240
output:
setup_snakefile=output+'/snakemake_params/setup_run.smk',
finish_snakefile=output+'/snakemake_params/finish_run.smk',
output_configfile=output+'/snakemake_params/wrangler_by_sample.yaml',
out_scripts=directory(output+'/snakemake_params/scripts')
output+'/analysis/{sample}/{sample}_mipExtraction/log.txt'
shell:
'''
cp {input.setup_snakefile} {output.setup_snakefile}
cp {input.finish_snakefile} {output.finish_snakefile}
cp {input.input_configfile} {output.output_configfile}
cp -r {input.in_scripts} {output.out_scripts}
MIPWrangler mipExtractByArm --masterDir {params.output_dir} --sample {wildcards.sample} --overWriteDirs --minCaptureLength=30
'''
if config['downsample_umi_count']<2**32:
rule mip_barcode_correction:
input:
good_samples=expand(output+'/analysis/{sample}/{sample}_mipExtraction/log.txt', sample=all_samples)
params:
output_dir='/opt/analysis/analysis',
# wrangler_dir=output,
# sif_file=config['miptools_sif'],
downsample_seed=config['downsample_seed'],
downsample_amount=config['downsample_umi_count']
resources:
mem_mb=config['memory_mb_per_step'],
time_min=20
output:
barcode_corrections_finished=output+'/analysis/{sample}/{sample}_mipBarcodeCorrection/barcodeFilterStats.tab.txt'
shell:
'''
MIPWrangler mipBarcodeCorrection --masterDir {params.output_dir} \
--downSampleAmount {params.downsample_amount} --downSampleSeed \
{params.downsample_seed} --overWriteDirs --sample {wildcards.sample}
'''
else:
rule mip_barcode_correction:
input:
good_samples=expand(output+'/analysis/{sample}/{sample}_mipExtraction/log.txt', sample=all_samples)
params:
output_dir='/opt/analysis/analysis',
# wrangler_dir=output,
# sif_file=config['miptools_sif'],
downsample_seed=config['downsample_seed'],
resources:
mem_mb=config['memory_mb_per_step'],
time_min=20
output:
barcode_corrections_finished=output+'/analysis/{sample}/{sample}_mipBarcodeCorrection/barcodeFilterStats.tab.txt'
shell:
'''
MIPWrangler mipBarcodeCorrection --masterDir {params.output_dir} \
--doNotDownSample --downSampleSeed \
{params.downsample_seed} --overWriteDirs --sample {wildcards.sample}
'''

rule generate_mip_files:
'''
given that I'm repackaging miptools wrangler (so wrangler.sh is not needed)
and that the existing generate_wrangler_scripts.py seems unnecessarily
convoluted and that only two files are needed by subsequent steps
(mipArms.txt and allMipsSamplesNames.tab.txt) I wrote my own
script for this. Input is an arms file and a sample sheet. Output is an arms
file with rearranged columns and a two column file with names of all mips
and names of all samples (with no pairing between columns of any given row).
'''

rule correct_for_same_barcode_contam:
input:
arms_file=config['project_resources']+'/mip_ids/mip_arms.txt',
sample_sheet=config['input_sample_sheet'],
fastq_folder=config['fastq_dir']
all_corrected=expand(output+'/analysis/{sample}/{sample}_mipBarcodeCorrection/barcodeFilterStats.tab.txt', sample=all_samples)
params:
sample_set=config['sample_set_used'],
probe_sets=config['probe_sets_used']
output_dir='/opt/analysis/analysis',
# wrangler_dir=output,
# sif_file=config['miptools_sif'],
resources:
mem_mb=40000,
time_min=1440,
nodes=20
threads: 20
output:
mip_arms=output+'/mip_ids/mipArms.txt',
sample_file=output+'/mip_ids/allMipsSamplesNames.tab.txt',
sample_sheet=output+'/sample_sheet.tsv'
script:
'scripts/generate_mip_files.py'
#name is controlled by --logFile
corrected_barcode_marker=output+'/analysis/logs/mipCorrectForContamWithSameBarcodes_run1.json'
shell:
'''
MIPWrangler mipCorrectForContamWithSameBarcodesMultiple --masterDir {params.output_dir} --numThreads {threads} --overWriteDirs --overWriteLog --logFile mipCorrectForContamWithSameBarcodes_run1
'''

rule setup:
rule mip_clustering:
input:
mip_arms=output+'/mip_ids/mipArms.txt',
sample_file=output+'/mip_ids/allMipsSamplesNames.tab.txt'
corrected_barcode_marker=output+'/analysis/logs/mipCorrectForContamWithSameBarcodes_run1.json',
#sample_dir=output+'/analysis/{sample}'
params:
output_dir='/opt/analysis/analysis',
project_resources=config['project_resources'],
wrangler_dir=output,
sif_file=config['miptools_sif'],
fastq_dir=config['fastq_dir']
# wrangler_dir=output,
# sif_file=config['miptools_sif']
resources:
mem_mb=config['memory_mb_per_step'],
time_min=60,
output:
setup_finished=output+'/setup_finished.txt'
threads: config['cpu_count']
mip_clustering=output+'/clustering_status/{sample}_mip_clustering_finished.txt'
shell:
'''
singularity exec \
-B {params.project_resources}:/opt/project_resources \
-B {params.wrangler_dir}:/opt/analysis \
-B {params.fastq_dir}:/opt/data \
{params.sif_file} \
MIPWrangler mipSetup --mipArmsFilename /opt/analysis/mip_ids/mipArms.txt --mipSampleFile /opt/analysis/mip_ids/allMipsSamplesNames.tab.txt --numThreads {threads} --masterDir {params.output_dir} --dir /opt/data --mipServerNumber 1
touch {output.setup_finished}
MIPWrangler mipClustering --masterDir {params.output_dir} --overWriteDirs --par /opt/resources/clustering_pars/illumina_collapseHomoploymers.pars.txt --countEndGaps --sample {wildcards.sample}
touch {output.mip_clustering}
'''

rule pop_cluster_target:
input:
mip_cluster_files=expand(output+'/clustering_status/{sample}_mip_clustering_finished.txt', sample=all_samples)
params:
output_dir='/opt/analysis/analysis',
# wrangler_dir=output,
# sif_file=config['miptools_sif']
resources:
mem_mb=config['memory_mb_per_step'],
time_min=60,
output:
pop_clustering=output+'/analysis/populationClustering/{target}/analysis/log.txt'
shell:
'''
MIPWrangler mipPopulationClustering --keepIntermediateFiles --masterDir {params.output_dir} --overWriteDirs --cutoff 0 --countEndGaps --fraccutoff 0.005 --mipName {wildcards.target}
touch {output.pop_clustering}
'''

rule output_final_table:
'''
cat together output files of previous step into a final file, do a "natural
sort" to sort things similar to how Nick's are output. gzip it
'''
input:
pop_clustering=expand(output+'/analysis/populationClustering/{target}/analysis/log.txt', target=all_targets)
# final_sample_outputs=expand('/path/to/sample/outputs/{sample}.something', sample=sample_list)
params:
all_targets=all_targets,
prefix=output+'/analysis/populationClustering/',
suffix='/analysis/selectedClustersInfo.tab.txt.gz'
resources:
mem_mb=20000,
time_min=480
output:
final_table=output+'/allInfo.tsv.gz'
script:
'scripts/output_final_table.py'
Loading

0 comments on commit f1364d0

Please sign in to comment.