Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wrangler in sif #54

Merged
merged 12 commits into from
Jan 16, 2024
49 changes: 49 additions & 0 deletions snakemake/scripts/generate_mip_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import pandas as pd
import subprocess
import os

arms_file=snakemake.input.arms_file
input_fastq_folder=snakemake.input.fastq_folder
input_sample_sheet=snakemake.input.sample_sheet
desired_sample_set=snakemake.params.sample_set
desired_probe_sets=snakemake.params.probe_sets.replace(' ', '').strip().split(',')
mip_arms=snakemake.output.mip_arms
sample_file=open(snakemake.output.sample_file, 'w')
output_sample_sheet=snakemake.output.sample_sheet
subprocess.call(f'cp {input_sample_sheet} {output_sample_sheet}', shell=True)

#grab only selected columns from original arms file and output them to new arms file
arms_df=pd.read_table(arms_file)
arms_df=arms_df[['mip_id', 'mip_family', 'extension_arm', 'ligation_arm', 'extension_barcode_length', 'ligation_barcode_length', 'gene_name', 'mipset']]
arms_df.to_csv(mip_arms, index=False, sep='\t')
sequenced_samples=[sample.split('_')[0] for sample in os.listdir(input_fastq_folder)]

print('sequenced samples are', sequenced_samples)

samples_used=set([])
for line_number, line in enumerate(open(input_sample_sheet)):
line=line.strip().split('\t')
if line_number==0:
sample_name_c, sample_set_c, replicate_c, probe_set_c=line.index('sample_name'), line.index('sample_set'), line.index('replicate'), line.index('probe_set')
else:
probe_sets=line[probe_set_c].replace(' ', '').strip().split(',')
probe_sets=[entry.upper() for entry in probe_sets]
sample_set=line[sample_set_c].replace(' ', '').strip()
for desired_probe_set in desired_probe_sets:
new_sample_name=f'{line[sample_name_c]}-{line[sample_set_c]}-{line[replicate_c]}'
if new_sample_name in sequenced_samples:
if desired_probe_set.upper() in probe_sets and sample_set.upper()==desired_sample_set.upper():
samples_used.add(new_sample_name)

family_df=arms_df[['mip_family']]
family_dict=family_df.to_dict()
family_list=sorted([family_dict['mip_family'][row] for row in family_dict['mip_family']])
sample_list=sorted(list(samples_used))

bigger_size=max(len(sample_list), len(family_list))
sample_list=sample_list+['']*(bigger_size-len(sample_list))
family_list=family_list+['']*(bigger_size-len(family_list))

sample_file.write('mips\tsamples\n')
for entry in range(bigger_size):
sample_file.write(f'{family_list[entry]}\t{sample_list[entry]}\n')
26 changes: 26 additions & 0 deletions snakemake/scripts/get_good_samples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
sample_file=snakemake.input.sample_file
good_samples=snakemake.output.good_samples
nested_output=snakemake.params.nested_output

samples=[]
for line in open(sample_file):
line=line.strip().split('\t')
if len(line)>1:
samples.append(line[1])

samples=samples[1:]
summary_dict={}
for sample in samples:
summary_file=nested_output+f'/analysis/{sample}/{sample}_mipExtraction/extractInfoSummary.txt'
for line_number, line in enumerate(open(summary_file)):
if line_number>0:
good_read_count=int(line.strip().split('\t')[6].split('(')[0])
summary_dict[sample]=good_read_count

if len(summary_dict)==len(samples):
#only make output file if all extractions gave data - double checks that all
#extractions really did complete for all samples
good_samples=open(good_samples, 'w')
for sample in summary_dict:
if summary_dict[sample]>0:
good_samples.write(sample+'\n')
19 changes: 19 additions & 0 deletions snakemake/scripts/mip_barcode_correction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import tqdm_thing

sample_fastq=snakemake.input.sample_fastq
sample_output=snakemake.output.sample_output
sample=snakemake.wildcards.sample


#see /nfs/jbailey5/baileyweb/asimkin/miptools/miptools_by_sample_prototyping/output/analysis/analysis/D10-JJJ-28/D10-JJJ-28_mipExtraction/k13_S0_Sub0_mip3_ref/k13_S0_Sub0_mip3_ref.fastq.gz for example UMIs
#wrangler_downsample_umi(wrangler_downsample_list)

mip_barcode_correction(barcode_correction_list)
#command: MIPWrangler mipBarcodeCorrection --keepIntermediateFiles --masterDir {params.output_dir} --numThreads {threads} --overWriteDirs --sample {sample}
#marker file: /nfs/jbailey5/baileyweb/asimkin/miptools/miptools_by_sample_prototyping/output/analysis/analysis/D10-JJJ-2/D10-JJJ-2_mipBarcodeCorrection/barcodeFilterStats.tab.txt

mip_correct_for_contam_with_same_barcodes(correct_for_contam_list)
#

mip_clustering(mip_clustering)

7 changes: 7 additions & 0 deletions snakemake/scripts/mip_clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
corrected_barcode_marker=snakemake.input.corrected_barcode_marker
mip_cluster_finished=snakemake.output.mip_cluster_finished
sample=snakemake.wildcards.sample

MIPWrangler mipClustering --masterDir {params.output_dir} --numThreads {threads} --overWriteDirs --par /opt/resources/clustering_pars/illumina_collapseHomoploymers.pars.txt --countEndGaps --sample sample
subprocess.call(f'touch {mip_cluster_finished}', shell=True)

29 changes: 29 additions & 0 deletions snakemake/scripts/output_final_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
'''
still needs to be written
'''
import os
import gzip
#from natsort import natsorted
all_targets=snakemake.params.all_targets
prefix=snakemake.params.prefix
suffix=snakemake.params.suffix
final_table=gzip.open(snakemake.output.final_table, mode='wt')

full_list=[]
header=''
for target in all_targets:
file_path=prefix+target+suffix
if os.path.exists(file_path):
for line_number, line in enumerate(gzip.open(file_path, mode='rt')):
if line_number>0:
line=line.strip().split('\t')
# line[18]='not_shown'
full_list.append(line)
elif len(header)==0:
header=line
#sorted_list=natsorted(full_list)
full_list.sort()

final_table.write(header)
for line in full_list:
final_table.write('\t'.join(line)+'\n')
19 changes: 19 additions & 0 deletions snakemake/scripts/wrangle_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import tqdm_thing

sample_fastq=snakemake.input.sample_fastq
sample_output=snakemake.output.sample_output
sample=snakemake.wildcards.sample


#see /nfs/jbailey5/baileyweb/asimkin/miptools/miptools_by_sample_prototyping/output/analysis/analysis/D10-JJJ-28/D10-JJJ-28_mipExtraction/k13_S0_Sub0_mip3_ref/k13_S0_Sub0_mip3_ref.fastq.gz for example UMIs
#wrangler_downsample_umi(wrangler_downsample_list)

mip_barcode_correction(barcode_correction_list)
#command: MIPWrangler mipBarcodeCorrection --keepIntermediateFiles --masterDir {params.output_dir} --numThreads {threads} --overWriteDirs --sample {sample}
#marker file: /nfs/jbailey5/baileyweb/asimkin/miptools/miptools_by_sample_prototyping/output/analysis/analysis/D10-JJJ-2/D10-JJJ-2_mipBarcodeCorrection/barcodeFilterStats.tab.txt

mip_correct_for_contam_with_same_barcodes(correct_for_contam_list)
#

mip_clustering(mip_clustering)

159 changes: 159 additions & 0 deletions snakemake/wrangler_by_sample_finish.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
configfile: 'wrangler_by_sample.yaml'
output='/opt/analysis'

all_samples, all_targets=[],[]

for line_number, line in enumerate(open(output+'/mip_ids/allMipsSamplesNames.tab.txt')):
if line_number>0:
line=line.rstrip().split('\t')
if len(line)>1 and len(line[1])>0:
all_samples.append(line[1])
if len(line[0])>0:
all_targets.append(line[0])

final_dict={1: expand(output+'/analysis/{sample}/{sample}_mipExtraction/log.txt', sample=all_samples),
2: expand(output+'/analysis/{sample}/{sample}_mipBarcodeCorrection/barcodeFilterStats.tab.txt', sample=all_samples),
3: output+'/analysis/logs/mipCorrectForContamWithSameBarcodes_run1.json',
4: expand(output+'/clustering_status/{sample}_mip_clustering_finished.txt', sample=all_samples),
5: expand(output+'/analysis/populationClustering/{target}/analysis/log.txt', target=all_targets),
6: output+'/allInfo.tsv.gz'}
output_choice=config['output_choice']
final_out=final_dict[output_choice]

rule all:
input:
final_out

rule extract_by_arm:
input:
params:
output_dir='/opt/analysis/analysis',
# wrangler_dir=output,
# fastq_dir=config['fastq_dir']
resources:
time_min=240
output:
output+'/analysis/{sample}/{sample}_mipExtraction/log.txt'
shell:
'''
MIPWrangler mipExtractByArm --masterDir {params.output_dir} --sample {wildcards.sample} --overWriteDirs --minCaptureLength=30
'''
if config['downsample_umi_count']<2**32:
rule mip_barcode_correction:
input:
good_samples=expand(output+'/analysis/{sample}/{sample}_mipExtraction/log.txt', sample=all_samples)
params:
output_dir='/opt/analysis/analysis',
# wrangler_dir=output,
# sif_file=config['miptools_sif'],
downsample_seed=config['downsample_seed'],
downsample_amount=config['downsample_umi_count']
resources:
mem_mb=config['memory_mb_per_step'],
time_min=20
output:
barcode_corrections_finished=output+'/analysis/{sample}/{sample}_mipBarcodeCorrection/barcodeFilterStats.tab.txt'
shell:
'''
MIPWrangler mipBarcodeCorrection --masterDir {params.output_dir} \
--downSampleAmount {params.downsample_amount} --downSampleSeed \
{params.downsample_seed} --overWriteDirs --sample {wildcards.sample}
'''
else:
rule mip_barcode_correction:
input:
good_samples=expand(output+'/analysis/{sample}/{sample}_mipExtraction/log.txt', sample=all_samples)
params:
output_dir='/opt/analysis/analysis',
# wrangler_dir=output,
# sif_file=config['miptools_sif'],
downsample_seed=config['downsample_seed'],
resources:
mem_mb=config['memory_mb_per_step'],
time_min=20
output:
barcode_corrections_finished=output+'/analysis/{sample}/{sample}_mipBarcodeCorrection/barcodeFilterStats.tab.txt'
shell:
'''
MIPWrangler mipBarcodeCorrection --masterDir {params.output_dir} \
--doNotDownSample --downSampleSeed \
{params.downsample_seed} --overWriteDirs --sample {wildcards.sample}
'''


rule correct_for_same_barcode_contam:
input:
all_corrected=expand(output+'/analysis/{sample}/{sample}_mipBarcodeCorrection/barcodeFilterStats.tab.txt', sample=all_samples)
params:
output_dir='/opt/analysis/analysis',
# wrangler_dir=output,
# sif_file=config['miptools_sif'],
resources:
mem_mb=40000,
time_min=1440,
nodes=20
threads: 20
output:
#name is controlled by --logFile
corrected_barcode_marker=output+'/analysis/logs/mipCorrectForContamWithSameBarcodes_run1.json'
shell:
'''
MIPWrangler mipCorrectForContamWithSameBarcodesMultiple --masterDir {params.output_dir} --numThreads {threads} --overWriteDirs --overWriteLog --logFile mipCorrectForContamWithSameBarcodes_run1
'''

rule mip_clustering:
input:
corrected_barcode_marker=output+'/analysis/logs/mipCorrectForContamWithSameBarcodes_run1.json',
#sample_dir=output+'/analysis/{sample}'
params:
output_dir='/opt/analysis/analysis',
# wrangler_dir=output,
# sif_file=config['miptools_sif']
resources:
mem_mb=config['memory_mb_per_step'],
time_min=60,
output:
mip_clustering=output+'/clustering_status/{sample}_mip_clustering_finished.txt'
shell:
'''
MIPWrangler mipClustering --masterDir {params.output_dir} --overWriteDirs --par /opt/resources/clustering_pars/illumina_collapseHomoploymers.pars.txt --countEndGaps --sample {wildcards.sample}
touch {output.mip_clustering}
'''

rule pop_cluster_target:
input:
mip_cluster_files=expand(output+'/clustering_status/{sample}_mip_clustering_finished.txt', sample=all_samples)
params:
output_dir='/opt/analysis/analysis',
# wrangler_dir=output,
# sif_file=config['miptools_sif']
resources:
mem_mb=config['memory_mb_per_step'],
time_min=60,
output:
pop_clustering=output+'/analysis/populationClustering/{target}/analysis/log.txt'
shell:
'''
MIPWrangler mipPopulationClustering --keepIntermediateFiles --masterDir {params.output_dir} --overWriteDirs --cutoff 0 --countEndGaps --fraccutoff 0.005 --mipName {wildcards.target}
touch {output.pop_clustering}
'''

rule output_final_table:
'''
cat together output files of previous step into a final file, do a "natural
sort" to sort things similar to how Nick's are output. gzip it
'''
input:
pop_clustering=expand(output+'/analysis/populationClustering/{target}/analysis/log.txt', target=all_targets)
# final_sample_outputs=expand('/path/to/sample/outputs/{sample}.something', sample=sample_list)
params:
all_targets=all_targets,
prefix=output+'/analysis/populationClustering/',
suffix='/analysis/selectedClustersInfo.tab.txt.gz'
resources:
mem_mb=20000,
time_min=480
output:
final_table=output+'/allInfo.tsv.gz'
script:
'scripts/output_final_table.py'
76 changes: 76 additions & 0 deletions snakemake/wrangler_by_sample_setup.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
'''
creates a mip_ids folder and an allMipsSamplesNames.tab.txt file. extracts mips,
corrects mips, and generates files that can be used to determine sample names as
well as sample names that had extractable data.
'''

configfile: 'wrangler_by_sample.yaml'
output='/opt/analysis'

rule all:
input:
setup_finished=output+'/setup_finished.txt',
# good_samples=output+'/successfully_extracted_samples.txt',
output_configfile=output+'/snakemake_params/wrangler_by_sample.yaml'

rule copy_files:
input:
setup_snakefile='/opt/snakemake/wrangler_by_sample_setup.smk',
finish_snakefile='/opt/snakemake/wrangler_by_sample_finish.smk',
input_configfile='wrangler_by_sample.yaml',
in_scripts='/opt/snakemake/scripts'
output:
setup_snakefile=output+'/snakemake_params/setup_run.smk',
finish_snakefile=output+'/snakemake_params/finish_run.smk',
output_configfile=output+'/snakemake_params/wrangler_by_sample.yaml',
out_scripts=directory(output+'/snakemake_params/scripts')
shell:
'''
cp {input.setup_snakefile} {output.setup_snakefile}
cp {input.finish_snakefile} {output.finish_snakefile}
cp {input.input_configfile} {output.output_configfile}
cp -r {input.in_scripts} {output.out_scripts}
'''

rule generate_mip_files:
'''
given that I'm repackaging miptools wrangler (so wrangler.sh is not needed)
and that the existing generate_wrangler_scripts.py seems unnecessarily
convoluted and that only two files are needed by subsequent steps
(mipArms.txt and allMipsSamplesNames.tab.txt) I wrote my own
script for this. Input is an arms file and a sample sheet. Output is an arms
file with rearranged columns and a two column file with names of all mips
and names of all samples (with no pairing between columns of any given row).
'''
input:
arms_file='/opt/project_resources/mip_ids/mip_arms.txt',
sample_sheet='/opt/input_sample_sheet_directory/'+config['input_sample_sheet_name'],
fastq_folder='/opt/data'
params:
sample_set=config['sample_set_used'],
probe_sets=config['probe_sets_used']
output:
mip_arms=output+'/mip_ids/mipArms.txt',
sample_file=output+'/mip_ids/allMipsSamplesNames.tab.txt',
sample_sheet=output+'/sample_sheet.tsv'
script:
'scripts/generate_mip_files.py'

rule setup:
input:
mip_arms=output+'/mip_ids/mipArms.txt',
sample_file=output+'/mip_ids/allMipsSamplesNames.tab.txt'
params:
output_dir='/opt/analysis/analysis',
project_resources='/opt/project_resources',
# wrangler_dir=output,
# sif_file=config['miptools_sif'],
fastq_dir='/opt/data'
output:
setup_finished=output+'/setup_finished.txt'
threads: config['cpu_count']
shell:
'''
MIPWrangler mipSetup --mipArmsFilename /opt/analysis/mip_ids/mipArms.txt --mipSampleFile /opt/analysis/mip_ids/allMipsSamplesNames.tab.txt --numThreads {threads} --masterDir {params.output_dir} --dir /opt/data --mipServerNumber 1
touch {output.setup_finished}
'''
1 change: 1 addition & 0 deletions user_scripts/variant_calling.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ ulimit -n $(ulimit -Hn)
# set the home directory as the current working directory
#################################################
newhome=$(pwd -P)

###############################################
# function to parse the yaml file edited by the user
# pulls out the location of the sif file, output directory, etc.
Expand Down
Loading