diff --git a/CHANGELOG.md b/CHANGELOG.md index ec2d167b..2c9b7821 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,17 +8,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Enhancements & fixes - Updated pipeline template to [nf-core/tools 2.7.2](https://github.com/nf-core/tools/releases/tag/2.7.2) -- [[#317](https://github.com/nf-core/chipseq/issues/317)] Added metro map -- [[#288](https://github.com/nf-core/chipseq/issues/291)] Bump `chromap` version 2 and enable all the steps below chromap again when paired-end data is processed. -- [[#311](https://github.com/nf-core/chipseq/issues/311)] Add back `--skip_spp` parameter which was unintentionally removed from the code. +- [[#317](https://github.com/nf-core/chipseq/issues/317)] - Added metro map +- [[#288](https://github.com/nf-core/chipseq/issues/291)] - Bump `chromap` version 2 and enable all the steps below chromap again when paired-end data is processed. +- [[#311](https://github.com/nf-core/chipseq/issues/311)] - Add back `--skip_spp` parameter which was unintentionally removed from the code. - Install available nf-core subworkflows and refactor code accordingly -- [[#318](https://github.com/nf-core/chipseq/issues/318)] Update `bowtie2/align` module to fix issue when downloading its singularity image. -- [[#320](https://github.com/nf-core/chipseq/issues/320)] Fix samplesheet control column in documentation examples. -- [[#328](https://github.com/nf-core/chipseq/issues/328)] Modify documentation to clarify that is necessary to provide the `--read_length` when `--genome` is set and `--macs_gsize` has not provided. +- [[#318](https://github.com/nf-core/chipseq/issues/318)] - Update `bowtie2/align` module to fix issue when downloading its singularity image. +- [[#320](https://github.com/nf-core/chipseq/issues/320)] - Fix samplesheet control column in documentation examples. +- [[#328](https://github.com/nf-core/chipseq/issues/328)] - Modify documentation to clarify that is necessary to provide the `--read_length` when `--genome` is set and `--macs_gsize` has not provided. - Remove `enable_conda` param from local modules. - Fix the path where `chromap` index is stored when `--save_reference` is set. - Fix untar of `chromap` index when using `--chromap_index` param. -- [nf-core/tools#2286](https://github.com/nf-core/tools/issues/2286) Set default container registry outside profile scope. +- [nf-core/tools#2286](https://github.com/nf-core/tools/issues/2286) - Set default container registry outside profile scope. +- [[#343](https://github.com/nf-core/chipseq/issues/343)] - Provide replicate information explicitly in samplesheet. ### Software dependencies diff --git a/assets/samplesheet_pe.csv b/assets/samplesheet_pe.csv index 3a304f12..17ea2e86 100644 --- a/assets/samplesheet_pe.csv +++ b/assets/samplesheet_pe.csv @@ -1,21 +1,21 @@ -sample,fastq_1,fastq_2,antibody,control -WT_BCATENIN_IP_REP1,BLA203A1_S27_L006_R1_001.fastq.gz,BLA203A1_S27_L006_R2_001.fastq.gz,BCATENIN,WT_INPUT -WT_BCATENIN_IP_REP2,BLA203A25_S16_L001_R1_001.fastq.gz,BLA203A25_S16_L001_R2_001.fastq.gz,BCATENIN,WT_INPUT -WT_BCATENIN_IP_REP2,BLA203A25_S16_L002_R1_001.fastq.gz,BLA203A25_S16_L002_R2_001.fastq.gz,BCATENIN,WT_INPUT -WT_BCATENIN_IP_REP3,BLA203A49_S40_L001_R1_001.fastq.gz,BLA203A49_S40_L001_R2_001.fastq.gz,BCATENIN,WT_INPUT -NAIVE_BCATENIN_IP_REP1,BLA203A7_S60_L001_R1_001.fastq.gz,BLA203A7_S60_L001_R2_001.fastq.gz,BCATENIN,NAIVE_INPUT -NAIVE_BCATENIN_IP_REP2,BLA203A43_S34_L001_R1_001.fastq.gz,BLA203A43_S34_L001_R2_001.fastq.gz,BCATENIN,NAIVE_INPUT -NAIVE_BCATENIN_IP_REP2,BLA203A43_S34_L002_R1_001.fastq.gz,BLA203A43_S34_L002_R2_001.fastq.gz,BCATENIN,NAIVE_INPUT -NAIVE_BCATENIN_IP_REP3,BLA203A64_S55_L001_R1_001.fastq.gz,BLA203A64_S55_L001_R2_001.fastq.gz,BCATENIN,NAIVE_INPUT -WT_TCF4_IP_REP1,BLA203A3_S29_L006_R1_001.fastq.gz,BLA203A3_S29_L006_R2_001.fastq.gz,TCF4,WT_INPUT -WT_TCF4_IP_REP2,BLA203A27_S18_L001_R1_001.fastq.gz,BLA203A27_S18_L001_R2_001.fastq.gz,TCF4,WT_INPUT -WT_TCF4_IP_REP2,BLA203A51_S42_L001_R1_001.fastq.gz,BLA203A51_S42_L001_R2_001.fastq.gz,TCF4,WT_INPUT -NAIVE_TCF4_IP_REP1,BLA203A9_S62_L001_R1_001.fastq.gz,BLA203A9_S62_L001_R2_001.fastq.gz,TCF4,NAIVE_INPUT -NAIVE_TCF4_IP_REP2,BLA203A45_S36_L001_R1_001.fastq.gz,BLA203A45_S36_L001_R2_001.fastq.gz,TCF4,NAIVE_INPUT -NAIVE_TCF4_IP_REP3,BLA203A66_S57_L001_R1_001.fastq.gz,BLA203A66_S57_L001_R2_001.fastq.gz,TCF4,NAIVE_INPUT -WT_INPUT_REP1,BLA203A6_S32_L006_R1_001.fastq.gz,BLA203A6_S32_L006_R2_001.fastq.gz,, -WT_INPUT_REP2,BLA203A30_S21_L001_R1_001.fastq.gz,BLA203A30_S21_L001_R2_001.fastq.gz,, -WT_INPUT_REP3,BLA203A31_S21_L003_R1_001.fastq.gz,BLA203A31_S21_L003_R2_001.fastq.gz,, -NAIVE_INPUT_REP1,BLA203A12_S3_L001_R1_001.fastq.gz,BLA203A12_S3_L001_R2_001.fastq.gz,, -NAIVE_INPUT_REP2,BLA203A48_S39_L001_R1_001.fastq.gz,BLA203A48_S39_L001_R2_001.fastq.gz,, -NAIVE_INPUT_REP3,BLA203A49_S1_L006_R1_001.fastq.gz,BLA203A49_S1_L006_R2_001.fastq.gz,, +sample,fastq_1,fastq_2,replicate,antibody,control,control_replicate +WT_BCATENIN_IP,BLA203A1_S27_L006_R1_001.fastq.gz,BLA203A1_S27_L006_R2_001.fastq.gz,1,BCATENIN,WT_INPUT,1 +WT_BCATENIN_IP,BLA203A25_S16_L001_R1_001.fastq.gz,BLA203A25_S16_L001_R2_001.fastq.gz,2,BCATENIN,WT_INPUT,2 +WT_BCATENIN_IP,BLA203A25_S16_L002_R1_001.fastq.gz,BLA203A25_S16_L002_R2_001.fastq.gz,2,BCATENIN,WT_INPUT,2 +WT_BCATENIN_IP,BLA203A49_S40_L001_R1_001.fastq.gz,BLA203A49_S40_L001_R2_001.fastq.gz,3,BCATENIN,WT_INPUT,3 +NAIVE_BCATENIN_IP,BLA203A7_S60_L001_R1_001.fastq.gz,BLA203A7_S60_L001_R2_001.fastq.gz,1,BCATENIN,NAIVE_INPUT,1 +NAIVE_BCATENIN_IP,BLA203A43_S34_L001_R1_001.fastq.gz,BLA203A43_S34_L001_R2_001.fastq.gz,2,BCATENIN,NAIVE_INPUT,2 +NAIVE_BCATENIN_IP,BLA203A43_S34_L002_R1_001.fastq.gz,BLA203A43_S34_L002_R2_001.fastq.gz,2,BCATENIN,NAIVE_INPUT,2 +NAIVE_BCATENIN_IP,BLA203A64_S55_L001_R1_001.fastq.gz,BLA203A64_S55_L001_R2_001.fastq.gz,3,BCATENIN,NAIVE_INPUT,3 +WT_TCF4_IP,BLA203A3_S29_L006_R1_001.fastq.gz,BLA203A3_S29_L006_R2_001.fastq.gz,1,TCF4,WT_INPUT,1 +WT_TCF4_IP,BLA203A27_S18_L001_R1_001.fastq.gz,BLA203A27_S18_L001_R2_001.fastq.gz,2,TCF4,WT_INPUT,2 +WT_TCF4_IP,BLA203A51_S42_L001_R1_001.fastq.gz,BLA203A51_S42_L001_R2_001.fastq.gz,2,TCF4,WT_INPUT,2 +NAIVE_TCF4_IP,BLA203A9_S62_L001_R1_001.fastq.gz,BLA203A9_S62_L001_R2_001.fastq.gz,1,TCF4,NAIVE_INPUT,1 +NAIVE_TCF4_IP,BLA203A45_S36_L001_R1_001.fastq.gz,BLA203A45_S36_L001_R2_001.fastq.gz,2,TCF4,NAIVE_INPUT,2 +NAIVE_TCF4_IP,BLA203A66_S57_L001_R1_001.fastq.gz,BLA203A66_S57_L001_R2_001.fastq.gz,3,TCF4,NAIVE_INPUT,3 +WT_INPUT,BLA203A6_S32_L006_R1_001.fastq.gz,BLA203A6_S32_L006_R2_001.fastq.gz,1,,, +WT_INPUT,BLA203A30_S21_L001_R1_001.fastq.gz,BLA203A30_S21_L001_R2_001.fastq.gz,2,,, +WT_INPUT,BLA203A31_S21_L003_R1_001.fastq.gz,BLA203A31_S21_L003_R2_001.fastq.gz,3,,, +NAIVE_INPUT,BLA203A12_S3_L001_R1_001.fastq.gz,BLA203A12_S3_L001_R2_001.fastq.gz,1,,, +NAIVE_INPUT,BLA203A48_S39_L001_R1_001.fastq.gz,BLA203A48_S39_L001_R2_001.fastq.gz,2,,, +NAIVE_INPUT,BLA203A49_S1_L006_R1_001.fastq.gz,BLA203A49_S1_L006_R2_001.fastq.gz,3,,, diff --git a/assets/samplesheet_se.csv b/assets/samplesheet_se.csv index a9581d6e..8a0297d7 100644 --- a/assets/samplesheet_se.csv +++ b/assets/samplesheet_se.csv @@ -1,21 +1,21 @@ -sample,fastq_1,fastq_2,antibody,control -WT_BCATENIN_IP_REP1,BLA203A1_S27_L006_R1_001.fastq.gz,,BCATENIN,WT_INPUT -WT_BCATENIN_IP_REP2,BLA203A25_S16_L001_R1_001.fastq.gz,,BCATENIN,WT_INPUT -WT_BCATENIN_IP_REP2,BLA203A25_S16_L002_R1_001.fastq.gz,,BCATENIN,WT_INPUT -WT_BCATENIN_IP_REP3,BLA203A49_S40_L001_R1_001.fastq.gz,,BCATENIN,WT_INPUT -NAIVE_BCATENIN_IP_REP1,BLA203A7_S60_L001_R1_001.fastq.gz,,BCATENIN,NAIVE_INPUT -NAIVE_BCATENIN_IP_REP2,BLA203A43_S34_L001_R1_001.fastq.gz,,BCATENIN,NAIVE_INPUT -NAIVE_BCATENIN_IP_REP2,BLA203A43_S34_L002_R1_001.fastq.gz,,BCATENIN,NAIVE_INPUT -NAIVE_BCATENIN_IP_REP3,BLA203A64_S55_L001_R1_001.fastq.gz,,BCATENIN,NAIVE_INPUT -WT_TCF4_IP_REP1,BLA203A3_S29_L006_R1_001.fastq.gz,,TCF4,WT_INPUT -WT_TCF4_IP_REP2,BLA203A27_S18_L001_R1_001.fastq.gz,,TCF4,WT_INPUT -WT_TCF4_IP_REP3,BLA203A51_S42_L001_R1_001.fastq.gz,,TCF4,WT_INPUT -NAIVE_TCF4_IP_REP1,BLA203A9_S62_L001_R1_001.fastq.gz,,TCF4,NAIVE_INPUT -NAIVE_TCF4_IP_REP2,BLA203A45_S36_L001_R1_001.fastq.gz,,TCF4,NAIVE_INPUT -NAIVE_TCF4_IP_REP3,BLA203A66_S57_L001_R1_001.fastq.gz,,TCF4,NAIVE_INPUT -WT_INPUT_REP1,BLA203A6_S32_L006_R1_001.fastq.gz,,, -WT_INPUT_REP2,BLA203A30_S21_L001_R1_001.fastq.gz,,, -WT_INPUT_REP3,BLA203A31_S21_L003_R1_001.fastq.gz,,, -NAIVE_INPUT_REP1,BLA203A12_S3_L001_R1_001.fastq.gz,,, -NAIVE_INPUT_REP2,BLA203A48_S39_L001_R1_001.fastq.gz,,, -NAIVE_INPUT_REP3,BLA203A49_S1_L006_R1_001.fastq.gz,,, +sample,fastq_1,fastq_2,replicate,antibody,control,control_replicate +WT_BCATENIN_IP,BLA203A1_S27_L006_R1_001.fastq.gz,,1,BCATENIN,WT_INPUT,1 +WT_BCATENIN_IP,BLA203A25_S16_L001_R1_001.fastq.gz,,2,BCATENIN,WT_INPUT,2 +WT_BCATENIN_IP,BLA203A25_S16_L002_R1_001.fastq.gz,,2,BCATENIN,WT_INPUT,2 +WT_BCATENIN_IP,BLA203A49_S40_L001_R1_001.fastq.gz,,3,BCATENIN,WT_INPUT,3 +NAIVE_BCATENIN_IP,BLA203A7_S60_L001_R1_001.fastq.gz,,1,BCATENIN,NAIVE_INPUT,1 +NAIVE_BCATENIN_IP,BLA203A43_S34_L001_R1_001.fastq.gz,,2,BCATENIN,NAIVE_INPUT,2 +NAIVE_BCATENIN_IP,BLA203A43_S34_L002_R1_001.fastq.gz,,2,BCATENIN,NAIVE_INPUT,2 +NAIVE_BCATENIN_IP,BLA203A64_S55_L001_R1_001.fastq.gz,,3,BCATENIN,NAIVE_INPUT,3 +WT_TCF4_IP,BLA203A3_S29_L006_R1_001.fastq.gz,,1,TCF4,WT_INPUT,1 +WT_TCF4_IP,BLA203A27_S18_L001_R1_001.fastq.gz,,2,TCF4,WT_INPUT,2 +WT_TCF4_IP,BLA203A51_S42_L001_R1_001.fastq.gz,,3,TCF4,WT_INPUT,3 +NAIVE_TCF4_IP,BLA203A9_S62_L001_R1_001.fastq.gz,,1,TCF4,NAIVE_INPUT,1 +NAIVE_TCF4_IP,BLA203A45_S36_L001_R1_001.fastq.gz,,2,TCF4,NAIVE_INPUT,2 +NAIVE_TCF4_IP,BLA203A66_S57_L001_R1_001.fastq.gz,,3,TCF4,NAIVE_INPUT,3 +WT_INPUT,BLA203A6_S32_L006_R1_001.fastq.gz,,1,,, +WT_INPUT,BLA203A30_S21_L001_R1_001.fastq.gz,,2,,, +WT_INPUT,BLA203A31_S21_L003_R1_001.fastq.gz,,3,,, +NAIVE_INPUT,BLA203A12_S3_L001_R1_001.fastq.gz,,1,,, +NAIVE_INPUT,BLA203A48_S39_L001_R1_001.fastq.gz,,2,,, +NAIVE_INPUT,BLA203A49_S1_L006_R1_001.fastq.gz,,3,,, diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 3fd6ff8a..b7b285ab 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -38,20 +38,20 @@ def print_error(error, context="Line", context_str=""): def check_samplesheet(file_in, file_out): """ This function checks that the samplesheet follows the following structure: - sample,fastq_1,fastq_2,antibody,control - SPT5_T0_REP1,SRR1822153_1.fastq.gz,SRR1822153_2.fastq.gz,SPT5,SPT5_INPUT_REP1 - SPT5_T0_REP2,SRR1822154_1.fastq.gz,SRR1822154_2.fastq.gz,SPT5,SPT5_INPUT_REP2 - SPT5_INPUT_REP1,SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R1.fastq.gz,SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R2.fastq.gz,, - SPT5_INPUT_REP2,SRR5204810_Spt5-ChIP_Input2_SacCer_ChIP-Seq_ss100k_R1.fastq.gz,SRR5204810_Spt5-ChIP_Input2_SacCer_ChIP-Seq_ss100k_R2.fastq.gz,, + sample,fastq_1,fastq_2,replicate,antibody,control,control_replicate + SPT5_T0,SRR1822153_1.fastq.gz,SRR1822153_2.fastq.gz,SPT5,1,SPT5_INPUT,1 + SPT5_T0,SRR1822154_1.fastq.gz,SRR1822154_2.fastq.gz,SPT5,2,SPT5_INPUT,2 + SPT5_INPUT,SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R1.fastq.gz,SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R2.fastq.gz,1,,, + SPT5_INPUT,SRR5204810_Spt5-ChIP_Input2_SacCer_ChIP-Seq_ss100k_R1.fastq.gz,SRR5204810_Spt5-ChIP_Input2_SacCer_ChIP-Seq_ss100k_R2.fastq.gz,2,,, For an example see: - https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/samplesheet/v2.0/samplesheet_test.csv + https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/samplesheet/v2.1/samplesheet_test.csv """ sample_mapping_dict = {} with open(file_in, "r", encoding="utf-8-sig") as fin: ## Check header - MIN_COLS = 2 - HEADER = ["sample", "fastq_1", "fastq_2", "antibody", "control"] + MIN_COLS = 3 + HEADER = ["sample", "fastq_1", "fastq_2", "replicate", "antibody", "control", "control_replicate"] header = [x.strip('"') for x in fin.readline().strip().split(",")] if header[: len(HEADER)] != HEADER: print(f"ERROR: Please check samplesheet header -> {','.join(header)} != {','.join(HEADER)}") @@ -77,7 +77,7 @@ def check_samplesheet(file_in, file_out): ) ## Check sample name entries - sample, fastq_1, fastq_2, antibody, control = lspl[: len(HEADER)] + sample, fastq_1, fastq_2, replicate, antibody, control, control_replicate = lspl[: len(HEADER)] if sample.find(" ") != -1: print(f"WARNING: Spaces have been replaced by underscores for sample: {sample}") sample = sample.replace(" ", "_") @@ -96,6 +96,11 @@ def check_samplesheet(file_in, file_out): line, ) + ## Check replicate column is integer + if not replicate.isdecimal(): + print_error("Replicate id not an integer!", "Line", line) + sys.exit(1) + ## Check antibody and control columns have valid values if antibody: if antibody.find(" ") != -1: @@ -107,10 +112,15 @@ def check_samplesheet(file_in, file_out): "Line", line, ) + if control: if control.find(" ") != -1: print(f"WARNING: Spaces have been replaced by underscores for control: {control}") control = control.replace(" ", "_") + if not control_replicate.isdecimal(): + print_error("Control replicate id not an integer!", "Line", line) + sys.exit(1) + control = "{}_REP{}".format(control, control_replicate) if not antibody: print_error( "Both antibody and control columns must be specified!", @@ -119,22 +129,26 @@ def check_samplesheet(file_in, file_out): ) ## Auto-detect paired-end/single-end - sample_info = [] ## [single_end, fastq_1, fastq_2, antibody, control] + sample_info = [] ## [single_end, fastq_1, fastq_2, replicate, antibody, control] if sample and fastq_1 and fastq_2: ## Paired-end short reads - sample_info = ["0", fastq_1, fastq_2, antibody, control] + sample_info = ["0", fastq_1, fastq_2, replicate, antibody, control] elif sample and fastq_1 and not fastq_2: ## Single-end short reads - sample_info = ["1", fastq_1, fastq_2, antibody, control] + sample_info = ["1", fastq_1, fastq_2, replicate, antibody, control] else: print_error("Invalid combination of columns provided!", "Line", line) - ## Create sample mapping dictionary = {sample: [[ single_end, fastq_1, fastq_2, antibody, control ]]} + ## Create sample mapping dictionary = {sample: [[ single_end, fastq_1, fastq_2, replicate, antibody, control ]]} + replicate = int(replicate) + sample_info = sample_info + lspl[len(HEADER) :] if sample not in sample_mapping_dict: - sample_mapping_dict[sample] = [sample_info] + sample_mapping_dict[sample] = {} + if replicate not in sample_mapping_dict[sample]: + sample_mapping_dict[sample][replicate] = [sample_info] else: - if sample_info in sample_mapping_dict[sample]: + if sample_info in sample_mapping_dict[sample][replicate]: print_error("Samplesheet contains duplicate rows!", "Line", line) else: - sample_mapping_dict[sample].append(sample_info) + sample_mapping_dict[sample][replicate].append(sample_info) ## Write validated samplesheet with appropriate columns if len(sample_mapping_dict) > 0: @@ -148,6 +162,7 @@ def check_samplesheet(file_in, file_out): "single_end", "fastq_1", "fastq_2", + "replicate", "antibody", "control", ] @@ -155,24 +170,60 @@ def check_samplesheet(file_in, file_out): + "\n" ) for sample in sorted(sample_mapping_dict.keys()): - ## Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end - if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]): + ## Check that replicate ids are in format 1.. + uniq_rep_ids = sorted(list(set(sample_mapping_dict[sample].keys()))) + if len(uniq_rep_ids) != max(uniq_rep_ids) or 1 != min(uniq_rep_ids): print_error( - f"Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end!", + "Replicate ids must start with 1..!", + "Sample", + "{}, replicate ids: {}".format(sample, ",".join([str(x) for x in uniq_rep_ids])), + ) + sys.exit(1) + + ## Check that multiple replicates are of the same datatype i.e. single-end / paired-end + if not all( + x[0][0] == sample_mapping_dict[sample][1][0][0] for x in sample_mapping_dict[sample].values() + ): + print_error( + f"Multiple replicates of a sample must be of the same datatype i.e. single-end or paired-end!", "Sample", sample, ) - for idx, val in enumerate(sample_mapping_dict[sample]): - control = val[-1] - if control and control not in sample_mapping_dict.keys(): + for replicate in sorted(sample_mapping_dict[sample].keys()): + ## Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end + if not all( + x[0] == sample_mapping_dict[sample][replicate][0][0] + for x in sample_mapping_dict[sample][replicate] + ): print_error( - f"Control identifier has to match does a provided sample identifier!", - "Control", - control, + f"Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end!", + "Sample", + sample, ) - fout.write(",".join([f"{sample}_T{idx+1}"] + val) + "\n") + for idx, val in enumerate(sample_mapping_dict[sample][replicate]): + control = "_REP".join(val[-1].split("_REP")[:-1]) + control_replicate = val[-1].split("_REP")[-1] + if control and ( + control not in sample_mapping_dict.keys() + or int(control_replicate) not in sample_mapping_dict[control].keys() + ): + print_error( + f"Control identifier and replicate has to match a provided sample identifier and replicate!", + "Control", + val[4], + ) + + ## Write to file + for idx in range(len(sample_mapping_dict[sample][replicate])): + fastq_files = sample_mapping_dict[sample][replicate][idx] + sample_id = "{}_REP{}_T{}".format(sample, replicate, idx + 1) + if len(fastq_files) == 1: + fout.write(",".join([sample_id] + fastq_files) + ",\n") + else: + fout.write(",".join([sample_id] + fastq_files) + "\n") + else: print_error(f"No entries to process!", "Samplesheet: {file_in}") diff --git a/conf/test.config b/conf/test.config index 9b24bc9a..f38e12c5 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,7 +20,7 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/samplesheet/v2.0/samplesheet_test.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/samplesheet/v2.1/samplesheet_test.csv' read_length = 50 // Genome references diff --git a/conf/test_full.config b/conf/test_full.config index 28cb9c98..a9478183 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -17,7 +17,7 @@ params { config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/samplesheet/v2.0/samplesheet_full.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/samplesheet/v2.1/samplesheet_full.csv' // Used to calculate --macs_gsize read_length = 50 diff --git a/docs/usage.md b/docs/usage.md index db940f51..6cab8bfe 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -12,66 +12,82 @@ You will need to create a samplesheet with information about the samples you wou --input '[path to samplesheet file]' ``` -### Multiple runs of the same library +### Multiple replicates + +The `sample` identifier should be identical when you have multiple replicates from the same experimental group, just increment the `replicate` identifier appropriately. The first replicate value for any given experimental group must be 1. + +The `antibody` column is required to separate the downstream consensus peak merging for different antibodies. It is not advisable to generate a consensus peak set across different antibodies especially if their binding patterns are inherently different e.g. narrow transcription factors and broad histone marks. -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will perform the alignments in parallel, and subsequently merge them before further analysis. Below is an example where the samples called `WT_BCATENIN_IP_REP2` and `WT_INPUT_REP2` have been re-sequenced multiple times: +The `control` column should be the `sample` identifier for the controls for any given IP. This column together with the `control_replicate` column will set the corresponding control for each of the samples in the table. ```console -sample,fastq_1,fastq_2,antibody,control -WT_BCATENIN_IP_REP1,BLA203A1_S27_L006_R1_001.fastq.gz,,BCATENIN,WT_INPUT_REP1 -WT_BCATENIN_IP_REP2,BLA203A25_S16_L001_R1_001.fastq.gz,,BCATENIN,WT_INPUT_REP2 -WT_BCATENIN_IP_REP2,BLA203A25_S16_L002_R1_001.fastq.gz,,BCATENIN,WT_INPUT_REP2 -WT_BCATENIN_IP_REP2,BLA203A25_S16_L003_R1_001.fastq.gz,,BCATENIN,WT_INPUT_REP2 -WT_BCATENIN_IP_REP3,BLA203A49_S40_L001_R1_001.fastq.gz,,BCATENIN,WT_INPUT_REP3 -WT_INPUT_REP1,BLA203A6_S32_L006_R1_001.fastq.gz,,, -WT_INPUT_REP2,BLA203A30_S21_L001_R1_001.fastq.gz,,, -WT_INPUT_REP2,BLA203A30_S21_L002_R1_001.fastq.gz,,, -WT_INPUT_REP3,BLA203A31_S21_L003_R1_001.fastq.gz,,, +group,fastq_1,fastq_2,replicate,antibody,control,control_replicate +WT_BCATENIN_IP,BLA203A1_S27_L006_R1_001.fastq.gz,,1,BCATENIN,WT_INPUT,1 +WT_BCATENIN_IP,BLA203A25_S16_L002_R1_001.fastq.gz,,2,BCATENIN,WT_INPUT,2 +WT_BCATENIN_IP,BLA203A49_S40_L001_R1_001.fastq.gz,,3,BCATENIN,WT_INPUT,3 +WT_INPUT,BLA203A6_S32_L006_R1_001.fastq.gz,,1,,, +WT_INPUT,BLA203A30_S21_L002_R1_001.fastq.gz,,2,,, +WT_INPUT,BLA203A31_S21_L003_R1_001.fastq.gz,,3,,, ``` -### Full design +### Multiple runs of the same library -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 5 columns to match those defined in the table below. +Both the `sample` and `replicate` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will perform the alignments in parallel, and subsequently merge them before further analysis. Below is an example where the samples called `WT_BCATENIN_IP` and `WT_INPUT` have been re-sequenced multiple times: -The `antibody` column is required to separate the downstream consensus peak merging for different antibodies. Its not advisable to generate a consensus peak set across different antibodies especially if their binding patterns are inherently different e.g. narrow transcription factors and broad histone marks. +```console +sample,fastq_1,fastq_2,replicate,antibody,control,control_replicate +WT_BCATENIN_IP,BLA203A1_S27_L006_R1_001.fastq.gz,,1,BCATENIN,WT_INPUT,1 +WT_BCATENIN_IP,BLA203A25_S16_L001_R1_001.fastq.gz,,2,BCATENIN,WT_INPUT,2 +WT_BCATENIN_IP,BLA203A25_S16_L002_R1_001.fastq.gz,,2,BCATENIN,WT_INPUT,2 +WT_BCATENIN_IP,BLA203A25_S16_L003_R1_001.fastq.gz,,2,BCATENIN,WT_INPUT,2 +WT_BCATENIN_IP,BLA203A49_S40_L001_R1_001.fastq.gz,,3,BCATENIN,WT_INPUT,3 +WT_INPUT,BLA203A6_S32_L006_R1_001.fastq.gz,,1,,, +WT_INPUT,BLA203A30_S21_L001_R1_001.fastq.gz,,2,,, +WT_INPUT,BLA203A30_S21_L002_R1_001.fastq.gz,,2,,, +WT_INPUT,BLA203A31_S21_L003_R1_001.fastq.gz,,3,,, +``` -The `control` column should be the `sample` identifier for the controls for any given IP. +### Full design -A final design file may look something like the one below. This is for two antibodies and associated controls, where the `WT_BCATENIN_IP_REP2` and `NAIVE_BCATENIN_IP_REP2` samples have been sequenced twice: +The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 7 columns to match those defined in the table below. + +A final design file may look something like the one below. This is for two antibodies and associated controls, where the second replicate of the `WT_BCATENIN_IP` and `NAIVE_BCATENIN_IP` samples have been sequenced twice: ```console -sample,fastq_1,fastq_2,antibody,control -WT_BCATENIN_IP_REP1,BLA203A1_S27_L006_R1_001.fastq.gz,,BCATENIN,WT_INPUT_REP1 -WT_BCATENIN_IP_REP2,BLA203A25_S16_L001_R1_001.fastq.gz,,BCATENIN,WT_INPUT_REP2 -WT_BCATENIN_IP_REP2,BLA203A25_S16_L002_R1_001.fastq.gz,,BCATENIN,WT_INPUT_REP2 -WT_BCATENIN_IP_REP3,BLA203A49_S40_L001_R1_001.fastq.gz,,BCATENIN,WT_INPUT_REP3 -NAIVE_BCATENIN_IP_REP1,BLA203A7_S60_L001_R1_001.fastq.gz,,BCATENIN,NAIVE_INPUT_REP1 -NAIVE_BCATENIN_IP_REP2,BLA203A43_S34_L001_R1_001.fastq.gz,,BCATENIN,NAIVE_INPUT_REP2 -NAIVE_BCATENIN_IP_REP2,BLA203A43_S34_L002_R1_001.fastq.gz,,BCATENIN,NAIVE_INPUT_REP2 -NAIVE_BCATENIN_IP_REP3,BLA203A64_S55_L001_R1_001.fastq.gz,,BCATENIN,NAIVE_INPUT_REP3 -WT_TCF4_IP_REP1,BLA203A3_S29_L006_R1_001.fastq.gz,,TCF4,WT_INPUT_REP1 -WT_TCF4_IP_REP2,BLA203A27_S18_L001_R1_001.fastq.gz,,TCF4,WT_INPUT_REP2 -WT_TCF4_IP_REP3,BLA203A51_S42_L001_R1_001.fastq.gz,,TCF4,WT_INPUT_REP3 -NAIVE_TCF4_IP_REP1,BLA203A9_S62_L001_R1_001.fastq.gz,,TCF4,NAIVE_INPUT_REP1 -NAIVE_TCF4_IP_REP2,BLA203A45_S36_L001_R1_001.fastq.gz,,TCF4,NAIVE_INPUT_REP2 -NAIVE_TCF4_IP_REP3,BLA203A66_S57_L001_R1_001.fastq.gz,,TCF4,NAIVE_INPUT_REP3 -WT_INPUT_REP1,BLA203A6_S32_L006_R1_001.fastq.gz,,, -WT_INPUT_REP2,BLA203A30_S21_L001_R1_001.fastq.gz,,, -WT_INPUT_REP3,BLA203A31_S21_L003_R1_001.fastq.gz,,, -NAIVE_INPUT_REP1,BLA203A12_S3_L001_R1_001.fastq.gz,,, -NAIVE_INPUT_REP2,BLA203A48_S39_L001_R1_001.fastq.gz,,, -NAIVE_INPUT_REP3,BLA203A49_S1_L006_R1_001.fastq.gz,,, +sample,fastq_1,fastq_2,replicate,antibody,control,control_replicate +WT_BCATENIN_IP,BLA203A1_S27_L006_R1_001.fastq.gz,,1,BCATENIN,WT_INPUT,1 +WT_BCATENIN_IP,BLA203A25_S16_L001_R1_001.fastq.gz,,2,BCATENIN,WT_INPUT,2 +WT_BCATENIN_IP,BLA203A25_S16_L002_R1_001.fastq.gz,,2,BCATENIN,WT_INPUT,2 +WT_BCATENIN_IP,BLA203A49_S40_L001_R1_001.fastq.gz,,3,BCATENIN,WT_INPUT,3 +NAIVE_BCATENIN_IP,BLA203A7_S60_L001_R1_001.fastq.gz,,1,BCATENIN,NAIVE_INPUT,1 +NAIVE_BCATENIN_IP,BLA203A43_S34_L001_R1_001.fastq.gz,,2,BCATENIN,NAIVE_INPUT,2 +NAIVE_BCATENIN_IP,BLA203A43_S34_L002_R1_001.fastq.gz,,2,BCATENIN,NAIVE_INPUT,2 +NAIVE_BCATENIN_IP,BLA203A64_S55_L001_R1_001.fastq.gz,,3,BCATENIN,NAIVE_INPUT,3 +WT_TCF4_IP,BLA203A3_S29_L006_R1_001.fastq.gz,,1,TCF4,WT_INPUT,1 +WT_TCF4_IP,BLA203A27_S18_L001_R1_001.fastq.gz,,2,TCF4,WT_INPUT,2 +WT_TCF4_IP,BLA203A51_S42_L001_R1_001.fastq.gz,,3,TCF4,WT_INPUT,3 +NAIVE_TCF4_IP,BLA203A9_S62_L001_R1_001.fastq.gz,,1,TCF4,NAIVE_INPUT,1 +NAIVE_TCF4_IP,BLA203A45_S36_L001_R1_001.fastq.gz,,2,TCF4,NAIVE_INPUT,2 +NAIVE_TCF4_IP,BLA203A66_S57_L001_R1_001.fastq.gz,,3,TCF4,NAIVE_INPUT,3 +WT_INPUT,BLA203A6_S32_L006_R1_001.fastq.gz,,1,,, +WT_INPUT,BLA203A30_S21_L001_R1_001.fastq.gz,,2,,, +WT_INPUT,BLA203A31_S21_L003_R1_001.fastq.gz,,3,,, +NAIVE_INPUT,BLA203A12_S3_L001_R1_001.fastq.gz,,1,,, +NAIVE_INPUT,BLA203A48_S39_L001_R1_001.fastq.gz,,2,,, +NAIVE_INPUT,BLA203A49_S1_L006_R1_001.fastq.gz,,3,,, ``` -| Column | Description | -| ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `antibody` | Antibody name. This is required to segregate downstream analysis for different antibodies. Required when `control` is specified. | -| `control` | Sample name for control sample. | - -Example design files have been provided with the pipeline for [paired-end](../assets/samplesheet_pe.csv) and [single-end](../assets/samplesheet_se.csv) data. +| Column | Description | +| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | +| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| `replicate` | Integer representing replicate number. This will be identical for re-sequenced libraries. Must start from `1..`. | +| `antibody` | Antibody name. This is required to segregate downstream analysis for different antibodies. Required when `control` is specified. | +| `control` | Sample name for control sample. | +| `control_replicate` | Integer representing replicate number for control sample. | + +Example design files have bee_n provided with the pipeline for [paired-end](../assets/samplesheet_pe.csv) and [single-end](../assets/samplesheet_se.csv) data. > **NB:** The `group` and `replicate` columns were replaced with a single `sample` column as of v2.0 of the pipeline. The `sample` column is essentially a concatenation of the `group` and `replicate` columns. If all values of `sample` have the same number of underscores, fields defined by these underscore-separated names may be used in the PCA plots produced by the pipeline, to regain the ability to represent different groupings. diff --git a/modules/local/macs2_consensus.nf b/modules/local/macs2_consensus.nf index b8a5abd7..eb373d2c 100644 --- a/modules/local/macs2_consensus.nf +++ b/modules/local/macs2_consensus.nf @@ -12,6 +12,7 @@ process MACS2_CONSENSUS { input: tuple val(meta), path(peaks) + val is_narrow_peak output: tuple val(meta), path("*.bed") , emit: bed @@ -26,11 +27,12 @@ process MACS2_CONSENSUS { task.ext.when == null || task.ext.when script: // This script is bundled with the pipeline, in nf-core/chipseq/bin/ - def prefix = task.ext.prefix ?: "${meta.id}" - def peak_type = params.narrow_peak ? 'narrowPeak' : 'broadPeak' - def mergecols = params.narrow_peak ? (2..10).join(',') : (2..9).join(',') - def collapsecols = params.narrow_peak ? (['collapse']*9).join(',') : (['collapse']*8).join(',') - def expandparam = params.narrow_peak ? '--is_narrow_peak' : '' + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def peak_type = is_narrow_peak ? 'narrowPeak' : 'broadPeak' + def mergecols = is_narrow_peak ? (2..10).join(',') : (2..9).join(',') + def collapsecols = is_narrow_peak ? (['collapse']*9).join(',') : (['collapse']*8).join(',') + def expandparam = is_narrow_peak ? '--is_narrow_peak' : '' """ sort -T '.' -k1,1 -k2,2n ${peaks.collect{it.toString()}.sort().join(' ')} \\ | mergeBed -c $mergecols -o $collapsecols > ${prefix}.txt @@ -40,6 +42,7 @@ process MACS2_CONSENSUS { ${peaks.collect{it.toString()}.sort().join(',').replaceAll("_peaks.${peak_type}","")} \\ ${prefix}.boolean.txt \\ --min_replicates $params.min_reps_consensus \\ + $args \\ $expandparam awk -v FS='\t' -v OFS='\t' 'FNR > 1 { print \$1, \$2, \$3, \$4, "0", "+" }' ${prefix}.boolean.txt > ${prefix}.bed diff --git a/workflows/chipseq.nf b/workflows/chipseq.nf index bdf5783f..fc914a51 100644 --- a/workflows/chipseq.nf +++ b/workflows/chipseq.nf @@ -423,11 +423,19 @@ workflow CHIPSEQ { .set { ch_genome_bam_bai } ch_genome_bam_bai - .combine(ch_genome_bam_bai) .map { - meta1, bam1, bai1, meta2, bam2, bai2 -> - meta1.control == meta2.id ? [ meta1, [ bam1, bam2 ], [ bai1, bai2 ] ] : null + meta, bam, bai -> + meta.control ? null : [ meta.id, [ bam ] , [ bai ] ] } + .set { ch_control_bam_bai } + + ch_genome_bam_bai + .map { + meta, bam, bai -> + meta.control ? [ meta.control, meta, [ bam ], [ bai ] ] : null + } + .combine(ch_control_bam_bai, by: 0) + .map { it -> [ it[1] , it[2] + it[4], it[3] + it[5] ] } .set { ch_ip_control_bam_bai } // @@ -482,7 +490,10 @@ workflow CHIPSEQ { MACS2_CALLPEAK .out .peak - .filter { meta, peaks -> peaks.size() > 0 } + .filter { + meta, peaks -> + peaks.size() > 0 + } .set { ch_macs2_peaks } // Create channels: [ meta, ip_bam, peaks ] @@ -564,7 +575,7 @@ workflow CHIPSEQ { ch_deseq2_clustering_multiqc = Channel.empty() if (!params.skip_consensus_peaks) { // Create channels: [ meta , [ peaks ] ] - // Where meta = [ id:antibody, multiple_groups:true/false, replicates_exist:true/false ] + // Where meta = [ id:antibody, multiple_groups:true/false, replicates_exist:true/false ] ch_macs2_peaks .map { meta, peak -> @@ -593,7 +604,8 @@ workflow CHIPSEQ { // MODULE: Generate consensus peaks across samples // MACS2_CONSENSUS ( - ch_antibody_peaks + ch_antibody_peaks, + params.narrow_peak ) ch_macs2_consensus_bed_lib = MACS2_CONSENSUS.out.bed ch_macs2_consensus_txt_lib = MACS2_CONSENSUS.out.txt