Merge pull request #105 from CCBR/paired-end

Support paired-end reads and custom genomes
CCBR · Oct 11, 2023 · 509a082 · 509a082
2 parents fb1f244 + fa91bbd
commit 509a082
Show file tree

Hide file tree

Showing 49 changed files with 1,608 additions and 223 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -2,6 +2,6 @@
 *.nf.test linguist-language=nextflow
 modules/nf-core/** linguist-generated
 subworkflows/nf-core/** linguist-generated
-*.config linguist-language=nextflow
-assets/** linguist-generated
+modules/CCBR/** linguist-generated
+subworkflows/CCBR/** linguist-generated
 docs/**.html linguist-generated
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -9,6 +9,15 @@ on:
     branches:
       - main
       - develop
+  workflow_dispatch:
+    inputs:
+      test_run:
+        type: boolean
+        default: false
+        required: true
+
+env:
+  test_run: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.test_run }}
 
 jobs:
   build:
@@ -29,11 +38,16 @@ jobs:
         run: |
           python -m pip install --upgrade pip setuptools
           pip install .[dev,test]
-      - name: Test stub run
+      - name: Stub run
         run: |
           champagne run -profile ci_stub -stub
+      - name: Test run
+        if: ${{ env.test_run == 'true' }}
+        run: |
+          champagne run -profile ci_test,docker
       - name: "Upload Artifact"
         uses: actions/upload-artifact@v3
+        if: always() # run even if previous steps fail
         with:
           name: nextflow-log
           path: .nextflow.log

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,8 @@
   - Fraction in Peaks (FRiP) (#89)
   - Jaccard index (#92)
   - Histogram of peak widths (#92)
+- Added support for paired-end reads. (#105)
+- Added an option to use a custom reference from a genome fasta, gtf, and blacklist file. (#105)
 
 ### Bug fixes
 

diff --git a/README.md b/README.md
@@ -15,28 +15,35 @@ You can run champagne from the command line.
 The CLI includes helper steps for execution on supported
 high performance computing clusters including Biowulf and FRCE.
 
-Run the test dataset using the test profile:
+Run preview to view processes that will run:
 
 ```sh
-champagne run -profile test,singularity
+champagne run -profile ci_stub -preview
 ```
 
-or explicitly specify the output directory and input:
+Launch a stub run to view processes that will run and download containers:
 
 ```sh
-champagne run -profile singularity --outdir results/test --input assets/samplesheet_test.csv
+champagne run -profile ci_stub,singularity -stub
 ```
 
-Run preview to view that steps that will run without actually executing any code:
+Run the test dataset using the test profile:
 
 ```sh
-champagne run -profile ci_stub -preview
+champagne run -profile test,singularity
+```
+
+or explicitly specify the output directory and input:
+
+```sh
+champagne run -profile singularity --outdir results/test --input assets/samplesheet_test.csv
 ```
 
-Launch a stub run to view the steps that will run and download containers without performing the full analysis.
+Create and use a custom reference genome:
 
 ```sh
-champagne run -profile ci_stub -stub
+nextflow run main.nf -profile test -entry MAKE_REFERENCE
+nextflow run main.nf -profile test -c results/test/genome/custom_genome.config
 ```
 
 ### nextflow pipeline

diff --git a/assets/R64-1-1_ensembl2UCSC.txt b/assets/R64-1-1_ensembl2UCSC.txt
@@ -0,0 +1,17 @@
+I	chrI
+II	chrII
+III	chrIII
+IV	chrIV
+IX	chrIX
+MT	chrM
+V	chrV
+VI	chrVI
+VII	chrVII
+VIII	chrVIII
+X	chrX
+XI	chrXI
+XII	chrXII
+XIII	chrXIII
+XIV	chrXIV
+XV	chrXV
+XVI	chrXVI
diff --git a/assets/dag.png b/assets/dag.png
diff --git a/assets/fastq_screen_ci.conf b/assets/fastq_screen_ci.conf
@@ -0,0 +1,41 @@
+# This is a configuration file for fastq_screen
+
+##############
+## Databases #
+##############
+## This section allows you to configure multiple databases
+## to search against in your screen. For each database
+## you need to provide a database name (which can't contain
+## spaces) and the location of the bowtie indices which
+## you created for that database.
+##
+## The entries shown below are only suggested examples, you
+## can add as many DATABASE sections as required, and you
+## can comment out or remove as many of the existing entries
+## as desired.
+##
+## Either the original bowtie or bowtie2 may be used for the
+## mapping. Specify the aligner to use with the command line
+## flag --aligner with arguments 'bowtie' or
+## 'bowtie2' (default).
+##
+## The configuration file may list paths to both bowtie and
+## bowtie2 indices. FastQ Screen automatically detects whether
+## a specified index is compatible with bowtie or bowtie2.
+##
+## Although the configuration file may list paths to both
+## bowtie and bowtie2 indices, only one aligner will be used
+## for the mapping, as specified by the --aligner flag.
+##
+## The path to the index files SHOULD INCLUDE THE BASENAME of
+## the index, e.g:
+## /data/public/Genomes/Human_Bowtie/GRCh37/Homo_sapiens.GRCh37
+## Thus, the indices (Homo_sapiens.GRCh37.1.bt2, Homo_sapiens.GRCh37.2.bt2, etc.)
+## are found in a folder named 'GRCh37'.
+##
+## If the bowtie AND bowtie2 indices of a given genome reside in the SAME FOLDER,
+## a SINGLE path may be provided to BOTH sets of indices.
+##
+## Human - sequences available from
+## ftp://ftp.ensembl.org/pub/current/fasta/homo_sapiens/dna/
+DATABASE     rRNA  fastq_screen_db/rRNA/rRNA
diff --git a/assets/samplesheet_test.csv b/assets/samplesheet_test.csv
@@ -1,7 +1,7 @@
 sample,fastq_1,fastq_2,antibody,control
-SPT5_T0_REP1,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822153_1.fastq.gz,,SPT5,SPT5_INPUT_REP1
+SPT5_T0_REP1,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822153_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822153_2.fastq.gz,SPT5,SPT5_INPUT_REP1
 SPT5_T0_REP2,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822154_1.fastq.gz,,SPT5,SPT5_INPUT_REP2
-SPT5_T15_REP1,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822157_1.fastq.gz,,SPT5,SPT5_INPUT_REP1
+SPT5_T15_REP1,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822157_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822157_2.fastq.gz,SPT5,SPT5_INPUT_REP1
 SPT5_T15_REP2,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822158_1.fastq.gz,,SPT5,SPT5_INPUT_REP2
-SPT5_INPUT_REP1,https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/testdata/SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R1.fastq.gz,,,
+SPT5_INPUT_REP1,https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/testdata/SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/testdata/SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R2.fastq.gz,,
 SPT5_INPUT_REP2,https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/testdata/SRR5204810_Spt5-ChIP_Input2_SacCer_ChIP-Seq_ss100k_R1.fastq.gz,,,
diff --git a/bin/bam_filter_by_mapq.py b/bin/bam_filter_by_mapq.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+
+"""
+source https://github.com/CCBR/Pipeliner/blob/86c6ccaa3d58381a0ffd696bbf9c047e4f991f9e/Results-template/Scripts/bam_filter_by_mapq.py
+"""
+
+import pysam, sys
+import argparse
+
+parser = argparse.ArgumentParser(description="filter PE bamfile by mapQ values")
+parser.add_argument("-i", dest="inBam", required=True, help="Input Bam File")
+parser.add_argument("-o", dest="outBam", required=True, help="Output Bam File")
+parser.add_argument(
+    "-q",
+    dest="mapQ",
+    type=int,
+    required=False,
+    help="mapQ value ... default 6",
+    default=6,
+)
+args = parser.parse_args()
+samfile = pysam.AlignmentFile(args.inBam, "rb")
+mapq = dict()
+for read in samfile.fetch():
+    if read.is_unmapped:
+        continue
+    if read.is_supplementary:
+        continue
+    if read.is_secondary:
+        continue
+    if read.is_duplicate:
+        continue
+    if read.is_proper_pair:
+        if read.mapping_quality < args.mapQ and read.query_name in mapq:
+            del mapq[read.query_name]
+        if read.mapping_quality >= args.mapQ and not read.query_name in mapq:
+            mapq[read.query_name] = 1
+samfile.close()
+samfile = pysam.AlignmentFile(args.inBam, "rb")
+pairedreads = pysam.AlignmentFile(args.outBam, "wb", template=samfile)
+for read in samfile.fetch():
+    if read.query_name in mapq:
+        if read.is_supplementary:
+            continue
+        if read.is_secondary:
+            continue
+        if read.is_duplicate:
+            continue
+        pairedreads.write(read)
+samfile.close()
+pairedreads.close()
diff --git a/bin/splitRef.py b/bin/splitRef.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+import os
+import sys
+
+
+def formatSequencelength(seq, stringlen):
+    fseq = ""
+    for i in range(len(seq)):
+        index = i + 1
+        if index % 80 == 0:
+            fseq += "{}{}".format(seq[i], "\n")
+        else:
+            fseq += seq[i]
+    return fseq
+
+
+def parsed(filename):
+    fh = open(filename, "r")
+    sequence = ""
+    chrom = ""
+    seqindex = 0
+    seqlen = 0
+    for line in fh:
+        line = line.strip()
+        if line.startswith(">") and sequence != "":
+            yield chrom, formatSequencelength(sequence, seqlen), len(sequence)
+            chrom = line.split(" ")[0]
+            sequence = ""
+        elif line.startswith(">"):
+            chrom = line.split(" ")[0]
+        else:
+            seqindex += 1
+            sequence += line
+            if seqindex == 1:
+                seqlen = len(line)
+    else:
+        # formatSequencelength(sequence, seqlen)
+        yield chrom, formatSequencelength(sequence, seqlen), len(sequence)
+    fh.close()
+
+
+def main(fasta_fn, chrom_sizes_fn, outdir):
+    os.mkdir(outdir)
+    chromsizesfh = open(chrom_sizes_fn, "w")
+
+    for chrom, seq, chromsize in parsed(fasta_fn):
+        chromsizesfh.write("{}\t{}\n".format(chrom.replace(">", ""), chromsize))
+        outfilename = os.path.join(outdir, chrom.replace(">", "") + ".fa")
+        outfh = open(outfilename, "w")
+        print("{}\n".format(chrom))
+        outfh.write("{}\n{}\n".format(chrom, seq.rstrip()))
+        outfh.close()
+
+    chromsizesfh.close()
+
+
+if __name__ == "__main__":
+    main(sys.argv[1], sys.argv[2], sys.argv[3])
diff --git a/conf/ci_stub.config b/conf/ci_stub.config
@@ -15,22 +15,22 @@ params {
     // CCBR shared resource paths
     index_dir = "tests/data"
     fastq_screen {
-        conf = "assets/fastq_screen_biowulf.conf"
-        db_dir = "tests/data/"
+        conf = "assets/fastq_screen_ci.conf"
+        db_dir = "tests/data/fastq_screen_db"
     }
-    multiqc_config = "assets/multiqc_config.yaml"
     genomes {
         'test' { // blank files for testing stubs on GitHub Actions
-            blacklist = 'test.blacklist'
-            blacklist_files = "tests/data/test.blacklist"
-            reference_files = "tests/data/test/*"
+            blacklist_index = "tests/data/test.blacklist"
+            reference_index = "tests/data/test/*"
             effective_genome_size = 2700000000
             chrom_sizes = "tests/data/test.fa.sizes"
             gene_info = "tests/data/geneinfo.bed"
             chromosomes_dir = "tests/data/chroms/"
         }
     }
-
+    sicer {
+        species = "sacCer1" // supported species https://github.com/zanglab/SICER2/blob/master/sicer/lib/GenomeData.py
+    }
 }
 
 process {

diff --git a/conf/ci_test.config b/conf/ci_test.config
@@ -0,0 +1,37 @@
+params {
+    config_profile_name = 'Test single-end stubs'
+    config_profile_description = 'Minimal test dataset with blank references to run stubs with continuous integration'
+
+    outdir = 'results/test'
+    input = 'assets/samplesheet_test.csv' // adapted from https://github.com/nf-core/test-datasets/blob/chipseq/samplesheet/v2.0/samplesheet_test.csv
+
+    genome = 'custom_genome'
+    read_length = 50
+
+    // Genome references
+    genome_fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/reference/genome.fa'
+    genes_gtf   = 'https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/reference/genes.gtf'
+    blacklist = 'tests/data/test.blacklist'
+    rename_contigs = 'assets/R64-1-1_ensembl2UCSC.txt'
+
+
+    max_cpus = 2        // for GitHub Actions https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners#supported-runners-and-hardware-resources
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    publish_dir_mode = "symlink"
+
+    // CCBR shared resource paths
+    index_dir = "tests/data"
+    fastq_screen = null
+    sicer.species = "sacCer1" // supported species https://github.com/zanglab/SICER2/blob/master/sicer/lib/GenomeData.py
+
+    deeptools.bin_size = 10000 // this value is only to make bamCoverage run faster. use smaller value for real data.
+    deeptools.excluded_chroms = 'chrM'
+    run.sicer = false // TODO set to true after https://github.com/CCBR/CHAMPAGNE/issues/109
+}
+
+process {
+    cpus = 1
+    memory = '1 GB'
+}
diff --git a/conf/containers.config b/conf/containers.config
@@ -11,6 +11,7 @@ params {
         multiqc = 'nciccbr/ccbr_multiqc_1.15:v1'
         ngsqc = 'nciccbr/ccbr_ngsqc_0.31:v1'
         phantom_peaks = 'quay.io/biocontainers/phantompeakqualtools:1.2.2--hdfd78af_1'
+        picard = 'nciccbr/ccbr_picard_2.27.5:v1'
         preseq = 'nciccbr/ccbr_preseq_v2.0:v1'
         r = 'nciccbr/ccbr_r_4.3.0:v1'
         sicer = 'nciccbr/ccbr_sicer2_1.0.3:v1'

diff --git a/conf/full_mm10.config b/conf/full_mm10.config
@@ -5,4 +5,7 @@ params {
     genome = 'mm10'
     outdir = "results/full_mm10"
     input = "assets/samplesheet_full_mm10.csv"
+    sicer {
+        species = "mm10" // supported species https://github.com/zanglab/SICER2/blob/master/sicer/lib/GenomeData.py
+    }
 }
diff --git a/conf/genomes.config b/conf/genomes.config
@@ -1,22 +1,20 @@
 params {
     genomes {
         'hg38' {
-            blacklist = 'hg38.blacklist'
-            blacklist_files = "${params.index_dir}/hg38_basic/indexes/hg38.blacklist*"
-            reference_files = "${params.index_dir}/hg38_basic/indexes/hg38*"
-            effective_genome_size = 2700000000
+            blacklist_index = "${params.index_dir}/hg38_basic/indexes/blacklist/hg38.blacklist_v3.chrM.chr_rDNA.*"
+            reference_index = "${params.index_dir}/hg38_basic/bwa_index/hg38*"
+            chromosomes_dir = "${params.index_dir}/hg38_basic/Chromsomes/"
             chrom_sizes = "${params.index_dir}/hg38_basic/indexes/hg38.fa.sizes"
             gene_info = "${params.index_dir}/hg38_basic/geneinfo.bed"
-            chromosomes_dir = "${params.index_dir}/hg38_basic/Chromsomes/"
+            effective_genome_size = 2700000000
         }
         'mm10' {
-            blacklist = 'mm10.blacklist'
-            blacklist_files = "${params.index_dir}/mm10_basic/indexes/mm10.blacklist*"
-            reference_files = "${params.index_dir}/mm10_basic/indexes/mm10*"
-            effective_genome_size = 2400000000
+            blacklist_index = "${params.index_dir}/mm10_basic/indexes/blacklist/mm10.blacklist.chrM.chr_rDNA.*"
+            reference_index = "${params.index_dir}/mm10_basic/indexes/reference/mm10*"
+            chromosomes_dir = "${params.index_dir}/mm10_basic/Chromsomes/"
             chrom_sizes = "${params.index_dir}/mm10_basic/indexes/mm10.fa.sizes"
             gene_info = "${params.index_dir}/mm10_basic/geneinfo.bed"
-            chromosomes_dir = "${params.index_dir}/mm10_basic/Chromsomes/"
+            effective_genome_size = 2400000000
         }
     }
 }