Skip to content

Commit

Permalink
Merge pull request #105 from CCBR/paired-end
Browse files Browse the repository at this point in the history
Support paired-end reads and custom genomes
  • Loading branch information
kelly-sovacool authored Oct 11, 2023
2 parents fb1f244 + fa91bbd commit 509a082
Show file tree
Hide file tree
Showing 49 changed files with 1,608 additions and 223 deletions.
4 changes: 2 additions & 2 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
*.nf.test linguist-language=nextflow
modules/nf-core/** linguist-generated
subworkflows/nf-core/** linguist-generated
*.config linguist-language=nextflow
assets/** linguist-generated
modules/CCBR/** linguist-generated
subworkflows/CCBR/** linguist-generated
docs/**.html linguist-generated
16 changes: 15 additions & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,15 @@ on:
branches:
- main
- develop
workflow_dispatch:
inputs:
test_run:
type: boolean
default: false
required: true

env:
test_run: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.test_run }}

jobs:
build:
Expand All @@ -29,11 +38,16 @@ jobs:
run: |
python -m pip install --upgrade pip setuptools
pip install .[dev,test]
- name: Test stub run
- name: Stub run
run: |
champagne run -profile ci_stub -stub
- name: Test run
if: ${{ env.test_run == 'true' }}
run: |
champagne run -profile ci_test,docker
- name: "Upload Artifact"
uses: actions/upload-artifact@v3
if: always() # run even if previous steps fail
with:
name: nextflow-log
path: .nextflow.log
Expand Down
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
- Fraction in Peaks (FRiP) (#89)
- Jaccard index (#92)
- Histogram of peak widths (#92)
- Added support for paired-end reads. (#105)
- Added an option to use a custom reference from a genome fasta, gtf, and blacklist file. (#105)

### Bug fixes

Expand Down
23 changes: 15 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,28 +15,35 @@ You can run champagne from the command line.
The CLI includes helper steps for execution on supported
high performance computing clusters including Biowulf and FRCE.

Run the test dataset using the test profile:
Run preview to view processes that will run:

```sh
champagne run -profile test,singularity
champagne run -profile ci_stub -preview
```

or explicitly specify the output directory and input:
Launch a stub run to view processes that will run and download containers:

```sh
champagne run -profile singularity --outdir results/test --input assets/samplesheet_test.csv
champagne run -profile ci_stub,singularity -stub
```

Run preview to view that steps that will run without actually executing any code:
Run the test dataset using the test profile:

```sh
champagne run -profile ci_stub -preview
champagne run -profile test,singularity
```

or explicitly specify the output directory and input:

```sh
champagne run -profile singularity --outdir results/test --input assets/samplesheet_test.csv
```

Launch a stub run to view the steps that will run and download containers without performing the full analysis.
Create and use a custom reference genome:

```sh
champagne run -profile ci_stub -stub
nextflow run main.nf -profile test -entry MAKE_REFERENCE
nextflow run main.nf -profile test -c results/test/genome/custom_genome.config
```

### nextflow pipeline
Expand Down
17 changes: 17 additions & 0 deletions assets/R64-1-1_ensembl2UCSC.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
I chrI
II chrII
III chrIII
IV chrIV
IX chrIX
MT chrM
V chrV
VI chrVI
VII chrVII
VIII chrVIII
X chrX
XI chrXI
XII chrXII
XIII chrXIII
XIV chrXIV
XV chrXV
XVI chrXVI
Binary file modified assets/dag.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
41 changes: 41 additions & 0 deletions assets/fastq_screen_ci.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# This is a configuration file for fastq_screen

##############
## Databases #
##############
## This section allows you to configure multiple databases
## to search against in your screen. For each database
## you need to provide a database name (which can't contain
## spaces) and the location of the bowtie indices which
## you created for that database.
##
## The entries shown below are only suggested examples, you
## can add as many DATABASE sections as required, and you
## can comment out or remove as many of the existing entries
## as desired.
##
## Either the original bowtie or bowtie2 may be used for the
## mapping. Specify the aligner to use with the command line
## flag --aligner with arguments 'bowtie' or
## 'bowtie2' (default).
##
## The configuration file may list paths to both bowtie and
## bowtie2 indices. FastQ Screen automatically detects whether
## a specified index is compatible with bowtie or bowtie2.
##
## Although the configuration file may list paths to both
## bowtie and bowtie2 indices, only one aligner will be used
## for the mapping, as specified by the --aligner flag.
##
## The path to the index files SHOULD INCLUDE THE BASENAME of
## the index, e.g:
## /data/public/Genomes/Human_Bowtie/GRCh37/Homo_sapiens.GRCh37
## Thus, the indices (Homo_sapiens.GRCh37.1.bt2, Homo_sapiens.GRCh37.2.bt2, etc.)
## are found in a folder named 'GRCh37'.
##
## If the bowtie AND bowtie2 indices of a given genome reside in the SAME FOLDER,
## a SINGLE path may be provided to BOTH sets of indices.
##
## Human - sequences available from
## ftp://ftp.ensembl.org/pub/current/fasta/homo_sapiens/dna/
DATABASE rRNA fastq_screen_db/rRNA/rRNA
6 changes: 3 additions & 3 deletions assets/samplesheet_test.csv
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
sample,fastq_1,fastq_2,antibody,control
SPT5_T0_REP1,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822153_1.fastq.gz,,SPT5,SPT5_INPUT_REP1
SPT5_T0_REP1,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822153_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822153_2.fastq.gz,SPT5,SPT5_INPUT_REP1
SPT5_T0_REP2,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822154_1.fastq.gz,,SPT5,SPT5_INPUT_REP2
SPT5_T15_REP1,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822157_1.fastq.gz,,SPT5,SPT5_INPUT_REP1
SPT5_T15_REP1,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822157_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822157_2.fastq.gz,SPT5,SPT5_INPUT_REP1
SPT5_T15_REP2,https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/testdata/SRR1822158_1.fastq.gz,,SPT5,SPT5_INPUT_REP2
SPT5_INPUT_REP1,https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/testdata/SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R1.fastq.gz,,,
SPT5_INPUT_REP1,https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/testdata/SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/testdata/SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R2.fastq.gz,,
SPT5_INPUT_REP2,https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/testdata/SRR5204810_Spt5-ChIP_Input2_SacCer_ChIP-Seq_ss100k_R1.fastq.gz,,,
51 changes: 51 additions & 0 deletions bin/bam_filter_by_mapq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env python

"""
source https://github.com/CCBR/Pipeliner/blob/86c6ccaa3d58381a0ffd696bbf9c047e4f991f9e/Results-template/Scripts/bam_filter_by_mapq.py
"""

import pysam, sys
import argparse

parser = argparse.ArgumentParser(description="filter PE bamfile by mapQ values")
parser.add_argument("-i", dest="inBam", required=True, help="Input Bam File")
parser.add_argument("-o", dest="outBam", required=True, help="Output Bam File")
parser.add_argument(
"-q",
dest="mapQ",
type=int,
required=False,
help="mapQ value ... default 6",
default=6,
)
args = parser.parse_args()
samfile = pysam.AlignmentFile(args.inBam, "rb")
mapq = dict()
for read in samfile.fetch():
if read.is_unmapped:
continue
if read.is_supplementary:
continue
if read.is_secondary:
continue
if read.is_duplicate:
continue
if read.is_proper_pair:
if read.mapping_quality < args.mapQ and read.query_name in mapq:
del mapq[read.query_name]
if read.mapping_quality >= args.mapQ and not read.query_name in mapq:
mapq[read.query_name] = 1
samfile.close()
samfile = pysam.AlignmentFile(args.inBam, "rb")
pairedreads = pysam.AlignmentFile(args.outBam, "wb", template=samfile)
for read in samfile.fetch():
if read.query_name in mapq:
if read.is_supplementary:
continue
if read.is_secondary:
continue
if read.is_duplicate:
continue
pairedreads.write(read)
samfile.close()
pairedreads.close()
60 changes: 60 additions & 0 deletions bin/splitRef.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/usr/bin/env python

from __future__ import print_function
import os
import sys


def formatSequencelength(seq, stringlen):
fseq = ""
for i in range(len(seq)):
index = i + 1
if index % 80 == 0:
fseq += "{}{}".format(seq[i], "\n")
else:
fseq += seq[i]
return fseq


def parsed(filename):
fh = open(filename, "r")
sequence = ""
chrom = ""
seqindex = 0
seqlen = 0
for line in fh:
line = line.strip()
if line.startswith(">") and sequence != "":
yield chrom, formatSequencelength(sequence, seqlen), len(sequence)
chrom = line.split(" ")[0]
sequence = ""
elif line.startswith(">"):
chrom = line.split(" ")[0]
else:
seqindex += 1
sequence += line
if seqindex == 1:
seqlen = len(line)
else:
# formatSequencelength(sequence, seqlen)
yield chrom, formatSequencelength(sequence, seqlen), len(sequence)
fh.close()


def main(fasta_fn, chrom_sizes_fn, outdir):
os.mkdir(outdir)
chromsizesfh = open(chrom_sizes_fn, "w")

for chrom, seq, chromsize in parsed(fasta_fn):
chromsizesfh.write("{}\t{}\n".format(chrom.replace(">", ""), chromsize))
outfilename = os.path.join(outdir, chrom.replace(">", "") + ".fa")
outfh = open(outfilename, "w")
print("{}\n".format(chrom))
outfh.write("{}\n{}\n".format(chrom, seq.rstrip()))
outfh.close()

chromsizesfh.close()


if __name__ == "__main__":
main(sys.argv[1], sys.argv[2], sys.argv[3])
14 changes: 7 additions & 7 deletions conf/ci_stub.config
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,22 @@ params {
// CCBR shared resource paths
index_dir = "tests/data"
fastq_screen {
conf = "assets/fastq_screen_biowulf.conf"
db_dir = "tests/data/"
conf = "assets/fastq_screen_ci.conf"
db_dir = "tests/data/fastq_screen_db"
}
multiqc_config = "assets/multiqc_config.yaml"
genomes {
'test' { // blank files for testing stubs on GitHub Actions
blacklist = 'test.blacklist'
blacklist_files = "tests/data/test.blacklist"
reference_files = "tests/data/test/*"
blacklist_index = "tests/data/test.blacklist"
reference_index = "tests/data/test/*"
effective_genome_size = 2700000000
chrom_sizes = "tests/data/test.fa.sizes"
gene_info = "tests/data/geneinfo.bed"
chromosomes_dir = "tests/data/chroms/"
}
}

sicer {
species = "sacCer1" // supported species https://github.com/zanglab/SICER2/blob/master/sicer/lib/GenomeData.py
}
}

process {
Expand Down
37 changes: 37 additions & 0 deletions conf/ci_test.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
params {
config_profile_name = 'Test single-end stubs'
config_profile_description = 'Minimal test dataset with blank references to run stubs with continuous integration'

outdir = 'results/test'
input = 'assets/samplesheet_test.csv' // adapted from https://github.com/nf-core/test-datasets/blob/chipseq/samplesheet/v2.0/samplesheet_test.csv

genome = 'custom_genome'
read_length = 50

// Genome references
genome_fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/reference/genome.fa'
genes_gtf = 'https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/reference/genes.gtf'
blacklist = 'tests/data/test.blacklist'
rename_contigs = 'assets/R64-1-1_ensembl2UCSC.txt'


max_cpus = 2 // for GitHub Actions https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners#supported-runners-and-hardware-resources
max_memory = '6.GB'
max_time = '6.h'

publish_dir_mode = "symlink"

// CCBR shared resource paths
index_dir = "tests/data"
fastq_screen = null
sicer.species = "sacCer1" // supported species https://github.com/zanglab/SICER2/blob/master/sicer/lib/GenomeData.py

deeptools.bin_size = 10000 // this value is only to make bamCoverage run faster. use smaller value for real data.
deeptools.excluded_chroms = 'chrM'
run.sicer = false // TODO set to true after https://github.com/CCBR/CHAMPAGNE/issues/109
}

process {
cpus = 1
memory = '1 GB'
}
1 change: 1 addition & 0 deletions conf/containers.config
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ params {
multiqc = 'nciccbr/ccbr_multiqc_1.15:v1'
ngsqc = 'nciccbr/ccbr_ngsqc_0.31:v1'
phantom_peaks = 'quay.io/biocontainers/phantompeakqualtools:1.2.2--hdfd78af_1'
picard = 'nciccbr/ccbr_picard_2.27.5:v1'
preseq = 'nciccbr/ccbr_preseq_v2.0:v1'
r = 'nciccbr/ccbr_r_4.3.0:v1'
sicer = 'nciccbr/ccbr_sicer2_1.0.3:v1'
Expand Down
3 changes: 3 additions & 0 deletions conf/full_mm10.config
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,7 @@ params {
genome = 'mm10'
outdir = "results/full_mm10"
input = "assets/samplesheet_full_mm10.csv"
sicer {
species = "mm10" // supported species https://github.com/zanglab/SICER2/blob/master/sicer/lib/GenomeData.py
}
}
18 changes: 8 additions & 10 deletions conf/genomes.config
Original file line number Diff line number Diff line change
@@ -1,22 +1,20 @@
params {
genomes {
'hg38' {
blacklist = 'hg38.blacklist'
blacklist_files = "${params.index_dir}/hg38_basic/indexes/hg38.blacklist*"
reference_files = "${params.index_dir}/hg38_basic/indexes/hg38*"
effective_genome_size = 2700000000
blacklist_index = "${params.index_dir}/hg38_basic/indexes/blacklist/hg38.blacklist_v3.chrM.chr_rDNA.*"
reference_index = "${params.index_dir}/hg38_basic/bwa_index/hg38*"
chromosomes_dir = "${params.index_dir}/hg38_basic/Chromsomes/"
chrom_sizes = "${params.index_dir}/hg38_basic/indexes/hg38.fa.sizes"
gene_info = "${params.index_dir}/hg38_basic/geneinfo.bed"
chromosomes_dir = "${params.index_dir}/hg38_basic/Chromsomes/"
effective_genome_size = 2700000000
}
'mm10' {
blacklist = 'mm10.blacklist'
blacklist_files = "${params.index_dir}/mm10_basic/indexes/mm10.blacklist*"
reference_files = "${params.index_dir}/mm10_basic/indexes/mm10*"
effective_genome_size = 2400000000
blacklist_index = "${params.index_dir}/mm10_basic/indexes/blacklist/mm10.blacklist.chrM.chr_rDNA.*"
reference_index = "${params.index_dir}/mm10_basic/indexes/reference/mm10*"
chromosomes_dir = "${params.index_dir}/mm10_basic/Chromsomes/"
chrom_sizes = "${params.index_dir}/mm10_basic/indexes/mm10.fa.sizes"
gene_info = "${params.index_dir}/mm10_basic/geneinfo.bed"
chromosomes_dir = "${params.index_dir}/mm10_basic/Chromsomes/"
effective_genome_size = 2400000000
}
}
}
Loading

0 comments on commit 509a082

Please sign in to comment.