nf-core · maxulysse · Apr 29, 2020 · Apr 28, 2020 · Apr 28, 2020 · Apr 28, 2020
@@ -1,6 +1,8 @@
 # Markdownlint configuration file
 default: true,
 line-length: false
+no-duplicate-header:
+    siblings_only: true
 no-inline-html: 
     allowed_elements:
         - a

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -48,7 +48,7 @@ jobs:
           docker pull nfcore/sarek:dev
           docker pull nfcore/sarek${{ matrix.tools }}:dev.${{ matrix.species }}
       - name: Run annotation test
-        run: nextflow run . -profile test_annotation,docker --tools ${{ matrix.tools }}
+        run: nextflow run ${GITHUB_WORKSPACE} -profile test_annotation,docker --tools ${{ matrix.tools }}
 
   germline:
     env:
@@ -69,9 +69,10 @@ jobs:
         run: git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data
       - name: Run germline test
         run: |
-          nextflow run . -profile test,docker --input data/testdata/tiny/normal
-          nextflow run . -profile test,docker --input=false --step recalibrate -resume
-          nextflow run . -profile test,docker --input=false --step variantCalling
+          nextflow run ${GITHUB_WORKSPACE} -profile test,docker --input data/testdata/tiny/normal
+          nextflow run ${GITHUB_WORKSPACE} -profile test,docker --input=false --step prepare_recalibration -resume
+          nextflow run ${GITHUB_WORKSPACE} -profile test,docker --input=false --step recalibrate -resume
+          nextflow run ${GITHUB_WORKSPACE} -profile test,docker --input=false --step variantCalling
 
   minimal:
     env:
@@ -93,7 +94,7 @@ jobs:
       - name: Pull docker image
         run: docker pull nfcore/sarek:dev
       - name: Run test for minimal genomes
-        run: nextflow run . -profile test,docker --skipQC all --genome ${{ matrix.genome }} ${{ matrix.intervals }} --tools Manta,mpileup,Strelka,FreeBayes
+        run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --skipQC all --genome ${{ matrix.genome }} ${{ matrix.intervals }} --tools Manta,mpileup,Strelka,FreeBayes
 
   profile:
     env:
@@ -114,7 +115,7 @@ jobs:
       - name: Pull docker image
         run: docker pull nfcore/sarek:dev
       - name: Run ${{ matrix.profile }} test
-        run: nextflow run . -profile ${{ matrix.profile }},docker
+        run: nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.profile }},docker
 
   tools:
     env:
@@ -145,4 +146,4 @@ jobs:
       - name: Pull docker image
         run: docker pull nfcore/sarek:dev
       - name: Run ${{ matrix.tool }} test
-        run: nextflow run . -profile test_tool,docker --tools ${{ matrix.tool }} ${{ matrix.intervals }}
+        run: nextflow run ${GITHUB_WORKSPACE} -profile test_tool,docker --tools ${{ matrix.tool }} ${{ matrix.intervals }}
@@ -11,12 +11,12 @@ RUN conda env create -f /environment.yml && conda clean -a
 # Add conda installation dir to PATH (instead of doing 'conda activate')
 ENV PATH /opt/conda/envs/nf-core-sarek-snpeff-2.6dev/bin:$PATH
 
-# Dump the details of the installed packages to a file for posterity
-RUN conda env export --name nf-core-sarek-snpeff-2.6dev > nf-core-sarek-snpeff-2.6dev.yml
-
 # Setup default ARG variables
 ARG GENOME=GRCh38
 ARG SNPEFF_CACHE_VERSION=86
 
 # Download Genome
 RUN snpEff download -v ${GENOME}.${SNPEFF_CACHE_VERSION}
+
+# Dump the details of the installed packages to a file for posterity
+RUN conda env export --name nf-core-sarek-snpeff-2.6dev > nf-core-sarek-snpeff-2.6dev.yml
@@ -7,4 +7,4 @@ channels:
   - defaults
 
 dependencies:
-  - snpeff=4.3.1t
+  - bioconda::snpeff=4.3.1t
@@ -11,9 +11,6 @@ RUN conda env create -f /environment.yml && conda clean -a
 # Add conda installation dir to PATH (instead of doing 'conda activate')
 ENV PATH /opt/conda/envs/nf-core-sarek-vep-2.6dev/bin:$PATH
 
-# Dump the details of the installed packages to a file for posterity
-RUN conda env export --name nf-core-sarek-vep-2.6dev > nf-core-sarek-vep-2.6dev.yml
-
 # Setup default ARG variables
 ARG GENOME=GRCh38
 ARG SPECIES=homo_sapiens
@@ -28,3 +25,6 @@ RUN vep_install \
   --CACHE_VERSION ${VEP_VERSION} \
   --CONVERT \
   --NO_BIOPERL --NO_HTSLIB --NO_TEST --NO_UPDATE
+
+# Dump the details of the installed packages to a file for posterity
+RUN conda env export --name nf-core-sarek-vep-2.6dev > nf-core-sarek-vep-2.6dev.yml
@@ -7,5 +7,5 @@ channels:
   - defaults
 
 dependencies:
-  - ensembl-vep=99.2
-  - genesplicer=1.0
+  - bioconda::ensembl-vep=99.2
+  - bioconda::genesplicer=1.0
@@ -118,6 +118,24 @@ G15511    XX    1    D0ENMT    D0ENM_1    pathToFiles/D0ENMAC_1.bam
 G15511    XX    1    D0ENMT    D0ENM_2    pathToFiles/D0ENMAC_2.bam
 ```
 
+## Starting from the BAM prepare recalibration step
+
+To start from the preparation of the recalibration step (`--step prepare_recalibration`), a TSV file for a normal/tumor pair needs to be given as input containing the paths to the non recalibrated but already mapped BAM files.
+The TSV needs to contain the following columns:
+
+- `subject sex status sample bam bai`
+
+The same way, if you have non recalibrated BAMs and their indexes, you should use a structure like:
+
+```text
+G15511    XX    0    C09DFN    pathToFiles/G15511.C09DFN.md.bam    pathToFiles/G15511.C09DFN.md.bai
+G15511    XX    1    D0ENMT    pathToFiles/G15511.D0ENMT.md.bam    pathToFiles/G15511.D0ENMT.md.bai
+```
+
+When starting Sarek from the mapping step, a TSV file is generated automatically after the `MarkDuplicates` process. This TSV file is stored under `results/Preprocessing/TSV/duplicates_marked_no_table.tsv` and can be used to restart Sarek from the non-recalibrated BAM files. Setting the step `--step prepare_recalibration` will automatically take this file as input.
+
+Additionally, individual TSV files for each sample (`duplicates_marked_no_table_[SAMPLE].tsv`) can be found in the same directory.
+
 ## Starting from the BAM recalibration step
 
 To start from the recalibration step (`--step recalibrate`), a TSV file for a normal/tumor pair needs to be given as input containing the paths to the non recalibrated but already mapped BAM files.
@@ -132,9 +150,9 @@ G15511    XX    0    C09DFN    pathToFiles/G15511.C09DFN.md.bam    pathToFiles/G
 G15511    XX    1    D0ENMT    pathToFiles/G15511.D0ENMT.md.bam    pathToFiles/G15511.D0ENMT.md.bai pathToFiles/G15511.D0ENMT.md.recal.table
 ```
 
-When starting Sarek from the mapping step, a TSV file is generated automatically after the `MarkDuplicates` process. This TSV file is stored under `results/Preprocessing/TSV/duplicateMarked.tsv` and can be used to restart Sarek from the non-recalibrated BAM files. Setting the step `--step recalibrate` will automatically take this file as input.
+When starting Sarek from the mapping step, a TSV file is generated automatically after the `MarkDuplicates` process. This TSV file is stored under `results/Preprocessing/TSV/duplicates_marked.tsv` and can be used to restart Sarek from the non-recalibrated BAM files. Setting the step `--step recalibrate` will automatically take this file as input.
 
-Additionally, individual TSV files for each sample (`duplicateMarked_[SAMPLE].tsv`) can be found in the same directory.
+Additionally, individual TSV files for each sample (`duplicates_marked_[SAMPLE].tsv`) can be found in the same directory.
 
 ## Starting from the variant calling step
 

@@ -75,31 +75,35 @@ Such files are intermediate and not kept in the final files delivered to users.
 If the pipeline is run with the option `--no_gatk_spark` then [GATK MarkDuplicates](https://software.broadinstitute.org/gatk/documentation/tooldocs/4.1.4.0/picard_sam_markduplicates_MarkDuplicates.php) is used instead.
 
 This directory is the location for the BAM files delivered to users.
-Besides the duplicate marked BAM files, the recalibration tables (`*.recal.table`) are also stored, and can be used to create base recalibrated files.
+Besides the duplicates marked BAM files, the recalibration tables (`*.recal.table`) are also stored, and can be used to create base recalibrated files.
 
 For further reading and documentation see the [data pre-processing workflow from the GATK best practices](https://software.broadinstitute.org/gatk/best-practices/workflow?id=11165).
 
 For all samples:
-**Output directory: `results/Preprocessing/[SAMPLE]/DuplicateMarked`**
+**Output directory: `results/Preprocessing/[SAMPLE]/DuplicatesMarked`**
 
-- `[SAMPLE].md.bam`, `[SAMPLE].md.bai` and `[SAMPLE].recal.table`
-  - BAM file and index with Recalibration Table
+- `[SAMPLE].md.bam` and `[SAMPLE].md.bai`
+  - BAM file and index
 
 ### Base (Quality Score) Recalibration
 
 #### GATK BaseRecalibrator
 
 [GATK BaseRecalibrator](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_bqsr_BaseRecalibrator.php) generates a recalibration table based on various covariates.
 
-Such files are intermediate and not kept in the final files delivered to users.
+For all samples:
+**Output directory: `results/Preprocessing/[SAMPLE]/DuplicatesMarked`**
+
+- `[SAMPLE].recal.table`
+  - Recalibration Table associated to the duplicates marked BAMs.
 
 #### GATK ApplyBQSR
 
 [GATK ApplyBQSR](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_bqsr_ApplyBQSR.php) recalibrates the base qualities of the input reads based on the recalibration table produced by the [`BaseRecalibrator`](#gatk-baserecalibrator) tool.
 
 This directory is usually empty, it is the location for the final recalibrated BAM files.
-Recalibrated BAM files are usually 2-3 times larger than the duplicate marked BAM files.
-To re-generate recalibrated BAM file you have to apply the recalibration table delivered to the `DuplicateMarked` directory either within Sarek, or doing this recalibration step yourself.
+Recalibrated BAM files are usually 2-3 times larger than the duplicates marked BAM files.
+To re-generate recalibrated BAM file you have to apply the recalibration table delivered to the `DuplicatesMarked` directory either within Sarek, or doing this recalibration step yourself.
 
 For further reading and documentation see the [data pre-processing workflow from the GATK best practices](https://software.broadinstitute.org/gatk/best-practices/workflow?id=11165).
 
@@ -118,19 +122,19 @@ For further reading and documentation see the [input documentation](https://gith
 For all samples:
 **Output directory: `results/Preprocessing/TSV`**
 
-- `duplicateMarked.tsv` and `recalibrated.tsv`
-  - TSV files to start Sarek from `recalibration` or `variantcalling` steps.
-- `duplicateMarked_[SAMPLE].tsv` and `recalibrated_[SAMPLE].tsv`
-  - TSV files to start Sarek from `recalibration` or `variantcalling` steps for a specific sample.
+- `duplicates_marked_no_table.tsv`, `duplicates_marked.tsv` and `recalibrated.tsv`
+  - TSV files to start Sarek from `prepare_recalibration`, `recalibrate` or `variantcalling` steps.
+- `duplicates_marked_no_table_[SAMPLE].tsv` `duplicates_marked_[SAMPLE].tsv` and `recalibrated_[SAMPLE].tsv`
+  - TSV files to start Sarek from `prepare_recalibration`, `recalibrate` or `variantcalling` steps for a specific sample.
 
 > `/!\` Only with [`--sentieon`](usage.md#--sentieon)
 
 For all samples:
 **Output directory: `results/Preprocessing/TSV`**
 
-- `recalibrated_sentieon.tsv`
+- `sentieon_deduped.tsv` and `recalibrated_sentieon.tsv`
   - TSV files to start Sarek from `variantcalling` step.
-- `recalibrated_sentieon_[SAMPLE].tsv`
+- `sentieon_deduped_[SAMPLE].tsv` and `recalibrated_sentieon_[SAMPLE].tsv`
   - TSV files to start Sarek from `variantcalling` step for a specific sample.
 
 ## Variant Calling
@@ -549,7 +553,7 @@ For more information about how to use Qualimap bamqc reports, see [Qualimap bamq
 If the pipeline is run with the option `--no_gatk_spark` then [GATK MarkDuplicates](https://software.broadinstitute.org/gatk/documentation/tooldocs/4.1.4.0/picard_sam_markduplicates_MarkDuplicates.php) is used instead.
 
 Collecting duplicate metrics slows down performance.
-To disable them use `--skipQC MarkDuplicates`.
+To disable them use `--skip_qc MarkDuplicates`.
 
 Duplicates can arise during sample preparation _e.g._ library construction using PCR.
 Duplicate reads can also result from a single amplification cluster, incorrectly detected as multiple clusters by the optical sensor of the sequencing instrument.

@@ -1,4 +1,4 @@
-# Variant calling
+# Variant calling <!-- omit in toc -->
 
 - [Germline variant calling](#germline-variant-calling)
 - [Somatic variant calling with tumor - normal pairs](#somatic-variant-calling-with-tumor---normal-pairs)