From 07f8b2fd86558e5d5dc7edd7b7e79cc30a3af97b Mon Sep 17 00:00:00 2001 From: Nikelle Petrillo <38223776+nikellepetrillo@users.noreply.github.com> Date: Thu, 12 Sep 2024 08:09:20 -0400 Subject: [PATCH] Np loom to h5ad snss2 (#1363) * pin all latest docker version * run with h5ad isntead o floom * run with h5ad isntead o floom * run with h5ad isntead o floom * run with h5ad isntead o floom * new docker * merge h5ad files * new path * redo h5ad merge * add verify task * add verify task * add verify task * no more loom * changing loom to h5ad * try updating all instances of warp tools * Updated pipeline_versions.txt with all pipeline version information * test out markdown * formatting * formatting again * formatting again * changelogs and add SingleNucleusSmartSeq2LoomOutput back in * Updated pipeline_versions.txt with all pipeline version information * changelogs * Updated pipeline_versions.txt with all pipeline version information --------- Co-authored-by: GitHub Action --- pipeline_versions.txt | 16 +-- pipelines/skylab/atac/atac.changelog.md | 5 + pipelines/skylab/atac/atac.wdl | 2 +- .../skylab/multiome/Multiome.changelog.md | 4 + pipelines/skylab/multiome/Multiome.wdl | 2 +- pipelines/skylab/optimus/Optimus.changelog.md | 4 + pipelines/skylab/optimus/Optimus.wdl | 4 +- .../skylab/paired_tag/PairedTag.changelog.md | 4 + pipelines/skylab/paired_tag/PairedTag.wdl | 2 +- .../skylab/slideseq/SlideSeq.changelog.md | 5 + pipelines/skylab/slideseq/SlideSeq.wdl | 4 +- .../MultiSampleSmartSeq2.changelog.md | 5 + .../MultiSampleSmartSeq2.wdl | 2 +- ...iSampleSmartSeq2SingleNucleus.changelog.md | 5 + .../MultiSampleSmartSeq2SingleNucleus.wdl | 28 ++-- .../SmartSeq2SingleSample.changelog.md | 5 + .../SmartSeq2SingleSample.wdl | 2 +- tasks/skylab/FastqProcessing.wdl | 2 +- tasks/skylab/H5adUtils.wdl | 134 ++++++++++++++++++ tasks/skylab/LoomUtils.wdl | 7 +- tasks/skylab/Metrics.wdl | 4 +- ...erifyMultiSampleSmartSeq2SingleNucleus.wdl | 10 +- .../TestMultiSampleSmartSeq2SingleNucleus.wdl | 10 +- .../README.md | 46 +++--- 24 files changed, 238 insertions(+), 74 deletions(-) diff --git a/pipeline_versions.txt b/pipeline_versions.txt index c916cdbb41..e675c060e4 100644 --- a/pipeline_versions.txt +++ b/pipeline_versions.txt @@ -1,15 +1,15 @@ Pipeline Name Version Date of Last Commit -MultiSampleSmartSeq2SingleNucleus 1.4.2 2024-08-25-02 -MultiSampleSmartSeq2 2.2.21 2023-04-19 -PairedTag 1.6.0 2024-08-02 -Optimus 7.6.0 2024-08-06 -atac 2.3.0 2024-08-29 +MultiSampleSmartSeq2SingleNucleus 2.0.0 2024-09-11 +MultiSampleSmartSeq2 2.2.22 2024-09-11 +PairedTag 1.6.1 2024-09-11 +Optimus 7.6.1 2024-09-11 +atac 2.3.1 2024-09-11 snm3C 4.0.4 2024-08-06 -SmartSeq2SingleSample 5.1.20 2023-04-19 -Multiome 5.6.0 2024-08-02 +SmartSeq2SingleSample 5.1.21 2024-09-11 +Multiome 5.6.1 2024-09-11 scATAC 1.3.2 2023-08-03 BuildIndices 3.0.0 2023-12-06 -SlideSeq 3.4.0 2024-08-06 +SlideSeq 3.4.1 2024-09-11 BuildCembaReferences 1.0.0 2020-11-15 CEMBA 1.1.7 2024-09-06 GDCWholeGenomeSomaticSingleSample 1.3.3 2024-09-06 diff --git a/pipelines/skylab/atac/atac.changelog.md b/pipelines/skylab/atac/atac.changelog.md index 544fb8ea50..d64620354a 100644 --- a/pipelines/skylab/atac/atac.changelog.md +++ b/pipelines/skylab/atac/atac.changelog.md @@ -1,3 +1,8 @@ +# 2.3.1 +2024-09-11 (Date of Last Commit) + +* Updated warp-tools docker which added create_h5ad_snss2.py to the docker image. This change does not affect the atac pipeline + # 2.3.0 2024-08-29 (Date of Last Commit) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index b207e393fb..8918a8d8ad 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -46,7 +46,7 @@ workflow ATAC { String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG" } - String pipeline_version = "2.3.0" + String pipeline_version = "2.3.1" # Determine docker prefix based on cloud provider String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/" diff --git a/pipelines/skylab/multiome/Multiome.changelog.md b/pipelines/skylab/multiome/Multiome.changelog.md index 98904837e8..843e4baced 100644 --- a/pipelines/skylab/multiome/Multiome.changelog.md +++ b/pipelines/skylab/multiome/Multiome.changelog.md @@ -1,3 +1,7 @@ +# 5.6.1 +2024-09-11 (Date of Last Commit) +* Updated warp-tools docker which added create_h5ad_snss2.py to the docker image. This change does not affect the Multiome pipeline + # 5.6.0 2024-08-02 (Date of Last Commit) diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index d647e82944..3979a4fa7d 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -9,7 +9,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow Multiome { - String pipeline_version = "5.6.0" + String pipeline_version = "5.6.1" input { diff --git a/pipelines/skylab/optimus/Optimus.changelog.md b/pipelines/skylab/optimus/Optimus.changelog.md index f64f9eb4bc..9a51ef009d 100644 --- a/pipelines/skylab/optimus/Optimus.changelog.md +++ b/pipelines/skylab/optimus/Optimus.changelog.md @@ -1,3 +1,7 @@ +# 7.6.1 +2024-09-11 (Date of Last Commit) +* Updated warp-tools docker which added create_h5ad_snss2.py to the docker image. This change does not affect the Optimus pipeline + # 7.6.0 2024-08-06 (Date of Last Commit) diff --git a/pipelines/skylab/optimus/Optimus.wdl b/pipelines/skylab/optimus/Optimus.wdl index 975439c9f3..eb03dfb30a 100644 --- a/pipelines/skylab/optimus/Optimus.wdl +++ b/pipelines/skylab/optimus/Optimus.wdl @@ -71,7 +71,7 @@ workflow Optimus { # version of this pipeline - String pipeline_version = "7.6.0" + String pipeline_version = "7.6.1" # this is used to scatter matched [r1_fastq, r2_fastq, i1_fastq] arrays @@ -91,7 +91,7 @@ workflow Optimus { String pytools_docker = "pytools:1.0.0-1661263730" String empty_drops_docker = "empty-drops:1.0.1-4.2" String star_docker = "star:1.0.1-2.7.11a-1692706072" - String warp_tools_docker_2_2_0 = "warp-tools:2.2.0" + String warp_tools_docker_2_2_0 = "warp-tools:2.3.0" String star_merge_docker = "star-merge-npz:1.2" #TODO how do we handle these? diff --git a/pipelines/skylab/paired_tag/PairedTag.changelog.md b/pipelines/skylab/paired_tag/PairedTag.changelog.md index ba4a05376c..7ad1571702 100644 --- a/pipelines/skylab/paired_tag/PairedTag.changelog.md +++ b/pipelines/skylab/paired_tag/PairedTag.changelog.md @@ -1,3 +1,7 @@ +# 1.6.1 +2024-09-11 (Date of Last Commit) +* Updated warp-tools docker which added create_h5ad_snss2.py to the docker image. This change does not affect the PairedTag pipeline + # 1.6.0 2024-08-02 (Date of Last Commit) diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl index 4206f4fabb..c401d25928 100644 --- a/pipelines/skylab/paired_tag/PairedTag.wdl +++ b/pipelines/skylab/paired_tag/PairedTag.wdl @@ -8,7 +8,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow PairedTag { - String pipeline_version = "1.6.0" + String pipeline_version = "1.6.1" input { diff --git a/pipelines/skylab/slideseq/SlideSeq.changelog.md b/pipelines/skylab/slideseq/SlideSeq.changelog.md index 4c49b67467..dbbe866338 100644 --- a/pipelines/skylab/slideseq/SlideSeq.changelog.md +++ b/pipelines/skylab/slideseq/SlideSeq.changelog.md @@ -1,3 +1,8 @@ +# 3.4.1 +2024-09-11 (Date of Last Commit) + +* Updated warp-tools docker which added create_h5ad_snss2.py to the docker image. This change does not affect the SlideSeq pipeline + # 3.4.0 2024-08-06 (Date of Last Commit) diff --git a/pipelines/skylab/slideseq/SlideSeq.wdl b/pipelines/skylab/slideseq/SlideSeq.wdl index c449818881..0ca5d4edc7 100644 --- a/pipelines/skylab/slideseq/SlideSeq.wdl +++ b/pipelines/skylab/slideseq/SlideSeq.wdl @@ -25,7 +25,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow SlideSeq { - String pipeline_version = "3.4.0" + String pipeline_version = "3.4.1" input { Array[File] r1_fastq @@ -48,7 +48,7 @@ workflow SlideSeq { # docker images String pytools_docker = "pytools:1.0.0-1661263730" String picard_cloud_docker = "picard-cloud:2.26.10" - String warp_tools_docker_2_2_0 = "warp-tools:2.2.0" + String warp_tools_docker_2_2_0 = "warp-tools:2.3.0" String star_merge_docker = "star-merge-npz:1.2" String ubuntu_docker = "ubuntu_16_0_4@sha256:025124e2f1cf4d29149958f17270596bffe13fc6acca6252977c572dd5ba01bf" diff --git a/pipelines/skylab/smartseq2_multisample/MultiSampleSmartSeq2.changelog.md b/pipelines/skylab/smartseq2_multisample/MultiSampleSmartSeq2.changelog.md index 5bc4212b9d..e5960f2ddf 100644 --- a/pipelines/skylab/smartseq2_multisample/MultiSampleSmartSeq2.changelog.md +++ b/pipelines/skylab/smartseq2_multisample/MultiSampleSmartSeq2.changelog.md @@ -1,3 +1,8 @@ +# 2.2.22 +2024-09-11 (Date of Last Commit) + +* Updated warp-tools docker which added create_h5ad_snss2.py to the docker image. This change does not affect the MultiSmartSeq2 pipeline + # 2.2.21 2023-04-19 (Date of Last Commit) diff --git a/pipelines/skylab/smartseq2_multisample/MultiSampleSmartSeq2.wdl b/pipelines/skylab/smartseq2_multisample/MultiSampleSmartSeq2.wdl index 91c9d4f882..0717f23b78 100644 --- a/pipelines/skylab/smartseq2_multisample/MultiSampleSmartSeq2.wdl +++ b/pipelines/skylab/smartseq2_multisample/MultiSampleSmartSeq2.wdl @@ -40,7 +40,7 @@ workflow MultiSampleSmartSeq2 { Boolean paired_end } # Version of this pipeline - String pipeline_version = "2.2.21" + String pipeline_version = "2.2.22" if (false) { String? none = "None" diff --git a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md index 67d38b59b2..f6556b3bbb 100644 --- a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md +++ b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md @@ -1,3 +1,8 @@ +# 2.0.0 +2024-09-11 (Dat of Last Commit) + +* Added h5ad as a format option for the cell by gene matrix output. The h5ad has the same layers and global attributes (unstructured data in h5ad) as the previous Loom output + # 1.4.2 2024-08-25-02 (Dat of Last Commit) diff --git a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl index 51d6b6c212..38ae12ff23 100644 --- a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl +++ b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl @@ -5,7 +5,7 @@ import "../../../tasks/skylab/TrimAdapters.wdl" as TrimAdapters import "../../../tasks/skylab/StarAlign.wdl" as StarAlign import "../../../tasks/skylab/Picard.wdl" as Picard import "../../../tasks/skylab/FeatureCounts.wdl" as CountAlignments -import "../../../tasks/skylab/LoomUtils.wdl" as LoomUtils +import "../../../tasks/skylab/H5adUtils.wdl" as H5adUtils import "../../../tasks/broad/Utilities.wdl" as utils workflow MultiSampleSmartSeq2SingleNucleus { @@ -57,7 +57,7 @@ workflow MultiSampleSmartSeq2SingleNucleus { } # Version of this pipeline - String pipeline_version = "1.4.2" + String pipeline_version = "2.0.0" if (false) { String? none = "None" @@ -129,7 +129,7 @@ workflow MultiSampleSmartSeq2SingleNucleus { annotation_gtf = annotations_gtf } - call LoomUtils.SingleNucleusSmartSeq2LoomOutput as LoomOutput { + call H5adUtils.SingleNucleusSmartSeq2H5adOutput as H5adOutput { input: input_ids = input_ids, input_names = input_names, @@ -144,28 +144,22 @@ workflow MultiSampleSmartSeq2SingleNucleus { annotation_introns_added_gtf = annotations_gtf } - ### Aggregate the Loom Files Directly ### - call LoomUtils.AggregateSmartSeq2Loom as AggregateLoom { + ### Aggregate the H5ad Files Directly ### + call H5adUtils.AggregateSmartSeq2H5ad as AggregateH5ad { input: - loom_input = LoomOutput.loom_output, - batch_id = batch_id, - batch_name = batch_name, - project_id = if defined(project_id) then select_first([project_id])[0] else none, - project_name = if defined(project_name) then select_first([project_name])[0] else none, - library = if defined(library) then select_first([library])[0] else none, - species = if defined(species) then select_first([species])[0] else none, - organ = if defined(organ) then select_first([organ])[0] else none, - pipeline_version = "MultiSampleSmartSeq2SingleNucleus_v~{pipeline_version}" + h5ad_input = H5adOutput.h5ad_output, + pipeline_version = pipeline_version, + batch_id = batch_id } ### Pipeline output ### output { - # loom output, exon/intron count tsv files and the aligned bam files - File loom_output = AggregateLoom.loom_output_file + # h5ad output, exon/intron count tsv files and the aligned bam files + File h5ad_output = AggregateH5ad.h5ad_output_file File genomic_reference_version = ReferenceCheck.genomic_ref_version - Array[File] exon_intron_count_files = LoomOutput.exon_intron_counts + Array[File] exon_intron_count_files = H5adOutput.exon_intron_counts Array[File] bam_files = RemoveDuplicatesFromBam.output_bam String pipeline_version_out = pipeline_version } diff --git a/pipelines/skylab/smartseq2_single_sample/SmartSeq2SingleSample.changelog.md b/pipelines/skylab/smartseq2_single_sample/SmartSeq2SingleSample.changelog.md index 421964d455..b706b6a96e 100644 --- a/pipelines/skylab/smartseq2_single_sample/SmartSeq2SingleSample.changelog.md +++ b/pipelines/skylab/smartseq2_single_sample/SmartSeq2SingleSample.changelog.md @@ -1,3 +1,8 @@ +# 5.1.21 +2024-09-11 (Date of Last Commit) + +* Updated warp-tools docker which added create_h5ad_snss2.py to the docker image. This change does not affect the SmartSeq2SingleSample pipeline + # 5.1.20 2023-04-19 (Date of Last Commit) diff --git a/pipelines/skylab/smartseq2_single_sample/SmartSeq2SingleSample.wdl b/pipelines/skylab/smartseq2_single_sample/SmartSeq2SingleSample.wdl index efec1c4163..b9df384859 100644 --- a/pipelines/skylab/smartseq2_single_sample/SmartSeq2SingleSample.wdl +++ b/pipelines/skylab/smartseq2_single_sample/SmartSeq2SingleSample.wdl @@ -36,7 +36,7 @@ workflow SmartSeq2SingleSample { } # version of this pipeline - String pipeline_version = "5.1.20" + String pipeline_version = "5.1.21" parameter_meta { genome_ref_fasta: "Genome reference in fasta format" diff --git a/tasks/skylab/FastqProcessing.wdl b/tasks/skylab/FastqProcessing.wdl index 20a7169d29..ea7363b738 100644 --- a/tasks/skylab/FastqProcessing.wdl +++ b/tasks/skylab/FastqProcessing.wdl @@ -138,7 +138,7 @@ task FastqProcessingSlidSeq { # Runtime attributes - String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.0" + String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.3.0" Int cpu = 16 Int machine_mb = 40000 Int disk = ceil(size(r1_fastq, "GiB")*3 + size(r2_fastq, "GiB")*3) + 50 diff --git a/tasks/skylab/H5adUtils.wdl b/tasks/skylab/H5adUtils.wdl index f7cb7338b3..890b044680 100644 --- a/tasks/skylab/H5adUtils.wdl +++ b/tasks/skylab/H5adUtils.wdl @@ -552,4 +552,138 @@ task SingleNucleusSlideseqH5adOutput { output { File h5ad_output = "~{input_id}.h5ad" } +} + +task SingleNucleusSmartSeq2H5adOutput { + input { + #runtime values + String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.3.0" + + Array[File] alignment_summary_metrics + Array[File] dedup_metrics + Array[File] gc_bias_summary_metrics + + # introns counts + Array[File] introns_counts + # exons counts + Array[File] exons_counts + # annotation file + File annotation_introns_added_gtf + # name of the sample + Array[String] input_ids + Array[String]? input_names + String? input_id_metadata_field + String? input_name_metadata_field + + String pipeline_version + Int preemptible = 3 + Int disk = 200 + Int machine_mem_mb = 8000 + Int cpu = 4 + } + + meta { + description: "This task will convert output from the SmartSeq2SingleNucleus pipeline into a loom file. Contrary to the SmartSeq2 single cell where there is only RSEM counts, here we have intronic and exonic counts per gene name" + } + + parameter_meta { + preemptible: "(optional) if non-zero, request a pre-emptible instance and allow for this number of preemptions before running the task on a non preemptible machine" + } + + command <<< + set -euo pipefail + + declare -a introns_counts_files=(~{sep=' ' introns_counts}) + declare -a exons_counts_files=(~{sep=' ' exons_counts}) + declare -a output_prefix=(~{sep=' ' input_ids}) + declare -a alignment_summary_metrics_list=(~{sep=' 'alignment_summary_metrics}) + declare -a dedup_metrics_list=(~{sep=' 'dedup_metrics}) + declare -a gc_bias_summary_metrics_list=(~{sep=' 'gc_bias_summary_metrics}) + + for (( i=0; i<${#introns_counts_files[@]}; ++i)); + do + # creates a table with gene_id, gene_name, intron and exon counts + echo "Running create_snss2_counts_csv." + python /warptools/scripts/create_snss2_counts_csv.py \ + --in-gtf ~{annotation_introns_added_gtf} \ + --intron-counts ${introns_counts_files[$i]} \ + --exon-counts ${exons_counts_files[$i]} \ + -o "${output_prefix[$i]}.exon_intron_counts.tsv" + echo "Success create_snss2_counts_csv." + + # groups the QC file into one file + echo "Running GroupQCs" + GroupQCs -f "${alignment_summary_metrics_list[$i]}" "${dedup_metrics_list[$i]}" "${gc_bias_summary_metrics_list[$i]}" \ + -t Picard -o "${output_prefix[$i]}.Picard_group" + echo "Success GroupQCs" + + # create the loom file + echo "Running create_h5ad_snss2." + python3 /warptools/scripts/create_h5ad_snss2.py \ + --qc_files "${output_prefix[$i]}.Picard_group.csv" \ + --count_results "${output_prefix[$i]}.exon_intron_counts.tsv" \ + --output_h5ad_path "${output_prefix[$i]}" \ + --input_id ${output_prefix[$i]} \ + ~{"--input_id_metadata_field " + input_id_metadata_field} \ + ~{"--input_name_metadata_field " + input_name_metadata_field} \ + --pipeline_version ~{pipeline_version} + + echo "Success create_h5ad_snss2" + done; + >>> + + runtime { + docker: docker + cpu: cpu + memory: "~{machine_mem_mb} MiB" + disks: "local-disk ~{disk} HDD" + disk: disk + " GB" # TES + preemptible: preemptible + } + + output { + Array[File] h5ad_output = glob("*.h5ad") + Array[File] exon_intron_counts = glob("*exon_intron_counts.tsv") + } +} + +task AggregateSmartSeq2H5ad { + input { + Array[File] h5ad_input + String batch_id + String pipeline_version + String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.3.0" + Int disk = 200 + Int machine_mem_mb = 4000 + Int cpu = 1 + } + + meta { + description: "aggregate the H5AD output" + } + + command { + set -e + + # Merge the h5ad files + python3 /warptools/scripts/ss2_h5ad_merge.py \ + --input-h5ad-files ~{sep=' ' h5ad_input} \ + --output-h5ad-file "~{batch_id}.h5ad" \ + --batch_id ~{batch_id} \ + --pipeline_version ~{pipeline_version} + } + + output { + File h5ad_output_file = "~{batch_id}.h5ad" + } + + runtime { + docker: docker + cpu: cpu + memory: "~{machine_mem_mb} MiB" + disks: "local-disk ~{disk} HDD" + disk: disk + " GB" # TES + preemptible: 3 + maxRetries: 1 + } } \ No newline at end of file diff --git a/tasks/skylab/LoomUtils.wdl b/tasks/skylab/LoomUtils.wdl index 145b2ebc77..1c13ad7994 100644 --- a/tasks/skylab/LoomUtils.wdl +++ b/tasks/skylab/LoomUtils.wdl @@ -62,7 +62,7 @@ task OptimusLoomGeneration { input { #runtime values - String docker = "us.gcr.io/broad-gotc-prod/warp-tools:1.0.1-1681406657" + String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.3.0" # name of the sample String input_id # user provided id @@ -214,11 +214,12 @@ task AggregateSmartSeq2Loom { } + task SingleNucleusOptimusLoomOutput { input { #runtime values - String docker = "us.gcr.io/broad-gotc-prod/warp-tools:1.0.1-1681406657" + String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.3.0" # name of the sample String input_id # user provided id @@ -401,7 +402,7 @@ task SlideSeqLoomOutput { String input_id String pipeline_version - String docker = "us.gcr.io/broad-gotc-prod/warp-tools:1.0.1-1681406657" + String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.3.0" Int disk_size_gb = 200 Int memory_mb = 18000 Int cpu = 4 diff --git a/tasks/skylab/Metrics.wdl b/tasks/skylab/Metrics.wdl index 1523712912..2f759071dc 100644 --- a/tasks/skylab/Metrics.wdl +++ b/tasks/skylab/Metrics.wdl @@ -165,7 +165,7 @@ task CalculateUMIsMetrics { # runtime values # Did not update docker image as this task uses loom which does not play nice with the changes - String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.1" + String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.3.0" Int machine_mem_mb = 16000 Int cpu = 8 Int disk = ceil(size(bam_input, "Gi") * 4) @@ -240,7 +240,7 @@ task FastqMetricsSlideSeq { # Runtime attributes - String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.1" + String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.3.0" Int cpu = 16 Int machine_mb = 40000 Int disk = ceil(size(r1_fastq, "GiB")*3) + 50 diff --git a/verification/VerifyMultiSampleSmartSeq2SingleNucleus.wdl b/verification/VerifyMultiSampleSmartSeq2SingleNucleus.wdl index a5d9b8618e..8d7917f60b 100644 --- a/verification/VerifyMultiSampleSmartSeq2SingleNucleus.wdl +++ b/verification/VerifyMultiSampleSmartSeq2SingleNucleus.wdl @@ -6,8 +6,8 @@ workflow VerifyMultiSampleSmartSeq2SingleNucleus { input { Array[File] truth_bams Array[File] test_bams - File truth_loom - File test_loom + File truth_h5ad + File test_h5ad Boolean? done } @@ -21,10 +21,10 @@ workflow VerifyMultiSampleSmartSeq2SingleNucleus { } } - call VerifyTasks.CompareLooms as CompareLooms { + call VerifyTasks.CompareH5adFilesGEX as CompareH5adFiles { input: - test_loom = test_loom, - truth_loom = truth_loom + test_h5ad = test_h5ad, + truth_h5ad = truth_h5ad } output{} diff --git a/verification/test-wdls/TestMultiSampleSmartSeq2SingleNucleus.wdl b/verification/test-wdls/TestMultiSampleSmartSeq2SingleNucleus.wdl index 228b6b1f41..3c98269a4b 100644 --- a/verification/test-wdls/TestMultiSampleSmartSeq2SingleNucleus.wdl +++ b/verification/test-wdls/TestMultiSampleSmartSeq2SingleNucleus.wdl @@ -68,7 +68,7 @@ workflow TestMultiSampleSmartSeq2SingleNucleus { # Collect all of the pipeline outputs into single Array[String] Array[String] pipeline_outputs = flatten([ [ # File outputs - MultiSampleSmartSeq2SingleNucleus.loom_output, + MultiSampleSmartSeq2SingleNucleus.h5ad_output, ], # Array[File] outputs MultiSampleSmartSeq2SingleNucleus.bam_files, @@ -106,9 +106,9 @@ workflow TestMultiSampleSmartSeq2SingleNucleus { results_path = results_path, truth_path = truth_path } - call Utilities.GetValidationInputs as GetLoom { + call Utilities.GetValidationInputs as GetH5ad { input: - input_file = MultiSampleSmartSeq2SingleNucleus.loom_output, + input_file = MultiSampleSmartSeq2SingleNucleus.h5ad_output, results_path = results_path, truth_path = truth_path } @@ -117,8 +117,8 @@ workflow TestMultiSampleSmartSeq2SingleNucleus { input: truth_bams = GetBams.truth_files, test_bams = GetBams.results_files, - truth_loom = GetLoom.truth_file, - test_loom = GetLoom.results_file, + truth_h5ad = GetH5ad.truth_file, + test_h5ad = GetH5ad.results_file, done = CopyToTestResults.done } } diff --git a/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README.md b/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README.md index ea1f81efae..0838117702 100644 --- a/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README.md +++ b/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README.md @@ -15,7 +15,7 @@ slug: /Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README The Smart-seq2 Single Nucleus Multi-Sample (Multi-snSS2) pipeline was developed in collaboration with the [BRAIN Initiative Cell Census Network](https://biccn.org/) (BICCN) to process single-nucleus RNAseq (snRNAseq) data generated by [Smart-seq2 assays](https://www.nature.com/articles/nmeth.2639). The pipeline's workflow is written in WDL, is freely available in the [WARP repository](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl) on GitHub, and can be run by any compliant WDL runner (e.g. [Crowmell](https://github.com/broadinstitute/cromwell)). -The pipeline is designed to process snRNA-seq data from multiple cells. Overall, the workflow trims paired-end FASTQ files, aligns reads to the genome using a modified GTF, [counts intronic and exonic reads](#6-creating-the-loom-cell-by-gene-matrix), and calculates quality control metrics. +The pipeline is designed to process snRNA-seq data from multiple cells. Overall, the workflow trims paired-end FASTQ files, aligns reads to the genome using a modified GTF, [counts intronic and exonic reads](#6-creating-the-h5ad-cell-by-gene-matrix), and calculates quality control metrics. The pipeline has been scientifically validated by the BRAIN Institute. Read more in the [validation section](#validation). @@ -27,7 +27,7 @@ You can run the [Smart-seq2 Single Nucleus Multi-Sample workflow](https://github ## Quick start table | Pipeline features | Description | Source | -|-------------------|---------------------------------------------------------------|-----------------------| +|---|---|---| | Assay type | Smart-seq2 Single Nucleus | [Smart-seq2](https://www.nature.com/articles/nprot.2014.006) | Overall workflow | Quality control and transcriptome quantification. | Code available from the [WARP repository](https://github.com/broadinstitute/warp/tree/develop/pipelines/skylab/smartseq2_single_nucleus/SmartSeq2SingleNucleus.wdl) in GitHub | | Workflow language | WDL | [openWDL](https://github.com/openwdl/wdl) | @@ -94,7 +94,7 @@ Overall, the Multi-snSS2 workflow: 1. Removes duplicate reads. 1. Calculates metrics. 1. Quantifies gene counts. -1. Merges exon counts, intron counts, and metrics into a Loom-formatted matrix. +1. Merges exon counts, intron counts, and metrics into a h5ad-formatted matrix. The tools each task employs in the Multi-snSS2 workflow are detailed in the table below. @@ -102,15 +102,15 @@ To see specific tool parameters, select the task WDL link in the table; then vie | Task name and WDL link | Tool | Software | Description | | --- | --- | --- | --- | -| [CheckInputs.checkInputArrays](https://github.com/broadinstitute/warp/blob/master/tasks/skylab/CheckInputs.wdl) | --- | Bash | Checks the inputs and initiates the per cell processing. | +| [CheckInputs.checkInputArrays](https://github.com/broadinstitute/warp/blob/master/tasks/skylab/CheckInputs.wdl) | --- | Bash | Checks the inputs and initiates the per cell processing. | | [StarAlign.STARGenomeRefVersion](https://github.com/broadinstitute/warp/tree/master/tasks/skylab/StarAlign.wdl) | --- | Bash | Reads the `tar_star_reference` file to obtain the genomic reference source, build version, and annotation version. | | [TrimAdapters.TrimAdapters](https://github.com/broadinstitute/warp/tree/master/tasks/skylab/TrimAdapters.wdl) | [fastq-mcf](https://github.com/ExpressionAnalysis/ea-utils/tree/master/clipper) | [ea-utils](https://github.com/ExpressionAnalysis/ea-utils) | Trims adapter sequences from the FASTQ inputs | | [StarAlign.StarAlignFastqMultisample](https://github.com/broadinstitute/warp/tree/master/tasks/skylab/StarAlign.wdl) | STAR | [STAR](https://github.com/alexdobin/STAR) | Aligns reads to the genome. | | [Picard.RemoveDuplicatesFromBam](https://github.com/broadinstitute/warp/tree/master/tasks/skylab/Picard.wdl) | MarkDuplicates, AddOrReplaceReadGroups | [Picard](https://broadinstitute.github.io/picard/) | Removes duplicate reads, producing a new BAM output; adds regroups to deduplicated BAM. | | [Picard.CollectMultipleMetricsMultiSample](https://github.com/broadinstitute/warp/tree/master/tasks/skylab/Picard.wdl) | CollectMultipleMetrics | [Picard](https://broadinstitute.github.io/picard/) | Collects QC metrics on the deduplicated BAM files. | | [CountAlignments.CountAlignments](https://github.com/broadinstitute/warp/tree/master/tasks/skylab/FeatureCounts.wdl) | FeatureCounts | [Subread](http://subread.sourceforge.net/), Python 3 | Uses a custom GTF with featureCounts and Python to mark introns, create a BAM that has alignments spanning intron-exon junctions removed, and counts exons using the custom BAM and by excluding intron tags. | -| [LoomUtils.SingleNucleusSmartSeq2LoomOutput](https://github.com/broadinstitute/warp/blob/master/tasks/skylab/LoomUtils.wdl) | Custom script: [ss2_loom_merge.py](https://github.com/broadinstitute/warp-tools/blob/develop/tools/scripts/ss2_loom_merge.py) | Python 3 | Creates the matrix files (Loom format) for each sample. | -| [LoomUtils.AggregateSmartSeq2Loom](https://github.com/broadinstitute/warp/blob/master/tasks/skylab/LoomUtils.wdl) | Custom script: [ss2_loom_merge.py](https://github.com/broadinstitute/warp-tools/blob/develop/tools/scripts/ss2_loom_merge.py) | Python 3 | Aggregates the matrix files (Loom format) for each sample to produce one final Loom output. | +| [H5adUtils.SingleNucleusSmartSeq2H5adOutput](https://github.com/broadinstitute/warp/blob/master/tasks/skylab/H5adUtils.wdl) | Custom script: [create_h5ad_snss2.py](https://github.com/broadinstitute/warp-tools/blob/develop/tools/scripts/create_h5ad_snss2.py) | Python 3 | Creates the matrix files (h5ad format) for each sample. | +| [H5adUtils.AggregateSmartSeq2H5ad](https://github.com/broadinstitute/warp/blob/master/tasks/skylab/H5adUtils.wdl) | Custom script: [ss2_h5ad_merge.py](https://github.com/broadinstitute/warp-tools/blob/develop/tools/scripts/ss2_h5ad_merge.py) | Python 3 | Aggregates the matrix files (h5ad format) for each sample to produce one final h5ad output. | #### 1. Trimming adapters The TrimAdapters task uses the adapter list reference file to run the [fastq-mcf](https://github.com/ExpressionAnalysis/ea-utils/tree/master/clipper) tool. This tool identifies the adapters in the input FASTQ files and performs clipping by using a subsampling parameter of 200,000 reads. The task outputs the trimmed FASTQ files which are then used for alignment. @@ -122,7 +122,7 @@ The StarAlignFastq task runs the STAR aligner on the trimmed FASTQ files. The ST The RemoveDuplicatesFromBam task removes multi-mapped reads, optical duplicates, and PCR duplicates from the aligned BAM. It then adds readgroup information and creates a new, coordinate-sorted aligned BAM output. #### 4. Collecting metrics -The CollectMultipleMetrics task uses the Picard tool CollectMultipleMetrics to perform QC on the deduplicated BAM file. These metrics are copied to the final cell-by-gene matrix output (Loom file). A detailed list of these metrics can be found in the [Multi-snSS2 Count Matrix Overview](./count-matrix-overview.md). +The CollectMultipleMetrics task uses the Picard tool CollectMultipleMetrics to perform QC on the deduplicated BAM file. These metrics are copied to the final cell-by-gene matrix output (h5ad file). A detailed list of these metrics can be found in the [Multi-snSS2 Count Matrix Overview](./count-matrix-overview.md). #### 5. Counting genes The CountAlignments task uses the featureCounts package to count introns and exons. First, the featureCounts tool counts intronic alignments in the deduplicated BAM using a custom GTF with annotated introns. The tool flags intronic alignments if they overlap an annotated intron by a minimum of 3 bp. @@ -131,28 +131,28 @@ Next, following pipeline processes established by the BICCN Whole Mouse Brain Wo Lastly, featureCounts uses the intermediate BAM with junctions removed to count exons. The final outputs of this step include a cell-by-gene matrix of intronic counts, a cell-by-gene matrix of exonic counts, and summary metrics for the different count types. -#### 6. Creating the Loom cell by gene matrix -The LoomUtils task combines the Picard metrics (alignment_summary_metrics, deduplication metrics, and the G/C bias summary metrics) with the featureCount exon and intron counts to create a Loom formatted cell-by-gene count matrix. +#### 6. Creating the h5ad cell by gene matrix +The H5adUtils task combines the Picard metrics (alignment_summary_metrics, deduplication metrics, and the G/C bias summary metrics) with the featureCount exon and intron counts to create an h5ad formatted cell-by-gene count matrix. Read full details for all the metrics in the [Multi-snSS2 Count Matrix Overview](./count-matrix-overview.md). -The cell-by-gene matrix can be examined using [Loompy software](https://linnarssonlab.org/loompy/index.html). Exonic counts are stored in the main Loom matrix which is unnamed by default. They are the default return value of the `loompy.connect()` command. Intronic counts are stored in the Loom as an additional layer which is named `intron_counts`. +The cell-by-gene matrix can be examined using [anndata software](https://anndata.readthedocs.io/en/latest/). Exonic counts are stored in the main h5ad matrix which is unnamed by default. They are the default return value of the `anndata.read_h5ad()` command. Intronic counts are stored in the h5ad as an additional layer which is named `intron_counts`. -You can also access exonic and intronic counts using Loompy's `layers()` method. For example, `loompy.connect.layers[“”]` will return the exonic counts from the output Loom file. Similarly, `loompy.connect.layers[“intron_counts”]` will return the intronic counts from the output Loom. +You can also access exonic and intronic counts using anndatas's `layers()` method. For example, `anndata.layers[“”]` will return the exonic counts from the output h5ad file. Similarly, `anndata.connect.layers[“intron_counts”]` will return the intronic counts from the output h5ad. Whole gene counts (which include both intronic and exonic counts) can be accessed by adding the intronic and exonic counts together. -Below is example Loompy code for accessing the Loom's exonic, intronic, and whole gene counts. +Below is example anndata code for accessing the h5ad's exonic, intronic, and whole gene counts. ```python -import loompy -ds = loompy.connect("/PATH/TO/File.loom") -count_exons = ds[:,:] #geneXcell table for the exonic read counts -count_introns = ds.layer["intron_counts"] #geneXcell table for the intronic read counts -gene_counts = count_exons + count_introns +import anndata +adata = anndata.read_h5ad("/PATH/TO/File.h5ad") +count_exons = adata.X #geneXcell table for the exonic read counts +count_introns = adata.layers["intron_counts"] #geneXcell table for the intronic read counts ``` +If you would like to get the counts for both introns and exons, you can sum the counts together. -To read more about the Loom file format and use of layers, see the [Loompy documentation](https://linnarssonlab.org/loompy/index.html). +To read more about the h5ad file format and use of layers, see the [h5ad documentation](https://anndata.readthedocs.io/en/latest/). #### 7. Outputs @@ -160,16 +160,14 @@ The table below details the final outputs of the Multi-snSS2 workflow. | Output variable name | Description | Type | | --- | --- | --- | -| loom_output | Cell-by-gene count matrix that includes the raw exon counts (in matrix), intron counts (in matrix layer), cell metrics (column attributes) and gene IDs (row attributes). | Loom | -| exon_intron_count_files | Array of TXT files (one per cell) that contain intronic and exonic counts. | Array [TXT]| +| h5ad_output | Cell-by-gene count matrix that includes the raw exon counts (in matrix), intron counts (in matrix layer), cell metrics (column attributes) and gene IDs (row attributes). | h5ad | +| exon_intron_count_files | Array of TXT files (one per cell) that contain intronic and exonic counts. | Array [TXT]| | bam_files | Array of genome-aligned BAM files (one for each cell) generated with STAR. | Array [BAM]| | pipeline_version_out | Version of the processing pipeline run on this data. | String | -The Loom matrix is the default output. See the [create_loom_snss2.py](https://github.com/broadinstitute/warp-tools/blob/develop/tools/scripts/create_loom_snss2.py) script for the detailed code. This matrix contains the count matrices, as well as the gene and cell metrics detailed in the [Multi-snSS2 Count Matrix Overview](./count-matrix-overview.md). +The h5ad matrix is the default output. See the [create_h5ad_snss2.py](https://github.com/broadinstitute/warp-tools/blob/develop/tools/scripts/create_h5ad_snss2.py) script for the detailed code. This matrix contains the count matrices, as well as the gene and cell metrics detailed in the [Multi-snSS2 Count Matrix Overview](./count-matrix-overview.md). -To facilitate downstream analysis, the output Loom file contains both gene names and gene IDs. - -The output Loom matrix can be converted to an H5AD file using a [custom script](https://github.com/broadinstitute/warp-tools/blob/develop/tools/scripts/loom_to_h5ad.py) available in the [warp-tools GitHub repository](https://github.com/broadinstitute/warp-tools). +To facilitate downstream analysis, the output h5ad file contains both gene names and gene IDs. ## Validation The Multi-snSS2 pipeline was scientifically validated by the BRAIN Initiatives Cell Census Network (BICCN) 2.0 Whole Mouse Brain Working Group.