diff --git a/.gitignore b/.gitignore index 097200a..c8c2326 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ dist/* *.tar.bz2 *.dylib *.tmp +*.egg-info/* diff --git a/.travis.yml b/.travis.yml index 23584ce..3c3e58b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,6 +11,7 @@ jobs: cache: directories: - ~/miniconda + - ~/Data before_install: # Set conda path info @@ -29,6 +30,49 @@ before_install: wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O $HOME/download/miniconda.sh; fi; fi; + # download test data for scatac + - TEST_DATA_PATH=$HOME/Data; + - TEST_DATA_SCATAC_PATH=$TEST_DATA_PATH/atac_pbmc_500_v1_fastqs_sampling; + - TEST_DATA_GIGGLE_PATH=$TEST_DATA_PATH/giggle.all; + - TEST_DATA_SCATAC_REFERENCE_PATH=$TEST_DATA_PATH/Refdata_scATAC_MAESTRO_GRCh38_1.1.0; + - if [[ -d $TEST_DATA_PATH ]]; then + echo "Directory Data already exists"; + else + mkdir -p $TEST_DATA_PATH; + fi; + - if [[ -d $TEST_DATA_SCATAC_PATH ]]; then + echo "scATAC test data already available from cache"; + else + cd $TEST_DATA_PATH; + echo "downloading scatac test data"; + wget http://cistrome.org/~chenfei/MAESTRO/atac_pbmc_500_v1_fastqs_sampling.tar.gz; + echo "decompressing scatac test data"; + tar -xvzf atac_pbmc_500_v1_fastqs_sampling.tar.gz; + rm atac_pbmc_500_v1_fastqs_sampling.tar.gz; + cd ../; + fi; + - if [[ -d $TEST_DATA_GIGGLE_PATH ]]; then + echo "scATAC giggle index data already available from cache"; + else + cd $TEST_DATA_PATH; + echo "downloading giggle index"; + wget http://cistrome.org/~chenfei/MAESTRO/giggle.all.tar.gz; + echo "decompressing giggle index"; + tar -xvzf giggle.all.tar.gz; + rm giggle.all.tar.gz; + cd ../; + fi; + - if [[ -d $TEST_DATA_SCATAC_REFERENCE_PATH ]]; then + echo "scATAC reference genome data already available from cache"; + else + cd $TEST_DATA_PATH; + echo "downloading reference genome"; + wget http://cistrome.org/~chenfei/MAESTRO/Refdata_scATAC_MAESTRO_GRCh38_1.1.0.tar.gz; + echo "decompressing reference genome"; + tar -xvzf Refdata_scATAC_MAESTRO_GRCh38_1.1.0.tar.gz; + rm Refdata_scATAC_MAESTRO_GRCh38_1.1.0.tar.gz; + cd ../; + fi; # install the package and dependencies: install: @@ -84,6 +128,10 @@ script: - MAESTRO -v - R -e "library(MAESTRO);library(Seurat)" - R -e "library(org.Hs.eg.db);library(org.Mm.eg.db)" + # test python utility scripts + - cd test + - bash test.sh + - cd .. # We also need more testing here! # the following codes will upload when all the above is successful and @@ -102,7 +150,7 @@ after_success: - mamba install anaconda-client - | # Only upload builds from tags - if [[ $TRAVIS_PULL_REQUEST == false && $TRAVIS_REPO_SLUG == "liulab-dfci/MAESTRO" + if [[ $TRAVIS_PULL_REQUEST == false && $TRAVIS_REPO_SLUG == "dongqingsun/MAESTRO" && $TRAVIS_BRANCH == $TRAVIS_TAG && $TRAVIS_TAG != '' ]]; then export ANACONDA_API_TOKEN=$CONDA_UPLOAD_TOKEN anaconda upload bld-dir/**/PACKAGENAME-*.tar.bz2 diff --git a/MAESTRO.egg-info/PKG-INFO b/MAESTRO.egg-info/PKG-INFO new file mode 100644 index 0000000..a60d2f5 --- /dev/null +++ b/MAESTRO.egg-info/PKG-INFO @@ -0,0 +1,17 @@ +Metadata-Version: 1.1 +Name: MAESTRO +Version: 1.2.0 +Summary: MAESTRO(Model-based AnalysEs of Single-cell Transcriptome and RegulOme) is a comprehensive single-cell RNA-seq and ATAC-seq analysis suit built using snakemake. +Home-page: https://github.com/chenfeiwang/MAESTRO +Author: Chenfei Wang, Dongqing Sun +Author-email: UNKNOWN +License: GPL-3.0 +Description: UNKNOWN +Platform: UNKNOWN +Classifier: Development Status :: 4 - Beta +Classifier: Environment :: Console +Classifier: Intended Audience :: Science/Research +Classifier: License :: OSI Approved :: GPL License +Classifier: Natural Language :: English +Classifier: Programming Language :: Python :: 3 +Classifier: Topic :: Scientific/Engineering :: Bio-Informatics diff --git a/MAESTRO.egg-info/SOURCES.txt b/MAESTRO.egg-info/SOURCES.txt new file mode 100644 index 0000000..7dc7e76 --- /dev/null +++ b/MAESTRO.egg-info/SOURCES.txt @@ -0,0 +1,27 @@ +README.md +setup.py +MAESTRO/MAESTRO +MAESTRO/MAESTRO_PipeInit.py +MAESTRO/__init__.py +MAESTRO/integrate_HTMLReport.py +MAESTRO/scATAC_10x_BarcodeCorrect.py +MAESTRO/scATAC_10x_PeakCount.py +MAESTRO/scATAC_BamAddTag.py +MAESTRO/scATAC_FragmentCorrect.py +MAESTRO/scATAC_FragmentGenerate.py +MAESTRO/scATAC_Genescore.py +MAESTRO/scATAC_H5Process.py +MAESTRO/scATAC_HTMLReport.py +MAESTRO/scATAC_QC.py +MAESTRO/scATAC_microfluidic_PeakCount.py +MAESTRO/scATAC_microfluidic_QC.py +MAESTRO/scATAC_sci_BarcodeExtract.py +MAESTRO/scATAC_utility.py +MAESTRO/scRNA_AnalysisPipeline.py +MAESTRO/scRNA_HTMLReport.py +MAESTRO/scRNA_QC.py +MAESTRO/scRNA_utility.py +MAESTRO.egg-info/PKG-INFO +MAESTRO.egg-info/SOURCES.txt +MAESTRO.egg-info/dependency_links.txt +MAESTRO.egg-info/top_level.txt \ No newline at end of file diff --git a/MAESTRO.egg-info/dependency_links.txt b/MAESTRO.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/MAESTRO.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/MAESTRO.egg-info/top_level.txt b/MAESTRO.egg-info/top_level.txt new file mode 100644 index 0000000..4767e77 --- /dev/null +++ b/MAESTRO.egg-info/top_level.txt @@ -0,0 +1 @@ +MAESTRO diff --git a/MAESTRO/MAESTRO b/MAESTRO/MAESTRO index cf7d1ba..818ccfa 100644 --- a/MAESTRO/MAESTRO +++ b/MAESTRO/MAESTRO @@ -5,7 +5,7 @@ # @Last Modified by: Dongqing Sun # @Last Modified time: 2020-02-28 13:58:10 -version = "1.1.0" +version = "1.2.1" import logging import sys, os @@ -13,6 +13,7 @@ import shutil import argparse as ap from MAESTRO.MAESTRO_PipeInit import * +from MAESTRO.MAESTRO_ParameterValidate import * from MAESTRO.scATAC_H5Process import * from MAESTRO.scATAC_Genescore import genescore_parser, genescore from MAESTRO.scATAC_10x_PeakCount import peakcount_parser, peakcount @@ -47,7 +48,7 @@ def main(): scrna_analysis_parser(subparsers) - logging.basicConfig(format="%(message)s", level=logging.INFO, stream=sys.stderr) + logging.basicConfig(format="%(levelname)s: %(message)s", stream=sys.stderr) args = parser.parse_args() @@ -55,9 +56,11 @@ def main(): print(version) exit(0) elif args.subcommand == "scatac-init": + scatac_validator(args) scatac_config(args) elif args.subcommand == "scrna-init": + scrna_validator(args) scrna_config(args) elif args.subcommand == "integrate-init": diff --git a/MAESTRO/MAESTRO_ParameterValidate.py b/MAESTRO/MAESTRO_ParameterValidate.py new file mode 100644 index 0000000..295c13f --- /dev/null +++ b/MAESTRO/MAESTRO_ParameterValidate.py @@ -0,0 +1,138 @@ +# -*- coding: utf-8 -*- +# @Author: Dongqing Sun +# @E-mail: Dongqingsun96@gmail.com +# @Date: 2020-07-19 17:20:32 +# @Last Modified by: Dongqing Sun +# @Last Modified time: 2020-07-20 13:49:05 + + +import sys +import os +import re +import logging +from argparse import ArgumentError + + +def scatac_validator(args): + """ + Validate parameters from scatac-init argument parsers. + """ + if args.platform == "10x-genomics": + if args.format == "fastq": + if args.fastq_dir == "": + logging.error("--fastq-dir is required. Please specify the directory where fastq files are stored!") + exit(1) + if args.fastq_prefix == "": + logging.error("--fastq-prefix is required. Please provide the sample name of fastq files!") + exit(1) + if args.fasta == "": + logging.error("--fasta is required if fastq files are provided!") + exit(1) + if args.whitelist == "": + logging.error("--whitelist is required for 10x-genomics data!") + exit(1) + if args.format == "bam": + if args.bam == "": + logging.error("--bam is required. Please provide the bam file with CB tag!") + exit(1) + if args.format == "fragments": + if args.frag == "": + logging.error("--frag is required. Please provide the fragment file generated by CellRanger ATAC!") + exit(1) + + if args.platform == "sci-ATAC-seq": + if args.format == "fastq": + if args.fastq_dir == "": + logging.error("--fastq-dir is required. Please specify the directory where fastq files are stored!") + exit(1) + if args.fastq_prefix == "": + logging.error("--fastq-prefix is required. Please provide the sample name of fastq files!") + exit(1) + if args.fasta == "": + logging.error("--fasta is required if fastq files are provided!") + exit(1) + if args.format == "bam": + if args.bam == "": + logging.error("--bam is required. Please provide the bam file with CB tag!") + exit(1) + if args.format == "fragments": + logging.error("Format of 'fragments' is supported only when the platform is '10x-genomics'.") + exit(1) + + if args.platform == "microfluidic": + if args.format == "fastq": + if args.fastq_dir == "": + logging.error("--fastq-dir is required. Please specify the directory where fastq files are stored!") + exit(1) + if args.fasta == "": + logging.error("--fasta is required if fastq files are provided!") + exit(1) + if args.format == "bam": + logging.error("Format of 'bam' is supported when the platform is '10x-genomics' or 'sci-ATAC-seq'.") + exit(1) + if args.format == "fragments": + logging.error("Format of 'fragments' is supported only when the platform is '10x-genomics'.") + exit(1) + + if args.signature not in ['human.immune.CIBERSORT', 'mouce.brain.ALLEN', 'mouse.all.facs.TabulaMuris', 'mouse.all.droplet.TabulaMuris']: + if os.path.exists(args.signature): + pass + else: + logging.error("Please specify the signature built in MAESTRO or provide customized signature file. See --signature help for more details!") + exit(1) + + +def scrna_validator(args): + """ + Validate parameters from scrna-init argument parsers. + """ + + if args.platform == "10x-genomics": + if args.fastq_dir == "": + logging.error("--fastq-dir is required. Please specify the directory where fastq files are stored!") + exit(1) + if args.fastq_prefix == "": + logging.error("--fastq-prefix is required. Please provide the sample name of fastq files!") + exit(1) + if args.whitelist == "": + logging.error("--whitelist is required for 10x-genomics data!") + exit(1) + + if args.platform == "Dropseq": + if args.fastq_dir == "": + logging.error("--fastq-dir is required for Dropsea data. Please specify the directory where fastq files are stored!") + exit(1) + if args.fastq_barcode == "": + logging.error("--fastq-barcode is required for Dropsea data. Please specify the barcode fastq file!") + exit(1) + if args.fastq_transcript == "": + logging.error("--fastq-transcript is required for Dropsea data. Please specify the transcript fastq file!") + exit(1) + if args.whitelist == "": + logging.error("--whitelist is required for Dropsea data. Please provide the barcode whitelist.") + exit(1) + + if args.platform == "Smartseq2": + if args.fastq_dir == "": + logging.error("--fastq-dir is required. Please specify the directory where fastq files are stored!") + exit(1) + if args.rsem == "": + logging.error("--rsem is required. Please provide the prefix of transcript references for RSEM. See --rsem help for more details.") + exit(1) + + if args.lisamode == "local": + if args.lisaenv == "": + logging.error("--lisaenv is required when lisamode is 'local'. Please specify the name of LISA environment!") + exit(1) + if args.condadir == "": + logging.error("--condadir is required when lisamode is 'local'. Please specify the directory where miniconda or anaconda is installed!") + exit(1) + + if args.signature not in ['human.immune.CIBERSORT', 'mouce.brain.ALLEN', 'mouse.all.facs.TabulaMuris', 'mouse.all.droplet.TabulaMuris']: + if os.path.exists(args.signature): + pass + else: + logging.error("Please specify the signature built in MAESTRO or provide customized signature file. See --signature help for more details!") + exit(1) + + diff --git a/MAESTRO/MAESTRO_PipeInit.py b/MAESTRO/MAESTRO_PipeInit.py index 58f37f5..e14a4f7 100644 --- a/MAESTRO/MAESTRO_PipeInit.py +++ b/MAESTRO/MAESTRO_PipeInit.py @@ -3,7 +3,7 @@ # @E-mail: Dongqingsun96@gmail.com # @Date: 2020-02-23 19:40:27 # @Last Modified by: Dongqing Sun -# @Last Modified time: 2020-06-11 22:19:45 +# @Last Modified time: 2020-07-20 16:47:23 import os @@ -27,9 +27,20 @@ def scatac_parser(subparsers): group_input.add_argument("--platform", dest = "platform", default = "10x-genomics", choices = ["10x-genomics", "sci-ATAC-seq", "microfluidic"], help = "Platform of single cell ATAC-seq. DEFAULT: 10x-genomics.") - group_input.add_argument("--fastq-dir", dest = "fastq_dir", type = str, required = True, + group_input.add_argument("--format", dest = "format", default = "fastq", + choices = ["fastq", "bam", "fragments"], + help = "The format of input files. Users can start with sequencing fastq files, " + "bam files with CB tag or fragments.tsv.gz file generated by CellRanger ATAC. " + "If the platform is 10x-genomics, users can start with sequencing fastq files, bam files with CB tag or fragments.tsv.gz file generated by CellRanger ATAC. " + "If the platform is sci-ATAC-seq, users can start with sequencing fastq files, bam files with CB tag. " + "If the platform is microfluidic, users can only start with sequencing fastq files. " + "If the format is 'fastq', --fastq-dir and --fastq-prefix need to be set. " + "If the format is 'bam', --bam needs to be set. " + "If the format is 'fragments', --frag needs to be set. " + "DEFAULT: fastq.") + group_input.add_argument("--fastq-dir", dest = "fastq_dir", type = str, default = "", help = "Directory where fastq files are stored.") - group_input.add_argument("--fastq-prefix", dest = "fastq_prefix", type = str, default = "", + group_input.add_argument("--fastq-prefix", dest = "fastq_prefix", type = str, default = "", help = "Sample name of fastq file (required for the platform of '10x-genomics' or 'sci-ATAC-seq'). " "When the platform is '10x-genomics', if there is a file named pbmc_1k_v2_S1_L001_I1_001.fastq.gz, the prefix is 'pbmc_1k_v2'. " "If the platform is 'sci-ATAC-seq', there are two ways to provide fastq files. " @@ -38,6 +49,11 @@ def scatac_parser(subparsers): "in the format of '@ReadName:CellBarcode:OtherInformation'. For example, @rd.1:TCTCCCGCCGAGGCTGACTGCATAAGGCGAAT:SHEN-MISEQ02:1:1101:15311:1341. " "The other way is to provide 10x-like fastq files which should contain three fastq files -- prefix_R1.fastq, prefix_R2.fastq and prefix_R3.fastq. " "In this way, read1, barcode and read2 are associated with R1, R2, R3, respectively.") + group_input.add_argument("--bam", dest = "bam", type = str, default = "", + help = "The bam file with cell barcode (CB) tag (required when the format is set as 'bam').") + group_input.add_argument("--frag", dest = "frag", type = str, default = "", + help = "The fragment file generated by CellRanger ATAC (required when the format is set as 'fragments'). " + "For example, fragments.tsv.gz.") group_input.add_argument("--species", dest = "species", default = "GRCh38", choices = ["GRCh38", "GRCm38"], type = str, help = "Specify the genome assembly (GRCh38 for human and GRCm38 for mouse). DEFAULT: GRCh38.") @@ -70,7 +86,7 @@ def scatac_parser(subparsers): "Please download the annotation file from " "http://cistrome.org/~chenfei/MAESTRO/giggle.tar.gz and decompress it.") group_reference.add_argument("--fasta", dest = "fasta", - required = True, + default = "", help = "Genome fasta file for minimap2. " "Users can just download the fasta file for huamn and mouse from " "http://cistrome.org/~chenfei/MAESTRO/Refdata_scATAC_MAESTRO_GRCh38_1.1.0.tar.gz and " @@ -78,7 +94,7 @@ def scatac_parser(subparsers): "For example, 'Refdata_scATAC_MAESTRO_GRCh38_1.1.0/GRCh38_genome.fa'.") # Barcode library arguments - group_barcode = workflow.add_argument_group("Barcode library arguments, only for platform of 'sci-ATAC-seq'") + group_barcode = workflow.add_argument_group("Barcode library arguments, only for platform of 'sci-ATAC-seq' and '10x-genomics'") group_barcode.add_argument("--whitelist", dest = "whitelist", type = str, default = "", help = "If the platform is 'sci-ATAC-seq' or '10x-genomics', please specify the barcode library (whitelist) " @@ -124,13 +140,27 @@ def scatac_parser(subparsers): "to 10kb (enhancer-based regulation). DEFAULT: 10000.") # Signature file arguments - group_signature = workflow.add_argument_group("Cell signature arguments") - group_signature.add_argument("--signature", dest = "signature", type = str, default = "human.immune.CIBERSORT", - help = "Cell signature file used to annotate cell types. MAESTRO provides several sets of built-in cell signatures. " + group_annotation = workflow.add_argument_group("Cell-type annotation arguments") + group_annotation.add_argument("--annotation", dest = "annotation", action = "store_true", + help = "Whether or not to perform ccell-type annotation. " + "By default (not set), MAESTRO will skip the step of cell-type annotation. " + "If set, please specify the method of cell-type annotation through --method. ") + group_annotation.add_argument("--method", dest = "method", type = str, default = "RP-based", + choices = ["RP-based", "peak-based", "both"], + help = "Method to annotate cell types. MAESTRO provides two strategies to annotate cell types for scATAC-seq data. " + "Users can choose from 'RP-based' and 'peak-based', or choose to run both of them. " + "One is based on gene regulatory potential predicted by RP model. Another is based on the bulk chromatin accessibility data from Cistrome database. " + "If 'RP-based' is set, MAESTRO performs the cell-type annotation using the gene regulatory potential to represent gene expression, " + "and the logFC of gene regulatory potential between one cluster and all the other cells is used to calculate the gene signature scores. " + "If 'peak-based' is set, MAESTRO utilizes GIGGLE to evaluate the enrichment of bulk chromatin accessibility peaks on cluster-specific peaks from scATAC-seq data, " + "and then transfers the Cistrome cluster identity from the most enriched bulk chromatin accessibility data as the cell-type annotation for the scATAC-seq cluster. " + "See the MAESTRO paper for more details. DEFAULT: RP-based. ") + group_annotation.add_argument("--signature", dest = "signature", type = str, default = "human.immune.CIBERSORT", + help = "Cell signature file used to annotate cell types (required when method is set as 'RP-based'). MAESTRO provides several sets of built-in cell signatures. " "Users can choose from ['human.immune.CIBERSORT', 'mouce.brain.ALLEN', 'mouse.all.facs.TabulaMuris', 'mouse.all.droplet.TabulaMuris']. " "Custom cell signatures are also supported. In this situation, users need to provide the file location of cell signatures, " "and the signature file is tab-seperated without header. The first column is cell type, and the second column is signature gene. " - "DEFAULT: human.immune.CIBERSORT.") + "DEFAULT: human.immune.CIBERSORT. ") return @@ -205,7 +235,7 @@ def scrna_parser(subparsers): # Barcode arguments group_barcode = workflow.add_argument_group("Barcode arguments, for platform of 'Dropseq' or '10x-genomics'") - group_barcode.add_argument("--whitelist", dest = "whitelist", type = str, + group_barcode.add_argument("--whitelist", dest = "whitelist", type = str, default = "", help = "If the platform is 'Dropseq' or '10x-genomics', please specify the barcode library (whitelist) " "so that STARsolo can do the error correction and demultiplexing of cell barcodes. " "The 10X Chromium whitelist file can be found inside the CellRanger distribution. " @@ -228,9 +258,9 @@ def scrna_parser(subparsers): # group_regulator.add_argument("--method", dest = "method", type = str, # choices = ["RABIT", "LISA"], default = "LISA", # help = "Method to predict driver regulators.") - group_regulator.add_argument("--method", dest = "method", type = str, - choices = ["LISA"], default = "LISA", - help = "Method to predict driver regulators.") + # group_regulator.add_argument("--method", dest = "method", type = str, + # choices = ["LISA"], default = "LISA", + # help = "Method to predict driver regulators.") # group_regulator.add_argument("--rabitlib", dest = "rabitlib", type = str, # help = "Path of the rabit annotation file required for regulator identification (only required if method is set to RABIT). " # "Please download the annotation file from " @@ -242,11 +272,11 @@ def scrna_parser(subparsers): "the 'local' mode is recommended to run the whole MAESTRO pipeline. " "If the mode is 'local', please provide the name of LISA environment through --lisaenv " "and specify the directory where miniconda or anaconda is installed through --condadir. DEFAULT: local.") - group_regulator.add_argument("--lisaenv", dest = "lisaenv", type = str, default = "lisa", - help = "Name of LISA environment (required if method is set to lisa and lisamode is set to local). DEFAULT: lisa.") - group_regulator.add_argument("--condadir", dest = "condadir", type = str, required = True, + group_regulator.add_argument("--lisaenv", dest = "lisaenv", type = str, default = "", + help = "Name of LISA environment (required if lisamode is set to local). For example, lisa.") + group_regulator.add_argument("--condadir", dest = "condadir", type = str, default = "", help = "Directory where miniconda or anaconda is installed " - "(required if method is set to lisa and lisamode is set to local). For example, '/home/user/miniconda3'.") + "(required if lisamode is set to local). For example, '/home/user/miniconda3'.") # Signature file arguments group_signature = workflow.add_argument_group("Cell signature arguments") @@ -310,10 +340,13 @@ def scatac_config(args): with open(configfile, "w") as configout: configout.write(config_template.render( + platform = args.platform, + format = args.format, fastqdir = os.path.abspath(args.fastq_dir), fastqprefix = args.fastq_prefix, + bam = os.path.abspath(args.bam), + frag = os.path.abspath(args.frag), species = args.species, - platform = args.platform, outprefix = args.outprefix, whitelist = os.path.abspath(args.whitelist), cores = args.cores, @@ -321,6 +354,8 @@ def scatac_config(args): count = args.count_cutoff, frip = args.frip_cutoff, cell = args.cell_cutoff, + annotation = args.annotation, + method = args.method, signature = signature, custompeaks = args.custompeak, custompeaksloc = os.path.abspath(args.custompeak_file), @@ -370,7 +405,6 @@ def scrna_config(args): gene = args.gene_cutoff, cell = args.cell_cutoff, signature = signature, - method = args.method, # rabitlib = os.path.abspath(args.rabitlib), lisamode = args.lisamode, lisaenv = args.lisaenv, diff --git a/MAESTRO/R/scATACseq_pipe.R b/MAESTRO/R/scATACseq_pipe.R index bc098d7..06356ae 100644 --- a/MAESTRO/R/scATACseq_pipe.R +++ b/MAESTRO/R/scATACseq_pipe.R @@ -24,6 +24,12 @@ option_list = list( make_option(c("--species"), type = "character", default = "GRCh38", action = "store", help = "The platform of scRNA-seq." ), + make_option(c("--annotation"), type = "character", default = "False", + action = "store", help = "Whether or not to annotate cell types. Default is False." + ), + make_option(c("--method"), type = "character", default = "RP-based", + action = "store", help = "Method to annotate cell types. ['RP-based', 'peak-based', 'both']" + ), make_option(c("--signature"), type = "character", default = "", action = "store", help = "The cell signature file for celltype annotation. Default is built-in CIBERSORT immune cell signature." ), @@ -49,6 +55,8 @@ sigfile = argue$signature gigglelib = argue$gigglelib species = argue$species fragment = argue$fragment +annotation = argue$annotation +method = argue$method # countmatrix = read.table(argue[1], sep = '\t', header = TRUE, row.names = 1, check.names = FALSE) # RPmatrix = read.table(argue[2], sep = '\t', header = TRUE, row.names = 1, check.names = FALSE) @@ -65,23 +73,41 @@ if(sigfile %in% c("human.immune.CIBERSORT", "mouce.brain.ALLEN", "mouse.all.facs } result = ATACRunSeurat(inputMat = countmatrix, project = prefix, method = "LSI") result$ATAC = ATACAttachGenescore(ATAC = result$ATAC, RPmatrix = RPmatrix) -result$ATAC = ATACAnnotateCelltype(result$ATAC, signatures = signatures) saveRDS(result, paste0(prefix, "_scATAC_Object.rds")) +if (annotation == "True") { + if (method == "RP-based") { + result$ATAC = ATACAnnotateCelltype(result$ATAC, signatures = signatures) + + } + if (method == "peak-based") { + result$ATAC <- ATACAnnotateChromatinAccessibility(ATAC = result$ATAC, + peaks = result$peaks, + project = prefix, + giggle.path = gigglelib, + organism = species) + } + if (method == "both") { + result$ATAC <- ATACAnnotateChromatinAccessibility(ATAC = result$ATAC, + peaks = result$peaks, + project = prefix, + giggle.path = gigglelib, + organism = species) + result$ATAC = ATACAnnotateCelltype(result$ATAC, signatures = signatures) + + } + if ("assign.biological_resource" %in% colnames(result$ATAC@meta.data)) { + p1 <- DimPlot(result$ATAC, label = TRUE, reduction = "umap", group.by = "assign.biological_resource", repel=T, pt.size = 0.5, label.size = 2.5) + ggsave(file.path(paste0(result$ATAC@project.name, "_CistromeTop_annotated.png")), p1, width=7.5, height=4) + } +} + result.tfs = ATACAnnotateTranscriptionFactor(ATAC = result$ATAC, peaks = result$peaks, project = prefix, giggle.path = gigglelib, organism = species) -result$ATAC <- ATACAnnotateChromatinAccessibility(ATAC = result$ATAC, - peaks = result$peaks, - project = prefix, - giggle.path = gigglelib, - organism = species) -if ("biological_resource" %in% colnames(result$ATAC@meta.data)) { - p1 <- DimPlot(result$ATAC, label = TRUE, reduction = "umap", group.by = "biological_resource", repel=T, pt.size = 0.5, label.size = 2.5) - ggsave(file.path(paste0(result$ATAC@project.name, "_CistromeTop_annotated.png")), p1, width=7.5, height=4) -} +saveRDS(result, paste0(prefix, "_scATAC_Object.rds")) if(species == "GRCh38"){ library(TxDb.Hsapiens.UCSC.hg38.knownGene) diff --git a/MAESTRO/R/scATACseq_qc.R b/MAESTRO/R/scATACseq_qc.R index a7252db..b8b7df6 100644 --- a/MAESTRO/R/scATACseq_qc.R +++ b/MAESTRO/R/scATACseq_qc.R @@ -8,7 +8,7 @@ option_list = list( make_option(c("--outdir"), type = "character", default = "MAESTRO", action = "store", help = "The directory where the output files are stored." ), - make_option(c("--bulkstat"), type = "character", default = "flagstat.txt", + make_option(c("--bulkstat"), type = "character", default = "", action = "store", help = "The result of samtools flagstat." ), make_option(c("--singlestat"), type = "character", default = "singlecell.txt", @@ -34,7 +34,9 @@ count_cutoff = argue$countcutoff frip_cutoff = argue$fripcutoff prefix = argue$prefix -ATACReadDistrPlot(stat.filepath = bulkstat_file, name = prefix) +if(bulkstat_file != "") { + ATACReadDistrPlot(stat.filepath = bulkstat_file, name = prefix) +} ATACFragmentSizePlot(filepath = fragment_file, name = prefix) ATACFilteringPlot(filepath = singlestat_file, name = prefix, reads.cutoff = count_cutoff, frip.cutoff = frip_cutoff) diff --git a/MAESTRO/R/scRNAseq_pipe.R b/MAESTRO/R/scRNAseq_pipe.R index d4c5087..4d7a8fb 100644 --- a/MAESTRO/R/scRNAseq_pipe.R +++ b/MAESTRO/R/scRNAseq_pipe.R @@ -25,9 +25,6 @@ option_list = list( make_option(c("--signature"), type = "character", default = "", action = "store", help = "The cell signature file for celltype annotation. Default is built-in CIBERSORT immune cell signature." ), - make_option(c("--rabitlib"), type = "character", default = "", - action = "store", help = "Annotation to run rabit (only if method is set to rabit)." - ), make_option(c("--lisamode"), type = "character", default = "", action = "store", help = "Mode to run LISA (web or local)." ), @@ -50,7 +47,6 @@ prefix = argue$prefix thread = argue$thread method = argue$method sigfile = argue$signature -rabitlib = argue$rabitlib lisamode = argue$lisamode condadir = argue$condadir lisaenv = argue$lisaenv @@ -87,6 +83,6 @@ RNA.res$RNA <- RNAAnnotateCelltype(RNA = RNA.res$RNA, genes = RNA.res$genes, signatures = signatures, min.score = 0.05) saveRDS(RNA.res, paste0(prefix, "_scRNA_Object.rds")) RNA.tfs <- RNAAnnotateTranscriptionFactor(RNA = RNA.res$RNA, genes = RNA.res$genes, project = prefix, - method = method, rabit.path = rabitlib, lisa.mode = lisamode, + method = method, lisa.mode = lisamode, conda.dir = condadir, lisa.envname = lisaenv, organism = species, top.tf = 10) diff --git a/MAESTRO/Snakemake/scATAC/Snakefile b/MAESTRO/Snakemake/scATAC/Snakefile index 256a553..c1df0b4 100644 --- a/MAESTRO/Snakemake/scATAC/Snakefile +++ b/MAESTRO/Snakemake/scATAC/Snakefile @@ -8,7 +8,7 @@ import yaml import sys import os -from MAESTRO.scATAC_utility import get_fastqlist, ENV_PATH, SCRIPT_PATH, RSCRIPT_PATH, getfastq_10x +from MAESTRO.scATAC_utility import is_gzip, get_fastqlist, ENV_PATH, SCRIPT_PATH, RSCRIPT_PATH, getfastq_10x # def qcplot_input(wildcards): # checkpoint_output = checkpoints.scatac_samsplit.get(**wildcards).output[0] @@ -21,12 +21,25 @@ def all_input(wildcards): cluster=glob_wildcards(os.path.join(checkpoint_output, "{cluster}.bam")).cluster) # return checkpoint_output +def all_input_frag(wildcards): + checkpoint_output = checkpoints.scatac_fragcluster.get().output[0] + return expand("Result/Analysis/Cluster/{cluster}_treat_pileup.bdg", + cluster=glob_wildcards(os.path.join(checkpoint_output, "{cluster}.bed")).cluster) + + +macs2_genome = "hs" if config["species"] == "GRCh38" else "mm" + if config["clusterpeaks"]: - rule all: - input: - summaryreport = "Result/" + config["outprefix"] + "_scATAC_report.html", - peakcluster = all_input - # tfscore = "Result/Analysis/" + config["outprefix"] + "_tfscore.txt" + if config["format"] != "fragments": + rule all: + input: + summaryreport = "Result/" + config["outprefix"] + "_scATAC_report.html", + peakcluster = all_input + else: + rule all: + input: + summaryreport = "Result/" + config["outprefix"] + "_scATAC_report.html", + peakcluster = all_input_frag else: rule all: input: @@ -195,13 +208,14 @@ if config["platform"] == "microfluidic": peak = "Result/Analysis/%s_all_peaks.narrowPeak" %(config["outprefix"]), bdg = "Result/Analysis/%s_all_treat_pileup.bdg" %(config["outprefix"]), params: - name = "%s_all" %(config["outprefix"]) + name = "%s_all" %(config["outprefix"]), + genome = macs2_genome log: "Result/Log/%s_macs2_allpeak.log" %(config["outprefix"]) benchmark: "Result/Benchmark/%s_AllPeakCall.benchmark" %(config["outprefix"]) shell: - "macs2 callpeak -g hs --outdir Result/Analysis/ -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR -t {input.bam}" + "macs2 callpeak -f BAMPE -g {params.genome} --outdir Result/Analysis/ -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR --keep-dup all -t {input.bam}" if config["shortpeaks"]: rule scatac_shortfragment: @@ -224,13 +238,14 @@ if config["platform"] == "microfluidic": output: bed = "Result/Analysis/%s_150bp_peaks.narrowPeak" %(config["outprefix"]) params: - name = "%s_150bp" %(config["outprefix"]) + name = "%s_150bp" %(config["outprefix"]), + genome = macs2_genome log: "Result/Log/" + config["outprefix"] + "_macs2_shortpeak.log" benchmark: "Result/Benchmark/%s_ShortPeakCall.benchmark" %(config["outprefix"]) shell: - "macs2 callpeak -g hs --outdir Result/Analysis -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR -t {input.shortbam}" + "macs2 callpeak -f BAMPE -g {params.genome} --outdir Result/Analysis -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR --keep-dup all -t {input.shortbam}" if config["custompeaks"] and config["shortpeaks"]: rule scatac_mergepeak: @@ -317,28 +332,30 @@ if config["platform"] == "microfluidic": "--bam-dir {params.bamdir} --directory {params.outdir} --outprefix {params.outpre} --cores {threads} --species {params.species}" if config["platform"] == "10x-genomics" or config["platform"] == "sci-ATAC-seq": - if config["platform"] == "10x-genomics": - rule scatac_preprocess: - input: - fastqs = config["fastqdir"], - fasta = config["genome"]["fasta"] - output: - r1cat = temp(os.path.join(config["fastqdir"], "%s_R1.fastq" %(config["fastqprefix"]))), - r2cat = temp(os.path.join(config["fastqdir"], "%s_R2.fastq" %(config["fastqprefix"]))), - r3cat = temp(os.path.join(config["fastqdir"], "%s_R3.fastq" %(config["fastqprefix"]))), - params: - r1 = getfastq_10x(config["fastqdir"], config["fastqprefix"])["r1"], - r2 = getfastq_10x(config["fastqdir"], config["fastqprefix"])["barcode"], - r3 = getfastq_10x(config["fastqdir"], config["fastqprefix"])["r3"], - cmd = getfastq_10x(config["fastqdir"], config["fastqprefix"])["command"], - threads: - config["cores"] - benchmark: - "Result/Benchmark/%s_Preprocess.benchmark" %(config["outprefix"]) - shell: - "{params.cmd} {params.r1} > {output.r1cat};" - "{params.cmd} {params.r2} > {output.r2cat};" - "{params.cmd} {params.r3} > {output.r3cat};" + if config["format"] == "fastq" or config["format"] == "bam": + if config["format"] == "fastq": + if config["platform"] == "10x-genomics": + rule scatac_preprocess: + input: + fastqs = config["fastqdir"], + fasta = config["genome"]["fasta"] + output: + r1cat = temp(os.path.join("Result/Tmp", "%s_R1.fastq" %(config["fastqprefix"]))), + r2cat = temp(os.path.join("Result/Tmp", "%s_R2.fastq" %(config["fastqprefix"]))), + r3cat = temp(os.path.join("Result/Tmp", "%s_R3.fastq" %(config["fastqprefix"]))), + params: + r1 = getfastq_10x(config["fastqdir"], config["fastqprefix"])["r1"], + r2 = getfastq_10x(config["fastqdir"], config["fastqprefix"])["barcode"], + r3 = getfastq_10x(config["fastqdir"], config["fastqprefix"])["r3"], + cmd = getfastq_10x(config["fastqdir"], config["fastqprefix"])["command"], + threads: + config["cores"] + benchmark: + "Result/Benchmark/%s_Preprocess.benchmark" %(config["outprefix"]) + shell: + "{params.cmd} {params.r1} > {output.r1cat};" + "{params.cmd} {params.r2} > {output.r2cat};" + "{params.cmd} {params.r3} > {output.r3cat};" # # rule scatac_bulk_unique: @@ -368,114 +385,375 @@ if config["platform"] == "10x-genomics" or config["platform"] == "sci-ATAC-seq": # # "cut -f 1-3 {output.bed} | sort -k1,1 -k2,2n | uniq | grep -v 'chrM' | intersectBed -wa -a - -b {input.promoter} -u | wc -l >> {output.bulkqc};" # # "cut -f 1-3 {output.bed} | sort -k1,1 -k2,2n | uniq | grep -v 'chrM' | intersectBed -wa -a - -b {input.peak} -u | wc -l >> {output.bulkqc};" - if config["platform"] == "sci-ATAC-seq": - rule scatac_preprocess: + if config["platform"] == "sci-ATAC-seq": + # support pair-end sequencing fastqs (_1.fastq, _2.fastq) + rule scatac_preprocess: + input: + fasta = config["genome"]["fasta"], + fastq1 = os.path.join(config["fastqdir"], "%s_1.fastq" %(config["fastqprefix"])), + fastq2 = os.path.join(config["fastqdir"], "%s_2.fastq" %(config["fastqprefix"])), + output: + r1 = temp(os.path.join("Result/Tmp", "%s_R1.fastq" %(config["fastqprefix"]))), + r2 = temp(os.path.join("Result/Tmp", "%s_R2.fastq" %(config["fastqprefix"]))), + r3 = temp(os.path.join("Result/Tmp", "%s_R3.fastq" %(config["fastqprefix"]))), + params: + outdir = "Result/Tmp" + benchmark: + "Result/Benchmark/%s_Preprocess.benchmark" %(config["outprefix"]) + shell: + "python " + SCRIPT_PATH + "/scATAC_sci_BarcodeExtract.py --R1 {input.fastq1} --R2 {input.fastq2} -O {params.outdir}" + + # support 10-like input (_R1.fatsq, _R2.fastq, _R3.fastq) + rule scatac_preprocess_fastqrename: + input: + fasta = config["genome"]["fasta"], + r1 = os.path.join(config["fastqdir"], "%s_R1.fastq" %(config["fastqprefix"])), + r2 = os.path.join(config["fastqdir"], "%s_R2.fastq" %(config["fastqprefix"])), + r3 = os.path.join(config["fastqdir"], "%s_R3.fastq" %(config["fastqprefix"])), + output: + r1 = temp(os.path.join("Result/Tmp", "%s_R1.fastq" %(config["fastqprefix"]))), + r2 = temp(os.path.join("Result/Tmp", "%s_R2.fastq" %(config["fastqprefix"]))), + r3 = temp(os.path.join("Result/Tmp", "%s_R3.fastq" %(config["fastqprefix"]))), + params: + outdir = "Result/Tmp" + benchmark: + "Result/Benchmark/%s_Preprocess.benchmark" %(config["outprefix"]) + shell: + "cp {input.r1} {input.r2} {input.r3} {params.outdir};" + + rule scatac_fqaddbarcode: + input: + r1 = os.path.join("Result/Tmp", "%s_R1.fastq" %(config["fastqprefix"])), + r2 = os.path.join("Result/Tmp", "%s_R2.fastq" %(config["fastqprefix"])), + r3 = os.path.join("Result/Tmp", "%s_R3.fastq" %(config["fastqprefix"])), + output: + r1 = temp(os.path.join("Result/Tmp", "%s_R1.barcoded.fastq" %(config["fastqprefix"]))), + r3 = temp(os.path.join("Result/Tmp", "%s_R3.barcoded.fastq" %(config["fastqprefix"]))), + # r1 = "%s/%s_R1.barcoded.fastq" %(config["fastqdir"], config["fastqprefix"]), + # r3 = "%s/%s_R3.barcoded.fastq" %(config["fastqdir"], config["fastqprefix"]), + benchmark: + "Result/Benchmark/%s_FqAddbarcode.benchmark" %(config["outprefix"]) + shell: + "base=`head -n 2 {input.r2} | tail -n 1 | wc -L`;" + "sinto barcode --barcode_fastq {input.r2} --read1 {input.r1} --read2 {input.r3} -b $base;" + + rule scatac_map: + input: + fasta = config["genome"]["fasta"], + r1 = os.path.join("Result/Tmp", "%s_R1.barcoded.fastq" %(config["fastqprefix"])), + r3 = os.path.join("Result/Tmp", "%s_R3.barcoded.fastq" %(config["fastqprefix"])), + output: + bam = temp("Result/minimap2/%s.sortedByPos.bam" %(config["outprefix"])), + threads: + config["cores"] + benchmark: + "Result/Benchmark/%s_Minimap2.benchmark" %(config["outprefix"]) + shell: + "minimap2 -ax sr -t {threads} {input.fasta} {input.r1} {input.r3} " + "| samtools view --threads {threads} -b " + "| samtools sort --threads {threads} -o {output.bam};" + + rule scatac_fragmentgenerate: + input: + bam = "Result/minimap2/%s.sortedByPos.bam" %(config["outprefix"]), + output: + fragments = "Result/minimap2/fragments.tsv", + bam = "Result/minimap2/%s.sortedByPos.CRadded.bam" %(config["outprefix"]), + params: + outdir = "Result/minimap2" + benchmark: + "Result/Benchmark/%s_FragGenerate.benchmark" %(config["outprefix"]) + shell: + "python " + SCRIPT_PATH + "/scATAC_FragmentGenerate.py -B {input.bam} -O {params.outdir} --addtag CR" + + if config["whitelist"]: + rule scatac_barcodecorrect: + input: + r2 = os.path.join("Result/Tmp", "%s_R2.fastq" %(config["fastqprefix"])), + whitelist = config["whitelist"], + output: + bc_correct = "Result/minimap2/barcode_correct.txt", + bc_correct_uniq = "Result/minimap2/barcode_correct_uniq.txt", + params: + outdir = "Result/minimap2" + benchmark: + "Result/Benchmark/%s_BarcodeCorrect.benchmark" %(config["outprefix"]) + shell: + "python " + SCRIPT_PATH + "/scATAC_10x_BarcodeCorrect.py -b {input.r2} -B {input.whitelist} -O {params.outdir};" + "sort -k1,1 -k3,3 {output.bc_correct} | uniq > {output.bc_correct_uniq}" + else: + rule scatac_barcodecorrect: + input: + r2 = os.path.join("Result/Tmp", "%s_R2.fastq" %(config["fastqprefix"])), + output: + bc_correct = "Result/minimap2/barcode_correct.txt", + bc_correct_uniq = "Result/minimap2/barcode_correct_uniq.txt", + params: + outdir = "Result/minimap2" + benchmark: + "Result/Benchmark/%s_BarcodeCorrect.benchmark" %(config["outprefix"]) + shell: + "python " + SCRIPT_PATH + "/scATAC_10x_BarcodeCorrect.py -b {input.r2} -O {params.outdir};" + "sort -k1,1 -k3,3 {output.bc_correct} | uniq > {output.bc_correct_uniq}" + + rule scatac_fragmentcorrect: + input: + fragments = "Result/minimap2/fragments.tsv", + bc_correct = "Result/minimap2/barcode_correct.txt" + output: + frag_count = "Result/minimap2/fragments_corrected_count.tsv", + frag_sort = "Result/minimap2/fragments_corrected_sorted.tsv" + params: + outdir = "Result/minimap2", + frag_correct = "Result/minimap2/fragments_corrected.tsv", + benchmark: + "Result/Benchmark/%s_FragCorrect.benchmark" %(config["outprefix"]) + shell: + "python " + SCRIPT_PATH + "/scATAC_FragmentCorrect.py -F {input.fragments} -C {input.bc_correct} -O {params.outdir};" + "sort -k1,1 -k2,2 -k3,3 -k4,4 -V {params.frag_correct} > {output.frag_sort};" + "bedtools groupby -i {output.frag_sort} -g 1,2,3,4 -c 4 -o count > {output.frag_count}" + + rule scatac_rmdp: + input: + bam = "Result/minimap2/%s.sortedByPos.CRadded.bam" %(config["outprefix"]), + output: + bam = "Result/minimap2/%s.sortedByPos.CRadded.rmdp.bam" %(config["outprefix"]), + metric = "Result/minimap2/%s.rmdp.txt" %(config["outprefix"]), + fragbed = "Result/QC/%s_frag.bed" %(config["outprefix"]), + params: + sam = "Result/minimap2/%s.sortedByPos.CRadded.rmdp.sample.sam" %(config["outprefix"]), + tmp = "Result/Tmp", + threads: + int(config["cores"])-2 + benchmark: + "Result/Benchmark/%s_Rmdp.benchmark" %(config["outprefix"]) + shell: + "picard MarkDuplicates INPUT={input.bam} OUTPUT={output.bam} METRICS_FILE={output.metric} TMP_DIR={params.tmp};" + "samtools view -@ {threads} -s 0.01 -o {params.sam} {input.bam};" + "awk '{{if ($9>0) print $9}}' {params.sam} > {output.fragbed};" + + rule scatac_bamaddCB: + input: + bam = "Result/minimap2/%s.sortedByPos.CRadded.rmdp.bam" %(config["outprefix"]), + bc_correct = "Result/minimap2/barcode_correct_uniq.txt" + output: + bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]), + params: + outdir = "Result/minimap2", + outprefix = "%s.sortedByPos.rmdp.CBadded" %(config["outprefix"]) + threads: + int(config["cores"])-2 + benchmark: + "Result/Benchmark/%s_BamAddCB.benchmark" %(config["outprefix"]) + shell: + "python " + SCRIPT_PATH + "/scATAC_BamAddTag.py -B {input.bam} -T {input.bc_correct} -C CR " + "-O {params.outdir} -P {params.outprefix};" + + if config["format"] == "bam": + rule scatac_fragmentgenerate: + input: + bam = config["bam"], + output: + fragments = "Result/minimap2/fragments_corrected_count.tsv", + params: + outdir = "Result/minimap2", + benchmark: + "Result/Benchmark/%s_FragGenerate.benchmark" %(config["outprefix"]) + shell: + "python " + SCRIPT_PATH + "/scATAC_FragmentGenerate.py -B {input.bam} -O {params.outdir} --CBtag CB --count;" + + rule scatac_rmdp: + input: + bam = config["bam"], + output: + bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]), + metric = "Result/minimap2/%s.rmdp.txt" %(config["outprefix"]), + fragbed = "Result/QC/%s_frag.bed" %(config["outprefix"]), + params: + sam = "Result/minimap2/%s.sortedByPos.CRadded.rmdp.sample.sam" %(config["outprefix"]), + tmp = "Result/Tmp", + threads: + int(config["cores"])-2 + benchmark: + "Result/Benchmark/%s_Rmdp.benchmark" %(config["outprefix"]) + shell: + "picard MarkDuplicates INPUT={input.bam} OUTPUT={output.bam} METRICS_FILE={output.metric} TMP_DIR={params.tmp};" + "samtools view -@ {threads} -s 0.01 -o {params.sam} {input.bam};" + "awk '{{if ($9>0) print $9}}' {params.sam} > {output.fragbed};" + + rule scatac_allpeakcall: input: - fasta = config["genome"]["fasta"], - fastq1 = os.path.join(config["fastqdir"], "%s_1.fastq" %(config["fastqprefix"])), - fastq2 = os.path.join(config["fastqdir"], "%s_2.fastq" %(config["fastqprefix"])), + bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]) output: - r1 = os.path.join(config["fastqdir"], "%s_R1.fastq" %(config["fastqprefix"])), - r2 = os.path.join(config["fastqdir"], "%s_R2.fastq" %(config["fastqprefix"])), - r3 = os.path.join(config["fastqdir"], "%s_R3.fastq" %(config["fastqprefix"])), + peak = "Result/Analysis/%s_all_peaks.narrowPeak" %(config["outprefix"]), + bdg = "Result/Analysis/%s_all_treat_pileup.bdg" %(config["outprefix"]), params: - outdir = config["fastqdir"] + name = "%s_all" %(config["outprefix"]), + genome = macs2_genome + log: + "Result/Log/%s_macs2_allpeak.log" %(config["outprefix"]) benchmark: - "Result/Benchmark/%s_Preprocess.benchmark" %(config["outprefix"]) + "Result/Benchmark/%s_AllPeakCall.benchmark" %(config["outprefix"]) shell: - "python " + SCRIPT_PATH + "/scATAC_sci_BarcodeExtract.py --R1 {input.fastq1} --R2 {input.fastq2} -O {params.outdir}" + "macs2 callpeak -f BAMPE -g {params.genome} --outdir Result/Analysis -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR --keep-dup all -t {input.bam}" - rule scatac_fqaddbarcode: - input: - r1 = os.path.join(config["fastqdir"], "%s_R1.fastq" %(config["fastqprefix"])), - r2 = os.path.join(config["fastqdir"], "%s_R2.fastq" %(config["fastqprefix"])), - r3 = os.path.join(config["fastqdir"], "%s_R3.fastq" %(config["fastqprefix"])), - output: - r1 = temp(os.path.join(config["fastqdir"], "%s_R1.barcoded.fastq" %(config["fastqprefix"]))), - r3 = temp(os.path.join(config["fastqdir"], "%s_R3.barcoded.fastq" %(config["fastqprefix"]))), - # r1 = "%s/%s_R1.barcoded.fastq" %(config["fastqdir"], config["fastqprefix"]), - # r3 = "%s/%s_R3.barcoded.fastq" %(config["fastqdir"], config["fastqprefix"]), - benchmark: - "Result/Benchmark/%s_FqAddbarcode.benchmark" %(config["outprefix"]) - shell: - "base=`head -n 2 {input.r2} | tail -n 1 | wc -L`;" - "sinto barcode --barcode_fastq {input.r2} --read1 {input.r1} --read2 {input.r3} -b $base;" + rule scatac_qcstat_bulk: + input: + bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]), + promoter = "%s/annotations/%s_promoter.bed" %(SCRIPT_PATH, config["species"]), + peak = "Result/Analysis/%s_all_peaks.narrowPeak" %(config["outprefix"]), + output: + bulk_stat = "Result/QC/flagstat.txt", + bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.unique.bam" %(config["outprefix"]), + bed = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.unique.bed" %(config["outprefix"]) + threads: + config["cores"] + benchmark: + "Result/Benchmark/%s_BulkQCStat.benchmark" %(config["outprefix"]) + shell: + "samtools flagstat --threads {threads} {input.bam} > {output.bulk_stat};" + "samtools view -F 2316 -f 0x2 -q 30 -b -o {output.bam} {input.bam};" + "samtools view {output.bam} -c >> {output.bulk_stat};" + "bedtools bamtobed -i {output.bam} > {output.bed};" + "grep 'chrM' {output.bed} -c >> {output.bulk_stat} || true;" + "grep -v 'chrM' {output.bed} | bedtools intersect -wa -a - -b {input.promoter} -u | wc -l >> {output.bulk_stat} || true;" + "grep -v 'chrM' {output.bed} | bedtools intersect -wa -a - -b {input.peak} -u | wc -l >> {output.bulk_stat} || true ;" - rule scatac_map: - input: - fasta = config["genome"]["fasta"], - r1 = os.path.join(config["fastqdir"], "%s_R1.barcoded.fastq" %(config["fastqprefix"])), - r3 = os.path.join(config["fastqdir"], "%s_R3.barcoded.fastq" %(config["fastqprefix"])), - output: - bam = temp("Result/minimap2/%s.sortedByPos.bam" %(config["outprefix"])), - threads: - config["cores"] - benchmark: - "Result/Benchmark/%s_Minimap2.benchmark" %(config["outprefix"]) - shell: - "minimap2 -ax sr -t {threads} {input.fasta} {input.r1} {input.r3} " - "| samtools view --threads {threads} -b " - "| samtools sort --threads {threads} -o {output.bam};" + if config["shortpeaks"]: + rule scatac_shortfragment: + input: + bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]) + output: + shortbam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.150bp.bam" %(config["outprefix"]) + threads: + config["cores"] + benchmark: + "Result/Benchmark/%s_ShortFrag.benchmark" %(config["outprefix"]) + shell: + "samtools view -@ {threads} -h {input.bam} | " + "awk -F'\\t' 'function abs(x){{return ((x < 0.0) ? -x : x)}} {{if (abs($9)<=150) print}}' | " + "samtools view -@ {threads} -b -o {output.shortbam}" + + rule scatac_shortpeakcall: + input: + shortbam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.150bp.bam" %(config["outprefix"]) + output: + bed = "Result/Analysis/%s_150bp_peaks.narrowPeak" %(config["outprefix"]) + params: + name = "%s_150bp" %(config["outprefix"]), + genome = macs2_genome + log: + "Result/Log/%s_macs2_shortpeak.log" %(config["outprefix"]) + benchmark: + "Result/Benchmark/%s_ShortPeakCall.benchmark" %(config["outprefix"]) + shell: + "macs2 callpeak -f BAMPE -g {params.genome} --outdir Result/Analysis -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR --keep-dup all -t {input.shortbam}" - rule scatac_fragmentgenerate: - input: - bam = "Result/minimap2/%s.sortedByPos.bam" %(config["outprefix"]), - output: - fragments = "Result/minimap2/fragments.tsv", - bam = "Result/minimap2/%s.sortedByPos.CRadded.bam" %(config["outprefix"]), - params: - outdir = "Result/minimap2" - benchmark: - "Result/Benchmark/%s_FragGenerate.benchmark" %(config["outprefix"]) - shell: - "python " + SCRIPT_PATH + "/scATAC_FragmentGenerate.py -B {input.bam} -O {params.outdir} --addtag CR" + else: + if is_gzip(config["frag"]): + rule scatac_fragmentcp: + input: + frag = config["frag"] + output: + frag = "Result/minimap2/fragments_corrected_count.tsv" + benchmark: + "Result/Benchmark/%s_FragCP.benchmark" %(config["outprefix"]) + shell: + "gzip -c -d {input.frag} > {output.frag}" + else: + rule scatac_fragmentcp: + input: + frag = config["frag"] + output: + frag = "Result/minimap2/fragments_corrected_count.tsv" + benchmark: + "Result/Benchmark/%s_FragCP.benchmark" %(config["outprefix"]) + shell: + "cp -T {input.frag} {output.frag}" - if config["whitelist"]: - rule scatac_barcodecorrect: + rule scatac_fragmentreshape: input: - r2 = os.path.join(config["fastqdir"], "%s_R2.fastq" %(config["fastqprefix"])), - whitelist = config["whitelist"], + frag = "Result/minimap2/fragments_corrected_count.tsv" output: - bc_correct = "Result/minimap2/barcode_correct.txt", - bc_correct_uniq = "Result/minimap2/barcode_correct_uniq.txt", + frag = "Result/minimap2/fragments_BEDPE.bed" params: outdir = "Result/minimap2" benchmark: - "Result/Benchmark/%s_BarcodeCorrect.benchmark" %(config["outprefix"]) + "Result/Benchmark/%s_FragReshape.benchmark" %(config["outprefix"]) shell: - "python " + SCRIPT_PATH + "/scATAC_10x_BarcodeCorrect.py -b {input.r2} -B {input.whitelist} -O {params.outdir};" - "sort -k1,1 -k3,3 {output.bc_correct} | uniq > {output.bc_correct_uniq}" - else: - rule scatac_barcodecorrect: + "python " + SCRIPT_PATH + "/scATAC_FragmentReshape.py -F {input.frag} -O {params.outdir}" + + rule scatac_fragmentsample: + input: + frag = "Result/minimap2/fragments_BEDPE.bed" + output: + fragbed = "Result/QC/%s_frag.bed" %(config["outprefix"]), + benchmark: + "Result/Benchmark/%s_FragSample.benchmark" %(config["outprefix"]) + shell: + "frag_num=`wc -l {input.frag} | awk '{{print $1}}'`;" + "frag_num_sample=`expr $frag_num / 100`;" + "shuf -n $frag_num_sample {input.frag} | awk '{{print ($3-$2)}}' > {output.fragbed}" + + rule scatac_allpeakcall: input: - r2 = os.path.join(config["fastqdir"], "%s_R2.fastq" %(config["fastqprefix"])), + frag = "Result/minimap2/fragments_BEDPE.bed" output: - bc_correct = "Result/minimap2/barcode_correct.txt", - bc_correct_uniq = "Result/minimap2/barcode_correct_uniq.txt", + peak = "Result/Analysis/%s_all_peaks.narrowPeak" %(config["outprefix"]), + bdg = "Result/Analysis/%s_all_treat_pileup.bdg" %(config["outprefix"]), params: - outdir = "Result/minimap2" + name = "%s_all" %(config["outprefix"]), + genome = macs2_genome + log: + "Result/Log/%s_macs2_allpeak.log" %(config["outprefix"]) benchmark: - "Result/Benchmark/%s_BarcodeCorrect.benchmark" %(config["outprefix"]) + "Result/Benchmark/%s_AllPeakCall.benchmark" %(config["outprefix"]) shell: - "python " + SCRIPT_PATH + "/scATAC_10x_BarcodeCorrect.py -b {input.r2} -O {params.outdir};" - "sort -k1,1 -k3,3 {output.bc_correct} | uniq > {output.bc_correct_uniq}" + "macs2 callpeak -f BEDPE -g {params.genome} --outdir Result/Analysis -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR --keep-dup all -t {input.frag}" - rule scatac_fragmentcorrect: - input: - fragments = "Result/minimap2/fragments.tsv", - bc_correct = "Result/minimap2/barcode_correct.txt" - output: - frag_count = "Result/minimap2/fragments_corrected_count.tsv", - frag_sort = "Result/minimap2/fragments_corrected_sorted.tsv" - params: - outdir = "Result/minimap2", - frag_correct = "Result/minimap2/fragments_corrected.tsv", - benchmark: - "Result/Benchmark/%s_FragCorrect.benchmark" %(config["outprefix"]) - shell: - "python " + SCRIPT_PATH + "/scATAC_FragmentCorrect.py -F {input.fragments} -C {input.bc_correct} -O {params.outdir};" - "sort -k1,1 -k2,2 -k3,3 -k4,4 -V {params.frag_correct} > {output.frag_sort};" - "bedtools groupby -i {output.frag_sort} -g 1,2,3,4 -c 4 -o count > {output.frag_count}" + rule scatac_qcstat_bulk: + input: + frag = "Result/minimap2/fragments_BEDPE.bed", + promoter = "%s/annotations/%s_promoter.bed" %(SCRIPT_PATH, config["species"]), + peak = "Result/Analysis/%s_all_peaks.narrowPeak" %(config["outprefix"]), + output: + bulk_stat = "Result/QC/flagstat.txt", + threads: + config["cores"] + benchmark: + "Result/Benchmark/%s_BulkQCStat.benchmark" %(config["outprefix"]) + shell: + "grep 'chrM' {input.frag} -c >> {output.bulk_stat} || true;" + "grep -v 'chrM' {input.frag} | bedtools intersect -wa -a - -b {input.promoter} -u | wc -l >> {output.bulk_stat} || true;" + "grep -v 'chrM' {input.frag} | bedtools intersect -wa -a - -b {input.peak} -u | wc -l >> {output.bulk_stat} || true ;" + + if config["shortpeaks"]: + rule scatac_shortfragment: + input: + frag = "Result/minimap2/fragments_BEDPE.bed" + output: + shortfrag = "Result/minimap2/fragments_BEDPE_150bp.bed" + threads: + config["cores"] + benchmark: + "Result/Benchmark/%s_ShortFrag.benchmark" %(config["outprefix"]) + shell: + "awk -F'\\t' '{{if (($3-$2)<=150) print}}' {input.frag} > {output.shortfrag}" + + rule scatac_shortpeakcall: + input: + shortfrag = "Result/minimap2/fragments_BEDPE_150bp.bed" + output: + bed = "Result/Analysis/%s_150bp_peaks.narrowPeak" %(config["outprefix"]) + params: + name = "%s_150bp" %(config["outprefix"]), + genome = macs2_genome + log: + "Result/Log/%s_macs2_shortpeak.log" %(config["outprefix"]) + benchmark: + "Result/Benchmark/%s_ShortPeakCall.benchmark" %(config["outprefix"]) + shell: + "macs2 callpeak -f BEDPE -g {params.genome} --outdir Result/Analysis -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR --keep-dup all -t {input.shortfrag}" rule scatac_qcstat_promoter: input: @@ -522,108 +800,6 @@ if config["platform"] == "10x-genomics" or config["platform"] == "sci-ATAC-seq": "sort -k1,1 {input.promoter_stat} > {params.promoter_stat_sort};" "join --nocheck-order -t $'\t' -a1 -e'0' -o'1.1 1.2 2.2' -1 1 -2 1 {params.mapped_stat_sort} {params.promoter_stat_sort} > {output.single_stat}" - rule scatac_rmdp: - input: - bam = "Result/minimap2/%s.sortedByPos.CRadded.bam" %(config["outprefix"]), - output: - bam = "Result/minimap2/%s.sortedByPos.CRadded.rmdp.bam" %(config["outprefix"]), - metric = "Result/minimap2/%s.rmdp.txt" %(config["outprefix"]), - fragbed = "Result/QC/%s_frag.bed" %(config["outprefix"]), - tmp = temp(directory("Result/Tmp")) - params: - sam = "Result/minimap2/%s.sortedByPos.CRadded.rmdp.sample.sam" %(config["outprefix"]) - threads: - int(config["cores"])-2 - benchmark: - "Result/Benchmark/%s_Rmdp.benchmark" %(config["outprefix"]) - shell: - "picard MarkDuplicates INPUT={input.bam} OUTPUT={output.bam} METRICS_FILE={output.metric} TMP_DIR={output.tmp};" - "samtools view -@ {threads} -s 0.01 -o {params.sam} {input.bam};" - "awk '{{if ($9>0) print $9}}' {params.sam} > {output.fragbed};" - - rule scatac_bamaddCB: - input: - bam = "Result/minimap2/%s.sortedByPos.CRadded.rmdp.bam" %(config["outprefix"]), - bc_correct = "Result/minimap2/barcode_correct_uniq.txt" - output: - bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]), - params: - outdir = "Result/minimap2", - outprefix = "%s.sortedByPos.rmdp.CBadded" %(config["outprefix"]) - threads: - int(config["cores"])-2 - benchmark: - "Result/Benchmark/%s_BamAddCB.benchmark" %(config["outprefix"]) - shell: - "python " + SCRIPT_PATH + "/scATAC_BamAddTag.py -B {input.bam} -T {input.bc_correct} -C CR " - "-O {params.outdir} -P {params.outprefix};" - - rule scatac_allpeakcall: - input: - bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]) - output: - peak = "Result/Analysis/%s_all_peaks.narrowPeak" %(config["outprefix"]), - bdg = "Result/Analysis/%s_all_treat_pileup.bdg" %(config["outprefix"]), - params: - name = "%s_all" %(config["outprefix"]) - log: - "Result/Log/%s_macs2_allpeak.log" %(config["outprefix"]) - benchmark: - "Result/Benchmark/%s_AllPeakCall.benchmark" %(config["outprefix"]) - shell: - "macs2 callpeak -g hs --outdir Result/Analysis -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR -t {input.bam}" - - rule scatac_qcstat_bulk: - input: - bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]), - promoter = "%s/annotations/%s_promoter.bed" %(SCRIPT_PATH, config["species"]), - peak = "Result/Analysis/%s_all_peaks.narrowPeak" %(config["outprefix"]), - output: - bulk_stat = "Result/QC/flagstat.txt", - bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.unique.bam" %(config["outprefix"]), - bed = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.unique.bed" %(config["outprefix"]) - threads: - config["cores"] - benchmark: - "Result/Benchmark/%s_BulkQCStat.benchmark" %(config["outprefix"]) - shell: - "samtools flagstat --threads {threads} {input.bam} > {output.bulk_stat};" - "samtools view -F 2316 -f 0x2 -q 30 -b -o {output.bam} {input.bam};" - "samtools view {output.bam} -c >> {output.bulk_stat};" - "bedtools bamtobed -i {output.bam} > {output.bed};" - "grep 'chrM' {output.bed} -c >> {output.bulk_stat} || true;" - "grep -v 'chrM' {output.bed} | bedtools intersect -wa -a - -b {input.promoter} -u | wc -l >> {output.bulk_stat} || true;" - "grep -v 'chrM' {output.bed} | bedtools intersect -wa -a - -b {input.peak} -u | wc -l >> {output.bulk_stat} || true ;" - - if config["shortpeaks"]: - rule scatac_shortfragment: - input: - bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]) - output: - shortbam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.150bp.bam" %(config["outprefix"]) - threads: - config["cores"] - benchmark: - "Result/Benchmark/%s_ShortFrag.benchmark" %(config["outprefix"]) - shell: - "samtools view -@ {threads} -h {input.bam} | " - "awk -F'\\t' 'function abs(x){{return ((x < 0.0) ? -x : x)}} {{if (abs($9)<=150) print}}' | " - "samtools view -@ {threads} -b -o {output.shortbam}" - - rule scatac_shortpeakcall: - input: - shortbam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.150bp.bam" %(config["outprefix"]) - output: - bed = "Result/Analysis/%s_150bp_peaks.narrowPeak" %(config["outprefix"]) - params: - name = "%s_150bp" %(config["outprefix"]) - log: - "Result/Log/%s_macs2_shortpeak.log" %(config["outprefix"]) - benchmark: - "Result/Benchmark/%s_ShortPeakCall.benchmark" %(config["outprefix"]) - shell: - "macs2 callpeak -g hs --outdir Result/Analysis -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR -t {input.shortbam}" - if config["custompeaks"] and config["shortpeaks"]: rule scatac_mergepeak: input: @@ -688,7 +864,7 @@ if config["platform"] == "10x-genomics" or config["platform"] == "sci-ATAC-seq": "| sort -k1,1 -k2,2n | cut -f 1-4 > {params.catpeaksort};" "mergeBed -i {params.catpeaksort} | grep -v '_' | grep -v 'chrEBV' > {output.finalpeak};" "rm {params.catpeaksort}" - + rule scatac_countpeak: input: finalpeak = "Result/Analysis/%s_final_peaks.bed" %(config["outprefix"]), @@ -914,31 +1090,55 @@ if config["platform"] == "10x-genomics" or config["platform"] == "sci-ATAC-seq": # # shell: # # "python " + SCRIPT_PATH + "/scATAC_microfluidic_count.py {input.finalpeak} {input.validbarcode} {params.bamdir} {output.count} {threads} {params.species}" -rule scatac_qcplot: - input: - fragbed = "Result/QC/%s_frag.bed" %(config["outprefix"]), - single_stat = "Result/QC/singlecell.txt", - bulk_stat = "Result/QC/flagstat.txt" - output: - readdistr = "Result/QC/" + config["outprefix"] + "_scATAC_read_distr.png", - qcfrag = "Result/QC/" + config["outprefix"] + "_scATAC_fragment_size.png", - qcfrip = "Result/QC/" + config["outprefix"] + "_scATAC_cell_filtering.png", - validbarcode = "Result/QC/" + config["outprefix"] + "_scATAC_validcells.txt", - params: - outdir = "Result/QC", - outpre = config["outprefix"], - fragbed = "%s_frag.bed" %(config["outprefix"]), - single_stat = "singlecell.txt", - bulk_stat = "flagstat.txt", - count = config["cutoff"]["count"], - frip = config["cutoff"]["frip"] - threads: - config["cores"] - benchmark: - "Result/Benchmark/%s_QCPlot.benchmark" %(config["outprefix"]) - shell: - "Rscript " + RSCRIPT_PATH + "/scATACseq_qc.R --bulkstat {params.bulk_stat} --fragment {params.fragbed} --singlestat {params.single_stat} " - "--countcutoff {params.count} --fripcutoff {params.frip} --prefix {params.outpre} --outdir {params.outdir}" +if config["format"] != "fragments": + rule scatac_qcplot: + input: + fragbed = "Result/QC/%s_frag.bed" %(config["outprefix"]), + single_stat = "Result/QC/singlecell.txt", + bulk_stat = "Result/QC/flagstat.txt" + output: + readdistr = "Result/QC/" + config["outprefix"] + "_scATAC_read_distr.png", + qcfrag = "Result/QC/" + config["outprefix"] + "_scATAC_fragment_size.png", + qcfrip = "Result/QC/" + config["outprefix"] + "_scATAC_cell_filtering.png", + validbarcode = "Result/QC/" + config["outprefix"] + "_scATAC_validcells.txt", + params: + outdir = "Result/QC", + outpre = config["outprefix"], + fragbed = "%s_frag.bed" %(config["outprefix"]), + single_stat = "singlecell.txt", + bulk_stat = "flagstat.txt", + count = config["cutoff"]["count"], + frip = config["cutoff"]["frip"] + threads: + config["cores"] + benchmark: + "Result/Benchmark/%s_QCPlot.benchmark" %(config["outprefix"]) + shell: + "Rscript " + RSCRIPT_PATH + "/scATACseq_qc.R --bulkstat {params.bulk_stat} --fragment {params.fragbed} --singlestat {params.single_stat} " + "--countcutoff {params.count} --fripcutoff {params.frip} --prefix {params.outpre} --outdir {params.outdir}" +else: + rule scatac_qcplot: + input: + fragbed = "Result/QC/%s_frag.bed" %(config["outprefix"]), + single_stat = "Result/QC/singlecell.txt", + output: + qcfrag = "Result/QC/" + config["outprefix"] + "_scATAC_fragment_size.png", + qcfrip = "Result/QC/" + config["outprefix"] + "_scATAC_cell_filtering.png", + validbarcode = "Result/QC/" + config["outprefix"] + "_scATAC_validcells.txt", + params: + outdir = "Result/QC", + outpre = config["outprefix"], + fragbed = "%s_frag.bed" %(config["outprefix"]), + single_stat = "singlecell.txt", + count = config["cutoff"]["count"], + frip = config["cutoff"]["frip"] + threads: + config["cores"] + benchmark: + "Result/Benchmark/%s_QCPlot.benchmark" %(config["outprefix"]) + shell: + "Rscript " + RSCRIPT_PATH + "/scATACseq_qc.R --fragment {params.fragbed} --singlestat {params.single_stat} " + "--countcutoff {params.count} --fripcutoff {params.frip} --prefix {params.outpre} --outdir {params.outdir}" rule scatac_qcfilter: input: @@ -1007,84 +1207,148 @@ rule scatac_analysis: fraggz = "../minimap2/fragments_corrected_count.tsv.gz", giggleannotation = config["giggleannotation"], species = config["species"], - signature = config["signature"] + signature = config["signature"], + method = config["method"], + annotation = config["annotation"], threads: config["cores"] benchmark: "Result/Benchmark/%s_Analysis.benchmark" %(config["outprefix"]) shell: "Rscript " + RSCRIPT_PATH + "/scATACseq_pipe.R --peakcount {params.count} --rpmatrix {params.genescore} " - "--species {params.species} --prefix {params.outpre} --signature {params.signature} " + "--species {params.species} --prefix {params.outpre} --annotation {params.annotation} --method {params.method} --signature {params.signature} " "--gigglelib {params.giggleannotation} --fragment {params.fraggz} --outdir {params.outdir} --thread {threads}" -rule scatac_bamindex: - input: - bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]), - output: - bai = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam.bai" %(config["outprefix"]), - threads: - config["cores"] - benchmark: - "Result/Benchmark/%s_BamIndex.benchmark" %(config["outprefix"]) - shell: - "samtools index -@ {threads} {input.bam}" +if config["format"] != "fragments": + rule scatac_bamindex: + input: + bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]), + output: + bai = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam.bai" %(config["outprefix"]), + threads: + config["cores"] + benchmark: + "Result/Benchmark/%s_BamIndex.benchmark" %(config["outprefix"]) + shell: + "samtools index -@ {threads} {input.bam}" -checkpoint scatac_bamcluster: - input: - bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]), - bai = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam.bai" %(config["outprefix"]), - cellcluster = "Result/Analysis/%s_cell_cluster.txt" %(config["outprefix"]), - output: - directory("Result/Analysis/Cluster/") - threads: - config["cores"] - benchmark: - "Result/Benchmark/%s_BamCluster.benchmark" %(config["outprefix"]) - shell: - "sinto filterbarcodes -b {input.bam} -c {input.cellcluster} -p {threads} --barcodetag CB;" - "mv Cluster_*.bam {output}" + checkpoint scatac_bamcluster: + input: + bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]), + bai = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam.bai" %(config["outprefix"]), + cellcluster = "Result/Analysis/%s_cell_cluster.txt" %(config["outprefix"]), + output: + directory("Result/Analysis/Cluster/") + threads: + config["cores"] + benchmark: + "Result/Benchmark/%s_BamCluster.benchmark" %(config["outprefix"]) + shell: + "sinto filterbarcodes -b {input.bam} -c {input.cellcluster} -p {threads} --barcodetag CB;" + "mv Cluster_*.bam {output}" -rule scatac_clusterpeakcall: - input: - bam = "Result/Analysis/Cluster/{cluster}.bam" - output: - peak = "Result/Analysis/Cluster/{cluster}_peaks.narrowPeak", - bdg = "Result/Analysis/Cluster/{cluster}_treat_pileup.bdg", - params: - name = "{cluster}", - outdir = "Result/Analysis/Cluster" - log: - "Result/Log/{cluster}_macs2_allpeak.log" - benchmark: - "Result/Benchmark/{cluster}_AllPeakCall.benchmark" - shell: - "macs2 callpeak -g hs --outdir {params.outdir} -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR -t {input.bam}" + rule scatac_clusterpeakcall: + input: + bam = "Result/Analysis/Cluster/{cluster}.bam" + output: + peak = "Result/Analysis/Cluster/{cluster}_peaks.narrowPeak", + bdg = "Result/Analysis/Cluster/{cluster}_treat_pileup.bdg", + params: + name = "{cluster}", + outdir = "Result/Analysis/Cluster", + genome = macs2_genome + log: + "Result/Log/{cluster}_macs2_allpeak.log" + benchmark: + "Result/Benchmark/{cluster}_AllPeakCall.benchmark" + shell: + "macs2 callpeak -f BAMPE -g {params.genome} --outdir {params.outdir} -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR --keep-dup all -t {input.bam}" -rule scatac_report: - input: - # bulkqc = "Result/QC/" + config["outprefix"] + "_bam_stat.txt", - readdistr = "Result/QC/%s_scATAC_read_distr.png" %(config["outprefix"]), - qcfrag = "Result/QC/%s_scATAC_fragment_size.png" %(config["outprefix"]), - qcfrip = "Result/QC/%s_scATAC_cell_filtering.png" %(config["outprefix"]), - count = "Result/QC/%s_filtered_peak_count.h5" %(config["outprefix"]), - clusterplot = "Result/Analysis/%s_cluster.png" %(config["outprefix"]), - annotateplot = "Result/Analysis/%s_annotated.png" %(config["outprefix"]), - genescore = "Result/Analysis/%s_gene_score.h5" %(config["outprefix"]), - tflist = "Result/Analysis/%s.PredictedTFTop10.txt" %(config["outprefix"]), - # acannotateplot = "Result/Analysis/%s_CistromeTop_annotated.png" %(config["outprefix"]), - # ms4a1trackplot = "Result/Analysis/%s_MS4A1_genetrack.png" %(config["outprefix"]), - # cd3dtrackplot = "Result/Analysis/%s_CD3D_genetrack.png" %(config["outprefix"]), - output: - summaryreport = "Result/%s_scATAC_report.html" %(config["outprefix"]), - params: - outpre = config["outprefix"], - fastqdir = config["fastqdir"], - species = config["species"], - platform = config["platform"], - outdir = "Result" - benchmark: - "Result/Benchmark/%s_Report.benchmark" %(config["outprefix"]) - shell: - # "cp {input.readdistr} {input.qcmap} {input.qcfrag} {input.qcfrip} {input.clusterplot} {input.annotateplot} {output.outdir};" - "python " + SCRIPT_PATH + "/scATAC_HTMLReport.py --directory {params.outdir} --outprefix {params.outpre} " - "--fastq-dir {params.fastqdir} --species {params.species} --platform {params.platform}" + rule scatac_report: + input: + # bulkqc = "Result/QC/" + config["outprefix"] + "_bam_stat.txt", + readdistr = "Result/QC/%s_scATAC_read_distr.png" %(config["outprefix"]), + qcfrag = "Result/QC/%s_scATAC_fragment_size.png" %(config["outprefix"]), + qcfrip = "Result/QC/%s_scATAC_cell_filtering.png" %(config["outprefix"]), + count = "Result/QC/%s_filtered_peak_count.h5" %(config["outprefix"]), + clusterplot = "Result/Analysis/%s_cluster.png" %(config["outprefix"]), + annotateplot = "Result/Analysis/%s_annotated.png" %(config["outprefix"]), + genescore = "Result/Analysis/%s_gene_score.h5" %(config["outprefix"]), + tflist = "Result/Analysis/%s.PredictedTFTop10.txt" %(config["outprefix"]), + # acannotateplot = "Result/Analysis/%s_CistromeTop_annotated.png" %(config["outprefix"]), + # ms4a1trackplot = "Result/Analysis/%s_MS4A1_genetrack.png" %(config["outprefix"]), + # cd3dtrackplot = "Result/Analysis/%s_CD3D_genetrack.png" %(config["outprefix"]), + output: + summaryreport = "Result/%s_scATAC_report.html" %(config["outprefix"]), + params: + outpre = config["outprefix"], + inputpath = config["fastqdir"] if config["format"] == "fastq" else config["bam"], + species = config["species"], + platform = config["platform"], + inputformat = config["format"], + outdir = "Result" + benchmark: + "Result/Benchmark/%s_Report.benchmark" %(config["outprefix"]) + shell: + # "cp {input.readdistr} {input.qcmap} {input.qcfrag} {input.qcfrip} {input.clusterplot} {input.annotateplot} {output.outdir};" + "python " + SCRIPT_PATH + "/scATAC_HTMLReport.py --directory {params.outdir} --outprefix {params.outpre} " + "--input-path {params.inputpath} --species {params.species} --platform {params.platform} --input-format {params.inputformat}" +else: + checkpoint scatac_fragcluster: + input: + frag = "Result/minimap2/fragments_corrected_count.tsv", + cellcluster = "Result/Analysis/%s_cell_cluster.txt" %(config["outprefix"]), + output: + directory("Result/Analysis/Cluster/") + threads: + config["cores"] + benchmark: + "Result/Benchmark/%s_FragCluster.benchmark" %(config["outprefix"]) + shell: + "python " + SCRIPT_PATH + "/scATAC_FragmentSplit.py --frag {input.frag} --cluster {input.cellcluster} --outdir {output}" + + rule scatac_clusterpeakcall: + input: + frag = "Result/Analysis/Cluster/{cluster}.bed" + output: + peak = "Result/Analysis/Cluster/{cluster}_peaks.narrowPeak", + bdg = "Result/Analysis/Cluster/{cluster}_treat_pileup.bdg", + params: + name = "{cluster}", + outdir = "Result/Analysis/Cluster", + genome = macs2_genome + log: + "Result/Log/{cluster}_macs2_allpeak.log" + benchmark: + "Result/Benchmark/{cluster}_AllPeakCall.benchmark" + shell: + "macs2 callpeak -f BEDPE -g {params.genome} --outdir {params.outdir} -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR --keep-dup all -t {input.frag}" + + rule scatac_report: + input: + # bulkqc = "Result/QC/" + config["outprefix"] + "_bam_stat.txt", + qcfrag = "Result/QC/%s_scATAC_fragment_size.png" %(config["outprefix"]), + qcfrip = "Result/QC/%s_scATAC_cell_filtering.png" %(config["outprefix"]), + count = "Result/QC/%s_filtered_peak_count.h5" %(config["outprefix"]), + clusterplot = "Result/Analysis/%s_cluster.png" %(config["outprefix"]), + annotateplot = "Result/Analysis/%s_annotated.png" %(config["outprefix"]), + genescore = "Result/Analysis/%s_gene_score.h5" %(config["outprefix"]), + tflist = "Result/Analysis/%s.PredictedTFTop10.txt" %(config["outprefix"]), + # acannotateplot = "Result/Analysis/%s_CistromeTop_annotated.png" %(config["outprefix"]), + # ms4a1trackplot = "Result/Analysis/%s_MS4A1_genetrack.png" %(config["outprefix"]), + # cd3dtrackplot = "Result/Analysis/%s_CD3D_genetrack.png" %(config["outprefix"]), + output: + summaryreport = "Result/%s_scATAC_report.html" %(config["outprefix"]), + params: + outpre = config["outprefix"], + inputpath = config["frag"], + species = config["species"], + platform = config["platform"], + inputformat = config["format"], + outdir = "Result", + benchmark: + "Result/Benchmark/%s_Report.benchmark" %(config["outprefix"]) + shell: + # "cp {input.readdistr} {input.qcmap} {input.qcfrag} {input.qcfrip} {input.clusterplot} {input.annotateplot} {output.outdir};" + "python " + SCRIPT_PATH + "/scATAC_HTMLReport.py --directory {params.outdir} --outprefix {params.outpre} " + "--input-path {params.inputpath} --species {params.species} --platform {params.platform} --input-format {params.inputformat}" diff --git a/MAESTRO/Snakemake/scATAC/config_template.yaml b/MAESTRO/Snakemake/scATAC/config_template.yaml index 8ff7741..81976ee 100644 --- a/MAESTRO/Snakemake/scATAC/config_template.yaml +++ b/MAESTRO/Snakemake/scATAC/config_template.yaml @@ -1,3 +1,15 @@ +# Platform of single cell ATAC-seq [10x-genomics, sci-ATAC-seq, microfluidic] +platform: {{ platform }} + +# The format of input files [fastq, bam, fragments] +# If the platform is 10x-genomics, users can start with sequencing fastq files, bam files with CB tag or fragments.tsv.gz file generated by CellRanger ATAC. +# If the platform is sci-ATAC-seq, users can start with sequencing fastq files, bam files with CB tag. +# If the platform is microfluidic, users can only start with sequencing fastq files. +# If the format is 'fastq', fastqdir and fastqprefix need to be set. +# If the format is 'bam', bam needs to be set. +# If the format is 'fragments', frag needs to be set. +format: {{ format }} + # Directory where fastq files are stored fastqdir: {{ fastqdir }} @@ -11,12 +23,16 @@ fastqdir: {{ fastqdir }} # In this way, read1, barcode and read2 are associated with R1, R2, R3, respectively. fastqprefix: {{ fastqprefix }} +# The bam file with cell barcode (CB) tag (required when the format is set as 'bam'). +bam: {{ bam }} + +# The fragment file generated by CellRanger ATAC (required when the format is set as 'fragments'). +# For example, fragments.tsv.gz. +frag: {{ frag }} + # Species to use [GRCh38, GRCm38] (GRCh38 for human and GRCm38 for mouse) species: {{ species }} -# Platform of single cell ATAC-seq [10x-genomics, sci-ATAC-seq, microfluidic] -platform: {{ platform }} - # The prefix of output files. DEFAULT: MAESTRO. outprefix: {{ outprefix }} @@ -30,6 +46,22 @@ whitelist: {{ whitelist }} # The core number cores: {{ cores }} +# Whether or not to perform ccell-type annotation. +# By default (not set), MAESTRO will skip the step of cell-type annotation. +# If set, please specify the method of cell-type annotation through method. +annotation: {{ annotation }} + +# Method to annotate cell types [RP-based, peak-based, both]. +# MAESTRO provides two strategies to annotate cell types for scATAC-seq data. +# Users can choose from 'RP-based' and 'peak-based', or choose to run both of them. +# One is based on gene regulatory potential predicted by RP model. Another is based on the bulk chromatin accessibility data from Cistrome database. +# If 'RP-based' is set, MAESTRO performs the cell-type annotation using the gene regulatory potential to represent gene expression, +# and the logFC of gene regulatory potential between one cluster and all the other cells is used to calculate the gene signature scores. +# If 'peak-based' is set, MAESTRO utilizes GIGGLE to evaluate the enrichment of bulk chromatin accessibility peaks on cluster-specific peaks from scATAC-seq data, +# and then transfers the Cistrome cluster identity from the most enriched bulk chromatin accessibility data as the cell-type annotation for the scATAC-seq cluster. +# See the MAESTRO paper for more details. DEFAULT: RP-based. +method: {{ method }} + # Cell signature file used to annotate cell types. MAESTRO provides several sets of built-in cell signatures. # Users can choose from ['human.immune.CIBERSORT', 'mouce.brain.ALLEN', 'mouse.all.facs.TabulaMuris', 'mouse.all.droplet.TabulaMuris']. # Custom cell signatures are also supported. In this situation, users need to provide the file location of cell signatures, diff --git a/MAESTRO/Snakemake/scRNA/Snakefile b/MAESTRO/Snakemake/scRNA/Snakefile index 6de4253..c2fba90 100644 --- a/MAESTRO/Snakemake/scRNA/Snakefile +++ b/MAESTRO/Snakemake/scRNA/Snakefile @@ -434,8 +434,7 @@ rule scrna_analysis: species = config["species"], outpre = config["outprefix"], outdir = "Result/Analysis", - method = config["method"], - rabitlib = config["rabitlib"], + method = "LISA", lisamode = config["lisamode"], lisaenv = config["lisaenv"], condadir = config["condadir"], @@ -446,7 +445,7 @@ rule scrna_analysis: config["cores"] shell: "Rscript " + RSCRIPT_PATH + "/scRNAseq_pipe.R --expression {params.expression} --species {params.species} " - "--prefix {params.outpre} --method {params.method} --signature {params.signature} --rabitlib {params.rabitlib} " + "--prefix {params.outpre} --method {params.method} --signature {params.signature} " "--lisamode {params.lisamode} --condadir {params.condadir} --lisaenv {params.lisaenv} --outdir {params.outdir} --thread {threads}" if config["rseqc"]: diff --git a/MAESTRO/Snakemake/scRNA/config_template.yaml b/MAESTRO/Snakemake/scRNA/config_template.yaml index c2d22f7..f33db40 100644 --- a/MAESTRO/Snakemake/scRNA/config_template.yaml +++ b/MAESTRO/Snakemake/scRNA/config_template.yaml @@ -28,13 +28,6 @@ cores: {{ cores }} # and the signature file is tab-seperated without header. The first column is cell type, and the second column is signature gene. signature: {{ signature }} -# The method to predict driver regulators [RABIT, LISA] -method: {{ method }} - -# Path of the rabit annotation file required for regulator identification (only required if method is set to RABIT). -# Please download the annotation file from http://cistrome.org/~chenfei/MAESTRO/rabit.tar.gz and decompress it. -rabitlib: {{ rabitlib }} - # Mode to Run LISA, 'local' or 'web'. If the mode is set as 'local', # please install LISA (https://github.com/qinqian/lisa) and download pre-computed datasets following the instructions. # The 'web' mode is to run online version of LISA. In consideration of the connection issue and size of datasets, diff --git a/MAESTRO/html/scATAC_nomappability_template.html b/MAESTRO/html/scATAC_nomappability_template.html new file mode 100644 index 0000000..8ea5ba7 --- /dev/null +++ b/MAESTRO/html/scATAC_nomappability_template.html @@ -0,0 +1,302 @@ + + + +
+ + + + + + + +Sample ID | +%(outprefix)s | +
---|---|
%(inputformat)s | +%(inputpath)s | +
Species | +%(species)s | +
Platform | +%(platform)s | +
+ Fragment size distribution for 1%% reads sampled from scATAC-seq. Only fragments with insert size less than 1,000 are considered. Y-axis represents the number of fragments. The fragment size distribution should show a periodicity of approximately 200bp due to nucleosome protection of the chromatin to transposase cutting. + +
+ ++ Driver transcription regulators identified based on cluster-specific peaks. By default, the regulators are ranked by the enrichment score in each cluster. +
+Cluster | +Celltype Annotation | +Transcription Factor | +log10(Giggle score) | +
---|