diff --git a/.gitignore b/.gitignore index 097200a..c8c2326 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ dist/* *.tar.bz2 *.dylib *.tmp +*.egg-info/* diff --git a/.travis.yml b/.travis.yml index 23584ce..3c3e58b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,6 +11,7 @@ jobs: cache: directories: - ~/miniconda + - ~/Data before_install: # Set conda path info @@ -29,6 +30,49 @@ before_install: wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O $HOME/download/miniconda.sh; fi; fi; + # download test data for scatac + - TEST_DATA_PATH=$HOME/Data; + - TEST_DATA_SCATAC_PATH=$TEST_DATA_PATH/atac_pbmc_500_v1_fastqs_sampling; + - TEST_DATA_GIGGLE_PATH=$TEST_DATA_PATH/giggle.all; + - TEST_DATA_SCATAC_REFERENCE_PATH=$TEST_DATA_PATH/Refdata_scATAC_MAESTRO_GRCh38_1.1.0; + - if [[ -d $TEST_DATA_PATH ]]; then + echo "Directory Data already exists"; + else + mkdir -p $TEST_DATA_PATH; + fi; + - if [[ -d $TEST_DATA_SCATAC_PATH ]]; then + echo "scATAC test data already available from cache"; + else + cd $TEST_DATA_PATH; + echo "downloading scatac test data"; + wget http://cistrome.org/~chenfei/MAESTRO/atac_pbmc_500_v1_fastqs_sampling.tar.gz; + echo "decompressing scatac test data"; + tar -xvzf atac_pbmc_500_v1_fastqs_sampling.tar.gz; + rm atac_pbmc_500_v1_fastqs_sampling.tar.gz; + cd ../; + fi; + - if [[ -d $TEST_DATA_GIGGLE_PATH ]]; then + echo "scATAC giggle index data already available from cache"; + else + cd $TEST_DATA_PATH; + echo "downloading giggle index"; + wget http://cistrome.org/~chenfei/MAESTRO/giggle.all.tar.gz; + echo "decompressing giggle index"; + tar -xvzf giggle.all.tar.gz; + rm giggle.all.tar.gz; + cd ../; + fi; + - if [[ -d $TEST_DATA_SCATAC_REFERENCE_PATH ]]; then + echo "scATAC reference genome data already available from cache"; + else + cd $TEST_DATA_PATH; + echo "downloading reference genome"; + wget http://cistrome.org/~chenfei/MAESTRO/Refdata_scATAC_MAESTRO_GRCh38_1.1.0.tar.gz; + echo "decompressing reference genome"; + tar -xvzf Refdata_scATAC_MAESTRO_GRCh38_1.1.0.tar.gz; + rm Refdata_scATAC_MAESTRO_GRCh38_1.1.0.tar.gz; + cd ../; + fi; # install the package and dependencies: install: @@ -84,6 +128,10 @@ script: - MAESTRO -v - R -e "library(MAESTRO);library(Seurat)" - R -e "library(org.Hs.eg.db);library(org.Mm.eg.db)" + # test python utility scripts + - cd test + - bash test.sh + - cd .. # We also need more testing here! # the following codes will upload when all the above is successful and @@ -102,7 +150,7 @@ after_success: - mamba install anaconda-client - | # Only upload builds from tags - if [[ $TRAVIS_PULL_REQUEST == false && $TRAVIS_REPO_SLUG == "liulab-dfci/MAESTRO" + if [[ $TRAVIS_PULL_REQUEST == false && $TRAVIS_REPO_SLUG == "dongqingsun/MAESTRO" && $TRAVIS_BRANCH == $TRAVIS_TAG && $TRAVIS_TAG != '' ]]; then export ANACONDA_API_TOKEN=$CONDA_UPLOAD_TOKEN anaconda upload bld-dir/**/PACKAGENAME-*.tar.bz2 diff --git a/MAESTRO.egg-info/PKG-INFO b/MAESTRO.egg-info/PKG-INFO new file mode 100644 index 0000000..a60d2f5 --- /dev/null +++ b/MAESTRO.egg-info/PKG-INFO @@ -0,0 +1,17 @@ +Metadata-Version: 1.1 +Name: MAESTRO +Version: 1.2.0 +Summary: MAESTRO(Model-based AnalysEs of Single-cell Transcriptome and RegulOme) is a comprehensive single-cell RNA-seq and ATAC-seq analysis suit built using snakemake. +Home-page: https://github.com/chenfeiwang/MAESTRO +Author: Chenfei Wang, Dongqing Sun +Author-email: UNKNOWN +License: GPL-3.0 +Description: UNKNOWN +Platform: UNKNOWN +Classifier: Development Status :: 4 - Beta +Classifier: Environment :: Console +Classifier: Intended Audience :: Science/Research +Classifier: License :: OSI Approved :: GPL License +Classifier: Natural Language :: English +Classifier: Programming Language :: Python :: 3 +Classifier: Topic :: Scientific/Engineering :: Bio-Informatics diff --git a/MAESTRO.egg-info/SOURCES.txt b/MAESTRO.egg-info/SOURCES.txt new file mode 100644 index 0000000..7dc7e76 --- /dev/null +++ b/MAESTRO.egg-info/SOURCES.txt @@ -0,0 +1,27 @@ +README.md +setup.py +MAESTRO/MAESTRO +MAESTRO/MAESTRO_PipeInit.py +MAESTRO/__init__.py +MAESTRO/integrate_HTMLReport.py +MAESTRO/scATAC_10x_BarcodeCorrect.py +MAESTRO/scATAC_10x_PeakCount.py +MAESTRO/scATAC_BamAddTag.py +MAESTRO/scATAC_FragmentCorrect.py +MAESTRO/scATAC_FragmentGenerate.py +MAESTRO/scATAC_Genescore.py +MAESTRO/scATAC_H5Process.py +MAESTRO/scATAC_HTMLReport.py +MAESTRO/scATAC_QC.py +MAESTRO/scATAC_microfluidic_PeakCount.py +MAESTRO/scATAC_microfluidic_QC.py +MAESTRO/scATAC_sci_BarcodeExtract.py +MAESTRO/scATAC_utility.py +MAESTRO/scRNA_AnalysisPipeline.py +MAESTRO/scRNA_HTMLReport.py +MAESTRO/scRNA_QC.py +MAESTRO/scRNA_utility.py +MAESTRO.egg-info/PKG-INFO +MAESTRO.egg-info/SOURCES.txt +MAESTRO.egg-info/dependency_links.txt +MAESTRO.egg-info/top_level.txt \ No newline at end of file diff --git a/MAESTRO.egg-info/dependency_links.txt b/MAESTRO.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/MAESTRO.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/MAESTRO.egg-info/top_level.txt b/MAESTRO.egg-info/top_level.txt new file mode 100644 index 0000000..4767e77 --- /dev/null +++ b/MAESTRO.egg-info/top_level.txt @@ -0,0 +1 @@ +MAESTRO diff --git a/MAESTRO/MAESTRO b/MAESTRO/MAESTRO index cf7d1ba..818ccfa 100644 --- a/MAESTRO/MAESTRO +++ b/MAESTRO/MAESTRO @@ -5,7 +5,7 @@ # @Last Modified by: Dongqing Sun # @Last Modified time: 2020-02-28 13:58:10 -version = "1.1.0" +version = "1.2.1" import logging import sys, os @@ -13,6 +13,7 @@ import shutil import argparse as ap from MAESTRO.MAESTRO_PipeInit import * +from MAESTRO.MAESTRO_ParameterValidate import * from MAESTRO.scATAC_H5Process import * from MAESTRO.scATAC_Genescore import genescore_parser, genescore from MAESTRO.scATAC_10x_PeakCount import peakcount_parser, peakcount @@ -47,7 +48,7 @@ def main(): scrna_analysis_parser(subparsers) - logging.basicConfig(format="%(message)s", level=logging.INFO, stream=sys.stderr) + logging.basicConfig(format="%(levelname)s: %(message)s", stream=sys.stderr) args = parser.parse_args() @@ -55,9 +56,11 @@ def main(): print(version) exit(0) elif args.subcommand == "scatac-init": + scatac_validator(args) scatac_config(args) elif args.subcommand == "scrna-init": + scrna_validator(args) scrna_config(args) elif args.subcommand == "integrate-init": diff --git a/MAESTRO/MAESTRO_ParameterValidate.py b/MAESTRO/MAESTRO_ParameterValidate.py new file mode 100644 index 0000000..295c13f --- /dev/null +++ b/MAESTRO/MAESTRO_ParameterValidate.py @@ -0,0 +1,138 @@ +# -*- coding: utf-8 -*- +# @Author: Dongqing Sun +# @E-mail: Dongqingsun96@gmail.com +# @Date: 2020-07-19 17:20:32 +# @Last Modified by: Dongqing Sun +# @Last Modified time: 2020-07-20 13:49:05 + + +import sys +import os +import re +import logging +from argparse import ArgumentError + + +def scatac_validator(args): + """ + Validate parameters from scatac-init argument parsers. + """ + if args.platform == "10x-genomics": + if args.format == "fastq": + if args.fastq_dir == "": + logging.error("--fastq-dir is required. Please specify the directory where fastq files are stored!") + exit(1) + if args.fastq_prefix == "": + logging.error("--fastq-prefix is required. Please provide the sample name of fastq files!") + exit(1) + if args.fasta == "": + logging.error("--fasta is required if fastq files are provided!") + exit(1) + if args.whitelist == "": + logging.error("--whitelist is required for 10x-genomics data!") + exit(1) + if args.format == "bam": + if args.bam == "": + logging.error("--bam is required. Please provide the bam file with CB tag!") + exit(1) + if args.format == "fragments": + if args.frag == "": + logging.error("--frag is required. Please provide the fragment file generated by CellRanger ATAC!") + exit(1) + + if args.platform == "sci-ATAC-seq": + if args.format == "fastq": + if args.fastq_dir == "": + logging.error("--fastq-dir is required. Please specify the directory where fastq files are stored!") + exit(1) + if args.fastq_prefix == "": + logging.error("--fastq-prefix is required. Please provide the sample name of fastq files!") + exit(1) + if args.fasta == "": + logging.error("--fasta is required if fastq files are provided!") + exit(1) + if args.format == "bam": + if args.bam == "": + logging.error("--bam is required. Please provide the bam file with CB tag!") + exit(1) + if args.format == "fragments": + logging.error("Format of 'fragments' is supported only when the platform is '10x-genomics'.") + exit(1) + + if args.platform == "microfluidic": + if args.format == "fastq": + if args.fastq_dir == "": + logging.error("--fastq-dir is required. Please specify the directory where fastq files are stored!") + exit(1) + if args.fasta == "": + logging.error("--fasta is required if fastq files are provided!") + exit(1) + if args.format == "bam": + logging.error("Format of 'bam' is supported when the platform is '10x-genomics' or 'sci-ATAC-seq'.") + exit(1) + if args.format == "fragments": + logging.error("Format of 'fragments' is supported only when the platform is '10x-genomics'.") + exit(1) + + if args.signature not in ['human.immune.CIBERSORT', 'mouce.brain.ALLEN', 'mouse.all.facs.TabulaMuris', 'mouse.all.droplet.TabulaMuris']: + if os.path.exists(args.signature): + pass + else: + logging.error("Please specify the signature built in MAESTRO or provide customized signature file. See --signature help for more details!") + exit(1) + + +def scrna_validator(args): + """ + Validate parameters from scrna-init argument parsers. + """ + + if args.platform == "10x-genomics": + if args.fastq_dir == "": + logging.error("--fastq-dir is required. Please specify the directory where fastq files are stored!") + exit(1) + if args.fastq_prefix == "": + logging.error("--fastq-prefix is required. Please provide the sample name of fastq files!") + exit(1) + if args.whitelist == "": + logging.error("--whitelist is required for 10x-genomics data!") + exit(1) + + if args.platform == "Dropseq": + if args.fastq_dir == "": + logging.error("--fastq-dir is required for Dropsea data. Please specify the directory where fastq files are stored!") + exit(1) + if args.fastq_barcode == "": + logging.error("--fastq-barcode is required for Dropsea data. Please specify the barcode fastq file!") + exit(1) + if args.fastq_transcript == "": + logging.error("--fastq-transcript is required for Dropsea data. Please specify the transcript fastq file!") + exit(1) + if args.whitelist == "": + logging.error("--whitelist is required for Dropsea data. Please provide the barcode whitelist.") + exit(1) + + if args.platform == "Smartseq2": + if args.fastq_dir == "": + logging.error("--fastq-dir is required. Please specify the directory where fastq files are stored!") + exit(1) + if args.rsem == "": + logging.error("--rsem is required. Please provide the prefix of transcript references for RSEM. See --rsem help for more details.") + exit(1) + + if args.lisamode == "local": + if args.lisaenv == "": + logging.error("--lisaenv is required when lisamode is 'local'. Please specify the name of LISA environment!") + exit(1) + if args.condadir == "": + logging.error("--condadir is required when lisamode is 'local'. Please specify the directory where miniconda or anaconda is installed!") + exit(1) + + if args.signature not in ['human.immune.CIBERSORT', 'mouce.brain.ALLEN', 'mouse.all.facs.TabulaMuris', 'mouse.all.droplet.TabulaMuris']: + if os.path.exists(args.signature): + pass + else: + logging.error("Please specify the signature built in MAESTRO or provide customized signature file. See --signature help for more details!") + exit(1) + + diff --git a/MAESTRO/MAESTRO_PipeInit.py b/MAESTRO/MAESTRO_PipeInit.py index 58f37f5..e14a4f7 100644 --- a/MAESTRO/MAESTRO_PipeInit.py +++ b/MAESTRO/MAESTRO_PipeInit.py @@ -3,7 +3,7 @@ # @E-mail: Dongqingsun96@gmail.com # @Date: 2020-02-23 19:40:27 # @Last Modified by: Dongqing Sun -# @Last Modified time: 2020-06-11 22:19:45 +# @Last Modified time: 2020-07-20 16:47:23 import os @@ -27,9 +27,20 @@ def scatac_parser(subparsers): group_input.add_argument("--platform", dest = "platform", default = "10x-genomics", choices = ["10x-genomics", "sci-ATAC-seq", "microfluidic"], help = "Platform of single cell ATAC-seq. DEFAULT: 10x-genomics.") - group_input.add_argument("--fastq-dir", dest = "fastq_dir", type = str, required = True, + group_input.add_argument("--format", dest = "format", default = "fastq", + choices = ["fastq", "bam", "fragments"], + help = "The format of input files. Users can start with sequencing fastq files, " + "bam files with CB tag or fragments.tsv.gz file generated by CellRanger ATAC. " + "If the platform is 10x-genomics, users can start with sequencing fastq files, bam files with CB tag or fragments.tsv.gz file generated by CellRanger ATAC. " + "If the platform is sci-ATAC-seq, users can start with sequencing fastq files, bam files with CB tag. " + "If the platform is microfluidic, users can only start with sequencing fastq files. " + "If the format is 'fastq', --fastq-dir and --fastq-prefix need to be set. " + "If the format is 'bam', --bam needs to be set. " + "If the format is 'fragments', --frag needs to be set. " + "DEFAULT: fastq.") + group_input.add_argument("--fastq-dir", dest = "fastq_dir", type = str, default = "", help = "Directory where fastq files are stored.") - group_input.add_argument("--fastq-prefix", dest = "fastq_prefix", type = str, default = "", + group_input.add_argument("--fastq-prefix", dest = "fastq_prefix", type = str, default = "", help = "Sample name of fastq file (required for the platform of '10x-genomics' or 'sci-ATAC-seq'). " "When the platform is '10x-genomics', if there is a file named pbmc_1k_v2_S1_L001_I1_001.fastq.gz, the prefix is 'pbmc_1k_v2'. " "If the platform is 'sci-ATAC-seq', there are two ways to provide fastq files. " @@ -38,6 +49,11 @@ def scatac_parser(subparsers): "in the format of '@ReadName:CellBarcode:OtherInformation'. For example, @rd.1:TCTCCCGCCGAGGCTGACTGCATAAGGCGAAT:SHEN-MISEQ02:1:1101:15311:1341. " "The other way is to provide 10x-like fastq files which should contain three fastq files -- prefix_R1.fastq, prefix_R2.fastq and prefix_R3.fastq. " "In this way, read1, barcode and read2 are associated with R1, R2, R3, respectively.") + group_input.add_argument("--bam", dest = "bam", type = str, default = "", + help = "The bam file with cell barcode (CB) tag (required when the format is set as 'bam').") + group_input.add_argument("--frag", dest = "frag", type = str, default = "", + help = "The fragment file generated by CellRanger ATAC (required when the format is set as 'fragments'). " + "For example, fragments.tsv.gz.") group_input.add_argument("--species", dest = "species", default = "GRCh38", choices = ["GRCh38", "GRCm38"], type = str, help = "Specify the genome assembly (GRCh38 for human and GRCm38 for mouse). DEFAULT: GRCh38.") @@ -70,7 +86,7 @@ def scatac_parser(subparsers): "Please download the annotation file from " "http://cistrome.org/~chenfei/MAESTRO/giggle.tar.gz and decompress it.") group_reference.add_argument("--fasta", dest = "fasta", - required = True, + default = "", help = "Genome fasta file for minimap2. " "Users can just download the fasta file for huamn and mouse from " "http://cistrome.org/~chenfei/MAESTRO/Refdata_scATAC_MAESTRO_GRCh38_1.1.0.tar.gz and " @@ -78,7 +94,7 @@ def scatac_parser(subparsers): "For example, 'Refdata_scATAC_MAESTRO_GRCh38_1.1.0/GRCh38_genome.fa'.") # Barcode library arguments - group_barcode = workflow.add_argument_group("Barcode library arguments, only for platform of 'sci-ATAC-seq'") + group_barcode = workflow.add_argument_group("Barcode library arguments, only for platform of 'sci-ATAC-seq' and '10x-genomics'") group_barcode.add_argument("--whitelist", dest = "whitelist", type = str, default = "", help = "If the platform is 'sci-ATAC-seq' or '10x-genomics', please specify the barcode library (whitelist) " @@ -124,13 +140,27 @@ def scatac_parser(subparsers): "to 10kb (enhancer-based regulation). DEFAULT: 10000.") # Signature file arguments - group_signature = workflow.add_argument_group("Cell signature arguments") - group_signature.add_argument("--signature", dest = "signature", type = str, default = "human.immune.CIBERSORT", - help = "Cell signature file used to annotate cell types. MAESTRO provides several sets of built-in cell signatures. " + group_annotation = workflow.add_argument_group("Cell-type annotation arguments") + group_annotation.add_argument("--annotation", dest = "annotation", action = "store_true", + help = "Whether or not to perform ccell-type annotation. " + "By default (not set), MAESTRO will skip the step of cell-type annotation. " + "If set, please specify the method of cell-type annotation through --method. ") + group_annotation.add_argument("--method", dest = "method", type = str, default = "RP-based", + choices = ["RP-based", "peak-based", "both"], + help = "Method to annotate cell types. MAESTRO provides two strategies to annotate cell types for scATAC-seq data. " + "Users can choose from 'RP-based' and 'peak-based', or choose to run both of them. " + "One is based on gene regulatory potential predicted by RP model. Another is based on the bulk chromatin accessibility data from Cistrome database. " + "If 'RP-based' is set, MAESTRO performs the cell-type annotation using the gene regulatory potential to represent gene expression, " + "and the logFC of gene regulatory potential between one cluster and all the other cells is used to calculate the gene signature scores. " + "If 'peak-based' is set, MAESTRO utilizes GIGGLE to evaluate the enrichment of bulk chromatin accessibility peaks on cluster-specific peaks from scATAC-seq data, " + "and then transfers the Cistrome cluster identity from the most enriched bulk chromatin accessibility data as the cell-type annotation for the scATAC-seq cluster. " + "See the MAESTRO paper for more details. DEFAULT: RP-based. ") + group_annotation.add_argument("--signature", dest = "signature", type = str, default = "human.immune.CIBERSORT", + help = "Cell signature file used to annotate cell types (required when method is set as 'RP-based'). MAESTRO provides several sets of built-in cell signatures. " "Users can choose from ['human.immune.CIBERSORT', 'mouce.brain.ALLEN', 'mouse.all.facs.TabulaMuris', 'mouse.all.droplet.TabulaMuris']. " "Custom cell signatures are also supported. In this situation, users need to provide the file location of cell signatures, " "and the signature file is tab-seperated without header. The first column is cell type, and the second column is signature gene. " - "DEFAULT: human.immune.CIBERSORT.") + "DEFAULT: human.immune.CIBERSORT. ") return @@ -205,7 +235,7 @@ def scrna_parser(subparsers): # Barcode arguments group_barcode = workflow.add_argument_group("Barcode arguments, for platform of 'Dropseq' or '10x-genomics'") - group_barcode.add_argument("--whitelist", dest = "whitelist", type = str, + group_barcode.add_argument("--whitelist", dest = "whitelist", type = str, default = "", help = "If the platform is 'Dropseq' or '10x-genomics', please specify the barcode library (whitelist) " "so that STARsolo can do the error correction and demultiplexing of cell barcodes. " "The 10X Chromium whitelist file can be found inside the CellRanger distribution. " @@ -228,9 +258,9 @@ def scrna_parser(subparsers): # group_regulator.add_argument("--method", dest = "method", type = str, # choices = ["RABIT", "LISA"], default = "LISA", # help = "Method to predict driver regulators.") - group_regulator.add_argument("--method", dest = "method", type = str, - choices = ["LISA"], default = "LISA", - help = "Method to predict driver regulators.") + # group_regulator.add_argument("--method", dest = "method", type = str, + # choices = ["LISA"], default = "LISA", + # help = "Method to predict driver regulators.") # group_regulator.add_argument("--rabitlib", dest = "rabitlib", type = str, # help = "Path of the rabit annotation file required for regulator identification (only required if method is set to RABIT). " # "Please download the annotation file from " @@ -242,11 +272,11 @@ def scrna_parser(subparsers): "the 'local' mode is recommended to run the whole MAESTRO pipeline. " "If the mode is 'local', please provide the name of LISA environment through --lisaenv " "and specify the directory where miniconda or anaconda is installed through --condadir. DEFAULT: local.") - group_regulator.add_argument("--lisaenv", dest = "lisaenv", type = str, default = "lisa", - help = "Name of LISA environment (required if method is set to lisa and lisamode is set to local). DEFAULT: lisa.") - group_regulator.add_argument("--condadir", dest = "condadir", type = str, required = True, + group_regulator.add_argument("--lisaenv", dest = "lisaenv", type = str, default = "", + help = "Name of LISA environment (required if lisamode is set to local). For example, lisa.") + group_regulator.add_argument("--condadir", dest = "condadir", type = str, default = "", help = "Directory where miniconda or anaconda is installed " - "(required if method is set to lisa and lisamode is set to local). For example, '/home/user/miniconda3'.") + "(required if lisamode is set to local). For example, '/home/user/miniconda3'.") # Signature file arguments group_signature = workflow.add_argument_group("Cell signature arguments") @@ -310,10 +340,13 @@ def scatac_config(args): with open(configfile, "w") as configout: configout.write(config_template.render( + platform = args.platform, + format = args.format, fastqdir = os.path.abspath(args.fastq_dir), fastqprefix = args.fastq_prefix, + bam = os.path.abspath(args.bam), + frag = os.path.abspath(args.frag), species = args.species, - platform = args.platform, outprefix = args.outprefix, whitelist = os.path.abspath(args.whitelist), cores = args.cores, @@ -321,6 +354,8 @@ def scatac_config(args): count = args.count_cutoff, frip = args.frip_cutoff, cell = args.cell_cutoff, + annotation = args.annotation, + method = args.method, signature = signature, custompeaks = args.custompeak, custompeaksloc = os.path.abspath(args.custompeak_file), @@ -370,7 +405,6 @@ def scrna_config(args): gene = args.gene_cutoff, cell = args.cell_cutoff, signature = signature, - method = args.method, # rabitlib = os.path.abspath(args.rabitlib), lisamode = args.lisamode, lisaenv = args.lisaenv, diff --git a/MAESTRO/R/scATACseq_pipe.R b/MAESTRO/R/scATACseq_pipe.R index bc098d7..06356ae 100644 --- a/MAESTRO/R/scATACseq_pipe.R +++ b/MAESTRO/R/scATACseq_pipe.R @@ -24,6 +24,12 @@ option_list = list( make_option(c("--species"), type = "character", default = "GRCh38", action = "store", help = "The platform of scRNA-seq." ), + make_option(c("--annotation"), type = "character", default = "False", + action = "store", help = "Whether or not to annotate cell types. Default is False." + ), + make_option(c("--method"), type = "character", default = "RP-based", + action = "store", help = "Method to annotate cell types. ['RP-based', 'peak-based', 'both']" + ), make_option(c("--signature"), type = "character", default = "", action = "store", help = "The cell signature file for celltype annotation. Default is built-in CIBERSORT immune cell signature." ), @@ -49,6 +55,8 @@ sigfile = argue$signature gigglelib = argue$gigglelib species = argue$species fragment = argue$fragment +annotation = argue$annotation +method = argue$method # countmatrix = read.table(argue[1], sep = '\t', header = TRUE, row.names = 1, check.names = FALSE) # RPmatrix = read.table(argue[2], sep = '\t', header = TRUE, row.names = 1, check.names = FALSE) @@ -65,23 +73,41 @@ if(sigfile %in% c("human.immune.CIBERSORT", "mouce.brain.ALLEN", "mouse.all.facs } result = ATACRunSeurat(inputMat = countmatrix, project = prefix, method = "LSI") result$ATAC = ATACAttachGenescore(ATAC = result$ATAC, RPmatrix = RPmatrix) -result$ATAC = ATACAnnotateCelltype(result$ATAC, signatures = signatures) saveRDS(result, paste0(prefix, "_scATAC_Object.rds")) +if (annotation == "True") { + if (method == "RP-based") { + result$ATAC = ATACAnnotateCelltype(result$ATAC, signatures = signatures) + + } + if (method == "peak-based") { + result$ATAC <- ATACAnnotateChromatinAccessibility(ATAC = result$ATAC, + peaks = result$peaks, + project = prefix, + giggle.path = gigglelib, + organism = species) + } + if (method == "both") { + result$ATAC <- ATACAnnotateChromatinAccessibility(ATAC = result$ATAC, + peaks = result$peaks, + project = prefix, + giggle.path = gigglelib, + organism = species) + result$ATAC = ATACAnnotateCelltype(result$ATAC, signatures = signatures) + + } + if ("assign.biological_resource" %in% colnames(result$ATAC@meta.data)) { + p1 <- DimPlot(result$ATAC, label = TRUE, reduction = "umap", group.by = "assign.biological_resource", repel=T, pt.size = 0.5, label.size = 2.5) + ggsave(file.path(paste0(result$ATAC@project.name, "_CistromeTop_annotated.png")), p1, width=7.5, height=4) + } +} + result.tfs = ATACAnnotateTranscriptionFactor(ATAC = result$ATAC, peaks = result$peaks, project = prefix, giggle.path = gigglelib, organism = species) -result$ATAC <- ATACAnnotateChromatinAccessibility(ATAC = result$ATAC, - peaks = result$peaks, - project = prefix, - giggle.path = gigglelib, - organism = species) -if ("biological_resource" %in% colnames(result$ATAC@meta.data)) { - p1 <- DimPlot(result$ATAC, label = TRUE, reduction = "umap", group.by = "biological_resource", repel=T, pt.size = 0.5, label.size = 2.5) - ggsave(file.path(paste0(result$ATAC@project.name, "_CistromeTop_annotated.png")), p1, width=7.5, height=4) -} +saveRDS(result, paste0(prefix, "_scATAC_Object.rds")) if(species == "GRCh38"){ library(TxDb.Hsapiens.UCSC.hg38.knownGene) diff --git a/MAESTRO/R/scATACseq_qc.R b/MAESTRO/R/scATACseq_qc.R index a7252db..b8b7df6 100644 --- a/MAESTRO/R/scATACseq_qc.R +++ b/MAESTRO/R/scATACseq_qc.R @@ -8,7 +8,7 @@ option_list = list( make_option(c("--outdir"), type = "character", default = "MAESTRO", action = "store", help = "The directory where the output files are stored." ), - make_option(c("--bulkstat"), type = "character", default = "flagstat.txt", + make_option(c("--bulkstat"), type = "character", default = "", action = "store", help = "The result of samtools flagstat." ), make_option(c("--singlestat"), type = "character", default = "singlecell.txt", @@ -34,7 +34,9 @@ count_cutoff = argue$countcutoff frip_cutoff = argue$fripcutoff prefix = argue$prefix -ATACReadDistrPlot(stat.filepath = bulkstat_file, name = prefix) +if(bulkstat_file != "") { + ATACReadDistrPlot(stat.filepath = bulkstat_file, name = prefix) +} ATACFragmentSizePlot(filepath = fragment_file, name = prefix) ATACFilteringPlot(filepath = singlestat_file, name = prefix, reads.cutoff = count_cutoff, frip.cutoff = frip_cutoff) diff --git a/MAESTRO/R/scRNAseq_pipe.R b/MAESTRO/R/scRNAseq_pipe.R index d4c5087..4d7a8fb 100644 --- a/MAESTRO/R/scRNAseq_pipe.R +++ b/MAESTRO/R/scRNAseq_pipe.R @@ -25,9 +25,6 @@ option_list = list( make_option(c("--signature"), type = "character", default = "", action = "store", help = "The cell signature file for celltype annotation. Default is built-in CIBERSORT immune cell signature." ), - make_option(c("--rabitlib"), type = "character", default = "", - action = "store", help = "Annotation to run rabit (only if method is set to rabit)." - ), make_option(c("--lisamode"), type = "character", default = "", action = "store", help = "Mode to run LISA (web or local)." ), @@ -50,7 +47,6 @@ prefix = argue$prefix thread = argue$thread method = argue$method sigfile = argue$signature -rabitlib = argue$rabitlib lisamode = argue$lisamode condadir = argue$condadir lisaenv = argue$lisaenv @@ -87,6 +83,6 @@ RNA.res$RNA <- RNAAnnotateCelltype(RNA = RNA.res$RNA, genes = RNA.res$genes, signatures = signatures, min.score = 0.05) saveRDS(RNA.res, paste0(prefix, "_scRNA_Object.rds")) RNA.tfs <- RNAAnnotateTranscriptionFactor(RNA = RNA.res$RNA, genes = RNA.res$genes, project = prefix, - method = method, rabit.path = rabitlib, lisa.mode = lisamode, + method = method, lisa.mode = lisamode, conda.dir = condadir, lisa.envname = lisaenv, organism = species, top.tf = 10) diff --git a/MAESTRO/Snakemake/scATAC/Snakefile b/MAESTRO/Snakemake/scATAC/Snakefile index 256a553..c1df0b4 100644 --- a/MAESTRO/Snakemake/scATAC/Snakefile +++ b/MAESTRO/Snakemake/scATAC/Snakefile @@ -8,7 +8,7 @@ import yaml import sys import os -from MAESTRO.scATAC_utility import get_fastqlist, ENV_PATH, SCRIPT_PATH, RSCRIPT_PATH, getfastq_10x +from MAESTRO.scATAC_utility import is_gzip, get_fastqlist, ENV_PATH, SCRIPT_PATH, RSCRIPT_PATH, getfastq_10x # def qcplot_input(wildcards): # checkpoint_output = checkpoints.scatac_samsplit.get(**wildcards).output[0] @@ -21,12 +21,25 @@ def all_input(wildcards): cluster=glob_wildcards(os.path.join(checkpoint_output, "{cluster}.bam")).cluster) # return checkpoint_output +def all_input_frag(wildcards): + checkpoint_output = checkpoints.scatac_fragcluster.get().output[0] + return expand("Result/Analysis/Cluster/{cluster}_treat_pileup.bdg", + cluster=glob_wildcards(os.path.join(checkpoint_output, "{cluster}.bed")).cluster) + + +macs2_genome = "hs" if config["species"] == "GRCh38" else "mm" + if config["clusterpeaks"]: - rule all: - input: - summaryreport = "Result/" + config["outprefix"] + "_scATAC_report.html", - peakcluster = all_input - # tfscore = "Result/Analysis/" + config["outprefix"] + "_tfscore.txt" + if config["format"] != "fragments": + rule all: + input: + summaryreport = "Result/" + config["outprefix"] + "_scATAC_report.html", + peakcluster = all_input + else: + rule all: + input: + summaryreport = "Result/" + config["outprefix"] + "_scATAC_report.html", + peakcluster = all_input_frag else: rule all: input: @@ -195,13 +208,14 @@ if config["platform"] == "microfluidic": peak = "Result/Analysis/%s_all_peaks.narrowPeak" %(config["outprefix"]), bdg = "Result/Analysis/%s_all_treat_pileup.bdg" %(config["outprefix"]), params: - name = "%s_all" %(config["outprefix"]) + name = "%s_all" %(config["outprefix"]), + genome = macs2_genome log: "Result/Log/%s_macs2_allpeak.log" %(config["outprefix"]) benchmark: "Result/Benchmark/%s_AllPeakCall.benchmark" %(config["outprefix"]) shell: - "macs2 callpeak -g hs --outdir Result/Analysis/ -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR -t {input.bam}" + "macs2 callpeak -f BAMPE -g {params.genome} --outdir Result/Analysis/ -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR --keep-dup all -t {input.bam}" if config["shortpeaks"]: rule scatac_shortfragment: @@ -224,13 +238,14 @@ if config["platform"] == "microfluidic": output: bed = "Result/Analysis/%s_150bp_peaks.narrowPeak" %(config["outprefix"]) params: - name = "%s_150bp" %(config["outprefix"]) + name = "%s_150bp" %(config["outprefix"]), + genome = macs2_genome log: "Result/Log/" + config["outprefix"] + "_macs2_shortpeak.log" benchmark: "Result/Benchmark/%s_ShortPeakCall.benchmark" %(config["outprefix"]) shell: - "macs2 callpeak -g hs --outdir Result/Analysis -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR -t {input.shortbam}" + "macs2 callpeak -f BAMPE -g {params.genome} --outdir Result/Analysis -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR --keep-dup all -t {input.shortbam}" if config["custompeaks"] and config["shortpeaks"]: rule scatac_mergepeak: @@ -317,28 +332,30 @@ if config["platform"] == "microfluidic": "--bam-dir {params.bamdir} --directory {params.outdir} --outprefix {params.outpre} --cores {threads} --species {params.species}" if config["platform"] == "10x-genomics" or config["platform"] == "sci-ATAC-seq": - if config["platform"] == "10x-genomics": - rule scatac_preprocess: - input: - fastqs = config["fastqdir"], - fasta = config["genome"]["fasta"] - output: - r1cat = temp(os.path.join(config["fastqdir"], "%s_R1.fastq" %(config["fastqprefix"]))), - r2cat = temp(os.path.join(config["fastqdir"], "%s_R2.fastq" %(config["fastqprefix"]))), - r3cat = temp(os.path.join(config["fastqdir"], "%s_R3.fastq" %(config["fastqprefix"]))), - params: - r1 = getfastq_10x(config["fastqdir"], config["fastqprefix"])["r1"], - r2 = getfastq_10x(config["fastqdir"], config["fastqprefix"])["barcode"], - r3 = getfastq_10x(config["fastqdir"], config["fastqprefix"])["r3"], - cmd = getfastq_10x(config["fastqdir"], config["fastqprefix"])["command"], - threads: - config["cores"] - benchmark: - "Result/Benchmark/%s_Preprocess.benchmark" %(config["outprefix"]) - shell: - "{params.cmd} {params.r1} > {output.r1cat};" - "{params.cmd} {params.r2} > {output.r2cat};" - "{params.cmd} {params.r3} > {output.r3cat};" + if config["format"] == "fastq" or config["format"] == "bam": + if config["format"] == "fastq": + if config["platform"] == "10x-genomics": + rule scatac_preprocess: + input: + fastqs = config["fastqdir"], + fasta = config["genome"]["fasta"] + output: + r1cat = temp(os.path.join("Result/Tmp", "%s_R1.fastq" %(config["fastqprefix"]))), + r2cat = temp(os.path.join("Result/Tmp", "%s_R2.fastq" %(config["fastqprefix"]))), + r3cat = temp(os.path.join("Result/Tmp", "%s_R3.fastq" %(config["fastqprefix"]))), + params: + r1 = getfastq_10x(config["fastqdir"], config["fastqprefix"])["r1"], + r2 = getfastq_10x(config["fastqdir"], config["fastqprefix"])["barcode"], + r3 = getfastq_10x(config["fastqdir"], config["fastqprefix"])["r3"], + cmd = getfastq_10x(config["fastqdir"], config["fastqprefix"])["command"], + threads: + config["cores"] + benchmark: + "Result/Benchmark/%s_Preprocess.benchmark" %(config["outprefix"]) + shell: + "{params.cmd} {params.r1} > {output.r1cat};" + "{params.cmd} {params.r2} > {output.r2cat};" + "{params.cmd} {params.r3} > {output.r3cat};" # # rule scatac_bulk_unique: @@ -368,114 +385,375 @@ if config["platform"] == "10x-genomics" or config["platform"] == "sci-ATAC-seq": # # "cut -f 1-3 {output.bed} | sort -k1,1 -k2,2n | uniq | grep -v 'chrM' | intersectBed -wa -a - -b {input.promoter} -u | wc -l >> {output.bulkqc};" # # "cut -f 1-3 {output.bed} | sort -k1,1 -k2,2n | uniq | grep -v 'chrM' | intersectBed -wa -a - -b {input.peak} -u | wc -l >> {output.bulkqc};" - if config["platform"] == "sci-ATAC-seq": - rule scatac_preprocess: + if config["platform"] == "sci-ATAC-seq": + # support pair-end sequencing fastqs (_1.fastq, _2.fastq) + rule scatac_preprocess: + input: + fasta = config["genome"]["fasta"], + fastq1 = os.path.join(config["fastqdir"], "%s_1.fastq" %(config["fastqprefix"])), + fastq2 = os.path.join(config["fastqdir"], "%s_2.fastq" %(config["fastqprefix"])), + output: + r1 = temp(os.path.join("Result/Tmp", "%s_R1.fastq" %(config["fastqprefix"]))), + r2 = temp(os.path.join("Result/Tmp", "%s_R2.fastq" %(config["fastqprefix"]))), + r3 = temp(os.path.join("Result/Tmp", "%s_R3.fastq" %(config["fastqprefix"]))), + params: + outdir = "Result/Tmp" + benchmark: + "Result/Benchmark/%s_Preprocess.benchmark" %(config["outprefix"]) + shell: + "python " + SCRIPT_PATH + "/scATAC_sci_BarcodeExtract.py --R1 {input.fastq1} --R2 {input.fastq2} -O {params.outdir}" + + # support 10-like input (_R1.fatsq, _R2.fastq, _R3.fastq) + rule scatac_preprocess_fastqrename: + input: + fasta = config["genome"]["fasta"], + r1 = os.path.join(config["fastqdir"], "%s_R1.fastq" %(config["fastqprefix"])), + r2 = os.path.join(config["fastqdir"], "%s_R2.fastq" %(config["fastqprefix"])), + r3 = os.path.join(config["fastqdir"], "%s_R3.fastq" %(config["fastqprefix"])), + output: + r1 = temp(os.path.join("Result/Tmp", "%s_R1.fastq" %(config["fastqprefix"]))), + r2 = temp(os.path.join("Result/Tmp", "%s_R2.fastq" %(config["fastqprefix"]))), + r3 = temp(os.path.join("Result/Tmp", "%s_R3.fastq" %(config["fastqprefix"]))), + params: + outdir = "Result/Tmp" + benchmark: + "Result/Benchmark/%s_Preprocess.benchmark" %(config["outprefix"]) + shell: + "cp {input.r1} {input.r2} {input.r3} {params.outdir};" + + rule scatac_fqaddbarcode: + input: + r1 = os.path.join("Result/Tmp", "%s_R1.fastq" %(config["fastqprefix"])), + r2 = os.path.join("Result/Tmp", "%s_R2.fastq" %(config["fastqprefix"])), + r3 = os.path.join("Result/Tmp", "%s_R3.fastq" %(config["fastqprefix"])), + output: + r1 = temp(os.path.join("Result/Tmp", "%s_R1.barcoded.fastq" %(config["fastqprefix"]))), + r3 = temp(os.path.join("Result/Tmp", "%s_R3.barcoded.fastq" %(config["fastqprefix"]))), + # r1 = "%s/%s_R1.barcoded.fastq" %(config["fastqdir"], config["fastqprefix"]), + # r3 = "%s/%s_R3.barcoded.fastq" %(config["fastqdir"], config["fastqprefix"]), + benchmark: + "Result/Benchmark/%s_FqAddbarcode.benchmark" %(config["outprefix"]) + shell: + "base=`head -n 2 {input.r2} | tail -n 1 | wc -L`;" + "sinto barcode --barcode_fastq {input.r2} --read1 {input.r1} --read2 {input.r3} -b $base;" + + rule scatac_map: + input: + fasta = config["genome"]["fasta"], + r1 = os.path.join("Result/Tmp", "%s_R1.barcoded.fastq" %(config["fastqprefix"])), + r3 = os.path.join("Result/Tmp", "%s_R3.barcoded.fastq" %(config["fastqprefix"])), + output: + bam = temp("Result/minimap2/%s.sortedByPos.bam" %(config["outprefix"])), + threads: + config["cores"] + benchmark: + "Result/Benchmark/%s_Minimap2.benchmark" %(config["outprefix"]) + shell: + "minimap2 -ax sr -t {threads} {input.fasta} {input.r1} {input.r3} " + "| samtools view --threads {threads} -b " + "| samtools sort --threads {threads} -o {output.bam};" + + rule scatac_fragmentgenerate: + input: + bam = "Result/minimap2/%s.sortedByPos.bam" %(config["outprefix"]), + output: + fragments = "Result/minimap2/fragments.tsv", + bam = "Result/minimap2/%s.sortedByPos.CRadded.bam" %(config["outprefix"]), + params: + outdir = "Result/minimap2" + benchmark: + "Result/Benchmark/%s_FragGenerate.benchmark" %(config["outprefix"]) + shell: + "python " + SCRIPT_PATH + "/scATAC_FragmentGenerate.py -B {input.bam} -O {params.outdir} --addtag CR" + + if config["whitelist"]: + rule scatac_barcodecorrect: + input: + r2 = os.path.join("Result/Tmp", "%s_R2.fastq" %(config["fastqprefix"])), + whitelist = config["whitelist"], + output: + bc_correct = "Result/minimap2/barcode_correct.txt", + bc_correct_uniq = "Result/minimap2/barcode_correct_uniq.txt", + params: + outdir = "Result/minimap2" + benchmark: + "Result/Benchmark/%s_BarcodeCorrect.benchmark" %(config["outprefix"]) + shell: + "python " + SCRIPT_PATH + "/scATAC_10x_BarcodeCorrect.py -b {input.r2} -B {input.whitelist} -O {params.outdir};" + "sort -k1,1 -k3,3 {output.bc_correct} | uniq > {output.bc_correct_uniq}" + else: + rule scatac_barcodecorrect: + input: + r2 = os.path.join("Result/Tmp", "%s_R2.fastq" %(config["fastqprefix"])), + output: + bc_correct = "Result/minimap2/barcode_correct.txt", + bc_correct_uniq = "Result/minimap2/barcode_correct_uniq.txt", + params: + outdir = "Result/minimap2" + benchmark: + "Result/Benchmark/%s_BarcodeCorrect.benchmark" %(config["outprefix"]) + shell: + "python " + SCRIPT_PATH + "/scATAC_10x_BarcodeCorrect.py -b {input.r2} -O {params.outdir};" + "sort -k1,1 -k3,3 {output.bc_correct} | uniq > {output.bc_correct_uniq}" + + rule scatac_fragmentcorrect: + input: + fragments = "Result/minimap2/fragments.tsv", + bc_correct = "Result/minimap2/barcode_correct.txt" + output: + frag_count = "Result/minimap2/fragments_corrected_count.tsv", + frag_sort = "Result/minimap2/fragments_corrected_sorted.tsv" + params: + outdir = "Result/minimap2", + frag_correct = "Result/minimap2/fragments_corrected.tsv", + benchmark: + "Result/Benchmark/%s_FragCorrect.benchmark" %(config["outprefix"]) + shell: + "python " + SCRIPT_PATH + "/scATAC_FragmentCorrect.py -F {input.fragments} -C {input.bc_correct} -O {params.outdir};" + "sort -k1,1 -k2,2 -k3,3 -k4,4 -V {params.frag_correct} > {output.frag_sort};" + "bedtools groupby -i {output.frag_sort} -g 1,2,3,4 -c 4 -o count > {output.frag_count}" + + rule scatac_rmdp: + input: + bam = "Result/minimap2/%s.sortedByPos.CRadded.bam" %(config["outprefix"]), + output: + bam = "Result/minimap2/%s.sortedByPos.CRadded.rmdp.bam" %(config["outprefix"]), + metric = "Result/minimap2/%s.rmdp.txt" %(config["outprefix"]), + fragbed = "Result/QC/%s_frag.bed" %(config["outprefix"]), + params: + sam = "Result/minimap2/%s.sortedByPos.CRadded.rmdp.sample.sam" %(config["outprefix"]), + tmp = "Result/Tmp", + threads: + int(config["cores"])-2 + benchmark: + "Result/Benchmark/%s_Rmdp.benchmark" %(config["outprefix"]) + shell: + "picard MarkDuplicates INPUT={input.bam} OUTPUT={output.bam} METRICS_FILE={output.metric} TMP_DIR={params.tmp};" + "samtools view -@ {threads} -s 0.01 -o {params.sam} {input.bam};" + "awk '{{if ($9>0) print $9}}' {params.sam} > {output.fragbed};" + + rule scatac_bamaddCB: + input: + bam = "Result/minimap2/%s.sortedByPos.CRadded.rmdp.bam" %(config["outprefix"]), + bc_correct = "Result/minimap2/barcode_correct_uniq.txt" + output: + bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]), + params: + outdir = "Result/minimap2", + outprefix = "%s.sortedByPos.rmdp.CBadded" %(config["outprefix"]) + threads: + int(config["cores"])-2 + benchmark: + "Result/Benchmark/%s_BamAddCB.benchmark" %(config["outprefix"]) + shell: + "python " + SCRIPT_PATH + "/scATAC_BamAddTag.py -B {input.bam} -T {input.bc_correct} -C CR " + "-O {params.outdir} -P {params.outprefix};" + + if config["format"] == "bam": + rule scatac_fragmentgenerate: + input: + bam = config["bam"], + output: + fragments = "Result/minimap2/fragments_corrected_count.tsv", + params: + outdir = "Result/minimap2", + benchmark: + "Result/Benchmark/%s_FragGenerate.benchmark" %(config["outprefix"]) + shell: + "python " + SCRIPT_PATH + "/scATAC_FragmentGenerate.py -B {input.bam} -O {params.outdir} --CBtag CB --count;" + + rule scatac_rmdp: + input: + bam = config["bam"], + output: + bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]), + metric = "Result/minimap2/%s.rmdp.txt" %(config["outprefix"]), + fragbed = "Result/QC/%s_frag.bed" %(config["outprefix"]), + params: + sam = "Result/minimap2/%s.sortedByPos.CRadded.rmdp.sample.sam" %(config["outprefix"]), + tmp = "Result/Tmp", + threads: + int(config["cores"])-2 + benchmark: + "Result/Benchmark/%s_Rmdp.benchmark" %(config["outprefix"]) + shell: + "picard MarkDuplicates INPUT={input.bam} OUTPUT={output.bam} METRICS_FILE={output.metric} TMP_DIR={params.tmp};" + "samtools view -@ {threads} -s 0.01 -o {params.sam} {input.bam};" + "awk '{{if ($9>0) print $9}}' {params.sam} > {output.fragbed};" + + rule scatac_allpeakcall: input: - fasta = config["genome"]["fasta"], - fastq1 = os.path.join(config["fastqdir"], "%s_1.fastq" %(config["fastqprefix"])), - fastq2 = os.path.join(config["fastqdir"], "%s_2.fastq" %(config["fastqprefix"])), + bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]) output: - r1 = os.path.join(config["fastqdir"], "%s_R1.fastq" %(config["fastqprefix"])), - r2 = os.path.join(config["fastqdir"], "%s_R2.fastq" %(config["fastqprefix"])), - r3 = os.path.join(config["fastqdir"], "%s_R3.fastq" %(config["fastqprefix"])), + peak = "Result/Analysis/%s_all_peaks.narrowPeak" %(config["outprefix"]), + bdg = "Result/Analysis/%s_all_treat_pileup.bdg" %(config["outprefix"]), params: - outdir = config["fastqdir"] + name = "%s_all" %(config["outprefix"]), + genome = macs2_genome + log: + "Result/Log/%s_macs2_allpeak.log" %(config["outprefix"]) benchmark: - "Result/Benchmark/%s_Preprocess.benchmark" %(config["outprefix"]) + "Result/Benchmark/%s_AllPeakCall.benchmark" %(config["outprefix"]) shell: - "python " + SCRIPT_PATH + "/scATAC_sci_BarcodeExtract.py --R1 {input.fastq1} --R2 {input.fastq2} -O {params.outdir}" + "macs2 callpeak -f BAMPE -g {params.genome} --outdir Result/Analysis -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR --keep-dup all -t {input.bam}" - rule scatac_fqaddbarcode: - input: - r1 = os.path.join(config["fastqdir"], "%s_R1.fastq" %(config["fastqprefix"])), - r2 = os.path.join(config["fastqdir"], "%s_R2.fastq" %(config["fastqprefix"])), - r3 = os.path.join(config["fastqdir"], "%s_R3.fastq" %(config["fastqprefix"])), - output: - r1 = temp(os.path.join(config["fastqdir"], "%s_R1.barcoded.fastq" %(config["fastqprefix"]))), - r3 = temp(os.path.join(config["fastqdir"], "%s_R3.barcoded.fastq" %(config["fastqprefix"]))), - # r1 = "%s/%s_R1.barcoded.fastq" %(config["fastqdir"], config["fastqprefix"]), - # r3 = "%s/%s_R3.barcoded.fastq" %(config["fastqdir"], config["fastqprefix"]), - benchmark: - "Result/Benchmark/%s_FqAddbarcode.benchmark" %(config["outprefix"]) - shell: - "base=`head -n 2 {input.r2} | tail -n 1 | wc -L`;" - "sinto barcode --barcode_fastq {input.r2} --read1 {input.r1} --read2 {input.r3} -b $base;" + rule scatac_qcstat_bulk: + input: + bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]), + promoter = "%s/annotations/%s_promoter.bed" %(SCRIPT_PATH, config["species"]), + peak = "Result/Analysis/%s_all_peaks.narrowPeak" %(config["outprefix"]), + output: + bulk_stat = "Result/QC/flagstat.txt", + bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.unique.bam" %(config["outprefix"]), + bed = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.unique.bed" %(config["outprefix"]) + threads: + config["cores"] + benchmark: + "Result/Benchmark/%s_BulkQCStat.benchmark" %(config["outprefix"]) + shell: + "samtools flagstat --threads {threads} {input.bam} > {output.bulk_stat};" + "samtools view -F 2316 -f 0x2 -q 30 -b -o {output.bam} {input.bam};" + "samtools view {output.bam} -c >> {output.bulk_stat};" + "bedtools bamtobed -i {output.bam} > {output.bed};" + "grep 'chrM' {output.bed} -c >> {output.bulk_stat} || true;" + "grep -v 'chrM' {output.bed} | bedtools intersect -wa -a - -b {input.promoter} -u | wc -l >> {output.bulk_stat} || true;" + "grep -v 'chrM' {output.bed} | bedtools intersect -wa -a - -b {input.peak} -u | wc -l >> {output.bulk_stat} || true ;" - rule scatac_map: - input: - fasta = config["genome"]["fasta"], - r1 = os.path.join(config["fastqdir"], "%s_R1.barcoded.fastq" %(config["fastqprefix"])), - r3 = os.path.join(config["fastqdir"], "%s_R3.barcoded.fastq" %(config["fastqprefix"])), - output: - bam = temp("Result/minimap2/%s.sortedByPos.bam" %(config["outprefix"])), - threads: - config["cores"] - benchmark: - "Result/Benchmark/%s_Minimap2.benchmark" %(config["outprefix"]) - shell: - "minimap2 -ax sr -t {threads} {input.fasta} {input.r1} {input.r3} " - "| samtools view --threads {threads} -b " - "| samtools sort --threads {threads} -o {output.bam};" + if config["shortpeaks"]: + rule scatac_shortfragment: + input: + bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]) + output: + shortbam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.150bp.bam" %(config["outprefix"]) + threads: + config["cores"] + benchmark: + "Result/Benchmark/%s_ShortFrag.benchmark" %(config["outprefix"]) + shell: + "samtools view -@ {threads} -h {input.bam} | " + "awk -F'\\t' 'function abs(x){{return ((x < 0.0) ? -x : x)}} {{if (abs($9)<=150) print}}' | " + "samtools view -@ {threads} -b -o {output.shortbam}" + + rule scatac_shortpeakcall: + input: + shortbam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.150bp.bam" %(config["outprefix"]) + output: + bed = "Result/Analysis/%s_150bp_peaks.narrowPeak" %(config["outprefix"]) + params: + name = "%s_150bp" %(config["outprefix"]), + genome = macs2_genome + log: + "Result/Log/%s_macs2_shortpeak.log" %(config["outprefix"]) + benchmark: + "Result/Benchmark/%s_ShortPeakCall.benchmark" %(config["outprefix"]) + shell: + "macs2 callpeak -f BAMPE -g {params.genome} --outdir Result/Analysis -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR --keep-dup all -t {input.shortbam}" - rule scatac_fragmentgenerate: - input: - bam = "Result/minimap2/%s.sortedByPos.bam" %(config["outprefix"]), - output: - fragments = "Result/minimap2/fragments.tsv", - bam = "Result/minimap2/%s.sortedByPos.CRadded.bam" %(config["outprefix"]), - params: - outdir = "Result/minimap2" - benchmark: - "Result/Benchmark/%s_FragGenerate.benchmark" %(config["outprefix"]) - shell: - "python " + SCRIPT_PATH + "/scATAC_FragmentGenerate.py -B {input.bam} -O {params.outdir} --addtag CR" + else: + if is_gzip(config["frag"]): + rule scatac_fragmentcp: + input: + frag = config["frag"] + output: + frag = "Result/minimap2/fragments_corrected_count.tsv" + benchmark: + "Result/Benchmark/%s_FragCP.benchmark" %(config["outprefix"]) + shell: + "gzip -c -d {input.frag} > {output.frag}" + else: + rule scatac_fragmentcp: + input: + frag = config["frag"] + output: + frag = "Result/minimap2/fragments_corrected_count.tsv" + benchmark: + "Result/Benchmark/%s_FragCP.benchmark" %(config["outprefix"]) + shell: + "cp -T {input.frag} {output.frag}" - if config["whitelist"]: - rule scatac_barcodecorrect: + rule scatac_fragmentreshape: input: - r2 = os.path.join(config["fastqdir"], "%s_R2.fastq" %(config["fastqprefix"])), - whitelist = config["whitelist"], + frag = "Result/minimap2/fragments_corrected_count.tsv" output: - bc_correct = "Result/minimap2/barcode_correct.txt", - bc_correct_uniq = "Result/minimap2/barcode_correct_uniq.txt", + frag = "Result/minimap2/fragments_BEDPE.bed" params: outdir = "Result/minimap2" benchmark: - "Result/Benchmark/%s_BarcodeCorrect.benchmark" %(config["outprefix"]) + "Result/Benchmark/%s_FragReshape.benchmark" %(config["outprefix"]) shell: - "python " + SCRIPT_PATH + "/scATAC_10x_BarcodeCorrect.py -b {input.r2} -B {input.whitelist} -O {params.outdir};" - "sort -k1,1 -k3,3 {output.bc_correct} | uniq > {output.bc_correct_uniq}" - else: - rule scatac_barcodecorrect: + "python " + SCRIPT_PATH + "/scATAC_FragmentReshape.py -F {input.frag} -O {params.outdir}" + + rule scatac_fragmentsample: + input: + frag = "Result/minimap2/fragments_BEDPE.bed" + output: + fragbed = "Result/QC/%s_frag.bed" %(config["outprefix"]), + benchmark: + "Result/Benchmark/%s_FragSample.benchmark" %(config["outprefix"]) + shell: + "frag_num=`wc -l {input.frag} | awk '{{print $1}}'`;" + "frag_num_sample=`expr $frag_num / 100`;" + "shuf -n $frag_num_sample {input.frag} | awk '{{print ($3-$2)}}' > {output.fragbed}" + + rule scatac_allpeakcall: input: - r2 = os.path.join(config["fastqdir"], "%s_R2.fastq" %(config["fastqprefix"])), + frag = "Result/minimap2/fragments_BEDPE.bed" output: - bc_correct = "Result/minimap2/barcode_correct.txt", - bc_correct_uniq = "Result/minimap2/barcode_correct_uniq.txt", + peak = "Result/Analysis/%s_all_peaks.narrowPeak" %(config["outprefix"]), + bdg = "Result/Analysis/%s_all_treat_pileup.bdg" %(config["outprefix"]), params: - outdir = "Result/minimap2" + name = "%s_all" %(config["outprefix"]), + genome = macs2_genome + log: + "Result/Log/%s_macs2_allpeak.log" %(config["outprefix"]) benchmark: - "Result/Benchmark/%s_BarcodeCorrect.benchmark" %(config["outprefix"]) + "Result/Benchmark/%s_AllPeakCall.benchmark" %(config["outprefix"]) shell: - "python " + SCRIPT_PATH + "/scATAC_10x_BarcodeCorrect.py -b {input.r2} -O {params.outdir};" - "sort -k1,1 -k3,3 {output.bc_correct} | uniq > {output.bc_correct_uniq}" + "macs2 callpeak -f BEDPE -g {params.genome} --outdir Result/Analysis -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR --keep-dup all -t {input.frag}" - rule scatac_fragmentcorrect: - input: - fragments = "Result/minimap2/fragments.tsv", - bc_correct = "Result/minimap2/barcode_correct.txt" - output: - frag_count = "Result/minimap2/fragments_corrected_count.tsv", - frag_sort = "Result/minimap2/fragments_corrected_sorted.tsv" - params: - outdir = "Result/minimap2", - frag_correct = "Result/minimap2/fragments_corrected.tsv", - benchmark: - "Result/Benchmark/%s_FragCorrect.benchmark" %(config["outprefix"]) - shell: - "python " + SCRIPT_PATH + "/scATAC_FragmentCorrect.py -F {input.fragments} -C {input.bc_correct} -O {params.outdir};" - "sort -k1,1 -k2,2 -k3,3 -k4,4 -V {params.frag_correct} > {output.frag_sort};" - "bedtools groupby -i {output.frag_sort} -g 1,2,3,4 -c 4 -o count > {output.frag_count}" + rule scatac_qcstat_bulk: + input: + frag = "Result/minimap2/fragments_BEDPE.bed", + promoter = "%s/annotations/%s_promoter.bed" %(SCRIPT_PATH, config["species"]), + peak = "Result/Analysis/%s_all_peaks.narrowPeak" %(config["outprefix"]), + output: + bulk_stat = "Result/QC/flagstat.txt", + threads: + config["cores"] + benchmark: + "Result/Benchmark/%s_BulkQCStat.benchmark" %(config["outprefix"]) + shell: + "grep 'chrM' {input.frag} -c >> {output.bulk_stat} || true;" + "grep -v 'chrM' {input.frag} | bedtools intersect -wa -a - -b {input.promoter} -u | wc -l >> {output.bulk_stat} || true;" + "grep -v 'chrM' {input.frag} | bedtools intersect -wa -a - -b {input.peak} -u | wc -l >> {output.bulk_stat} || true ;" + + if config["shortpeaks"]: + rule scatac_shortfragment: + input: + frag = "Result/minimap2/fragments_BEDPE.bed" + output: + shortfrag = "Result/minimap2/fragments_BEDPE_150bp.bed" + threads: + config["cores"] + benchmark: + "Result/Benchmark/%s_ShortFrag.benchmark" %(config["outprefix"]) + shell: + "awk -F'\\t' '{{if (($3-$2)<=150) print}}' {input.frag} > {output.shortfrag}" + + rule scatac_shortpeakcall: + input: + shortfrag = "Result/minimap2/fragments_BEDPE_150bp.bed" + output: + bed = "Result/Analysis/%s_150bp_peaks.narrowPeak" %(config["outprefix"]) + params: + name = "%s_150bp" %(config["outprefix"]), + genome = macs2_genome + log: + "Result/Log/%s_macs2_shortpeak.log" %(config["outprefix"]) + benchmark: + "Result/Benchmark/%s_ShortPeakCall.benchmark" %(config["outprefix"]) + shell: + "macs2 callpeak -f BEDPE -g {params.genome} --outdir Result/Analysis -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR --keep-dup all -t {input.shortfrag}" rule scatac_qcstat_promoter: input: @@ -522,108 +800,6 @@ if config["platform"] == "10x-genomics" or config["platform"] == "sci-ATAC-seq": "sort -k1,1 {input.promoter_stat} > {params.promoter_stat_sort};" "join --nocheck-order -t $'\t' -a1 -e'0' -o'1.1 1.2 2.2' -1 1 -2 1 {params.mapped_stat_sort} {params.promoter_stat_sort} > {output.single_stat}" - rule scatac_rmdp: - input: - bam = "Result/minimap2/%s.sortedByPos.CRadded.bam" %(config["outprefix"]), - output: - bam = "Result/minimap2/%s.sortedByPos.CRadded.rmdp.bam" %(config["outprefix"]), - metric = "Result/minimap2/%s.rmdp.txt" %(config["outprefix"]), - fragbed = "Result/QC/%s_frag.bed" %(config["outprefix"]), - tmp = temp(directory("Result/Tmp")) - params: - sam = "Result/minimap2/%s.sortedByPos.CRadded.rmdp.sample.sam" %(config["outprefix"]) - threads: - int(config["cores"])-2 - benchmark: - "Result/Benchmark/%s_Rmdp.benchmark" %(config["outprefix"]) - shell: - "picard MarkDuplicates INPUT={input.bam} OUTPUT={output.bam} METRICS_FILE={output.metric} TMP_DIR={output.tmp};" - "samtools view -@ {threads} -s 0.01 -o {params.sam} {input.bam};" - "awk '{{if ($9>0) print $9}}' {params.sam} > {output.fragbed};" - - rule scatac_bamaddCB: - input: - bam = "Result/minimap2/%s.sortedByPos.CRadded.rmdp.bam" %(config["outprefix"]), - bc_correct = "Result/minimap2/barcode_correct_uniq.txt" - output: - bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]), - params: - outdir = "Result/minimap2", - outprefix = "%s.sortedByPos.rmdp.CBadded" %(config["outprefix"]) - threads: - int(config["cores"])-2 - benchmark: - "Result/Benchmark/%s_BamAddCB.benchmark" %(config["outprefix"]) - shell: - "python " + SCRIPT_PATH + "/scATAC_BamAddTag.py -B {input.bam} -T {input.bc_correct} -C CR " - "-O {params.outdir} -P {params.outprefix};" - - rule scatac_allpeakcall: - input: - bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]) - output: - peak = "Result/Analysis/%s_all_peaks.narrowPeak" %(config["outprefix"]), - bdg = "Result/Analysis/%s_all_treat_pileup.bdg" %(config["outprefix"]), - params: - name = "%s_all" %(config["outprefix"]) - log: - "Result/Log/%s_macs2_allpeak.log" %(config["outprefix"]) - benchmark: - "Result/Benchmark/%s_AllPeakCall.benchmark" %(config["outprefix"]) - shell: - "macs2 callpeak -g hs --outdir Result/Analysis -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR -t {input.bam}" - - rule scatac_qcstat_bulk: - input: - bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]), - promoter = "%s/annotations/%s_promoter.bed" %(SCRIPT_PATH, config["species"]), - peak = "Result/Analysis/%s_all_peaks.narrowPeak" %(config["outprefix"]), - output: - bulk_stat = "Result/QC/flagstat.txt", - bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.unique.bam" %(config["outprefix"]), - bed = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.unique.bed" %(config["outprefix"]) - threads: - config["cores"] - benchmark: - "Result/Benchmark/%s_BulkQCStat.benchmark" %(config["outprefix"]) - shell: - "samtools flagstat --threads {threads} {input.bam} > {output.bulk_stat};" - "samtools view -F 2316 -f 0x2 -q 30 -b -o {output.bam} {input.bam};" - "samtools view {output.bam} -c >> {output.bulk_stat};" - "bedtools bamtobed -i {output.bam} > {output.bed};" - "grep 'chrM' {output.bed} -c >> {output.bulk_stat} || true;" - "grep -v 'chrM' {output.bed} | bedtools intersect -wa -a - -b {input.promoter} -u | wc -l >> {output.bulk_stat} || true;" - "grep -v 'chrM' {output.bed} | bedtools intersect -wa -a - -b {input.peak} -u | wc -l >> {output.bulk_stat} || true ;" - - if config["shortpeaks"]: - rule scatac_shortfragment: - input: - bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]) - output: - shortbam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.150bp.bam" %(config["outprefix"]) - threads: - config["cores"] - benchmark: - "Result/Benchmark/%s_ShortFrag.benchmark" %(config["outprefix"]) - shell: - "samtools view -@ {threads} -h {input.bam} | " - "awk -F'\\t' 'function abs(x){{return ((x < 0.0) ? -x : x)}} {{if (abs($9)<=150) print}}' | " - "samtools view -@ {threads} -b -o {output.shortbam}" - - rule scatac_shortpeakcall: - input: - shortbam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.150bp.bam" %(config["outprefix"]) - output: - bed = "Result/Analysis/%s_150bp_peaks.narrowPeak" %(config["outprefix"]) - params: - name = "%s_150bp" %(config["outprefix"]) - log: - "Result/Log/%s_macs2_shortpeak.log" %(config["outprefix"]) - benchmark: - "Result/Benchmark/%s_ShortPeakCall.benchmark" %(config["outprefix"]) - shell: - "macs2 callpeak -g hs --outdir Result/Analysis -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR -t {input.shortbam}" - if config["custompeaks"] and config["shortpeaks"]: rule scatac_mergepeak: input: @@ -688,7 +864,7 @@ if config["platform"] == "10x-genomics" or config["platform"] == "sci-ATAC-seq": "| sort -k1,1 -k2,2n | cut -f 1-4 > {params.catpeaksort};" "mergeBed -i {params.catpeaksort} | grep -v '_' | grep -v 'chrEBV' > {output.finalpeak};" "rm {params.catpeaksort}" - + rule scatac_countpeak: input: finalpeak = "Result/Analysis/%s_final_peaks.bed" %(config["outprefix"]), @@ -914,31 +1090,55 @@ if config["platform"] == "10x-genomics" or config["platform"] == "sci-ATAC-seq": # # shell: # # "python " + SCRIPT_PATH + "/scATAC_microfluidic_count.py {input.finalpeak} {input.validbarcode} {params.bamdir} {output.count} {threads} {params.species}" -rule scatac_qcplot: - input: - fragbed = "Result/QC/%s_frag.bed" %(config["outprefix"]), - single_stat = "Result/QC/singlecell.txt", - bulk_stat = "Result/QC/flagstat.txt" - output: - readdistr = "Result/QC/" + config["outprefix"] + "_scATAC_read_distr.png", - qcfrag = "Result/QC/" + config["outprefix"] + "_scATAC_fragment_size.png", - qcfrip = "Result/QC/" + config["outprefix"] + "_scATAC_cell_filtering.png", - validbarcode = "Result/QC/" + config["outprefix"] + "_scATAC_validcells.txt", - params: - outdir = "Result/QC", - outpre = config["outprefix"], - fragbed = "%s_frag.bed" %(config["outprefix"]), - single_stat = "singlecell.txt", - bulk_stat = "flagstat.txt", - count = config["cutoff"]["count"], - frip = config["cutoff"]["frip"] - threads: - config["cores"] - benchmark: - "Result/Benchmark/%s_QCPlot.benchmark" %(config["outprefix"]) - shell: - "Rscript " + RSCRIPT_PATH + "/scATACseq_qc.R --bulkstat {params.bulk_stat} --fragment {params.fragbed} --singlestat {params.single_stat} " - "--countcutoff {params.count} --fripcutoff {params.frip} --prefix {params.outpre} --outdir {params.outdir}" +if config["format"] != "fragments": + rule scatac_qcplot: + input: + fragbed = "Result/QC/%s_frag.bed" %(config["outprefix"]), + single_stat = "Result/QC/singlecell.txt", + bulk_stat = "Result/QC/flagstat.txt" + output: + readdistr = "Result/QC/" + config["outprefix"] + "_scATAC_read_distr.png", + qcfrag = "Result/QC/" + config["outprefix"] + "_scATAC_fragment_size.png", + qcfrip = "Result/QC/" + config["outprefix"] + "_scATAC_cell_filtering.png", + validbarcode = "Result/QC/" + config["outprefix"] + "_scATAC_validcells.txt", + params: + outdir = "Result/QC", + outpre = config["outprefix"], + fragbed = "%s_frag.bed" %(config["outprefix"]), + single_stat = "singlecell.txt", + bulk_stat = "flagstat.txt", + count = config["cutoff"]["count"], + frip = config["cutoff"]["frip"] + threads: + config["cores"] + benchmark: + "Result/Benchmark/%s_QCPlot.benchmark" %(config["outprefix"]) + shell: + "Rscript " + RSCRIPT_PATH + "/scATACseq_qc.R --bulkstat {params.bulk_stat} --fragment {params.fragbed} --singlestat {params.single_stat} " + "--countcutoff {params.count} --fripcutoff {params.frip} --prefix {params.outpre} --outdir {params.outdir}" +else: + rule scatac_qcplot: + input: + fragbed = "Result/QC/%s_frag.bed" %(config["outprefix"]), + single_stat = "Result/QC/singlecell.txt", + output: + qcfrag = "Result/QC/" + config["outprefix"] + "_scATAC_fragment_size.png", + qcfrip = "Result/QC/" + config["outprefix"] + "_scATAC_cell_filtering.png", + validbarcode = "Result/QC/" + config["outprefix"] + "_scATAC_validcells.txt", + params: + outdir = "Result/QC", + outpre = config["outprefix"], + fragbed = "%s_frag.bed" %(config["outprefix"]), + single_stat = "singlecell.txt", + count = config["cutoff"]["count"], + frip = config["cutoff"]["frip"] + threads: + config["cores"] + benchmark: + "Result/Benchmark/%s_QCPlot.benchmark" %(config["outprefix"]) + shell: + "Rscript " + RSCRIPT_PATH + "/scATACseq_qc.R --fragment {params.fragbed} --singlestat {params.single_stat} " + "--countcutoff {params.count} --fripcutoff {params.frip} --prefix {params.outpre} --outdir {params.outdir}" rule scatac_qcfilter: input: @@ -1007,84 +1207,148 @@ rule scatac_analysis: fraggz = "../minimap2/fragments_corrected_count.tsv.gz", giggleannotation = config["giggleannotation"], species = config["species"], - signature = config["signature"] + signature = config["signature"], + method = config["method"], + annotation = config["annotation"], threads: config["cores"] benchmark: "Result/Benchmark/%s_Analysis.benchmark" %(config["outprefix"]) shell: "Rscript " + RSCRIPT_PATH + "/scATACseq_pipe.R --peakcount {params.count} --rpmatrix {params.genescore} " - "--species {params.species} --prefix {params.outpre} --signature {params.signature} " + "--species {params.species} --prefix {params.outpre} --annotation {params.annotation} --method {params.method} --signature {params.signature} " "--gigglelib {params.giggleannotation} --fragment {params.fraggz} --outdir {params.outdir} --thread {threads}" -rule scatac_bamindex: - input: - bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]), - output: - bai = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam.bai" %(config["outprefix"]), - threads: - config["cores"] - benchmark: - "Result/Benchmark/%s_BamIndex.benchmark" %(config["outprefix"]) - shell: - "samtools index -@ {threads} {input.bam}" +if config["format"] != "fragments": + rule scatac_bamindex: + input: + bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]), + output: + bai = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam.bai" %(config["outprefix"]), + threads: + config["cores"] + benchmark: + "Result/Benchmark/%s_BamIndex.benchmark" %(config["outprefix"]) + shell: + "samtools index -@ {threads} {input.bam}" -checkpoint scatac_bamcluster: - input: - bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]), - bai = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam.bai" %(config["outprefix"]), - cellcluster = "Result/Analysis/%s_cell_cluster.txt" %(config["outprefix"]), - output: - directory("Result/Analysis/Cluster/") - threads: - config["cores"] - benchmark: - "Result/Benchmark/%s_BamCluster.benchmark" %(config["outprefix"]) - shell: - "sinto filterbarcodes -b {input.bam} -c {input.cellcluster} -p {threads} --barcodetag CB;" - "mv Cluster_*.bam {output}" + checkpoint scatac_bamcluster: + input: + bam = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam" %(config["outprefix"]), + bai = "Result/minimap2/%s.sortedByPos.rmdp.CBadded.bam.bai" %(config["outprefix"]), + cellcluster = "Result/Analysis/%s_cell_cluster.txt" %(config["outprefix"]), + output: + directory("Result/Analysis/Cluster/") + threads: + config["cores"] + benchmark: + "Result/Benchmark/%s_BamCluster.benchmark" %(config["outprefix"]) + shell: + "sinto filterbarcodes -b {input.bam} -c {input.cellcluster} -p {threads} --barcodetag CB;" + "mv Cluster_*.bam {output}" -rule scatac_clusterpeakcall: - input: - bam = "Result/Analysis/Cluster/{cluster}.bam" - output: - peak = "Result/Analysis/Cluster/{cluster}_peaks.narrowPeak", - bdg = "Result/Analysis/Cluster/{cluster}_treat_pileup.bdg", - params: - name = "{cluster}", - outdir = "Result/Analysis/Cluster" - log: - "Result/Log/{cluster}_macs2_allpeak.log" - benchmark: - "Result/Benchmark/{cluster}_AllPeakCall.benchmark" - shell: - "macs2 callpeak -g hs --outdir {params.outdir} -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR -t {input.bam}" + rule scatac_clusterpeakcall: + input: + bam = "Result/Analysis/Cluster/{cluster}.bam" + output: + peak = "Result/Analysis/Cluster/{cluster}_peaks.narrowPeak", + bdg = "Result/Analysis/Cluster/{cluster}_treat_pileup.bdg", + params: + name = "{cluster}", + outdir = "Result/Analysis/Cluster", + genome = macs2_genome + log: + "Result/Log/{cluster}_macs2_allpeak.log" + benchmark: + "Result/Benchmark/{cluster}_AllPeakCall.benchmark" + shell: + "macs2 callpeak -f BAMPE -g {params.genome} --outdir {params.outdir} -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR --keep-dup all -t {input.bam}" -rule scatac_report: - input: - # bulkqc = "Result/QC/" + config["outprefix"] + "_bam_stat.txt", - readdistr = "Result/QC/%s_scATAC_read_distr.png" %(config["outprefix"]), - qcfrag = "Result/QC/%s_scATAC_fragment_size.png" %(config["outprefix"]), - qcfrip = "Result/QC/%s_scATAC_cell_filtering.png" %(config["outprefix"]), - count = "Result/QC/%s_filtered_peak_count.h5" %(config["outprefix"]), - clusterplot = "Result/Analysis/%s_cluster.png" %(config["outprefix"]), - annotateplot = "Result/Analysis/%s_annotated.png" %(config["outprefix"]), - genescore = "Result/Analysis/%s_gene_score.h5" %(config["outprefix"]), - tflist = "Result/Analysis/%s.PredictedTFTop10.txt" %(config["outprefix"]), - # acannotateplot = "Result/Analysis/%s_CistromeTop_annotated.png" %(config["outprefix"]), - # ms4a1trackplot = "Result/Analysis/%s_MS4A1_genetrack.png" %(config["outprefix"]), - # cd3dtrackplot = "Result/Analysis/%s_CD3D_genetrack.png" %(config["outprefix"]), - output: - summaryreport = "Result/%s_scATAC_report.html" %(config["outprefix"]), - params: - outpre = config["outprefix"], - fastqdir = config["fastqdir"], - species = config["species"], - platform = config["platform"], - outdir = "Result" - benchmark: - "Result/Benchmark/%s_Report.benchmark" %(config["outprefix"]) - shell: - # "cp {input.readdistr} {input.qcmap} {input.qcfrag} {input.qcfrip} {input.clusterplot} {input.annotateplot} {output.outdir};" - "python " + SCRIPT_PATH + "/scATAC_HTMLReport.py --directory {params.outdir} --outprefix {params.outpre} " - "--fastq-dir {params.fastqdir} --species {params.species} --platform {params.platform}" + rule scatac_report: + input: + # bulkqc = "Result/QC/" + config["outprefix"] + "_bam_stat.txt", + readdistr = "Result/QC/%s_scATAC_read_distr.png" %(config["outprefix"]), + qcfrag = "Result/QC/%s_scATAC_fragment_size.png" %(config["outprefix"]), + qcfrip = "Result/QC/%s_scATAC_cell_filtering.png" %(config["outprefix"]), + count = "Result/QC/%s_filtered_peak_count.h5" %(config["outprefix"]), + clusterplot = "Result/Analysis/%s_cluster.png" %(config["outprefix"]), + annotateplot = "Result/Analysis/%s_annotated.png" %(config["outprefix"]), + genescore = "Result/Analysis/%s_gene_score.h5" %(config["outprefix"]), + tflist = "Result/Analysis/%s.PredictedTFTop10.txt" %(config["outprefix"]), + # acannotateplot = "Result/Analysis/%s_CistromeTop_annotated.png" %(config["outprefix"]), + # ms4a1trackplot = "Result/Analysis/%s_MS4A1_genetrack.png" %(config["outprefix"]), + # cd3dtrackplot = "Result/Analysis/%s_CD3D_genetrack.png" %(config["outprefix"]), + output: + summaryreport = "Result/%s_scATAC_report.html" %(config["outprefix"]), + params: + outpre = config["outprefix"], + inputpath = config["fastqdir"] if config["format"] == "fastq" else config["bam"], + species = config["species"], + platform = config["platform"], + inputformat = config["format"], + outdir = "Result" + benchmark: + "Result/Benchmark/%s_Report.benchmark" %(config["outprefix"]) + shell: + # "cp {input.readdistr} {input.qcmap} {input.qcfrag} {input.qcfrip} {input.clusterplot} {input.annotateplot} {output.outdir};" + "python " + SCRIPT_PATH + "/scATAC_HTMLReport.py --directory {params.outdir} --outprefix {params.outpre} " + "--input-path {params.inputpath} --species {params.species} --platform {params.platform} --input-format {params.inputformat}" +else: + checkpoint scatac_fragcluster: + input: + frag = "Result/minimap2/fragments_corrected_count.tsv", + cellcluster = "Result/Analysis/%s_cell_cluster.txt" %(config["outprefix"]), + output: + directory("Result/Analysis/Cluster/") + threads: + config["cores"] + benchmark: + "Result/Benchmark/%s_FragCluster.benchmark" %(config["outprefix"]) + shell: + "python " + SCRIPT_PATH + "/scATAC_FragmentSplit.py --frag {input.frag} --cluster {input.cellcluster} --outdir {output}" + + rule scatac_clusterpeakcall: + input: + frag = "Result/Analysis/Cluster/{cluster}.bed" + output: + peak = "Result/Analysis/Cluster/{cluster}_peaks.narrowPeak", + bdg = "Result/Analysis/Cluster/{cluster}_treat_pileup.bdg", + params: + name = "{cluster}", + outdir = "Result/Analysis/Cluster", + genome = macs2_genome + log: + "Result/Log/{cluster}_macs2_allpeak.log" + benchmark: + "Result/Benchmark/{cluster}_AllPeakCall.benchmark" + shell: + "macs2 callpeak -f BEDPE -g {params.genome} --outdir {params.outdir} -n {params.name} -B -q 0.05 --nomodel --extsize=50 --SPMR --keep-dup all -t {input.frag}" + + rule scatac_report: + input: + # bulkqc = "Result/QC/" + config["outprefix"] + "_bam_stat.txt", + qcfrag = "Result/QC/%s_scATAC_fragment_size.png" %(config["outprefix"]), + qcfrip = "Result/QC/%s_scATAC_cell_filtering.png" %(config["outprefix"]), + count = "Result/QC/%s_filtered_peak_count.h5" %(config["outprefix"]), + clusterplot = "Result/Analysis/%s_cluster.png" %(config["outprefix"]), + annotateplot = "Result/Analysis/%s_annotated.png" %(config["outprefix"]), + genescore = "Result/Analysis/%s_gene_score.h5" %(config["outprefix"]), + tflist = "Result/Analysis/%s.PredictedTFTop10.txt" %(config["outprefix"]), + # acannotateplot = "Result/Analysis/%s_CistromeTop_annotated.png" %(config["outprefix"]), + # ms4a1trackplot = "Result/Analysis/%s_MS4A1_genetrack.png" %(config["outprefix"]), + # cd3dtrackplot = "Result/Analysis/%s_CD3D_genetrack.png" %(config["outprefix"]), + output: + summaryreport = "Result/%s_scATAC_report.html" %(config["outprefix"]), + params: + outpre = config["outprefix"], + inputpath = config["frag"], + species = config["species"], + platform = config["platform"], + inputformat = config["format"], + outdir = "Result", + benchmark: + "Result/Benchmark/%s_Report.benchmark" %(config["outprefix"]) + shell: + # "cp {input.readdistr} {input.qcmap} {input.qcfrag} {input.qcfrip} {input.clusterplot} {input.annotateplot} {output.outdir};" + "python " + SCRIPT_PATH + "/scATAC_HTMLReport.py --directory {params.outdir} --outprefix {params.outpre} " + "--input-path {params.inputpath} --species {params.species} --platform {params.platform} --input-format {params.inputformat}" diff --git a/MAESTRO/Snakemake/scATAC/config_template.yaml b/MAESTRO/Snakemake/scATAC/config_template.yaml index 8ff7741..81976ee 100644 --- a/MAESTRO/Snakemake/scATAC/config_template.yaml +++ b/MAESTRO/Snakemake/scATAC/config_template.yaml @@ -1,3 +1,15 @@ +# Platform of single cell ATAC-seq [10x-genomics, sci-ATAC-seq, microfluidic] +platform: {{ platform }} + +# The format of input files [fastq, bam, fragments] +# If the platform is 10x-genomics, users can start with sequencing fastq files, bam files with CB tag or fragments.tsv.gz file generated by CellRanger ATAC. +# If the platform is sci-ATAC-seq, users can start with sequencing fastq files, bam files with CB tag. +# If the platform is microfluidic, users can only start with sequencing fastq files. +# If the format is 'fastq', fastqdir and fastqprefix need to be set. +# If the format is 'bam', bam needs to be set. +# If the format is 'fragments', frag needs to be set. +format: {{ format }} + # Directory where fastq files are stored fastqdir: {{ fastqdir }} @@ -11,12 +23,16 @@ fastqdir: {{ fastqdir }} # In this way, read1, barcode and read2 are associated with R1, R2, R3, respectively. fastqprefix: {{ fastqprefix }} +# The bam file with cell barcode (CB) tag (required when the format is set as 'bam'). +bam: {{ bam }} + +# The fragment file generated by CellRanger ATAC (required when the format is set as 'fragments'). +# For example, fragments.tsv.gz. +frag: {{ frag }} + # Species to use [GRCh38, GRCm38] (GRCh38 for human and GRCm38 for mouse) species: {{ species }} -# Platform of single cell ATAC-seq [10x-genomics, sci-ATAC-seq, microfluidic] -platform: {{ platform }} - # The prefix of output files. DEFAULT: MAESTRO. outprefix: {{ outprefix }} @@ -30,6 +46,22 @@ whitelist: {{ whitelist }} # The core number cores: {{ cores }} +# Whether or not to perform ccell-type annotation. +# By default (not set), MAESTRO will skip the step of cell-type annotation. +# If set, please specify the method of cell-type annotation through method. +annotation: {{ annotation }} + +# Method to annotate cell types [RP-based, peak-based, both]. +# MAESTRO provides two strategies to annotate cell types for scATAC-seq data. +# Users can choose from 'RP-based' and 'peak-based', or choose to run both of them. +# One is based on gene regulatory potential predicted by RP model. Another is based on the bulk chromatin accessibility data from Cistrome database. +# If 'RP-based' is set, MAESTRO performs the cell-type annotation using the gene regulatory potential to represent gene expression, +# and the logFC of gene regulatory potential between one cluster and all the other cells is used to calculate the gene signature scores. +# If 'peak-based' is set, MAESTRO utilizes GIGGLE to evaluate the enrichment of bulk chromatin accessibility peaks on cluster-specific peaks from scATAC-seq data, +# and then transfers the Cistrome cluster identity from the most enriched bulk chromatin accessibility data as the cell-type annotation for the scATAC-seq cluster. +# See the MAESTRO paper for more details. DEFAULT: RP-based. +method: {{ method }} + # Cell signature file used to annotate cell types. MAESTRO provides several sets of built-in cell signatures. # Users can choose from ['human.immune.CIBERSORT', 'mouce.brain.ALLEN', 'mouse.all.facs.TabulaMuris', 'mouse.all.droplet.TabulaMuris']. # Custom cell signatures are also supported. In this situation, users need to provide the file location of cell signatures, diff --git a/MAESTRO/Snakemake/scRNA/Snakefile b/MAESTRO/Snakemake/scRNA/Snakefile index 6de4253..c2fba90 100644 --- a/MAESTRO/Snakemake/scRNA/Snakefile +++ b/MAESTRO/Snakemake/scRNA/Snakefile @@ -434,8 +434,7 @@ rule scrna_analysis: species = config["species"], outpre = config["outprefix"], outdir = "Result/Analysis", - method = config["method"], - rabitlib = config["rabitlib"], + method = "LISA", lisamode = config["lisamode"], lisaenv = config["lisaenv"], condadir = config["condadir"], @@ -446,7 +445,7 @@ rule scrna_analysis: config["cores"] shell: "Rscript " + RSCRIPT_PATH + "/scRNAseq_pipe.R --expression {params.expression} --species {params.species} " - "--prefix {params.outpre} --method {params.method} --signature {params.signature} --rabitlib {params.rabitlib} " + "--prefix {params.outpre} --method {params.method} --signature {params.signature} " "--lisamode {params.lisamode} --condadir {params.condadir} --lisaenv {params.lisaenv} --outdir {params.outdir} --thread {threads}" if config["rseqc"]: diff --git a/MAESTRO/Snakemake/scRNA/config_template.yaml b/MAESTRO/Snakemake/scRNA/config_template.yaml index c2d22f7..f33db40 100644 --- a/MAESTRO/Snakemake/scRNA/config_template.yaml +++ b/MAESTRO/Snakemake/scRNA/config_template.yaml @@ -28,13 +28,6 @@ cores: {{ cores }} # and the signature file is tab-seperated without header. The first column is cell type, and the second column is signature gene. signature: {{ signature }} -# The method to predict driver regulators [RABIT, LISA] -method: {{ method }} - -# Path of the rabit annotation file required for regulator identification (only required if method is set to RABIT). -# Please download the annotation file from http://cistrome.org/~chenfei/MAESTRO/rabit.tar.gz and decompress it. -rabitlib: {{ rabitlib }} - # Mode to Run LISA, 'local' or 'web'. If the mode is set as 'local', # please install LISA (https://github.com/qinqian/lisa) and download pre-computed datasets following the instructions. # The 'web' mode is to run online version of LISA. In consideration of the connection issue and size of datasets, diff --git a/MAESTRO/html/scATAC_nomappability_template.html b/MAESTRO/html/scATAC_nomappability_template.html new file mode 100644 index 0000000..8ea5ba7 --- /dev/null +++ b/MAESTRO/html/scATAC_nomappability_template.html @@ -0,0 +1,302 @@ + + + + + + + + + + + + MAESTRO + + + + +
+
+
+ +
+
+
+
+
+

Sample Information

+
+ + + + + + + + + + + + + + + + + + + +
Sample ID%(outprefix)s
%(inputformat)s%(inputpath)s
Species%(species)s
Platform%(platform)s
+
+
+
+

Quality Control

+
+
+
+

Bulk level

+
+
+

+ Fragment size distribution + + + + +

+

+ Fragment size distribution for 1%% reads sampled from scATAC-seq. Only fragments with insert size less than 1,000 are considered. Y-axis represents the number of fragments. The fragment size distribution should show a periodicity of approximately 200bp due to nucleosome protection of the chromatin to transposase cutting. + +

+ +
+
+
+
+
+
+

Single-cell level

+
+
+

+ Cell filtering based on valid fragments and FRIP + + + + +

+

+ Cell filtering plot of scATAC-seq. The x-axis represents the number of unique reads present in each cell, and the y-axis represents the fraction of reads in promoter regions (defined as 2kb up/downstream of TSS). +

+ +
+
+
+
+
+
+
+

Cell Clustering

+
+
+
+
+
+

+ Cell clustering based on peaks + + + + +

+

+ UMAP visualization of the clustering result. Colors represent different clusters with the cluster ID labeled. +

+ +
+
+
+
+
+
+
+

Annotation

+
+
+
+

Celltype annotation

+
+
+

+ Celltype annotation based on RP and DE genes + + + + +

+

+ UMAP visualization of annotated clusters. Colors represent different cell types. The cell type information for each cluster is annotated using the regulatory potential of marker genes. +

+ +
+
+

+ Celltype annotation based on bulk chromatin accessibility data + + + + +

+

+ UMAP visualization of cells. Colors represent different biological sources. The biological source information for each cluster is inferred based on cluster-specific peaks. +

+ +
+
+
+
+
+
+

Regulator annotation

+
+
+

+ Cluster-specific regulator identified by GIGGLE + + + + +

+

+ Driver transcription regulators identified based on cluster-specific peaks. By default, the regulators are ranked by the enrichment score in each cluster. +

+ + + + + + + + + + +%(regtable)s + +
ClusterCelltype AnnotationTranscription Factorlog10(Giggle score)
+
+
+
+
+
+
+
+

Gene Track

+
+
+
+
+
+

+ Gene track plot for each cluster + + + + +

+

+ Gene track plot for each cluster. Here, CD3D nad MS4A1 are displayed as markers of T cells and B cells, respectively. +

+ + +
+
+
+
+
+
+
+
+
+
+
+ + + + + + + + + + + \ No newline at end of file diff --git a/MAESTRO/html/scATAC_template.html b/MAESTRO/html/scATAC_template.html index 9e35bb5..94af9a5 100644 --- a/MAESTRO/html/scATAC_template.html +++ b/MAESTRO/html/scATAC_template.html @@ -81,8 +81,8 @@

Sample Information

%(outprefix)s - FASTQ Path - %(fastqdir)s + %(inputformat)s + %(inputpath)s Species @@ -203,7 +203,7 @@

Annotation

Celltype annotation

-
+

Celltype annotation based on RP and DE genes @@ -216,15 +216,15 @@

-
+

Celltype annotation based on bulk chromatin accessibility data - +

-

+

UMAP visualization of cells. Colors represent different biological sources. The biological source information for each cluster is inferred based on cluster-specific peaks.

@@ -329,4 +329,4 @@

-ƒ \ No newline at end of file + \ No newline at end of file diff --git a/MAESTRO/scATAC_10x_BarcodeCorrect.py b/MAESTRO/scATAC_10x_BarcodeCorrect.py index 7611b9a..e330cac 100644 --- a/MAESTRO/scATAC_10x_BarcodeCorrect.py +++ b/MAESTRO/scATAC_10x_BarcodeCorrect.py @@ -3,7 +3,7 @@ # @E-mail: Dongqingsun96@gmail.com # @Date: 2020-01-16 19:44:48 # @Last Modified by: Dongqing Sun -# @Last Modified time: 2020-06-07 15:11:46 +# @Last Modified time: 2020-07-21 14:50:25 import argparse @@ -11,15 +11,17 @@ import time, os from collections import defaultdict +from MAESTRO.scATAC_utility import * + def CommandLineParser(): parser=argparse.ArgumentParser(description = "This is a description of input args") - parser.add_argument("-b","--barcodefq",dest = "barcode_fastq",default = "",help = "The barcode fastq file (for 10x genomics, R2)") - parser.add_argument("-B","--barcodelib",dest = "barcode_library",default = "",help = "The barcode library, in which the first column is the valid barcode combinations included in experiment.") + parser.add_argument("-b","--barcodefq",dest = "barcode_fastq",default = "",help = "The barcode fastq file (for 10x genomics, R2). Support gzipped fastq file.") + parser.add_argument("-B","--barcodelib",dest = "barcode_library",default = "",help = "The barcode library, in which the first column is the valid barcode combinations included in experiment. Support gzipped file.") parser.add_argument("-O", "--outdir", dest = "outdir",default = "",help = "The output directory.") return parser.parse_args() -def GenerateMimatch(seq): +def GenerateMismatch(seq): base_list = ["A", "T", "C", "G"] seq_mut_list = [] for i in range(len(seq)): @@ -28,19 +30,22 @@ def GenerateMimatch(seq): seq_mut_list = seq_mut_list + seq_mut return(seq_mut_list) -def GenerateMimatchDict(whitelist): +def GenerateMismatchDict(whitelist): barcode_dict = defaultdict(set) - barcode_list = open(whitelist, "r").readlines() + + filein = universal_open( whitelist, "rt" ) + barcode_list = filein.readlines() + filein.close() + barcode_list = [bc.strip() for bc in barcode_list] - with open(whitelist, "r") as filein: - for line in filein: - barcode = line.strip().split("\t")[0] - barcode_mut_list = GenerateMimatch(barcode) - for seq in barcode_mut_list: - if len(barcode_dict[seq]) == 0: - barcode_dict[seq].add(barcode) - else: - continue + for line in barcode_list: + barcode = line.strip().split("\t")[0] + barcode_mut_list = GenerateMismatch(barcode) + for seq in barcode_mut_list: + if len(barcode_dict[seq]) == 0: + barcode_dict[seq].add(barcode) + else: + continue for bc in barcode_list: barcode_dict[bc] = {bc} return(barcode_dict, barcode_list) @@ -72,7 +77,7 @@ def main(): # Expand the barcode library start_time = time.time() print("Start to read barcode library file",time.strftime("%a %b %d %H:%M:%S %Y", time.localtime())) - barcode_lib_dict, barcode_lib_list = GenerateMimatchDict(barcode_lib) + barcode_lib_dict, barcode_lib_list = GenerateMismatchDict(barcode_lib) end_time = time.time() print("End", end_time-start_time) diff --git a/MAESTRO/scATAC_10x_PeakCount.py b/MAESTRO/scATAC_10x_PeakCount.py index 51ceb6a..f0bcb09 100644 --- a/MAESTRO/scATAC_10x_PeakCount.py +++ b/MAESTRO/scATAC_10x_PeakCount.py @@ -29,25 +29,25 @@ def peakcount_parser(subparsers): workflow = subparsers.add_parser("scatac-peakcount", help = "Generate peak-cell binary count matrix. ") group_input = workflow.add_argument_group("Input arguments") - group_input.add_argument("--peak", dest = "peak", default = "", type = str, - help = "Location of peak file. " + group_input.add_argument("--peak", dest = "peak", type = str, required = True, + help = "Location of peak file. Support gzipped file." "The peak file is BED formatted with tab seperated. " "The first column is chromsome, the second is chromStart, and the third is chromEnd.") - group_input.add_argument("--fragment", dest = "fragment", default = "", type = str, - help = "Location of fragments.tsv file. " + group_input.add_argument("--fragment", dest = "fragment", type = str, required = True, + help = "Location of fragments.tsv file. Support gzipped file." "The fragments.tsv contains one line per unique fragment, with tab-separated fields as described below. " "Each row has 5 columns, representing chrom, chromStart, chromEnd, barcode and duplicateCount, respectively. " "In MAESTOR output, the file should be fragments_corrected_count.tsv. In Cell Ranger ATAC output, the file should be fragments.tsv. ") - group_input.add_argument("--barcode", dest = "barcode", default = "", type = str, - help = "Location of valid cell barcode file (optional). " + group_input.add_argument("--barcode", dest = "barcode", default = "", type = str, required = False, + help = "Location of valid cell barcode file (optional). Support gzipped file." "Each line of the file represents a valid barcode. " "If not set, the barcodes with enough read count (> --count-cutoff) " "in the fragment file will be used to generate peak-cell binary count matrix.") - group_input.add_argument("--count-cutoff", dest = "count_cutoff", default = 1000, type = int, + group_input.add_argument("--count-cutoff", dest = "count_cutoff", default = 1000, type = int, required = False, help = "Cutoff for the number of count in each cell. DEFAULT: 1000.") - group_input.add_argument("--species", dest = "species", default = "GRCh38", + group_input.add_argument("--species", dest = "species", default = "GRCh38", required = False, choices = ["GRCh38", "GRCm38"], type = str, - help = "Species (GRCh38 for human and GRCm38 for mouse). DEFAULT: GRCh38.") + help = "Genome assembly of either 'GRCh38' for human or 'GRCm38' for mouse. DEFAULT: GRCh38.") group_output = workflow.add_argument_group("Output and running arguments") @@ -67,23 +67,36 @@ def filter_fragment_file(barcode_file, frag_file, count_cutoff = 1000): barcode_count = defaultdict(lambda: 0) if barcode_file == "": - for line in open(frag_file, 'r'): + # no barcode file + # read fragment file to get barcode + fhd = universal_open( frag_file, "rt" ) + + for line in fhd: line = line.strip().split('\t') barcode_out[line[3]].append(line[0]+'\t'+line[1]+'\t'+line[2]) barcode_count[line[3]] += int(line[4]) + fhd.close() + for k in barcode_count: if barcode_count[k] >= count_cutoff: barcode_list.append(k) else: - for line in open(barcode_file,'r'): + # read barcode file + fhd = universal_open( barcode_file, "rt" ) + + for line in fhd: barcode_list.append(line.strip()) barcode_out[line.strip()] = [] + fhd.close() - for line in open(frag_file, 'r'): + fhd = universal_open( frag_file, "rt" ) + + for line in fhd: line = line.strip().split('\t') barcode_out[line[3]].append(line[0]+'\t'+line[1]+'\t'+line[2]) + fhd.close() for k in barcode_list: outf = open(tmp+"/"+k,'w') @@ -125,9 +138,12 @@ def merge_binary_file(peak_file, count_list, count_file, cores, genome = 'GRCh38 """Merge the intersectBed result into binary count table.""" peak_list = [] - for line in open(peak_file, 'r'): + + fhd = universal_open( peak_file, "rt" ) + for line in fhd: line = line.strip().split('\t') peak_list.append(line[0]+'_'+line[1]+'_'+line[2]) + fhd.close() peak_list = sorted(peak_list) count_list_split = [] diff --git a/MAESTRO/scATAC_FragmentReshape.py b/MAESTRO/scATAC_FragmentReshape.py new file mode 100644 index 0000000..36508a6 --- /dev/null +++ b/MAESTRO/scATAC_FragmentReshape.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +# @Author: Dongqing Sun +# @E-mail: Dongqingsun96@gmail.com +# @Date: 2020-07-17 07:37:07 +# @Last Modified by: Dongqing Sun +# @Last Modified time: 2020-07-17 20:32:22 + + +import argparse +import pysam +import time, os +from collections import defaultdict + +chr_list = ['chr'+str(i) for i in list(range(1,23))] +chr_list = chr_list + ['chrX', 'chrY'] + +def CommandLineParser(): + parser=argparse.ArgumentParser(description = "Reshape the 10X fragment file to long fragment bed file.") + parser.add_argument("-F","--frag", dest = "fragfile", default = "",help = "The fragment file generated by 10X CellRanger ATAC, " + "with barcodes and counts.") + parser.add_argument("-O", "--outdir", dest = "outdir", default = "", help = "The output directory.") + + return parser.parse_args() + + +parser = CommandLineParser() +fragfile = parser.fragfile +outdir = parser.outdir +outfile = os.path.join(outdir, "fragments_BEDPE.bed") + +try: + os.makedirs(outdir) +except OSError: + pass + +start_time = time.time() +print("Start to reshape 10X fragment file.",time.strftime("%a %b %d %H:%M:%S %Y", time.localtime())) +with open(fragfile, "r") as frag_in, open(outfile, "w") as frag_out: + for line in frag_in.readlines(): + items = line.strip().split("\t") + if int(items[4]) == 1: + frag_out.write("\t".join(items[0:3]) + "\n") + else: + for i in range(int(items[4])): + frag_out.write("\t".join(items[0:3]) + "\n") +end_time = time.time() +print("End:", end_time-start_time) + diff --git a/MAESTRO/scATAC_FragmentSplit.py b/MAESTRO/scATAC_FragmentSplit.py new file mode 100644 index 0000000..90d2fcb --- /dev/null +++ b/MAESTRO/scATAC_FragmentSplit.py @@ -0,0 +1,70 @@ +# -*- coding: utf-8 -*- +# @Author: Dongqing Sun +# @E-mail: Dongqingsun96@gmail.com +# @Date: 2020-07-17 18:54:18 +# @Last Modified by: Dongqing Sun +# @Last Modified time: 2020-07-17 20:14:07 + + +import argparse +import pysam +import time, os +from collections import defaultdict +from contextlib import ExitStack + + +chr_list = ['chr'+str(i) for i in list(range(1,23))] +chr_list = chr_list + ['chrX', 'chrY'] + +def CommandLineParser(): + parser=argparse.ArgumentParser(description = "Split the fragement file according to the cluster.") + parser.add_argument("-F","--frag", dest = "fragfile", default = "",help = "The fragment file with barcodes and counts.") + parser.add_argument("-C","--cluster", dest = "clusterfile", default = "",help = "The barcode-cluster file generated in the step of analysis.") + parser.add_argument("-O", "--outdir", dest = "outdir", default = "", help = "The output directory.") + + return parser.parse_args() + + +parser = CommandLineParser() +fragfile = parser.fragfile +clusterfile = parser.clusterfile +outdir = parser.outdir + +try: + os.makedirs(outdir) +except OSError: + pass + +start_time = time.time() +print("Start to read the cell-cluter file file.",time.strftime("%a %b %d %H:%M:%S %Y", time.localtime())) +cell_cluster_fict = {} +with open(clusterfile, "r") as cluster_in: + for line in cluster_in: + items = line.strip().split("\t") + cell_cluster_fict[items[0]] = items[1] +end_time = time.time() +print("End:", end_time-start_time) + +cluster_list = list(set(list(cell_cluster_fict.values()))) +cluster_index_dict = {} +for i in range(len(cluster_list)): + cluster_index_dict[cluster_list[i]] = i + +start_time = time.time() +print("Start to split the fragement file.",time.strftime("%a %b %d %H:%M:%S %Y", time.localtime())) +with ExitStack() as stack: + out_files = [stack.enter_context(open(os.path.join(outdir, cluster + ".bed"), "w")) for cluster in cluster_list] + with open(fragfile, "r") as frag_in: + for line in frag_in.readlines(): + items = line.strip().split("\t") + if items[3] in cell_cluster_fict: + cluster = cell_cluster_fict[items[3]] + index = cluster_index_dict[cluster] + if int(items[4]) == 1: + out_files[index].write("\t".join(items[0:3]) + "\n") + else: + for i in range(int(items[4])): + out_files[index].write("\t".join(items[0:3]) + "\n") +end_time = time.time() +print("End:", end_time-start_time) + diff --git a/MAESTRO/scATAC_Genescore.py b/MAESTRO/scATAC_Genescore.py index 96540c0..2ea3e7a 100644 --- a/MAESTRO/scATAC_Genescore.py +++ b/MAESTRO/scATAC_Genescore.py @@ -21,7 +21,6 @@ from MAESTRO.scATAC_utility import * from MAESTRO.scATAC_H5Process import * - def genescore_parser(subparsers): """ Add main function init-scatac argument parsers. @@ -295,7 +294,8 @@ def calculate_RP_score(peakmatrix, features, barcodes, gene_bed, decay, score_fi # peaks_list = [peak for peak in cell_peaks.index if peak.split("_")[1].isdigit()] # cell_peaks = sp_sparse.csc_matrix(cell_peaks.loc[peaks_list, :].values) if model == "Simple": - for line in open(gene_bed, 'r'): + fhd = universal_open(gene_bed, 'rt') + for line in fhd: line = line.strip().split('\t') if not line[0].startswith('#'): if line[3] == "+": @@ -303,6 +303,7 @@ def calculate_RP_score(peakmatrix, features, barcodes, gene_bed, decay, score_fi else: genes_info.append(("chr" + line[2], int(line[5]), 1, "%s@%s@%s" % (line[12], "chr" + line[2], line[5]))) # gene_info [chrom, tss, 1, gene_unique] + fhd.close() genes_info = list(set(genes_info)) for igene in range(len(genes_info)): tmp_gene = list(genes_info[igene]) diff --git a/MAESTRO/scATAC_HTMLReport.py b/MAESTRO/scATAC_HTMLReport.py index 95aac47..a17d12f 100644 --- a/MAESTRO/scATAC_HTMLReport.py +++ b/MAESTRO/scATAC_HTMLReport.py @@ -3,7 +3,7 @@ # @E-mail: Dongqingsun96@gmail.com # @Date: 2020-02-28 03:15:37 # @Last Modified by: Dongqing Sun -# @Last Modified time: 2020-06-13 16:22:28 +# @Last Modified time: 2020-07-20 20:48:50 import os,sys @@ -18,8 +18,11 @@ def CommandLineParser(): group_input.add_argument("--platform", dest = "platform", default = "10x-genomics", choices = ["10x-genomics", "sci-ATAC-seq", "microfluidic"], help = "Platform of single cell ATAC-seq. DEFAULT: 10x-genomics.") - group_input.add_argument("--fastq-dir", dest = "fastq_dir", type = str, default = "", - help = "Directory where fastq files are stored") + group_input.add_argument("--input-format", dest = "input_format", default = "fastq", + choices = ["fastq", "bam", "fragments"], + help = "The format of input files. DEFAULT: fastq.") + group_input.add_argument("--input-path", dest = "input_path", type = str, default = "", + help = "Directory where input files are stored") group_input.add_argument("--species", dest = "species", default = "GRCh38", choices = ["GRCh38", "GRCm38"], type = str, help = "Species (GRCh38 for human and GRCm38 for mouse). DEFAULT: GRCh38.") @@ -40,9 +43,10 @@ def main(): myparser = CommandLineParser() directory = myparser.directory outpre = myparser.outprefix - fastqdir = myparser.fastq_dir + input_path = myparser.input_path species = myparser.species platform = myparser.platform + input_format = myparser.input_format try: os.makedirs(directory) @@ -51,20 +55,23 @@ def main(): # next step. pass - report_html_tempfile = os.path.join(SCRIPT_PATH, "html", "scATAC_template.html") - report_html_temp = open(report_html_tempfile, "r").read() - cluster_regulator_file = "Result/Analysis/%s.PredictedTFTop10.txt"%outpre fragplot_link = snakemake.report.data_uri_from_file("Result/QC/%s_scATAC_fragment_size.png"%outpre) # mapplot_link = snakemake.report.data_uri_from_file("Result/QC/%s_scATAC_mapping_summary.png"%outpre)[0] fripplot_link = snakemake.report.data_uri_from_file("Result/QC/%s_scATAC_cell_filtering.png"%outpre) peakcluster_link = snakemake.report.data_uri_from_file("Result/Analysis/%s_cluster.png"%outpre) - rpannotate_link = snakemake.report.data_uri_from_file("Result/Analysis/%s_annotated.png"%outpre) - readdistrplot_link = snakemake.report.data_uri_from_file("Result/QC/%s_scATAC_read_distr.png"%outpre) + if os.path.exists("Result/Analysis/%s_annotated.png"%outpre): + rpannodisplay = "inline" + rpannotate_link = snakemake.report.data_uri_from_file("Result/Analysis/%s_annotated.png"%outpre) + else: + rpannodisplay = "none" + rpannotate_link = "" if os.path.exists("Result/Analysis/%s_CistromeTop_annotated.png"%outpre): + caannodisplay = "inline" caannotate_link = snakemake.report.data_uri_from_file("Result/Analysis/%s_CistromeTop_annotated.png"%outpre) else: + caannodisplay = "none" caannotate_link = "" if os.path.exists("Result/Analysis/%s_MS4A1_genetrack.png"%outpre): @@ -86,10 +93,28 @@ def main(): td_list.append(items_str) td_str = "\n".join(td_list) - report_html_instance = report_html_temp % {"outprefix":outpre, "fastqdir":fastqdir, "species":species, - "platform":platform, "readdistr":readdistrplot_link, "fragment":fragplot_link, "frip":fripplot_link, - "peakcluster":peakcluster_link, "rpannotate":rpannotate_link, "regtable":td_str, - "caannotate": caannotate_link, 'cd3dtrack': cd3dtrack_link, "ms4a1track": ms4a1track_link} + if input_format != "fragments": + if input_format == "fastq": + inputformat = "FASTQ Path" + else: + inputformat = "BAM Path" + readdistrplot_link = snakemake.report.data_uri_from_file("Result/QC/%s_scATAC_read_distr.png"%outpre) + report_html_tempfile = os.path.join(SCRIPT_PATH, "html", "scATAC_template.html") + report_html_temp = open(report_html_tempfile, "r").read() + + report_html_instance = report_html_temp % {"outprefix":outpre, "inputformat":inputformat, "inputpath":input_path, "species":species, + "platform":platform, "readdistr":readdistrplot_link, "fragment":fragplot_link, "frip":fripplot_link, + "peakcluster":peakcluster_link, "rpannodisplay": rpannodisplay, "rpannotate":rpannotate_link, "regtable":td_str, + "caannodisplay": caannodisplay, "caannotate": caannotate_link, 'cd3dtrack': cd3dtrack_link, "ms4a1track": ms4a1track_link} + else: + inputformat = "Fragment Path" + report_html_tempfile = os.path.join(SCRIPT_PATH, "html", "scATAC_nomappability_template.html") + report_html_temp = open(report_html_tempfile, "r").read() + + report_html_instance = report_html_temp % {"outprefix":outpre, "inputformat":inputformat, "inputpath":input_path, "species":species, + "platform":platform, "fragment":fragplot_link, "frip":fripplot_link, + "peakcluster":peakcluster_link, "rpannodisplay": rpannodisplay, "rpannotate":rpannotate_link, "regtable":td_str, + "caannodisplay": caannodisplay, "caannotate": caannotate_link, 'cd3dtrack': cd3dtrack_link, "ms4a1track": ms4a1track_link} report_html_instancefile = os.path.join(directory, outpre + "_scATAC_report.html") outf = open(report_html_instancefile,"w") diff --git a/MAESTRO/scATAC_utility.py b/MAESTRO/scATAC_utility.py index 3bd80bc..b9d1c17 100644 --- a/MAESTRO/scATAC_utility.py +++ b/MAESTRO/scATAC_utility.py @@ -11,6 +11,7 @@ import subprocess import random, string import re +import gzip from subprocess import call as subpcall from pkg_resources import resource_filename @@ -65,7 +66,7 @@ def getfastq_10x(fastqdir, fastqprefix): r1 = " ".join(r1_fastq) else: print("Invalid fastq files!") - if r1_fastq[0].endswith("gz"): + if is_gzip(r1_fastq[0]): command = "zcat" else: command = "cat" @@ -108,3 +109,18 @@ def randomString(stringLength=10): letters = string.ascii_lowercase return ''.join(random.choice(letters) for i in range(stringLength)) +def is_gzip ( filename ): + """Check if the file is gzipped. + """ + with gzip.open( filename, 'r' ) as f: + try: + f.read(1) + except OSError: + return False + return True + +def universal_open ( filename, mode ): + if is_gzip( filename ): + return gzip.open( filename, mode ) + else: + return open( filename, mode ) \ No newline at end of file diff --git a/R/ATACAnnotateCelltype.R b/R/ATACAnnotateCelltype.R index bab8e85..e59622c 100644 --- a/R/ATACAnnotateCelltype.R +++ b/R/ATACAnnotateCelltype.R @@ -48,5 +48,6 @@ ATACAnnotateCelltype <- function(ATAC, signatures = "human.immune.CIBERSORT", mi write.table(cluster.genes, paste0(ATAC@project.name, "_RPDiffGenes.tsv"), quote = F, sep = "\t")} ATAC <- RNAAnnotateCelltype(ATAC, cluster.genes, signatures, min.score = min.score, orig.ident = orig.ident) + ATAC@meta.data$assign.celltype = ATAC@meta.data$assign.ident return(ATAC) } \ No newline at end of file diff --git a/R/ATACAnnotateChromatinAccessibility.R b/R/ATACAnnotateChromatinAccessibility.R index 6ee985a..74301ff 100755 --- a/R/ATACAnnotateChromatinAccessibility.R +++ b/R/ATACAnnotateChromatinAccessibility.R @@ -102,8 +102,10 @@ ATACAnnotateChromatinAccessibility <- function(ATAC, peaks, project = ATAC@proje message(paste("Skip identifying enriched CAs for cluster ", icluster, " because peak number is less than", min.peaks,"...")) } } - return(ATAC) - } + ATAC@meta.data$assign.biological_resource = ATAC@meta.data$biological_resource + ATAC@meta.data$assign.ident = ATAC@meta.data$assign.biological_resource + return(ATAC) + } } RunGiggle <- function(peakbed, giggle.path, organism, antFile, type = 'tf'){ diff --git a/R/ATACAnnotateTranscriptionFactor.R b/R/ATACAnnotateTranscriptionFactor.R index 94e86f1..760987e 100644 --- a/R/ATACAnnotateTranscriptionFactor.R +++ b/R/ATACAnnotateTranscriptionFactor.R @@ -32,7 +32,7 @@ #' pbmc.tfs <- ATACAnnotateTranscriptionFactor(ATAC = pbmc.ATAC.res$ATAC, peaks = pbmc.ATAC.res$peaks, project = "PBMC.scATAC.TF", giggle.path = "/home/annotations/giggle") #' pbmc.tfs #' -#' @importFrom Seurat GetAssayData Idents +#' @importFrom Seurat GetAssayData Idents DefaultAssay #' @export ATACAnnotateTranscriptionFactor <- function(ATAC, peaks, cluster = NULL, project = ATAC@project.name, giggle.path, organism = "GRCh38", top.tf = 10, target.genes = FALSE, min.peaks = 10) diff --git a/conda/MAESTRO/build.sh b/conda/MAESTRO/build.sh index 2858618..00b4e96 100644 --- a/conda/MAESTRO/build.sh +++ b/conda/MAESTRO/build.sh @@ -30,6 +30,6 @@ cd ../ # # install MAESTRO/R -$R -e 'devtools::install(".", upgrade="never")' +$R -e 'Sys.setenv(R_REMOTES_NO_ERRORS_FROM_WARNINGS="true");devtools::install(".", upgrade="never")' diff --git a/conda/MAESTRO/meta.yaml b/conda/MAESTRO/meta.yaml index 707ac5b..d29fd2a 100644 --- a/conda/MAESTRO/meta.yaml +++ b/conda/MAESTRO/meta.yaml @@ -27,7 +27,7 @@ requirements: - macs2 =2.2.6 - umap-learn - pysam # this is needed by sinto - - r-base=3.6.3 + - r-base =3.6.2 - r-BiocManager - r-devtools - r-hdf5r @@ -93,7 +93,7 @@ requirements: - macs2 =2.2.6 - umap-learn - pysam # this is needed by sinto - - r-base=3.6.3 + - r-base =3.6.2 - r-BiocManager - r-devtools - r-hdf5r diff --git a/setup.py b/setup.py index b4a69b1..094ff0e 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ def main(): setup( name = "MAESTRO", - version = "1.2.0", + version = "1.2.1", package_dir = {'MAESTRO':'MAESTRO'}, packages = ['MAESTRO'], package_data={'MAESTRO':['Snakemake/scRNA/*', 'Snakemake/scATAC/*', 'Snakemake/integrate/*', 'R/*', 'annotations/*', 'html/*', '']}, diff --git a/test/standard_output/test_peak_count.h5 b/test/standard_output/test_peak_count.h5 new file mode 100644 index 0000000..e520c4e Binary files /dev/null and b/test/standard_output/test_peak_count.h5 differ diff --git a/test/test.barcode.txt.gz b/test/test.barcode.txt.gz new file mode 100644 index 0000000..3147c2b Binary files /dev/null and b/test/test.barcode.txt.gz differ diff --git a/test/test.fragments.tsv.gz b/test/test.fragments.tsv.gz new file mode 100644 index 0000000..2b155e1 Binary files /dev/null and b/test/test.fragments.tsv.gz differ diff --git a/test/test.peak.narrowPeak.gz b/test/test.peak.narrowPeak.gz new file mode 100644 index 0000000..4d9f9b6 Binary files /dev/null and b/test/test.peak.narrowPeak.gz differ diff --git a/test/test.sh b/test/test.sh new file mode 100644 index 0000000..174e16f --- /dev/null +++ b/test/test.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +TEST_PREFIX=test +STD_OUTPUT_DIR=standard_output + +#---------------------- block of test case 1 -------------------- +echo "1. Test support of gzipped file of scATAC_10x_PeakCount.py" +# testing support of gzipped file of scATAC_10x_PeakCount.py + +TEST_PEAK_GZ=test.peak.narrowPeak.gz +TEST_BC_GZ=test.barcode.txt.gz +TEST_FRAG_GZ=test.fragments.tsv.gz +TEST1_OUTPUT_DIR=${TEST_PREFIX}_peakcount_dir +TEST1_OUTPUT_FILE=${TEST1_OUTPUT_DIR}/${TEST_PREFIX}_peak_count.h5 +TEST1_STD_OUTPUT=${STD_OUTPUT_DIR}/${TEST_PREFIX}_peak_count.h5 + +MAESTRO scatac-peakcount --peak $TEST_PEAK_GZ --fragment $TEST_FRAG_GZ --barcode $TEST_BC_GZ --species GRCh38 --cores 1 --outprefix $TEST_PREFIX -d ${TEST1_OUTPUT_DIR} + +echo "1. Check if the output is the same as the standard output" +# Check if the output file is the same as standard output +d=`h5diff ${TEST1_OUTPUT_FILE} ${TEST1_STD_OUTPUT}` +if [ -z "$d" ]; then + echo " ... success!" +else + echo " ... failed! The first 10 lines of difference:" + h5diff ${TEST1_OUTPUT_FILE} ${TEST1_STD_OUTPUT} | head -10 + exit 1 +fi +#---------------------- end of block ----------------------------- + + +#---------------------- block of test case 2 -------------------- +echo "2. Test MAESTRO scATAC-seq pipeline for 10x-genomics data" +# Test MAESTRO scATAC-seq pipeline for 10x-genomics data + +TEST_PREFIX=${TEST_PREFIX}_scatac + +TEST_FASTQ_DIR=$HOME/Data/atac_pbmc_500_v1_fastqs_sampling +TEST_FASTQ_PREFIX=atac_pbmc_500_v1_downsampling_0.4 +TEST_OUTPUT_DIRDIR=${TEST_PREFIX}_pipeline_dir +TEST_GIGGLE_DIR=$HOME/Data/giggle.all +TEST_REFERENCE_PATH=$HOME/Data/Refdata_scATAC_MAESTRO_GRCh38_1.1.0/GRCh38_genome.fa +TEST_WHITELIST_PATH=$HOME/Data/737K-cratac-v1.txt +TEST_PEAK_GZ=test.peak.narrowPeak.gz +TEST_BC_GZ=test.barcode.txt.gz +TEST_FRAG_GZ=test.fragments.tsv.gz +TEST1_OUTPUT_DIR=${TEST_PREFIX}_peakcount_dir +TEST1_OUTPUT_FILE=${TEST1_OUTPUT_DIR}/${TEST_PREFIX}_peak_count.h5 +TEST1_STD_OUTPUT=${STD_OUTPUT_DIR}/${TEST_PREFIX}_peak_count.h5 + +MAESTRO scatac-init --format fastq --platform 10x-genomics --species GRCh38 \ +--fastq-dir $TEST_FASTQ_DIR --fastq-prefix $TEST_FASTQ_PREFIX \ +--cores 16 --directory $TEST_OUTPUT_DIRDIR --outprefix TEST_PREFIX \ +--peak-cutoff 10 --count-cutoff 100 --frip-cutoff 0.1 --cell-cutoff 10 \ +--giggleannotation $TEST_GIGGLE_DIR \ +--fasta $TEST_REFERENCE_PATH \ +--whitelist TEST_WHITELIST_PATH \ +--annotation --method both --signature human.immune.CIBERSORT --clusterpeak --rpmodel Adjusted + +echo "1. Check if the report html exists" +# Check if the report html exists +TEST_HTML_PATH=$TEST_OUTPUT_DIRDIR/Result/${TEST_PREFIX}_scATAC_report.html +if [ -z "$d" ]; then + echo " ... succeed!" +else + echo " ... failed!" + exit 1 +fi +#---------------------- end of block ----------------------------- + + +echo "Done!"