fix descriptions of gsize

macs3-project · Nov 9, 2023 · 4cf615d · 4cf615d
1 parent fa90a46
commit 4cf615d
Show file tree

Hide file tree

Showing 6 changed files with 230 additions and 71 deletions.
diff --git a/bin/macs3 b/bin/macs3
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Time-stamp: <2023-11-08 12:13:47 Tao Liu>
+# Time-stamp: <2023-11-09 14:47:18 taoliu>
 
 """Description: MACS v3 main executable.
 
@@ -212,7 +212,7 @@ def add_callpeak_parser( subparsers ):
                               help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let MACS decide which format (except for BAMPE and BEDPE which should be implicitly set) the file is. Please check the definition in README. Please note that if the format is set as BAMPE or BEDPE, MACS3 will call its special Paired-end mode to call peaks by piling up the actual ChIPed fragments defined by both aligned ends, instead of predicting the fragment size first and extending reads. Also please note that the BEDPE only contains three columns, and is NOT the same BEDPE format used by BEDTOOLS. DEFAULT: \"AUTO\"",
                               default = "AUTO" )
     group_input.add_argument( "-g", "--gsize", dest = "gsize", type = str, default = "hs",
-                              help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.9e9), 'mm' for mouse (2.6e9), 'ce' for C. elegans (1e8) and 'dm' for fruitfly (1.4e8), Default:hs. The effective genome size numbers are collected from Deeptools https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html Please refer to deeptools to define the best genome size you plan to use." )
+                              help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2,913,022,398), 'mm' for mouse (2,652,783,500), 'ce' for C. elegans (100,286,401) and 'dm' for fruitfly (142,573,017), Default:hs. The effective genome size numbers for the above four species are collected from Deeptools https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html Please refer to deeptools to define the best genome size you plan to use." )
     group_input.add_argument( "-s", "--tsize",  dest = "tsize", type = int, default = None,
                                     help = "Tag size/read length. This will override the auto detected tag size. DEFAULT: Not set")
     group_input.add_argument( "--keep-dup", dest = "keepduplicates", type = str, default = "1",
@@ -382,7 +382,7 @@ def add_filterdup_parser( subparsers ):
                                       help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let '%(prog)s' decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE or BAMPE/BEDPE. DEFAULT: \"AUTO\"",
                                       default = "AUTO" )
     argparser_filterdup.add_argument( "-g", "--gsize", dest = "gsize", type = str, default = "hs",
-                                      help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.7e9), 'mm' for mouse (1.87e9), 'ce' for C. elegans (9e7) and 'dm' for fruitfly (1.2e8), DEFAULT:hs" )
+                                      help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2,913,022,398), 'mm' for mouse (2,652,783,500), 'ce' for C. elegans (100,286,401) and 'dm' for fruitfly (142,573,017), Default:hs. The effective genome size numbers for the above four species are collected from Deeptools https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html Please refer to deeptools to define the best genome size you plan to use.")
     argparser_filterdup.add_argument( "-s", "--tsize", dest = "tsize", type = int,
                                       help = "Tag size. This will override the auto detected tag size. DEFAULT: Not set" )
     argparser_filterdup.add_argument( "-p", "--pvalue", dest = "pvalue", type = float,
@@ -565,7 +565,7 @@ def add_bdgdiff_parser( subparsers ):
     argparser_bdgdiff.add_argument( "--c2", dest="c2bdg", type = str, required = True,
                                     help = "MACS control lambda bedGraph for condition 2. Incompatible with callpeak --SPMR output. REQUIRED" )
     argparser_bdgdiff.add_argument( "-C", "--cutoff", dest = "cutoff", type = float,
-                                    help = "logLR cutoff. Regions with signals lower than cutoff will not be considerred as enriched regions. DEFAULT: 3 (likelihood ratio=1000)", default = 3 )
+                                    help = "log10LR cutoff. Regions with signals lower than cutoff will not be considerred as enriched regions. DEFAULT: 3 (likelihood ratio=1000)", default = 3 )
     argparser_bdgdiff.add_argument( "-l", "--min-len", dest = "minlen", type = int,
                                     help = "Minimum length of differential region. Try bigger value to remove small regions. DEFAULT: 200", default = 200 )
     argparser_bdgdiff.add_argument( "-g", "--max-gap", dest = "maxgap", type = int,
@@ -627,7 +627,7 @@ def add_predictd_parser( subparsers ):
                                      help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let MACS decide which format the file is. However, if you want to decide the average insertion size/fragment size from PE data such as BEDPE or BAMPE, please specify the format as BAMPE or BEDPE since MACS3 won't automatically recognize three two formats with -f AUTO. Please be aware that in PE mode, -g, -s, --bw, --d-min, -m, and --rfile have NO effect. DEFAULT: \"AUTO\"",
                                      default = "AUTO" )
     argparser_predictd.add_argument( "-g", "--gsize", dest = "gsize", type = str, default = "hs",
-                                     help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.7e9), 'mm' for mouse (1.87e9), 'ce' for C. elegans (9e7) and 'dm' for fruitfly (1.2e8), Default:hs" )
+                                     help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2,913,022,398), 'mm' for mouse (2,652,783,500), 'ce' for C. elegans (100,286,401) and 'dm' for fruitfly (142,573,017), Default:hs. The effective genome size numbers for the above four species are collected from Deeptools https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html Please refer to deeptools to define the best genome size you plan to use.")
     argparser_predictd.add_argument( "-s", "--tsize",  dest = "tsize", type = int, default = None,
                                      help = "Tag size. This will override the auto detected tag size. DEFAULT: Not set")
     argparser_predictd.add_argument( "--bw", dest = "bw", type = int, default = 300,

diff --git a/docs/Advanced_Step-by-step_Peak_Calling.md b/docs/Advanced_Step-by-step_Peak_Calling.md
@@ -20,7 +20,7 @@ files `CTCF_ChIP_200K.bed.gz` and `CTCF_Control_200K.bed.gz`, that you
 can find in MACS3 github repository. 
 
 *Note, currently this tutorial is for single-end datasets. Please
-modify the instructions for paired-end data by yourself.*
+modify the command line for paired-end data by yourself.*
 
 ## Step 1: Filter duplicates
 

diff --git a/docs/bdgbroadcall.md b/docs/bdgbroadcall.md
@@ -3,20 +3,7 @@
 ## Overview
 The `bdgbroadcall` subcommand of the MACS3 suite identifies 'nested'
 broad peaks from a single bedGraph track for scores, a function
-essential in certain ChIP-Seq analyses. Unlike narrow peak calling
-performed using `bdgpeakcall` or `callpeak` without the `--broad`
-option, this command, along with the `--broad` option in `callpeak`,
-facilitates broad peak calling, producing results in the UCSC
-gappedPeak format which encapsulates a nested structure of peaks. To
-conceptualize 'nested' peaks, picture a gene structure housing regions
-analogous to exons (strong peaks) and introns coupled with UTRs (weak
-peaks). The broad peak calling process utilizes two distinct cutoffs
-to discern broader, weaker peaks and narrower, stronger peaks, which
-are subsequently nested to provide a detailed peak landscape. 
-
-Please note that, if you only want to call 'broader' peak and not
-interested in the nested peak structure, please simply use
-`bdgpeakcall` with weaker cutoff. 
+essential in certain ChIP-Seq analyses. 
 
 ## Detailed Description
 
@@ -26,6 +13,21 @@ bedGraph files from MACS3 are acceptable to use in the `bdgbroadcall`
 command, as All regions on the same chromosome in the bedGraph file
 should be continuous. 
 
+Unlike narrow peak calling performed using `bdgpeakcall` or `callpeak`
+without the `--broad` option, this command, along with the `--broad`
+option in `callpeak`, facilitates broad peak calling, producing
+results in the UCSC gappedPeak format which encapsulates a nested
+structure of peaks. To conceptualize 'nested' peaks, picture a gene
+structure housing regions analogous to exons (strong peaks) and
+introns coupled with UTRs (weak peaks). The broad peak calling process
+utilizes two distinct cutoffs to discern broader, weaker peaks and
+narrower, stronger peaks, which are subsequently nested to provide a
+detailed peak landscape.
+
+Please note that, if you only want to call 'broader' peak and not
+interested in the nested peak structure, please simply use
+`bdgpeakcall` with weaker cutoff. 
+
 ## Command Line Options
 
 The command line options for `bdgbroadcall` are defined in `macs3

diff --git a/docs/bdgcmp.md b/docs/bdgcmp.md
@@ -1,34 +1,82 @@
 # bdgcmp
 
 ## Overview
-The `bdgcmp` command is part of the MACS3 suite of tools and is used to compare two bedGraph files in each basepair that are commonly covered by the two files. The typical use case is to calculate pvalue or qvalue using Poisson model for each basepair given a treatment pileup signal file in bedGraph format and a control lambda bedGraph file. But we provides more functions rather than pvalue and qvalue, including subtract, division (FE) and more.
+The `bdgcmp` command is part of the MACS3 suite of tools and is used
+to compare two bedGraph files in each basepair that are commonly
+covered by the two files. The typical use case is to calculate pvalue
+or qvalue using Poisson model for each basepair given a treatment
+pileup signal file in bedGraph format and a control lambda bedGraph
+file. But we provides more functions rather than pvalue and qvalue,
+including subtract, division (FE) and more.
 
 ## Detailed Description
 
-The `bdgcmp` command takes two input bedGraph files (e.g. a control and a treatment bedgraph) and produces an output bedGraph of comparison scores for each genomic position involved in the bedGraph files. The `bdgcmp` command normally is used to deduct noise from a signal track in bedGraph (e.g. ChIP treatment) over another signal track in bedGraph (e.g. control). Note: All regions on the same chromosome in the bedGraph file should be continuous so we recommand you use the bedGraph files from MACS3. We provide the following function to 'compare two tracks':
+The `bdgcmp` command takes two input bedGraph files (e.g. a control
+and a treatment bedgraph) and produces an output bedGraph of
+comparison scores for each genomic position involved in the bedGraph
+files. The `bdgcmp` command normally is used to deduct noise from a
+signal track in bedGraph (e.g. ChIP treatment) over another signal
+track in bedGraph (e.g. control). Note: All regions on the same
+chromosome in the bedGraph file should be continuous so we recommand
+you use the bedGraph files from MACS3. We provide the following
+function to 'compare two tracks':
 
-- ppois Poisson p-value (-log10(pvalue) form) using the second file (-c) as lambda and treatment (-t) as observation
+- ppois Poisson p-value (-log10(pvalue) form) using the second file
+  (-c) as lambda and treatment (-t) as observation
 - qpois Q-value through a BH process for poisson pvalues
 - subtract Subtraction from treatment
-- FE linear scale fold enrichment, or the score from file A divided by the score from file B
+- FE linear scale fold enrichment, or the score from file A divided by
+  the score from file B
 - logFE log10 fold enrichment(need to set pseudocount)
-- logLR log10 likelihood between ChIP-enriched model and open chromatin model (need to set pseudocount)
-- symmetric log10 likelihood between two ChIP-enrichment models using Poison distribution, and this can be used to compare ChIP signals from two differen conditions (differential binding)
+- logLR log10 likelihood between ChIP-enriched model and open
+  chromatin model (need to set pseudocount)
+- symmetric log10 likelihood between two ChIP-enrichment models using
+  Poison distribution, and this can be used to compare ChIP signals
+  from two differen conditions (differential binding)
 - max Maximum value between the two tracks.
 
 ## Command Line Options
 
 Here is a brief description of the command line options for `bdgcmp` :
 
-- `-t` or `--tfile`: Treatment bedGraph file, e.g. *_treat_pileup.bdg from MACS. REQUIRED
-- `-c` or `--cfile`: Control bedGraph file, e.g. *_control_lambda.bdg from MACS. REQUIRED
-- `-S` or `--scaling-factor`: Scaling factor for treatment and control track. Keep it as 1.0 or default in most cases. Set it ONLY while you have SPMR output from MACS3 callpeak, and plan to calculate scores as MACS3 callpeak module. If you want to simulate 'callpeak' w/o '--to-large', calculate effective smaller sample size after filtering redundant reads in million (e.g., put 31.415926 if effective reads are 31,415,926) and input it for '-S'; for 'callpeak --to-large', calculate effective reads in a larger sample. DEFAULT: 1.0
-- `-p` or `--pseudocount`: The pseudocount used for calculating logLR, logFE or FE. The count will be applied after normalization of sequencing depth. DEFAULT: 0.0, no pseudocount is applied.
-- `-m` or `--method`: Method to use while calculating a score in any bin by comparing the treatment value and control value. Available choices are: ppois, qpois, subtract, logFE, logLR, and slogLR. They represent Poisson P-value (-log10(pvalue) form) using control as lambda and treatment as observation, q-value through a BH process for Poisson p-values, subtraction from treatment, linear scale fold enrichment, log10 fold enrichment (need to set pseudocount), log10 likelihood between ChIP-enriched model and open chromatin model (need to set pseudocount), symmetric log10 likelihood between two ChIP-enrichment models, or the maximum value between the two tracks. The default option is ppois.
-- `--verbose`: Set the verbose level of runtime messages. 0: only show critical messages, 1: show additional warning messages, 2: show process information, 3: show debug messages. DEFAULT: 2
-- `--outdir`: If specified, all output files will be written to that directory. Default: the current working directory
-- `--o-prefix`: The PREFIX of the output bedGraph file to write scores. If it is given as A, and the method is 'ppois', the output file will be A_ppois.bdg. Mutually exclusive with -o/--ofile.
-- `-o` or `--ofile`: Output filename. Mutually exclusive with --o-prefix. The number and the order of arguments for --ofile must be the same as for -m.
+- `-t` or `--tfile`: Treatment bedGraph file, e.g. *_treat_pileup.bdg
+  from MACS. REQUIRED
+- `-c` or `--cfile`: Control bedGraph file, e.g. *_control_lambda.bdg
+  from MACS. REQUIRED
+- `-S` or `--scaling-factor`: Scaling factor for treatment and control
+  track. Keep it as 1.0 or default in most cases. Set it ONLY while
+  you have SPMR output from MACS3 callpeak, and plan to calculate
+  scores as MACS3 callpeak module. If you want to simulate 'callpeak'
+  w/o '--to-large', calculate effective smaller sample size after
+  filtering redundant reads in million (e.g., put 31.415926 if
+  effective reads are 31,415,926) and input it for '-S'; for 'callpeak
+  --to-large', calculate effective reads in a larger sample. DEFAULT:
+  1.0
+- `-p` or `--pseudocount`: The pseudocount used for calculating logLR,
+  logFE or FE. The count will be applied after normalization of
+  sequencing depth. DEFAULT: 0.0, no pseudocount is applied.
+- `-m` or `--method`: Method to use while calculating a score in any
+  bin by comparing the treatment value and control value. Available
+  choices are: ppois, qpois, subtract, logFE, logLR, and slogLR. They
+  represent Poisson P-value (-log10(pvalue) form) using control as
+  lambda and treatment as observation, q-value through a BH process
+  for Poisson p-values, subtraction from treatment, linear scale fold
+  enrichment, log10 fold enrichment (need to set pseudocount), log10
+  likelihood between ChIP-enriched model and open chromatin model
+  (need to set pseudocount), symmetric log10 likelihood between two
+  ChIP-enrichment models, or the maximum value between the two
+  tracks. The default option is ppois.
+- `--verbose`: Set the verbose level of runtime messages. 0: only show
+  critical messages, 1: show additional warning messages, 2: show
+  process information, 3: show debug messages. DEFAULT: 2
+- `--outdir`: If specified, all output files will be written to that
+  directory. Default: the current working directory
+- `--o-prefix`: The PREFIX of the output bedGraph file to write
+  scores. If it is given as A, and the method is 'ppois', the output
+  file will be A_ppois.bdg. Mutually exclusive with -o/--ofile.
+- `-o` or `--ofile`: Output filename. Mutually exclusive with
+  --o-prefix. The number and the order of arguments for --ofile must
+  be the same as for -m.
 
 ## Example Usage
 
@@ -38,4 +86,7 @@ Here is an example of how to use the `bdgcmp` command:
 macs3 bdgcmp -t treatment.bedGraph -c control.bedGraph -m ppois -p 1.0 -S 1.0 -o output.bedGraph
 ```
 
-In this example, the program will compare the `treatment.bedGraph` file and the `control.bedGraph` file and write the result to `output.bedGraph`. The method used for comparison is `ppois`, the pseudo-count is set to 1.0, and the scaling factor is set to 1.0.
+In this example, the program will compare the `treatment.bedGraph`
+file and the `control.bedGraph` file and write the result to
+`output.bedGraph`. The method used for comparison is `ppois`, the
+pseudo-count is set to 1.0, and the scaling factor is set to 1.0.