Skip to content

Commit

Permalink
fix descriptions of gsize
Browse files Browse the repository at this point in the history
  • Loading branch information
taoliu committed Nov 9, 2023
1 parent fa90a46 commit 4cf615d
Show file tree
Hide file tree
Showing 6 changed files with 230 additions and 71 deletions.
10 changes: 5 additions & 5 deletions bin/macs3
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python
# Time-stamp: <2023-11-08 12:13:47 Tao Liu>
# Time-stamp: <2023-11-09 14:47:18 taoliu>

"""Description: MACS v3 main executable.
Expand Down Expand Up @@ -212,7 +212,7 @@ def add_callpeak_parser( subparsers ):
help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let MACS decide which format (except for BAMPE and BEDPE which should be implicitly set) the file is. Please check the definition in README. Please note that if the format is set as BAMPE or BEDPE, MACS3 will call its special Paired-end mode to call peaks by piling up the actual ChIPed fragments defined by both aligned ends, instead of predicting the fragment size first and extending reads. Also please note that the BEDPE only contains three columns, and is NOT the same BEDPE format used by BEDTOOLS. DEFAULT: \"AUTO\"",
default = "AUTO" )
group_input.add_argument( "-g", "--gsize", dest = "gsize", type = str, default = "hs",
help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.9e9), 'mm' for mouse (2.6e9), 'ce' for C. elegans (1e8) and 'dm' for fruitfly (1.4e8), Default:hs. The effective genome size numbers are collected from Deeptools https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html Please refer to deeptools to define the best genome size you plan to use." )
help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2,913,022,398), 'mm' for mouse (2,652,783,500), 'ce' for C. elegans (100,286,401) and 'dm' for fruitfly (142,573,017), Default:hs. The effective genome size numbers for the above four species are collected from Deeptools https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html Please refer to deeptools to define the best genome size you plan to use." )
group_input.add_argument( "-s", "--tsize", dest = "tsize", type = int, default = None,
help = "Tag size/read length. This will override the auto detected tag size. DEFAULT: Not set")
group_input.add_argument( "--keep-dup", dest = "keepduplicates", type = str, default = "1",
Expand Down Expand Up @@ -382,7 +382,7 @@ def add_filterdup_parser( subparsers ):
help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let '%(prog)s' decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE or BAMPE/BEDPE. DEFAULT: \"AUTO\"",
default = "AUTO" )
argparser_filterdup.add_argument( "-g", "--gsize", dest = "gsize", type = str, default = "hs",
help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.7e9), 'mm' for mouse (1.87e9), 'ce' for C. elegans (9e7) and 'dm' for fruitfly (1.2e8), DEFAULT:hs" )
help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2,913,022,398), 'mm' for mouse (2,652,783,500), 'ce' for C. elegans (100,286,401) and 'dm' for fruitfly (142,573,017), Default:hs. The effective genome size numbers for the above four species are collected from Deeptools https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html Please refer to deeptools to define the best genome size you plan to use.")
argparser_filterdup.add_argument( "-s", "--tsize", dest = "tsize", type = int,
help = "Tag size. This will override the auto detected tag size. DEFAULT: Not set" )
argparser_filterdup.add_argument( "-p", "--pvalue", dest = "pvalue", type = float,
Expand Down Expand Up @@ -565,7 +565,7 @@ def add_bdgdiff_parser( subparsers ):
argparser_bdgdiff.add_argument( "--c2", dest="c2bdg", type = str, required = True,
help = "MACS control lambda bedGraph for condition 2. Incompatible with callpeak --SPMR output. REQUIRED" )
argparser_bdgdiff.add_argument( "-C", "--cutoff", dest = "cutoff", type = float,
help = "logLR cutoff. Regions with signals lower than cutoff will not be considerred as enriched regions. DEFAULT: 3 (likelihood ratio=1000)", default = 3 )
help = "log10LR cutoff. Regions with signals lower than cutoff will not be considerred as enriched regions. DEFAULT: 3 (likelihood ratio=1000)", default = 3 )
argparser_bdgdiff.add_argument( "-l", "--min-len", dest = "minlen", type = int,
help = "Minimum length of differential region. Try bigger value to remove small regions. DEFAULT: 200", default = 200 )
argparser_bdgdiff.add_argument( "-g", "--max-gap", dest = "maxgap", type = int,
Expand Down Expand Up @@ -627,7 +627,7 @@ def add_predictd_parser( subparsers ):
help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let MACS decide which format the file is. However, if you want to decide the average insertion size/fragment size from PE data such as BEDPE or BAMPE, please specify the format as BAMPE or BEDPE since MACS3 won't automatically recognize three two formats with -f AUTO. Please be aware that in PE mode, -g, -s, --bw, --d-min, -m, and --rfile have NO effect. DEFAULT: \"AUTO\"",
default = "AUTO" )
argparser_predictd.add_argument( "-g", "--gsize", dest = "gsize", type = str, default = "hs",
help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.7e9), 'mm' for mouse (1.87e9), 'ce' for C. elegans (9e7) and 'dm' for fruitfly (1.2e8), Default:hs" )
help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2,913,022,398), 'mm' for mouse (2,652,783,500), 'ce' for C. elegans (100,286,401) and 'dm' for fruitfly (142,573,017), Default:hs. The effective genome size numbers for the above four species are collected from Deeptools https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html Please refer to deeptools to define the best genome size you plan to use.")
argparser_predictd.add_argument( "-s", "--tsize", dest = "tsize", type = int, default = None,
help = "Tag size. This will override the auto detected tag size. DEFAULT: Not set")
argparser_predictd.add_argument( "--bw", dest = "bw", type = int, default = 300,
Expand Down
2 changes: 1 addition & 1 deletion docs/Advanced_Step-by-step_Peak_Calling.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ files `CTCF_ChIP_200K.bed.gz` and `CTCF_Control_200K.bed.gz`, that you
can find in MACS3 github repository.

*Note, currently this tutorial is for single-end datasets. Please
modify the instructions for paired-end data by yourself.*
modify the command line for paired-end data by yourself.*

## Step 1: Filter duplicates

Expand Down
30 changes: 16 additions & 14 deletions docs/bdgbroadcall.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,7 @@
## Overview
The `bdgbroadcall` subcommand of the MACS3 suite identifies 'nested'
broad peaks from a single bedGraph track for scores, a function
essential in certain ChIP-Seq analyses. Unlike narrow peak calling
performed using `bdgpeakcall` or `callpeak` without the `--broad`
option, this command, along with the `--broad` option in `callpeak`,
facilitates broad peak calling, producing results in the UCSC
gappedPeak format which encapsulates a nested structure of peaks. To
conceptualize 'nested' peaks, picture a gene structure housing regions
analogous to exons (strong peaks) and introns coupled with UTRs (weak
peaks). The broad peak calling process utilizes two distinct cutoffs
to discern broader, weaker peaks and narrower, stronger peaks, which
are subsequently nested to provide a detailed peak landscape.

Please note that, if you only want to call 'broader' peak and not
interested in the nested peak structure, please simply use
`bdgpeakcall` with weaker cutoff.
essential in certain ChIP-Seq analyses.

## Detailed Description

Expand All @@ -26,6 +13,21 @@ bedGraph files from MACS3 are acceptable to use in the `bdgbroadcall`
command, as All regions on the same chromosome in the bedGraph file
should be continuous.

Unlike narrow peak calling performed using `bdgpeakcall` or `callpeak`
without the `--broad` option, this command, along with the `--broad`
option in `callpeak`, facilitates broad peak calling, producing
results in the UCSC gappedPeak format which encapsulates a nested
structure of peaks. To conceptualize 'nested' peaks, picture a gene
structure housing regions analogous to exons (strong peaks) and
introns coupled with UTRs (weak peaks). The broad peak calling process
utilizes two distinct cutoffs to discern broader, weaker peaks and
narrower, stronger peaks, which are subsequently nested to provide a
detailed peak landscape.

Please note that, if you only want to call 'broader' peak and not
interested in the nested peak structure, please simply use
`bdgpeakcall` with weaker cutoff.

## Command Line Options

The command line options for `bdgbroadcall` are defined in `macs3
Expand Down
83 changes: 67 additions & 16 deletions docs/bdgcmp.md
Original file line number Diff line number Diff line change
@@ -1,34 +1,82 @@
# bdgcmp

## Overview
The `bdgcmp` command is part of the MACS3 suite of tools and is used to compare two bedGraph files in each basepair that are commonly covered by the two files. The typical use case is to calculate pvalue or qvalue using Poisson model for each basepair given a treatment pileup signal file in bedGraph format and a control lambda bedGraph file. But we provides more functions rather than pvalue and qvalue, including subtract, division (FE) and more.
The `bdgcmp` command is part of the MACS3 suite of tools and is used
to compare two bedGraph files in each basepair that are commonly
covered by the two files. The typical use case is to calculate pvalue
or qvalue using Poisson model for each basepair given a treatment
pileup signal file in bedGraph format and a control lambda bedGraph
file. But we provides more functions rather than pvalue and qvalue,
including subtract, division (FE) and more.

## Detailed Description

The `bdgcmp` command takes two input bedGraph files (e.g. a control and a treatment bedgraph) and produces an output bedGraph of comparison scores for each genomic position involved in the bedGraph files. The `bdgcmp` command normally is used to deduct noise from a signal track in bedGraph (e.g. ChIP treatment) over another signal track in bedGraph (e.g. control). Note: All regions on the same chromosome in the bedGraph file should be continuous so we recommand you use the bedGraph files from MACS3. We provide the following function to 'compare two tracks':
The `bdgcmp` command takes two input bedGraph files (e.g. a control
and a treatment bedgraph) and produces an output bedGraph of
comparison scores for each genomic position involved in the bedGraph
files. The `bdgcmp` command normally is used to deduct noise from a
signal track in bedGraph (e.g. ChIP treatment) over another signal
track in bedGraph (e.g. control). Note: All regions on the same
chromosome in the bedGraph file should be continuous so we recommand
you use the bedGraph files from MACS3. We provide the following
function to 'compare two tracks':

- ppois Poisson p-value (-log10(pvalue) form) using the second file (-c) as lambda and treatment (-t) as observation
- ppois Poisson p-value (-log10(pvalue) form) using the second file
(-c) as lambda and treatment (-t) as observation
- qpois Q-value through a BH process for poisson pvalues
- subtract Subtraction from treatment
- FE linear scale fold enrichment, or the score from file A divided by the score from file B
- FE linear scale fold enrichment, or the score from file A divided by
the score from file B
- logFE log10 fold enrichment(need to set pseudocount)
- logLR log10 likelihood between ChIP-enriched model and open chromatin model (need to set pseudocount)
- symmetric log10 likelihood between two ChIP-enrichment models using Poison distribution, and this can be used to compare ChIP signals from two differen conditions (differential binding)
- logLR log10 likelihood between ChIP-enriched model and open
chromatin model (need to set pseudocount)
- symmetric log10 likelihood between two ChIP-enrichment models using
Poison distribution, and this can be used to compare ChIP signals
from two differen conditions (differential binding)
- max Maximum value between the two tracks.

## Command Line Options

Here is a brief description of the command line options for `bdgcmp` :

- `-t` or `--tfile`: Treatment bedGraph file, e.g. *_treat_pileup.bdg from MACS. REQUIRED
- `-c` or `--cfile`: Control bedGraph file, e.g. *_control_lambda.bdg from MACS. REQUIRED
- `-S` or `--scaling-factor`: Scaling factor for treatment and control track. Keep it as 1.0 or default in most cases. Set it ONLY while you have SPMR output from MACS3 callpeak, and plan to calculate scores as MACS3 callpeak module. If you want to simulate 'callpeak' w/o '--to-large', calculate effective smaller sample size after filtering redundant reads in million (e.g., put 31.415926 if effective reads are 31,415,926) and input it for '-S'; for 'callpeak --to-large', calculate effective reads in a larger sample. DEFAULT: 1.0
- `-p` or `--pseudocount`: The pseudocount used for calculating logLR, logFE or FE. The count will be applied after normalization of sequencing depth. DEFAULT: 0.0, no pseudocount is applied.
- `-m` or `--method`: Method to use while calculating a score in any bin by comparing the treatment value and control value. Available choices are: ppois, qpois, subtract, logFE, logLR, and slogLR. They represent Poisson P-value (-log10(pvalue) form) using control as lambda and treatment as observation, q-value through a BH process for Poisson p-values, subtraction from treatment, linear scale fold enrichment, log10 fold enrichment (need to set pseudocount), log10 likelihood between ChIP-enriched model and open chromatin model (need to set pseudocount), symmetric log10 likelihood between two ChIP-enrichment models, or the maximum value between the two tracks. The default option is ppois.
- `--verbose`: Set the verbose level of runtime messages. 0: only show critical messages, 1: show additional warning messages, 2: show process information, 3: show debug messages. DEFAULT: 2
- `--outdir`: If specified, all output files will be written to that directory. Default: the current working directory
- `--o-prefix`: The PREFIX of the output bedGraph file to write scores. If it is given as A, and the method is 'ppois', the output file will be A_ppois.bdg. Mutually exclusive with -o/--ofile.
- `-o` or `--ofile`: Output filename. Mutually exclusive with --o-prefix. The number and the order of arguments for --ofile must be the same as for -m.
- `-t` or `--tfile`: Treatment bedGraph file, e.g. *_treat_pileup.bdg
from MACS. REQUIRED
- `-c` or `--cfile`: Control bedGraph file, e.g. *_control_lambda.bdg
from MACS. REQUIRED
- `-S` or `--scaling-factor`: Scaling factor for treatment and control
track. Keep it as 1.0 or default in most cases. Set it ONLY while
you have SPMR output from MACS3 callpeak, and plan to calculate
scores as MACS3 callpeak module. If you want to simulate 'callpeak'
w/o '--to-large', calculate effective smaller sample size after
filtering redundant reads in million (e.g., put 31.415926 if
effective reads are 31,415,926) and input it for '-S'; for 'callpeak
--to-large', calculate effective reads in a larger sample. DEFAULT:
1.0
- `-p` or `--pseudocount`: The pseudocount used for calculating logLR,
logFE or FE. The count will be applied after normalization of
sequencing depth. DEFAULT: 0.0, no pseudocount is applied.
- `-m` or `--method`: Method to use while calculating a score in any
bin by comparing the treatment value and control value. Available
choices are: ppois, qpois, subtract, logFE, logLR, and slogLR. They
represent Poisson P-value (-log10(pvalue) form) using control as
lambda and treatment as observation, q-value through a BH process
for Poisson p-values, subtraction from treatment, linear scale fold
enrichment, log10 fold enrichment (need to set pseudocount), log10
likelihood between ChIP-enriched model and open chromatin model
(need to set pseudocount), symmetric log10 likelihood between two
ChIP-enrichment models, or the maximum value between the two
tracks. The default option is ppois.
- `--verbose`: Set the verbose level of runtime messages. 0: only show
critical messages, 1: show additional warning messages, 2: show
process information, 3: show debug messages. DEFAULT: 2
- `--outdir`: If specified, all output files will be written to that
directory. Default: the current working directory
- `--o-prefix`: The PREFIX of the output bedGraph file to write
scores. If it is given as A, and the method is 'ppois', the output
file will be A_ppois.bdg. Mutually exclusive with -o/--ofile.
- `-o` or `--ofile`: Output filename. Mutually exclusive with
--o-prefix. The number and the order of arguments for --ofile must
be the same as for -m.

## Example Usage

Expand All @@ -38,4 +86,7 @@ Here is an example of how to use the `bdgcmp` command:
macs3 bdgcmp -t treatment.bedGraph -c control.bedGraph -m ppois -p 1.0 -S 1.0 -o output.bedGraph
```

In this example, the program will compare the `treatment.bedGraph` file and the `control.bedGraph` file and write the result to `output.bedGraph`. The method used for comparison is `ppois`, the pseudo-count is set to 1.0, and the scaling factor is set to 1.0.
In this example, the program will compare the `treatment.bedGraph`
file and the `control.bedGraph` file and write the result to
`output.bedGraph`. The method used for comparison is `ppois`, the
pseudo-count is set to 1.0, and the scaling factor is set to 1.0.
Loading

0 comments on commit 4cf615d

Please sign in to comment.