version 0.4.3

MikeAxtell · Mar 21, 2014 · fd394b8 · fd394b8
1 parent de7f0a6
commit fd394b8
Show file tree

Hide file tree

Showing 4 changed files with 215 additions and 76 deletions.
diff --git a/Prep_bam.pl b/Prep_bam.pl
@@ -1,7 +1,7 @@
 #!/usr/bin/perl -w
 use Getopt::Long;
 
-$version = "0.1.2";
+$version = "0.1.3";
 
 # Define a usage statement
 $usage = "$0 $version
@@ -16,15 +16,16 @@
 C\. SAM lines for mapped reads that have an invalid CIGAR string will be suppressed\!
 
 USAGE:
-1\. For \.bam files : samtools view [input\.bam\] | $0 --prefix [prefix] --genome [genome\.fasta]
-2\. For \.sam\.gz files : gzip -d -c [input\.sam\.gz] | $0 --prefix [prefix] --genome [genome\.fasta]
-3\. For uncompressed \.sam files : $0 --prefix [prefix] --genome [genome\.fasta] < [input\.sam]
-4\. Directly from bowtie version 1 \(eg 0-12-7, 0-12-8\): bowtie [bowtie options] -S [bowtie_index] [reads] | $0 --prefix [prefix] --genome [genome]
+1\. For \.bam files : samtools view [input\.bam\] | $0 --prefix [prefix] --genome [genome\.fasta] -- sortmem [INT]
+2\. For \.sam\.gz files : gzip -d -c [input\.sam\.gz] | $0 --prefix [prefix] --genome [genome\.fasta] -- sortmem [INT]
+3\. For uncompressed \.sam files : $0 --prefix [prefix] --genome [genome\.fasta] --sortmem [INT] < [input\.sam]
+4\. Directly from bowtie version 1 \(eg 0.12.x): bowtie [bowtie options] -S [bowtie_index] [reads] | $0 --prefix [prefix] --genome [genome] --sortmem [INT]
 
 
-OPTIONS \(both required\)
--- prefix [string] : base name for the final \.bam and \.bam\.bai files that will be created
--- genome [string] : Path to the fasta formatted genome file used for the mapping
+OPTIONS
+--prefix [string] : base name for the final \.bam and \.bam\.bai files that will be created :: REQUIRED
+--genome [string] : Path to the fasta formatted genome file used for the mapping :: REQUIRED
+--sortmem [integer] : Amount of memory to allocate to samtools sort, in bytes.  If not specified, defaults to 500000000 (e.g. 476.8Mb). Giving more memory to sorting causes fewer temp files to be created, and thus faster sorting.
 
 Type perldoc $0 for more details or see the README
 
@@ -34,9 +35,11 @@
 # initial option definitions
 $prefix = '';
 $genome = '';
+$sortmem = '';
 
 # get user options from command line
 GetOptions ('prefix=s' => \$prefix,
+	    'sortmem=i' => \$sortmem,
 	    'genome=s' => \$genome);
 
 # Verify existence of prefix
@@ -56,6 +59,12 @@
     die "samtools not found\n$full_samtools_test_text\n\n$usage\n";
 }
 
+# if option --sortmem is specified by user, verify it is an integer
+if($sortmem) {
+    unless($sortmem =~ /^\d+$/) {
+	die "Invalid entry for option --sortmem\.  Must be an integer\n";
+    }
+}
 
 # Initialize temp file
 $tempfile = "$prefix" . "\.sam\.gz";
@@ -208,7 +217,11 @@
 print STDERR "\nCreating initial \.bam file\n";
 system "samtools view -bT $genome $tempfile > $tempfile2";
 print STDERR "\nSorting bamfile by chromosomal location\n";
-system "samtools sort $tempfile2 $prefix";
+if($sortmem) {
+    system "samtools sort -m $sortmem $tempfile2 $prefix";
+} else {
+    system "samtools sort $tempfile2 $prefix";
+}
 print STDERR "\nIndexing the sorted file\n";
 system "samtools index $final_file";
 
@@ -287,7 +300,7 @@ =head1 LICENSE
 (at your option) any later version.                                                              
                                                                                                  
 This program is distributed in the hope that it will be useful,                                  
-    but WITHOUT ANY WARRANTY; without even the implied warranty of                                   
+but WITHOUT ANY WARRANTY; without even the implied warranty of                                   
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                                    
 GNU General Public License for more details.                                                     
                                                                                                  
@@ -302,15 +315,15 @@ =head1 SYNOPSIS
 
 =head1 CITATION
 
-If you use ShortStack in your work, please cite 
+If you use Prep_bam.pl in your work, please cite 
 
-Axtell MJ. (2013) ShortStack: Comprehensive annotation and quantification of small RNA genes.  RNA. (In press).
-
-As of this version release, a manuscript describing ShortStack is in press at the journal "RNA".  It might be published by the time you are reading this, so please check Pubmed before citing!
+Axtell MJ. (2013) ShortStack: Comprehensive annotation and quantification of small RNA genes.  RNA. doi:10.1261/rna.035279.112
 
 =head1 VERSIONS
 
-0.1.2 : THIS VERSION.  March 18, 2013.  Updates to documentation infromation only, program itself is the same as 0.1.1.  First released with ShortStack 0.4.2.
+0.1.3 : THIS VERSION. Developed April 12, 2013. First relased April 24, 2013 with ShortStack 0.4.3.
+
+0.1.2 : March 18, 2013.  Updates to documentation infromation only, program itself is the same as 0.1.1.  First released with ShortStack 0.4.2.
 
 0.1.1 : June 28, 2012.  Frist released with ShortStack 0.1.4.  Fixes program so it no longer breaks when unmapped lines are encountered (now it simply ignores them).  This also allows direct piping of bowtie data directly into Prep_bam.pl
 
@@ -337,26 +350,28 @@ =head1 INSTALL
 =head1 USAGE
                                                                                                                              
 1. For .bam files: 
-    
-    samtools view [input.bam] | Prep_bam.pl --prefix [prefix] --genome [genome.fasta]                                  
+	
+	samtools view [input.bam] | Prep_bam.pl --prefix [prefix] --genome [genome.fasta] --sortmem [INT]                                 
 
 2. For .sam.gz files:
 
-    gzip -d -c [input.sam.gz] | Prep_bam.pl --prefix [prefix] --genome [genome.fasta]                          
+	gzip -d -c [input.sam.gz] | Prep_bam.pl --prefix [prefix] --genome [genome.fasta] --sortmem [INT]                          
 
 3. For uncompressed .sam files:
 
-    Prep_bam.pl --prefix [prefix] --genome [genome.fasta] < [input.sam]     
+	Prep_bam.pl --prefix [prefix] --genome [genome.fasta] --sortmem [INT] < [input.sam]     
 
-4. To pipe in bowtie (version 1 e.g. 0.12.8) sam output directly:
+4. To pipe in bowtie (version 1 e.g. 0.12.x) sam output directly:
 
-    bowtie [bowtie_options] -S [bowtie_index] [trimmed_reads] | Prep_bam.pl --prefix [prefix] --genome [genome] 
+	bowtie [bowtie_options] -S [bowtie_index] [trimmed_reads] | Prep_bam.pl --prefix [prefix] --genome [genome] --sortmem [INT]
 
 =head1 OPTIONS
 
--- prefix [string] : base name for the final .bam and .bam.bai files that will be created    
+--prefix [string] : base name for the final .bam and .bam.bai files that will be created :: REQUIRED
                                       
--- genome [string] : Path to the fasta formatted genome file used for the mapping           
+--genome [string] : Path to the fasta formatted genome file used for the mapping :: REQUIRED
+
+--sortmem [integer] : Amount of memory, in bytes, to allocate to the samtools sort function. If not specified, samtools sort defaults to 500,000,000 (e.g. 476.8Mb).  Allocating more memory speeds sorting. :: OPTIONAL            
 
 =head1 NOTES
 
@@ -375,3 +390,6 @@ =head1 AUTHOR
 =cut
 
 
+
+
+
diff --git a/README b/README
@@ -6,14 +6,10 @@ CITATION
     If you use ShortStack in your work, please cite
 
     Axtell MJ. (2013) ShortStack: Comprehensive annotation and
-    quantification of small RNA genes. RNA. (In press).
-
-    As of this version release, a manuscript describing ShortStack is in
-    press at the journal "RNA". It might be published by the time you are
-    reading this, so please check Pubmed before citing!
+    quantification of small RNA genes. RNA. doi:10.1261/rna.035279.112
 
 VERSION
-    0.4.2 :: Released March 18, 2013
+    0.4.3 :: Released April 24, 2013
 
 AUTHOR
     Michael J. Axtell, Penn State University, mja18@psu.edu
@@ -42,7 +38,7 @@ INSTALL
     accordingly
 
 USAGE
-    Shortstack.pl [options] [in.bam] [genome.fasta]
+            Shortstack.pl [options] [in.bam] [genome.fasta] 
 
 QUICK START
     1. Install ShortStack.pl and Prep_bam.pl, and required third-party tools
@@ -64,8 +60,7 @@ QUICK START
     and indexing (assuming of course you've installed bowtie and built the
     bowtie index for your reference genome):
 
-    bowtie [bowtie_options] -S [bowtie_genome_index] [trimmed_reads] |
-    Prep_bam.pl --genome [genome.fasta] --prefix [file_name_prefix]
+            bowtie [bowtie_options] -S [bowtie_genome_index] [trimmed_reads] | Prep_bam.pl --genome [genome.fasta] --prefix [file_name_prefix]
 
     5. If you use another mapping method besides the one above, the final
     .bam formatted file must be sorted by chromosomal position, have NH:i:
@@ -369,7 +364,7 @@ OUTPUT
     To import this into R, here's a tip to deal with the first line, which
     has the headers but begins with a "#" character.
 
-        >results <- read.table("Results.txt", head=TRUE, sep="\t", comment.char="")
+            >results <- read.table("Results.txt", head=TRUE, sep="\t", comment.char="")
 
     Column 1: Locus : The genome-browser-friendly coordinates of the
     clusters. Coordinates are one-based, inclusive (e.g. Chr1:1-100 refers
@@ -455,12 +450,60 @@ OUTPUT
     this is not a HP or MIRNA locus, "NA" is entered instead.
 
     Column 21 : DuplexResult : The number of possible miRNA/miRNA* duplexes
-    in which neither partner spanned a loop and neither partnerill have its
-    own simple text file to display the details of the locus. These text
-    files all show A) the Name and genomic coordinates of the locus, B) the
-    sequence, in RNA form, C) the identified hairpin structure, in
-    dot-bracket notation, and D) all mappings whose start and stop is within
-    the interval being examined.
+    in which neither partner spanned a loop and neither partner had >
+    maxmiRUnpaired number of unpaired nucleotides in the putative duplex. If
+    this is not a HP or MIRNA locus, "NA" is entered instead.
+
+    Column 22 : StarResult : The number of putative miRNA*'s that were
+    actually sequenced. Values of 1 or more here indicate a MIRNA annoation.
+    If this is not a HP or MIRNA locus, "NA" is entered instead.
+
+    Column 23: Short : The total mappings from reads with lengths less than
+    --dicermin, either in raw reads (--raw mode), or mappings per million
+    mapped.
+
+    Column 24: Long : The total mappings from reads with lengths more than
+    --dicermax, either in raw reads (--raw mode), or mappings per million
+    mapped.
+
+    Columns 25 - the end : The total mappings from reads with the indicated
+    lengths. These are the sizes within the Dicer range.
+
+  Log.txt
+
+    This is a simple log file which records most of the information that is
+    also sent to STDERR during the run.
+
+  ShortStack.bed
+
+    This is a .bed file for viewing the clusters on a genome browser. It
+    follows the .bed specification given at the UCSC broswer site
+    <http://genome.ucsc.edu/FAQ/FAQformat.html>. Clusters are color-coded
+    based on the dominant size. Non-Dicer clusters are always dark gray.
+    Dicer-clusters are RGB-rainbow colored from red (shortest) to green
+    (middle) to blue (longest). Note that the bed coordinate system is
+    zero-based, and the 'stop' coordinate is the first nt NOT in the
+    interval. So, a 100 nt interval beginning at base 1 and ending at base
+    100 would have a start of 0 and a stop of 100 in the bed file.
+
+    Hairpins and MIRNAs are graphically indicated: The helical arms will be
+    shown as thick boxes, and the rest of the cluster will be thin lines.
+
+  miRNA_summary.txt
+
+    This is a tab-delimited text file that summarizes key features of the
+    loci annotated as MIRNAs, including mature miRNA sequences, miRNA-star
+    sequences, and the numbers of mappings for each and for the entire
+    locus.
+
+  Hairpin and MIRNA detail files
+
+    Unless the run was done in --nohp mode, each annotated hairpin-derived
+    and MIRNA locus will have its own simple text file to display the
+    details of the locus. These text files all show A) the Name and genomic
+    coordinates of the locus, B) the sequence, in RNA form, C) the
+    identified hairpin structure, in dot-bracket notation, and D) all
+    mappings whose start and stop is within the interval being examined.
 
     Reads mapped to the sense strand (sense relative to the hairpin, not
     necessarily relative to the genome) have "."s as placeholders and are
@@ -566,14 +609,50 @@ KEY METHODS
     criterion, than the one with the highest coverage fraction in the arms
     is retained. Note that this step contains a correction for reads that
     are "dyads" .. reads that map twice to a hairpin, once in each arm, on
-    opposite strands... this happens for perfect inverted repeat loci. Su
-    ShortStack's MIRNA annotation method is designed to reduce false
-    positives at the expense of an increased rate of false iRNA/miRNA* pair
-    must be at least 25% of the total small RNA abundance at the locus.
-    Finally, redundant candidate mature miRNAs are removed, as the steps
-    above initially might classify a small RNA as both a miRNA* and mature
-    miRNA. In such cases, the partner with the higher abundance is called
-    the miRNA, the other the miRNA*.
+    opposite strands... this happens for perfect inverted repeat loci. Such
+    reads are counted towards the sense strand only for a given hairpin.
+
+    9. The pattern of small RNA expression relative to the single hairpin
+    candidate is further scrutinized for polarity. The fraction of all
+    mappings in the hairpin interval must be >= [--minstrandfrac]. As in
+    step 8, this step corrects for "dyad" reads (see above). Hairpins that
+    pass this step are either HP or MIRNA loci. The coordinates of the
+    originally determined de novo locus are discarded, and replaced with the
+    hairpin coordinates.
+
+    10. Each potential hairpin that remains is next analyzed to see if it
+    qualifies as a MIRNA. MIRNA locus annotation is designed to satisfy the
+    criteria for de novo annotation of plant MIRNAs as described in Meyers
+    et al. (2008) Plant Cell 20:3186-3190. PMID: 19074682. In fact,
+    ShortStack's criteria is a little stricter than Meyers et al., in that
+    ShortStack has an absolute requirement for sequencing of the exact
+    predicted miRNA* sequence for a candidate mature miRNA. It is important
+    to note that ShortStack's MIRNA annotation method is designed to reduce
+    false positives at the expense of an increased rate of false negatives.
+    In other words, there are likely many bona fide MIRNA loci that end up
+    being classified as Hairpins, instead of MIRNAs, because they don't
+    quite meet the strict criteria set forth below.
+
+    - Hairpin Size: The total number of pairs in the hairpin must be <=
+    [--maxmiRHPPairs]
+
+    - Precision: There must be at least one candidate mature miRNA that
+    comprises at least 20% of the total abundance of small RNAs mapped to
+    the hairpin.
+
+    - Duplex: Both partners in a candidate miRNA/miRNA* duplex must not span
+    any loops (i.e. neither is allowed to have base-pairs to itself). In
+    addition, there must be <= [--maxmiRUnpaired] unpaired mature miRNA nts
+    in the miRNA/miRNA* duplex, and <= [--maxmiRUnpaired] unpaired miRNA*
+    nts in the miRNA/miRNA* duplex.
+
+    - Star: The exact predicted miRNA*s of candidate miRNAs must have at
+    least one mapped read, and the total abundance of any candidate mature
+    miRNA/miRNA* pair must be at least 25% of the total small RNA abundance
+    at the locus. Finally, redundant candidate mature miRNAs are removed, as
+    the steps above initially might classify a small RNA as both a miRNA*
+    and mature miRNA. In such cases, the partner with the higher abundance
+    is called the miRNA, the other the miRNA*.
 
     Hairpin loci passing all four of these loci are annotated as MIRNAs.
     Those failing one or more are reported as HP loci instead.

diff --git a/README_Prep_bam b/README_Prep_bam
@@ -4,19 +4,18 @@ SYNOPSIS
     chromosomal location, and a corresponding .bai index file.
 
 CITATION
-    If you use ShortStack in your work, please cite
+    If you use Prep_bam.pl in your work, please cite
 
     Axtell MJ. (2013) ShortStack: Comprehensive annotation and
-    quantification of small RNA genes. RNA. (In press).
-
-    As of this version release, a manuscript describing ShortStack is in
-    press at the journal "RNA". It might be published by the time you are
-    reading this, so please check Pubmed before citing!
+    quantification of small RNA genes. RNA. doi:10.1261/rna.035279.112
 
 VERSIONS
-    0.1.2 : THIS VERSION. March 18, 2013. Updates to documentation
-    infromation only, program itself is the same as 0.1.1. First released
-    with ShortStack 0.4.2.
+    0.1.3 : THIS VERSION. Developed April 12, 2013. First relased April 24,
+    2013 with ShortStack 0.4.3.
+
+    0.1.2 : March 18, 2013. Updates to documentation infromation only,
+    program itself is the same as 0.1.1. First released with ShortStack
+    0.4.2.
 
     0.1.1 : June 28, 2012. Frist released with ShortStack 0.1.4. Fixes
     program so it no longer breaks when unmapped lines are encountered (now
@@ -47,26 +46,31 @@ INSTALL
 USAGE
     1. For .bam files:
 
-        samtools view [input.bam] | Prep_bam.pl --prefix [prefix] --genome [genome.fasta]                                  
+            samtools view [input.bam] | Prep_bam.pl --prefix [prefix] --genome [genome.fasta] --sortmem [INT]                                 
 
     2. For .sam.gz files:
 
-        gzip -d -c [input.sam.gz] | Prep_bam.pl --prefix [prefix] --genome [genome.fasta]                          
+            gzip -d -c [input.sam.gz] | Prep_bam.pl --prefix [prefix] --genome [genome.fasta] --sortmem [INT]                          
 
     3. For uncompressed .sam files:
 
-        Prep_bam.pl --prefix [prefix] --genome [genome.fasta] < [input.sam]     
+            Prep_bam.pl --prefix [prefix] --genome [genome.fasta] --sortmem [INT] < [input.sam]     
 
-    4. To pipe in bowtie (version 1 e.g. 0.12.8) sam output directly:
+    4. To pipe in bowtie (version 1 e.g. 0.12.x) sam output directly:
 
-        bowtie [bowtie_options] -S [bowtie_index] [trimmed_reads] | Prep_bam.pl --prefix [prefix] --genome [genome] 
+            bowtie [bowtie_options] -S [bowtie_index] [trimmed_reads] | Prep_bam.pl --prefix [prefix] --genome [genome] --sortmem [INT]
 
 OPTIONS
-    -- prefix [string] : base name for the final .bam and .bam.bai files
-    that will be created
+    --prefix [string] : base name for the final .bam and .bam.bai files that
+    will be created :: REQUIRED
+
+    --genome [string] : Path to the fasta formatted genome file used for the
+    mapping :: REQUIRED
 
-    -- genome [string] : Path to the fasta formatted genome file used for
-    the mapping
+    --sortmem [integer] : Amount of memory, in bytes, to allocate to the
+    samtools sort function. If not specified, samtools sort defaults to
+    500,000,000 (e.g. 476.8Mb). Allocating more memory speeds sorting. ::
+    OPTIONAL
 
 NOTES
   Input format and assumptions