From e50d7ff1761b828e54026a7ba61354c634515bdd Mon Sep 17 00:00:00 2001
From: Daniel Park <dpark@broadinstitute.org>
Date: Sat, 17 Jan 2015 21:54:55 -0500
Subject: [PATCH 1/4] switch subsampled reads from fastq to bam

---
 assembly.py                | 63 +++++++++++++++++++++++++++-----------
 pipes/rules/assembly.rules | 10 +++---
 2 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/assembly.py b/assembly.py
index 86d2dd21c..b6dfb6219 100755
--- a/assembly.py
+++ b/assembly.py
@@ -17,26 +17,34 @@
 log = logging.getLogger(__name__)
 
 
-def bam_to_trim_rmdup_subsamp_fastqs(inBam, clipDb, n_reads=100000):
+def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000, outFastqs=None):
+    ''' Take reads through Trimmomatic, Prinseq, and subsampling.
+    '''
+    
+    # BAM -> fastq
     infq = list(map(util.file.mkstempfname, ['.in.1.fastq', '.in.2.fastq']))
     tools.picard.SamToFastqTool().execute(inBam, infq[0], infq[1])
     
+    # Trimmomatic
     trimfq = list(map(util.file.mkstempfname, ['.trim.1.fastq', '.trim.2.fastq']))
     taxon_filter.trimmomatic(infq[0], infq[1], trimfq[0], trimfq[1], clipDb)
     os.unlink(infq[0])
     os.unlink(infq[1])
     
+    # Prinseq
     rmdupfq = list(map(util.file.mkstempfname, ['.rmdup.1.fastq', '.rmdup.2.fastq']))
     read_utils.rmdup_prinseq_fastq(trimfq[0], rmdupfq[0])
     read_utils.rmdup_prinseq_fastq(trimfq[1], rmdupfq[1])
     os.unlink(trimfq[0])
     os.unlink(trimfq[1])
-
+    
+    # Purge unmated
     purgefq = list(map(util.file.mkstempfname, ['.fix.1.fastq', '.fix.2.fastq']))
     read_utils.purge_unmated(rmdupfq[0], rmdupfq[1], purgefq[0], purgefq[1])
     os.unlink(rmdupfq[0])
     os.unlink(rmdupfq[1])
     
+    # Subsample
     subsampfq = list(map(util.file.mkstempfname, ['.subsamp.1.fastq', '.subsamp.2.fastq']))
     cmd = [os.path.join(util.file.get_scripts_path(), 'subsampler.py'),
         '-n', str(n_reads),
@@ -47,21 +55,40 @@ def bam_to_trim_rmdup_subsamp_fastqs(inBam, clipDb, n_reads=100000):
     subprocess.check_call(cmd)
     os.unlink(purgefq[0])
     os.unlink(purgefq[1])
-    return subsampfq
+    
+    # Fastq -> BAM
+    tmp_bam = util.file.mkstempfname('.subsamp.bam')
+    tmp_header = util.file.mkstempfname('.header.sam')
+    tools.picard.FastqToSamTool().execute(
+        subsampfq[0], subsampfq[1], 'Dummy', tmp_bam)
+    tools.samtools.SamtoolsTool().dumpHeader(inBam, tmp_header)
+    tools.samtools.SamtoolsTool().reheader(tmp_bam, tmp_header, outBam)
+    os.unlink(tmp_bam)
+    os.unlink(tmp_header)
+    
+    # Save fastqs if requested
+    if outFastqs:
+        shutil.copyfile(subsampfq[0], outFastqs[0])
+        shutil.copyfile(subsampfq[1], outFastqs[1])
+    os.unlink(subsampfq[0])
+    os.unlink(subsampfq[1])
 
 
-def assemble_trinity(inBam, outFasta, clipDb, n_reads=100000, outFq=None):
+def assemble_trinity(inBam, outFasta, clipDb, n_reads=100000, outReads=None):
     ''' This step runs the Trinity assembler.
         First trim reads with trimmomatic, rmdup with prinseq,
         and random subsample to no more than 100k reads.
     '''
-    subsampfq = bam_to_trim_rmdup_subsamp_fastqs(inBam, clipDb, n_reads)
+    if outReads:
+        subsamp_bam = outReads
+    else:
+        subsamp_bam = util.file.mkstempfname('.subsamp.bam')
+    subsampfq = list(map(util.file.mkstempfname, ['.subsamp.1.fastq', '.subsamp.2.fastq']))
+    trim_rmdup_subsamp_reads(inBam, clipDb, subsamp_bam,
+        n_reads=n_reads, outFastqs=subsampfq)
     tools.trinity.TrinityTool().execute(subsampfq[0], subsampfq[1], outFasta)
-    if outFq:
-        if len(outFq)!=2:
-            raise ValueError("outFq must have exactly two values")
-        shutil.copyfile(subsampfq[0], outFq[0])
-        shutil.copyfile(subsampfq[1], outFq[1])
+    if not outReads:
+        os.unlink(subsamp_bam)
     os.unlink(subsampfq[0])
     os.unlink(subsampfq[1])
 
@@ -74,8 +101,8 @@ def parser_assemble_trinity(parser=argparse.ArgumentParser()):
         help='Output assembly.')
     parser.add_argument('--n_reads', default=100000, type=int,
         help='Subsample reads to no more than this many pairs. (default %(default)s)')
-    parser.add_argument('--outFq', default=None, nargs=2,
-        help='Save the trimmomatic/prinseq/subsamp reads to a pair of output fastq files')
+    parser.add_argument('--outReads', default=None,
+        help='Save the trimmomatic/prinseq/subsamp reads to a BAM file')
     util.cmd.common_args(parser, (('loglevel',None), ('version',None), ('tmpDir',None)))
     util.cmd.attach_main(parser, assemble_trinity, split_args=True)
     return parser
@@ -103,13 +130,12 @@ def order_and_orient(inFasta, inReference, outFasta, inReads=None):
         '-musclepath', musclepath,
         '-samtoolspath', tools.samtools.SamtoolsTool().install_and_get_path()]
     if inReads:
-        if len(inReads)!=2:
-            raise ValueError("inReads must have exactly two values")
+        infq = list(map(util.file.mkstempfname, ['.in.1.fastq', '.in.2.fastq']))
+        tools.picard.SamToFastqTool().execute(inReads, infq[0], infq[1])
         mosaik = tools.mosaik.MosaikTool()
-        mosaikDir = os.path.dirname(mosaik.install_and_get_path())
         cmd = cmd + [
-            '-readfq', inReads[0], '-readfq2', inReads[1],
-            '-mosaikpath', mosaikDir,
+            '-readfq', infq[0], '-readfq2', infq[1],
+            '-mosaikpath', os.path.dirname(mosaik.install_and_get_path()),
             '-mosaiknetworkpath', mosaik.get_networkFile(),
         ]
     subprocess.check_call(cmd)
@@ -131,7 +157,8 @@ def parser_order_and_orient(parser=argparse.ArgumentParser()):
         help='Reference genome, FASTA format.')
     parser.add_argument('outFasta',
         help='Output assembly, FASTA format.')
-    parser.add_argument('--inReads', default=None, nargs=2, help='Input reads in FASTQ format.')
+    parser.add_argument('--inReads', default=None,
+        help='Input reads in BAM format.')
     util.cmd.common_args(parser, (('loglevel',None), ('version',None), ('tmpDir',None)))
     util.cmd.attach_main(parser, order_and_orient, split_args=True)
     return parser
diff --git a/pipes/rules/assembly.rules b/pipes/rules/assembly.rules
index c64f8df09..0cd246f71 100644
--- a/pipes/rules/assembly.rules
+++ b/pipes/rules/assembly.rules
@@ -43,8 +43,7 @@ rule assemble_trinity:
     '''
     input:  config["dataDir"]+'/'+config["subdirs"]["per_sample"]+'/{sample}.taxfilt.bam'
     output: config["tmpDir"] +'/'+config["subdirs"]["assembly"]+'/{sample}.assembly1-trinity.fasta',
-            config["tmpDir"] +'/'+config["subdirs"]["assembly"]+'/{sample}.subsamp.1.fastq',
-            config["tmpDir"] +'/'+config["subdirs"]["assembly"]+'/{sample}.subsamp.2.fastq'
+            config["tmpDir"] +'/'+config["subdirs"]["assembly"]+'/{sample}.subsamp.bam'
     resources: mem=3
     params: LSF=config.get('LSF_queues', {}).get('short', '-W 4:00'),
             n_reads="100000",
@@ -54,7 +53,7 @@ rule assemble_trinity:
             makedirs(expand("{dir}/{subdir}",
                 dir=[config["dataDir"],config["tmpDir"]],
                 subdir=config["subdirs"]["assembly"]))
-            shell("{config[binDir]}/assembly.py assemble_trinity {input} {params.clipDb} {output[0]} --n_reads={params.n_reads} --outFq {output[1]} {output[2]}")
+            shell("{config[binDir]}/assembly.py assemble_trinity {input} {params.clipDb} {output[0]} --n_reads={params.n_reads} --outReads {output[1]}")
 
 rule orient_and_impute:
     ''' This step cleans up the Trinity assembly with a known reference genome.
@@ -74,8 +73,7 @@ rule orient_and_impute:
             revert positions back to Ns where read support is lacking.
     '''
     input:  config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.assembly1-trinity.fasta',
-            config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.subsamp.1.fastq',
-            config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.subsamp.2.fastq'
+            config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.subsamp.bam'
     output: config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.assembly2-vfat.fasta',
             config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.assembly3-modify.fasta'
     resources: mem=3
@@ -87,7 +85,7 @@ rule orient_and_impute:
             replace_length="55",
             logid="{sample}"
     run:
-            shell("{config[binDir]}/assembly.py order_and_orient {input[0]} {params.refGenome} {output[0]} --inReads {input[1]} {input[2]}")
+            shell("{config[binDir]}/assembly.py order_and_orient {input[0]} {params.refGenome} {output[0]} --inReads {input[1]}")
             shell("{config[binDir]}/assembly.py impute_from_reference {output[0]} {params.refGenome} {output[1]} --newName {params.renamed_prefix}{wildcards.sample} --replaceLength {params.replace_length} --minLength {params.length} --minUnambig {params.min_unambig}")
 
 rule refine_assembly_1:

From c0b6c1e2cb7dfdef3320d4aadad57fed16c8d646 Mon Sep 17 00:00:00 2001
From: Daniel Park <dpark@broadinstitute.org>
Date: Sat, 17 Jan 2015 22:40:27 -0500
Subject: [PATCH 2/4] docs updates

---
 docs/assembly.rst            |  4 +--
 docs/broad_utils.rst         |  2 +-
 docs/cmdline.rst             | 14 ++++++++
 docs/conf.py                 |  2 +-
 docs/index.rst               | 20 +++++------
 docs/install.rst             | 68 ++++++++++++++++++++++++++++++++++++
 docs/interhost.rst           |  2 +-
 docs/intrahost.rst           |  4 +--
 docs/read_utils.rst          |  2 +-
 docs/reports.rst             |  2 +-
 docs/taxon_filter.rst        |  2 +-
 pipes/ref_assisted/Snakefile | 11 +-----
 12 files changed, 102 insertions(+), 31 deletions(-)
 create mode 100644 docs/cmdline.rst
 create mode 100644 docs/install.rst

diff --git a/docs/assembly.rst b/docs/assembly.rst
index 687f749be..edb502118 100644
--- a/docs/assembly.rst
+++ b/docs/assembly.rst
@@ -1,5 +1,5 @@
-Assembly commands
-=====================================
+assembly.py - *de novo* assembly
+================================
 
 .. argparse::
     :module:  assembly
diff --git a/docs/broad_utils.rst b/docs/broad_utils.rst
index 011d6c557..f33d24320 100644
--- a/docs/broad_utils.rst
+++ b/docs/broad_utils.rst
@@ -1,4 +1,4 @@
-Processing Broad Institute Sequencing outputs
+broad_utils.py - for data generated at the Broad Institute
 =====================================
 
 .. argparse::
diff --git a/docs/cmdline.rst b/docs/cmdline.rst
new file mode 100644
index 000000000..444d082fa
--- /dev/null
+++ b/docs/cmdline.rst
@@ -0,0 +1,14 @@
+Command line tools
+==================
+
+.. toctree::
+
+   taxon_filter
+   assembly
+   interhost
+   intrahost
+   read_utils
+   reports
+   broad_utils
+
+
diff --git a/docs/conf.py b/docs/conf.py
index 0a4cce0c7..c5f21604b 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -254,7 +254,7 @@
 #  dir menu entry, description, category)
 texinfo_documents = [
   ('index', 'viral-ngs', 'viral-ngs Documentation',
-   'Broad Institute Viral Genomics', 'viral-ngs', 'One line description of project.',
+   'Broad Institute Viral Genomics', 'viral-ngs', 'Viral genomics analysis pipelines',
    'Miscellaneous'),
 ]
 
diff --git a/docs/index.rst b/docs/index.rst
index bd98bb15e..2e06110bd 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -3,19 +3,17 @@
    You can adapt this file completely to your liking, but it should at least
    contain the root `toctree` directive.
 
-Welcome to viral-ngs's documentation!
-=====================================
+viral-ngs: genomic analysis pipelines for viral sequencing
+==========================================================
 
-Contents:
+
+Contents
+--------
 
 .. toctree::
    :maxdepth: 2
-
-   taxon_filter
-   assembly
-   interhost
-   intrahost
-   read_utils
-   reports
-   broad_utils
+   :numbered:
+   
+   install
+   cmdline
 
diff --git a/docs/install.rst b/docs/install.rst
new file mode 100644
index 000000000..b82c670a1
--- /dev/null
+++ b/docs/install.rst
@@ -0,0 +1,68 @@
+Installation
+============
+
+This is known to install cleanly on most modern Linux systems with Python,
+Java, and some basic development libraries.  On Ubuntu 14.04 LTS, the
+following APT packages should be installed on top of the vanilla setup::
+
+  python3 python3-pip python3-nose
+  python-software-properties
+  zlib zlib1g zlib1g-dev
+  libblas3gf libblas-dev liblapack3gf liblapack-dev
+  libatlas-dev libatlas3-base libatlas3gf-base libatlas-base-dev
+  gfortran
+  oracle-java8-installer
+  libncurses5-dev
+
+The Fortran libraries (including blas and atlas) are required to install
+numpy via pip from source. numpy is not actually required if you have
+Python 3.4, if you want to avoid this system dependency.
+
+**Java >= 1.7** is required by GATK and Picard.
+
+
+Python dependencies
+-------------------
+
+The **command line tools** require Python >= 2.7 or >= 3.4. Required packages
+(like pysam and Biopython) are listed in requirements.txt and can be
+installed the usual pip way::
+
+  pip install -r requirements.txt
+
+Additionally, in order to use the **pipeline infrastructure**, Python 3.4
+is required (Python 2 is not supported) and you must install snakemake
+as well::
+
+  pip install snakemake==3.2 yappi=0.94
+
+You should either sudo pip install or use a virtualenv (recommended).
+
+
+Tool dependencies
+-----------------
+
+A lot of effort has gone into writing auto download/compile wrappers for
+most of the bioinformatic tools we rely on here. They will auto-download
+and install the first time they are needed by any command. If you want
+to pre-install all of the external tools, simply type this::
+
+  python -m unittest test.test_tools.TestToolsInstallation -v
+
+However, there are two tools in particular that cannot be auto-installed
+due to licensing restrictions.  You will need to download and install
+these tools on your own (paying for it if your use case requires it) and
+set environment variables pointing to their installed location.
+
+ * GATK - http://www.broadinstitute.org/gatk/
+ * Novoalign - http://www.novocraft.com/products/novoalign/
+
+The environment variables you will need to set are GATK_PATH and
+NOVOALIGN_PATH. These should be set to the full directory path
+that contains these tools (the jar file for GATK and the executable
+binaries for Novoalign).
+
+Alternatively, if you are using the Snakemake pipelines, you can create
+a dictionary called "env_vars" in the config.json file for Snakemake,
+and the pipelines will automatically set all environment variables prior
+to running any scripts.
diff --git a/docs/interhost.rst b/docs/interhost.rst
index 48a2aadbe..2c6c7e052 100644
--- a/docs/interhost.rst
+++ b/docs/interhost.rst
@@ -1,4 +1,4 @@
-Interhost genetic variation
+interhost.py - species and population-level genetic variation
 =====================================
 
 .. argparse::
diff --git a/docs/intrahost.rst b/docs/intrahost.rst
index da13aec94..8b5e47fea 100644
--- a/docs/intrahost.rst
+++ b/docs/intrahost.rst
@@ -1,5 +1,5 @@
-Intrahost genetic variation (iSNVs)
-=====================================
+intrahost.py - within-host genetic variation (iSNVs)
+====================================================
 
 .. argparse::
     :module:  intrahost
diff --git a/docs/read_utils.rst b/docs/read_utils.rst
index 63720b2ea..2d46eee99 100644
--- a/docs/read_utils.rst
+++ b/docs/read_utils.rst
@@ -1,4 +1,4 @@
-Read utilities
+read_utils.py - utilities that manipulate bam and fastq files
 =====================================
 
 .. argparse::
diff --git a/docs/reports.rst b/docs/reports.rst
index 187c8b41e..46d61aca4 100644
--- a/docs/reports.rst
+++ b/docs/reports.rst
@@ -1,4 +1,4 @@
-Reports
+reports.py - produce various metrics and reports
 =====================================
 
 .. argparse::
diff --git a/docs/taxon_filter.rst b/docs/taxon_filter.rst
index 775cfe0b2..e185e0dea 100644
--- a/docs/taxon_filter.rst
+++ b/docs/taxon_filter.rst
@@ -1,4 +1,4 @@
-Taxonomic filtration tools
+taxon_filter.py - tools for taxonomic removal or filtration of reads
 =====================================
 
 .. argparse::
diff --git a/pipes/ref_assisted/Snakefile b/pipes/ref_assisted/Snakefile
index 1fde749ad..03c8504dd 100644
--- a/pipes/ref_assisted/Snakefile
+++ b/pipes/ref_assisted/Snakefile
@@ -12,16 +12,7 @@
     Then type "ref_assisted" and wait a few hours for aligned BAMs, VCFs, and
     FASTAs.
     
-    This is designed for use on a single linux computer (e.g. Ubuntu 14.04 LTS) with:
-        apt-get install:
-         python3 python3-pip python-software-properties
-         zlib zlib1g zlib1g-dev
-         libblas3gf libblas-dev liblapack3gf liblapack-dev
-         libatlas-dev libatlas3-base libatlas3gf-base libatlas-base-dev
-         gfortran git oracle-java8-installer
-         libncurses5-dev python3-nose
-        pip3 install -r requirements.txt
-        pip3 install snakemake==3.2 yappi==0.94
+    This is designed for use on a single linux computer.
 """
 
 __author__ = 'Daniel Park <dpark@broadinstitute.org>'

From 78467ccb765170970d204aec79f737a8573b7d93 Mon Sep 17 00:00:00 2001
From: Daniel Park <dpark@broadinstitute.org>
Date: Sat, 17 Jan 2015 22:44:52 -0500
Subject: [PATCH 3/4] add subheader to install docs [ci skip]

---
 docs/install.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/install.rst b/docs/install.rst
index b82c670a1..f8fabed00 100644
--- a/docs/install.rst
+++ b/docs/install.rst
@@ -1,6 +1,10 @@
 Installation
 ============
 
+
+System dependencies
+-------------------
+
 This is known to install cleanly on most modern Linux systems with Python,
 Java, and some basic development libraries.  On Ubuntu 14.04 LTS, the
 following APT packages should be installed on top of the vanilla setup::

From 634a9c76904d5ff16e1565a3b477df6448c4eff4 Mon Sep 17 00:00:00 2001
From: Daniel Park <dpark@broadinstitute.org>
Date: Sat, 17 Jan 2015 22:58:02 -0500
Subject: [PATCH 4/4] more documentation

---
 README.md        | 11 +++++------
 docs/index.rst   |  2 +-
 docs/pipeuse.rst | 29 +++++++++++++++++++++++++++++
 3 files changed, 35 insertions(+), 7 deletions(-)
 create mode 100644 docs/pipeuse.rst

diff --git a/README.md b/README.md
index 95a4a9e94..9d98aa601 100644
--- a/README.md
+++ b/README.md
@@ -5,11 +5,10 @@
 viral-ngs
 =========
 
-A set of scripts and tools for the analysis of viral NGS data.  More 
-details to come.
+A set of scripts and tools for the analysis of viral NGS data.
 
 
-Usage
-=========
-
-Detailed command line help can be found at http://viral-ngs.readthedocs.org/
+More detailed documentation can be found at http://viral-ngs.readthedocs.org/
+This includes installation instructions,
+usage instructions for the command line tools,
+and usage of the pipeline infrastructure.
diff --git a/docs/index.rst b/docs/index.rst
index 2e06110bd..363ad264a 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -16,4 +16,4 @@ Contents
    
    install
    cmdline
-
+   pipeuse
diff --git a/docs/pipeuse.rst b/docs/pipeuse.rst
new file mode 100644
index 000000000..60433dad6
--- /dev/null
+++ b/docs/pipeuse.rst
@@ -0,0 +1,29 @@
+Using the Snakemake pipelines
+=============================
+
+Much more documentation to come...
+
+This utilizes Snakemake, which is documented at
+https://bitbucket.org/johanneskoester/snakemake/wiki/Home
+
+Note that Python 3.4 is required to use these tools with Snakemake.
+
+
+Setting up an analysis directory
+--------------------------------
+
+
+Configuring for your compute platform
+-------------------------------------
+
+
+
+Assembly of pre-filtered reads
+------------------------------
+
+Taxonomic filtration of raw reads
+---------------------------------
+
+Starting from Illumina BCL directories
+--------------------------------------
+