Merge pull request #74 from broadinstitute/dpark-dev

change subsampled reads from fastq to bam, add installation instructions to RTD
broadinstitute · Jan 18, 2015 · e1902ce · e1902ce
2 parents a594a66 + 634a9c7
commit e1902ce
Show file tree

Hide file tree

Showing 16 changed files with 190 additions and 62 deletions.
diff --git a/README.md b/README.md
@@ -5,11 +5,10 @@
 viral-ngs
 =========
 
-A set of scripts and tools for the analysis of viral NGS data. More 
-details to come.
+A set of scripts and tools for the analysis of viral NGS data.
 
 
-Usage
-=========
-
-Detailed command line help can be found at http://viral-ngs.readthedocs.org/
+More detailed documentation can be found at http://viral-ngs.readthedocs.org/
+This includes installation instructions,
+usage instructions for the command line tools,
+and usage of the pipeline infrastructure.
diff --git a/assembly.py b/assembly.py
@@ -17,26 +17,34 @@
 log = logging.getLogger(__name__)
 
 
-def bam_to_trim_rmdup_subsamp_fastqs(inBam, clipDb, n_reads=100000):
+def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000, outFastqs=None):
+ ''' Take reads through Trimmomatic, Prinseq, and subsampling.
+ '''
+
+ # BAM -> fastq
  infq = list(map(util.file.mkstempfname, ['.in.1.fastq', '.in.2.fastq']))
  tools.picard.SamToFastqTool().execute(inBam, infq[0], infq[1])
 
+ # Trimmomatic
  trimfq = list(map(util.file.mkstempfname, ['.trim.1.fastq', '.trim.2.fastq']))
  taxon_filter.trimmomatic(infq[0], infq[1], trimfq[0], trimfq[1], clipDb)
  os.unlink(infq[0])
  os.unlink(infq[1])
 
+ # Prinseq
  rmdupfq = list(map(util.file.mkstempfname, ['.rmdup.1.fastq', '.rmdup.2.fastq']))
  read_utils.rmdup_prinseq_fastq(trimfq[0], rmdupfq[0])
  read_utils.rmdup_prinseq_fastq(trimfq[1], rmdupfq[1])
  os.unlink(trimfq[0])
  os.unlink(trimfq[1])
-
+
+ # Purge unmated
  purgefq = list(map(util.file.mkstempfname, ['.fix.1.fastq', '.fix.2.fastq']))
  read_utils.purge_unmated(rmdupfq[0], rmdupfq[1], purgefq[0], purgefq[1])
  os.unlink(rmdupfq[0])
  os.unlink(rmdupfq[1])
 
+ # Subsample
  subsampfq = list(map(util.file.mkstempfname, ['.subsamp.1.fastq', '.subsamp.2.fastq']))
  cmd = [os.path.join(util.file.get_scripts_path(), 'subsampler.py'),
  '-n', str(n_reads),
@@ -47,21 +55,40 @@ def bam_to_trim_rmdup_subsamp_fastqs(inBam, clipDb, n_reads=100000):
  subprocess.check_call(cmd)
  os.unlink(purgefq[0])
  os.unlink(purgefq[1])
- return subsampfq
+
+ # Fastq -> BAM
+ tmp_bam = util.file.mkstempfname('.subsamp.bam')
+ tmp_header = util.file.mkstempfname('.header.sam')
+ tools.picard.FastqToSamTool().execute(
+ subsampfq[0], subsampfq[1], 'Dummy', tmp_bam)
+ tools.samtools.SamtoolsTool().dumpHeader(inBam, tmp_header)
+ tools.samtools.SamtoolsTool().reheader(tmp_bam, tmp_header, outBam)
+ os.unlink(tmp_bam)
+ os.unlink(tmp_header)
+
+ # Save fastqs if requested
+ if outFastqs:
+ shutil.copyfile(subsampfq[0], outFastqs[0])
+ shutil.copyfile(subsampfq[1], outFastqs[1])
+ os.unlink(subsampfq[0])
+ os.unlink(subsampfq[1])
 
 
-def assemble_trinity(inBam, outFasta, clipDb, n_reads=100000, outFq=None):
+def assemble_trinity(inBam, outFasta, clipDb, n_reads=100000, outReads=None):
  ''' This step runs the Trinity assembler.
  First trim reads with trimmomatic, rmdup with prinseq,
  and random subsample to no more than 100k reads.
  '''
- subsampfq = bam_to_trim_rmdup_subsamp_fastqs(inBam, clipDb, n_reads)
+ if outReads:
+ subsamp_bam = outReads
+ else:
+ subsamp_bam = util.file.mkstempfname('.subsamp.bam')
+ subsampfq = list(map(util.file.mkstempfname, ['.subsamp.1.fastq', '.subsamp.2.fastq']))
+ trim_rmdup_subsamp_reads(inBam, clipDb, subsamp_bam,
+ n_reads=n_reads, outFastqs=subsampfq)
  tools.trinity.TrinityTool().execute(subsampfq[0], subsampfq[1], outFasta)
- if outFq:
- if len(outFq)!=2:
- raise ValueError("outFq must have exactly two values")
- shutil.copyfile(subsampfq[0], outFq[0])
- shutil.copyfile(subsampfq[1], outFq[1])
+ if not outReads:
+ os.unlink(subsamp_bam)
  os.unlink(subsampfq[0])
  os.unlink(subsampfq[1])
 
@@ -74,8 +101,8 @@ def parser_assemble_trinity(parser=argparse.ArgumentParser()):
  help='Output assembly.')
  parser.add_argument('--n_reads', default=100000, type=int,
  help='Subsample reads to no more than this many pairs. (default %(default)s)')
- parser.add_argument('--outFq', default=None, nargs=2,
- help='Save the trimmomatic/prinseq/subsamp reads to a pair of output fastq files')
+ parser.add_argument('--outReads', default=None,
+ help='Save the trimmomatic/prinseq/subsamp reads to a BAM file')
  util.cmd.common_args(parser, (('loglevel',None), ('version',None), ('tmpDir',None)))
  util.cmd.attach_main(parser, assemble_trinity, split_args=True)
  return parser
@@ -103,13 +130,12 @@ def order_and_orient(inFasta, inReference, outFasta, inReads=None):
  '-musclepath', musclepath,
  '-samtoolspath', tools.samtools.SamtoolsTool().install_and_get_path()]
  if inReads:
- if len(inReads)!=2:
-  raise ValueError("inReads must have exactly two values")
+ infq = list(map(util.file.mkstempfname, ['.in.1.fastq', '.in.2.fastq']))
+ tools.picard.SamToFastqTool().execute(inReads, infq[0], infq[1])
  mosaik = tools.mosaik.MosaikTool()
- mosaikDir = os.path.dirname(mosaik.install_and_get_path())
  cmd = cmd + [
- '-readfq', inReads[0], '-readfq2', inReads[1],
- '-mosaikpath', mosaikDir,
+ '-readfq', infq[0], '-readfq2', infq[1],
+ '-mosaikpath', os.path.dirname(mosaik.install_and_get_path()),
  '-mosaiknetworkpath', mosaik.get_networkFile(),
  ]
  subprocess.check_call(cmd)
@@ -131,7 +157,8 @@ def parser_order_and_orient(parser=argparse.ArgumentParser()):
  help='Reference genome, FASTA format.')
  parser.add_argument('outFasta',
  help='Output assembly, FASTA format.')
- parser.add_argument('--inReads', default=None, nargs=2, help='Input reads in FASTQ format.')
+ parser.add_argument('--inReads', default=None,
+ help='Input reads in BAM format.')
  util.cmd.common_args(parser, (('loglevel',None), ('version',None), ('tmpDir',None)))
  util.cmd.attach_main(parser, order_and_orient, split_args=True)
  return parser

diff --git a/docs/assembly.rst b/docs/assembly.rst
@@ -1,5 +1,5 @@
-Assembly commands
-=====================================
+assembly.py - *de novo* assembly
+================================
 
 .. argparse::
  :module: assembly

diff --git a/docs/broad_utils.rst b/docs/broad_utils.rst
@@ -1,4 +1,4 @@
-Processing Broad Institute Sequencing outputs
+broad_utils.py - for data generated at the Broad Institute
 =====================================
 
 .. argparse::

diff --git a/docs/cmdline.rst b/docs/cmdline.rst
@@ -0,0 +1,14 @@
+Command line tools
+==================
+
+.. toctree::
+
+ taxon_filter
+ assembly
+ interhost
+ intrahost
+ read_utils
+ reports
+ broad_utils
+
+
diff --git a/docs/conf.py b/docs/conf.py
@@ -254,7 +254,7 @@
 # dir menu entry, description, category)
 texinfo_documents = [
  ('index', 'viral-ngs', 'viral-ngs Documentation',
- 'Broad Institute Viral Genomics', 'viral-ngs', 'One line description of project.',
+ 'Broad Institute Viral Genomics', 'viral-ngs', 'Viral genomics analysis pipelines',
  'Miscellaneous'),
 ]
 

diff --git a/docs/index.rst b/docs/index.rst
@@ -3,19 +3,17 @@
  You can adapt this file completely to your liking, but it should at least
  contain the root `toctree` directive.
 
-Welcome to viral-ngs's documentation!
-=====================================
+viral-ngs: genomic analysis pipelines for viral sequencing
+==========================================================
 
-Contents:
+
+Contents
+--------
 
 .. toctree::
  :maxdepth: 2
-
- taxon_filter
- assembly
- interhost
- intrahost
- read_utils
- reports
- broad_utils
-
+ :numbered:
+
+ install
+ cmdline
+ pipeuse
diff --git a/docs/install.rst b/docs/install.rst
@@ -0,0 +1,72 @@
+Installation
+============
+
+
+System dependencies
+-------------------
+
+This is known to install cleanly on most modern Linux systems with Python,
+Java, and some basic development libraries. On Ubuntu 14.04 LTS, the
+following APT packages should be installed on top of the vanilla setup::
+
+ python3 python3-pip python3-nose
+ python-software-properties
+ zlib zlib1g zlib1g-dev
+ libblas3gf libblas-dev liblapack3gf liblapack-dev
+ libatlas-dev libatlas3-base libatlas3gf-base libatlas-base-dev
+ gfortran
+ oracle-java8-installer
+ libncurses5-dev
+
+The Fortran libraries (including blas and atlas) are required to install
+numpy via pip from source. numpy is not actually required if you have
+Python 3.4, if you want to avoid this system dependency.
+
+**Java >= 1.7** is required by GATK and Picard.
+
+
+Python dependencies
+-------------------
+
+The **command line tools** require Python >= 2.7 or >= 3.4. Required packages
+(like pysam and Biopython) are listed in requirements.txt and can be
+installed the usual pip way::
+
+ pip install -r requirements.txt
+
+Additionally, in order to use the **pipeline infrastructure**, Python 3.4
+is required (Python 2 is not supported) and you must install snakemake
+as well::
+
+ pip install snakemake==3.2 yappi=0.94
+
+You should either sudo pip install or use a virtualenv (recommended).
+
+
+Tool dependencies
+-----------------
+
+A lot of effort has gone into writing auto download/compile wrappers for
+most of the bioinformatic tools we rely on here. They will auto-download
+and install the first time they are needed by any command. If you want
+to pre-install all of the external tools, simply type this::
+
+ python -m unittest test.test_tools.TestToolsInstallation -v
+
+However, there are two tools in particular that cannot be auto-installed
+due to licensing restrictions. You will need to download and install
+these tools on your own (paying for it if your use case requires it) and
+set environment variables pointing to their installed location.
+
+ * GATK - http://www.broadinstitute.org/gatk/
+ * Novoalign - http://www.novocraft.com/products/novoalign/
+
+The environment variables you will need to set are GATK_PATH and
+NOVOALIGN_PATH. These should be set to the full directory path
+that contains these tools (the jar file for GATK and the executable
+binaries for Novoalign).
+
+Alternatively, if you are using the Snakemake pipelines, you can create
+a dictionary called "env_vars" in the config.json file for Snakemake,
+and the pipelines will automatically set all environment variables prior
+to running any scripts.
diff --git a/docs/interhost.rst b/docs/interhost.rst
@@ -1,4 +1,4 @@
-Interhost genetic variation
+interhost.py - species and population-level genetic variation
 =====================================
 
 .. argparse::

diff --git a/docs/intrahost.rst b/docs/intrahost.rst
@@ -1,5 +1,5 @@
-Intrahost genetic variation (iSNVs)
-=====================================
+intrahost.py - within-host genetic variation (iSNVs)
+====================================================
 
 .. argparse::
  :module: intrahost

diff --git a/docs/pipeuse.rst b/docs/pipeuse.rst
@@ -0,0 +1,29 @@
+Using the Snakemake pipelines
+=============================
+
+Much more documentation to come...
+
+This utilizes Snakemake, which is documented at
+https://bitbucket.org/johanneskoester/snakemake/wiki/Home
+
+Note that Python 3.4 is required to use these tools with Snakemake.
+
+
+Setting up an analysis directory
+--------------------------------
+
+
+Configuring for your compute platform
+-------------------------------------
+
+
+
+Assembly of pre-filtered reads
+------------------------------
+
+Taxonomic filtration of raw reads
+---------------------------------
+
+Starting from Illumina BCL directories
+--------------------------------------
+
diff --git a/docs/read_utils.rst b/docs/read_utils.rst
@@ -1,4 +1,4 @@
-Read utilities
+read_utils.py - utilities that manipulate bam and fastq files
 =====================================
 
 .. argparse::

diff --git a/docs/reports.rst b/docs/reports.rst
@@ -1,4 +1,4 @@
-Reports
+reports.py - produce various metrics and reports
 =====================================
 
 .. argparse::

diff --git a/docs/taxon_filter.rst b/docs/taxon_filter.rst
@@ -1,4 +1,4 @@
-Taxonomic filtration tools
+taxon_filter.py - tools for taxonomic removal or filtration of reads
 =====================================
 
 .. argparse::

diff --git a/pipes/ref_assisted/Snakefile b/pipes/ref_assisted/Snakefile
@@ -12,16 +12,7 @@
  Then type "ref_assisted" and wait a few hours for aligned BAMs, VCFs, and
  FASTAs.
  
- This is designed for use on a single linux computer (e.g. Ubuntu 14.04 LTS) with:
- apt-get install:
- python3 python3-pip python-software-properties
- zlib zlib1g zlib1g-dev
- libblas3gf libblas-dev liblapack3gf liblapack-dev
- libatlas-dev libatlas3-base libatlas3gf-base libatlas-base-dev
- gfortran git oracle-java8-installer
- libncurses5-dev python3-nose
- pip3 install -r requirements.txt
- pip3 install snakemake==3.2 yappi==0.94
+ This is designed for use on a single linux computer.
 """
 
 __author__ = 'Daniel Park <dpark@broadinstitute.org>'