Skip to content

Commit

Permalink
Merge pull request #76 from broadinstitute/dpark-dev
Browse files Browse the repository at this point in the history
more docs, add align_and_fix python command
  • Loading branch information
dpark01 committed Jan 20, 2015
2 parents e1902ce + 43e5368 commit 2c10a9c
Show file tree
Hide file tree
Showing 18 changed files with 155 additions and 494 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
VERSION
test/output/

# Sphinx documentation
docs/_build/

# Mac OSX
Expand Down
21 changes: 11 additions & 10 deletions assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@
log = logging.getLogger(__name__)


def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000, outFastqs=None):
def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
''' Take reads through Trimmomatic, Prinseq, and subsampling.
This should probably move over to read_utils or taxon_filter.
'''

# BAM -> fastq
Expand Down Expand Up @@ -57,6 +58,9 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000, outFastqs=No
os.unlink(purgefq[1])

# Fastq -> BAM
# Note: this destroys RG IDs! We should instead just pull the list
# of IDs out of purge_unmated, random.sample them ourselves, and
# use FilterSamReadsTool to go straight from inBam -> outBam
tmp_bam = util.file.mkstempfname('.subsamp.bam')
tmp_header = util.file.mkstempfname('.header.sam')
tools.picard.FastqToSamTool().execute(
Expand All @@ -65,11 +69,6 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000, outFastqs=No
tools.samtools.SamtoolsTool().reheader(tmp_bam, tmp_header, outBam)
os.unlink(tmp_bam)
os.unlink(tmp_header)

# Save fastqs if requested
if outFastqs:
shutil.copyfile(subsampfq[0], outFastqs[0])
shutil.copyfile(subsampfq[1], outFastqs[1])
os.unlink(subsampfq[0])
os.unlink(subsampfq[1])

Expand All @@ -83,14 +82,16 @@ def assemble_trinity(inBam, outFasta, clipDb, n_reads=100000, outReads=None):
subsamp_bam = outReads
else:
subsamp_bam = util.file.mkstempfname('.subsamp.bam')

trim_rmdup_subsamp_reads(inBam, clipDb, subsamp_bam, n_reads=n_reads)
subsampfq = list(map(util.file.mkstempfname, ['.subsamp.1.fastq', '.subsamp.2.fastq']))
trim_rmdup_subsamp_reads(inBam, clipDb, subsamp_bam,
n_reads=n_reads, outFastqs=subsampfq)
tools.picard.SamToFastqTool().execute(subsamp_bam, subsampfq[0], subsampfq[1])
tools.trinity.TrinityTool().execute(subsampfq[0], subsampfq[1], outFasta)
if not outReads:
os.unlink(subsamp_bam)
os.unlink(subsampfq[0])
os.unlink(subsampfq[1])

if not outReads:
os.unlink(subsamp_bam)

def parser_assemble_trinity(parser=argparse.ArgumentParser()):
parser.add_argument('inBam',
Expand Down
2 changes: 1 addition & 1 deletion docs/broad_utils.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
broad_utils.py - for data generated at the Broad Institute
=====================================
==========================================================

.. argparse::
:module: broad_utils
Expand Down
18 changes: 13 additions & 5 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,22 @@
# documentation root, use os.path.abspath to make it absolute, like shown here.
sys.path.insert(0, os.path.dirname(os.path.abspath('.')))

import util.version

# -- Mock out the heavyweight pip packages, esp those that require C ----
import mock
MOCK_MODULES = ['numpy', 'scipy', 'matplotlib', 'pysam',
'Bio', 'Bio.AlignIO', 'Bio.SeqIO', 'Bio.Data.IUPACData']
for mod_name in MOCK_MODULES:
sys.modules[mod_name] = mock.Mock()

# -- Obtain GIT version --
import subprocess
def _git_version():
cmd = ['git', 'describe', '--tags', '--always'] # omit "--dirty" from doc build
out = subprocess.check_output(cmd)
if type(out) != str:
out = out.decode('utf-8')
return out.strip()
__version__ = _git_version()

# -- General configuration ------------------------------------------------

Expand Down Expand Up @@ -65,7 +73,7 @@
# |version| and |release|, also used in various other places throughout the
# built documents.
#
release = util.version.get_version()
release = __version__
version = '.'.join(release.split('.')[:2])

# The language for content autogenerated by Sphinx. Refer to documentation
Expand All @@ -76,7 +84,7 @@
# non-false value, then it is used:
#today = ''
# Else, today_fmt is used as the format for a strftime call.
#today_fmt = '%B %d, %Y'
today_fmt = '%Y-%m-%d'

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
Expand Down Expand Up @@ -149,7 +157,7 @@

# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
#html_last_updated_fmt = '%b %d, %Y'
html_last_updated_fmt = '%Y-%m-%d. {}'.format(release)

# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
Expand Down
33 changes: 33 additions & 0 deletions docs/description.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
Description of the methods
==========================

Much more documentation to come...

TO DO: here we will put a high level description of the various tools that
exist here, perhaps with some pictures and such. We will describe why we
used certain tools and approaches / how other approaches fell short / what
kinds of problems certain steps are trying to solve. Perhaps some links to
papers and such. Kind of a mini-methods paper here.


Viral genome analysis
---------------------

*De novo* assembly, reference assisted assembly improvements,
gene annotaion, species-level variation, within-host variation, etc.


Taxonomic read filtration
-------------------------

Especially human read depletion (prior to submission to NCBI SRA).
But also the part where we restrict to a particular taxa of interest
(the species you're studying).


Taxonomic read identification
-----------------------------

Nothing much here at the moment. That comes later, but we will later
integrate it when it's ready.

1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Contents
:maxdepth: 2
:numbered:

description
install
cmdline
pipeuse
4 changes: 4 additions & 0 deletions docs/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,7 @@ Alternatively, if you are using the Snakemake pipelines, you can create
a dictionary called "env_vars" in the config.json file for Snakemake,
and the pipelines will automatically set all environment variables prior
to running any scripts.

The version of MOSAIK we use seems to fail compile on GCC-4.9 but compiles
fine on GCC-4.4. We have not tried intermediate versions of GCC, nor the
latest versions of MOSAIK.
2 changes: 1 addition & 1 deletion docs/interhost.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
interhost.py - species and population-level genetic variation
=====================================
=============================================================

.. argparse::
:module: interhost
Expand Down
2 changes: 1 addition & 1 deletion docs/read_utils.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
read_utils.py - utilities that manipulate bam and fastq files
=====================================
=============================================================

.. argparse::
:module: read_utils
Expand Down
2 changes: 1 addition & 1 deletion docs/reports.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
reports.py - produce various metrics and reports
=====================================
================================================

.. argparse::
:module: reports
Expand Down
2 changes: 1 addition & 1 deletion docs/taxon_filter.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
taxon_filter.py - tools for taxonomic removal or filtration of reads
=====================================
====================================================================

.. argparse::
:module: taxon_filter
Expand Down
Loading

0 comments on commit 2c10a9c

Please sign in to comment.