broadinstitute · tomkinsc · Aug 28, 2018 · Aug 24, 2018 · Aug 24, 2018 · Aug 24, 2018
diff --git a/metagenomics.py b/metagenomics.py
@@ -38,6 +38,7 @@
 import tools.kraken
 import tools.krona
 import tools.picard
+import tools.samtools
 from util.file import open_or_gzopen
 
 __commands__ = []
@@ -60,6 +61,10 @@ def maybe_compressed(fn):
 
 
 class TaxonomyDb(object):
+ """
+ This class loads NCBI taxonomy information from:
+ ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/
+ """
 
  def __init__(
  self,
@@ -1145,6 +1150,122 @@ def kraken_library_ids(library):
 class KrakenBuildError(Exception):
  '''Error while building kraken database.'''
 
+def parser_filter_bam_to_taxa(parser=argparse.ArgumentParser()):
+ parser.add_argument('in_bam', help='Input bam file.')
+ parser.add_argument('in_reads_to_tax_ID_map_file', help='TSV file mapping read IDs to taxIDs, Kraken-format by default. Assumes bijective mapping of read ID to tax ID.') 
+ parser.add_argument('out_bam', help='Output bam file, filtered to the taxa specified')
+ parser.add_argument('in_taxdb_nodes_path', help='nodes.dmp file from ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/')
+ parser.add_argument('in_taxdb_names_path', help='names.dmp file from ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/')
+ parser.add_argument('--taxNames', nargs="+", dest="tax_names", help='The taxonomic names to include. More than one can be specified. Mapped to Tax IDs by lowercase exact match only. Ex. "Viruses" This is in addition to any taxonomic IDs provided.')
+ parser.add_argument('--taxIDs', nargs="+", type=int, dest="tax_ids", help='The NCBI taxonomy IDs to include. More than one can be specified. This is in addition to any taxonomic names provided.')
+ parser.add_argument('--omit_children', action='store_true', dest="omit_children", help='Omit reads classified more specifically than each taxon specified (without this a taxon and its children are included).')
+ parser.add_argument('--read_id_col', type=int, dest="read_id_col", help='The (zero-indexed) number of the column in in_reads_to_tax_ID_map_file containing read IDs. (default: %(default)s)', default=1)
+ parser.add_argument('--tax_id_col', type=int, dest="tax_id_col", help='The (zero-indexed) number of the column in in_reads_to_tax_ID_map_file containing Taxonomy IDs. (default: %(default)s)', default=2)
+ parser.add_argument(
+ '--JVMmemory',
+ default=tools.picard.FilterSamReadsTool.jvmMemDefault,
+ help='JVM virtual memory size (default: %(default)s)'
+ )
+ util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
+ util.cmd.attach_main(parser, filter_bam_to_taxa, split_args=True)
+ return parser 
+
+def filter_bam_to_taxa( in_bam, 
+ in_reads_to_tax_ID_map_file,
+ out_bam,
+ in_taxdb_nodes_path, 
+ in_taxdb_names_path, 
+ tax_names=None, 
+ tax_ids=None,
+ omit_children=False,
+ read_id_col=1, 
+ tax_id_col=2,
+ JVMmemory=None ):
+ """
+ Filter an (already classified) input bam file to only include reads that have been mapped to specified 
+ taxonomic IDs or scientific names. This requires a classification file, as produced 
+ by tools such as Kraken, as well as the NCBI taxonomy database.
+ """
+ tax_ids = set(tax_ids) if tax_ids else set()
+ tax_names = tax_names or []
+ # use TaxonomyDb() class above and tree traversal/collection functions above 
+ db = TaxonomyDb(nodes_path=in_taxdb_nodes_path, names_path=in_taxdb_names_path, load_nodes=True, load_names=True)
+ db.children = parents_to_children(db.parents)
+
+ paired_read_base_pattern = re.compile(r'^(.*)/[1-2]$')
+
+ # get taxIDs for each of the heading values specifed (exact matches only)
+ tax_ids_from_headings = set()
+ for heading in tax_names:
+ # map heading to taxID
+ name_pattern = re.compile(heading.lower())
+ for row_tax_id, names in db.names.items():
+ found_heading = False
+ if type(names) == list:
+ # if taxID->list (of names)
+ for name in names:
+ if name_pattern.match(name.lower()):
+ tax_ids_from_headings.add(row_tax_id)
+ log.debug("Found taxName match: %s -> %s" % (row_tax_id,name))
+ found_heading = True
+ break
+ if found_heading:
+ break
+ else:
+ # if taxID->str
+ if name_pattern.match(names.lower()):
+ tax_ids_from_headings.add(row_tax_id)
+ log.debug("Found taxName match: %s -> %s" % (row_tax_id,names))
+ break
+
+ tax_ids |= tax_ids_from_headings
+
+ log.debug("tax_ids %s" % tax_ids)
+ log.debug("tax_names %s" % tax_names)
+
+ # extend tax_ids to include IDs of children
+ tax_ids_to_include = set()
+ for tax_id in tax_ids:
+ tax_ids_to_include.add(tax_id)
+ if not omit_children:
+ child_ids = collect_children(db.children, set([tax_id]))
+ tax_ids_to_include |= set(child_ids)
+
+ tax_ids_to_include = frozenset(tax_ids_to_include) # frozenset membership check slightly faster
+
+ # perform the actual filtering to return a list of read IDs, writeen to a temp file
+ with util.file.tempfname(".txt.gz") as temp_read_list:
+ with open_or_gzopen(temp_read_list, "wt") as read_IDs_file:
+ read_ids_written = 0
+ for row in util.file.read_tabfile(in_reads_to_tax_ID_map_file):
+ assert tax_id_col<len(row), "tax_id_col does not appear to be in range for number of columns present in mapping file"
+ assert read_id_col<len(row), "read_id_col does not appear to be in range for number of columns present in mapping file"
+ read_id = row[read_id_col]
+ read_tax_id = int(row[tax_id_col])
+
+ # transform read ID to take read pairs into account
+ read_id_match = re.match(paired_read_base_pattern,read_id)
+ if (read_id_match and
+ read_tax_id in tax_ids_to_include):
+ log.debug("Found matching read ID: %s" % read_id_match.group(1))
+ read_IDs_file.write(read_id_match.group(1)+"\n")
+ read_ids_written+=1
+
+ # if we found reads matching the taxNames requested,
+ if read_ids_written > 0:
+ # filter the input bam to include only these
+ tools.picard.FilterSamReadsTool().execute(in_bam, 
+ False, 
+ temp_read_list, 
+ out_bam, 
+ JVMmemory=JVMmemory)
+ else:
+ # otherwise, "touch" the output bam to contain the
+ # header of the input bam (no matching reads)
+ tools.samtools.SamtoolsTool().dumpHeader(in_bam,out_bam)
+__commands__.append(('filter_bam_to_taxa', parser_filter_bam_to_taxa))
+
+
 
 def parser_kraken_taxlevel_summary(parser=argparse.ArgumentParser()):
  parser.add_argument('summary_files_in', nargs="+", help='Kraken-format summary text file with tab-delimited taxonomic levels.')

diff --git a/pipes/WDL/dx-defaults-filter_classified_bam_to_taxa.json b/pipes/WDL/dx-defaults-filter_classified_bam_to_taxa.json
@@ -0,0 +1,4 @@
+{
+ "filter_classified_bam_to_taxa.filter_bam_to_taxa.ncbi_taxonomy_db_tgz":
+ "dx://file-F8Qqj880xf5g5bx23g0KjJbK"
+}
diff --git a/pipes/WDL/workflows/filter_classified_bam_to_taxa.wdl b/pipes/WDL/workflows/filter_classified_bam_to_taxa.wdl
@@ -0,0 +1,5 @@
+import "tasks_metagenomics.wdl" as metagenomics
+
+workflow filter_classified_bam_to_taxa {
+ call metagenomics.filter_bam_to_taxa
+}
diff --git a/pipes/WDL/workflows/tasks/tasks_metagenomics.wdl b/pipes/WDL/workflows/tasks/tasks_metagenomics.wdl
@@ -121,6 +121,61 @@ task krona {
  }
 }
 
+task filter_bam_to_taxa {
+ File classified_bam
+ File classified_reads_txt_gz
+ File ncbi_taxonomy_db_tgz # nodes.dmp names.dmp
+ Array[String]? taxonomic_names
+ Array[Int]? taxonomic_ids
+
+ String input_basename = basename(classified_bam, ".bam")
+
+ parameter_meta {
+ ncbi_taxonomy_db_tgz : "stream" # for DNAnexus, until WDL implements the File| type
+ }
+
+ command {
+ set -ex -o pipefail
+
+ # decompress DB to /mnt/db
+ read_utils.py extract_tarball \
+ ${ncbi_taxonomy_db_tgz} . \
+ --loglevel=DEBUG
+
+ TAX_NAMES="${sep=' ' taxonomic_names}"
+ if [ -n "$TAX_NAMES" ]; then TAX_NAMES="--taxNames $TAX_NAMES"; fi
+
+ TAX_IDs="${sep=' ' taxonomic_ids}"
+ if [ -n "$TAX_IDs" ]; then TAX_IDs="--taxIDs $TAX_IDs"; fi
+
+ metagenomics.py filter_bam_to_taxa \
+ ${classified_bam} \
+ ${classified_reads_txt_gz} \
+ "${input_basename}_filtered.bam" \
+ taxonomy/nodes.dmp \
+ taxonomy/names.dmp \
+ $TAX_NAMES \
+ $TAX_IDs \
+ --loglevel=DEBUG
+
+ samtools view -c ${classified_bam} | tee classified_taxonomic_filter_read_count_pre
+ samtools view -c "${input_basename}_filtered.bam" | tee classified_taxonomic_filter_read_count_post
+ }
+
+ output {
+ File bam_filtered_to_taxa = "${input_basename}_filtered.bam"
+ Int classified_taxonomic_filter_read_count_pre = read_int("classified_taxonomic_filter_read_count_pre")
+ Int classified_taxonomic_filter_read_count_post = read_int("classified_taxonomic_filter_read_count_post")
+ }
+
+ runtime {
+ docker: "quay.io/broadinstitute/viral-ngs"
+ memory: "4 GB"
+ cpu: 1
+ dx_instance_type: "mem1_ssd2_x2"
+ }
+
+}
 
 task diamond_contigs {
  File contigs_fasta

diff --git a/test/input/TestBamFilter/expected.bam b/test/input/TestBamFilter/expected.bam
diff --git a/test/input/TestBamFilter/input.bam b/test/input/TestBamFilter/input.bam
diff --git a/test/input/TestBamFilter/input.kraken-reads.tsv.gz b/test/input/TestBamFilter/input.kraken-reads.tsv.gz
diff --git a/test/unit/test_metagenomics.py b/test/unit/test_metagenomics.py
@@ -18,7 +18,7 @@
 import metagenomics
 import util.file
 import util.misc
-from test import TestCaseWithTmp, _CPUS
+from test import TestCaseWithTmp, assert_equal_bam_reads, _CPUS
 
 if six.PY2:
  from StringIO import StringIO
@@ -309,3 +309,47 @@ def test_coverage_lca(taxa_db):
  assert metagenomics.coverage_lca([6, 7, 8], taxa_db.parents) == 6
  assert metagenomics.coverage_lca([10, 11, 12], taxa_db.parents, 50) == 7
  assert metagenomics.coverage_lca([9], taxa_db.parents) is None
+
+class TestBamFilter(TestCaseWithTmp):
+ def test_bam_filter_simple(self):
+ temp_dir = tempfile.gettempdir()
+ input_dir = util.file.get_test_input_path(self)
+ taxonomy_dir = os.path.join(util.file.get_test_input_path(),"TestMetagenomicsSimple","db","taxonomy")
+
+ filtered_bam = util.file.mkstempfname('.bam')
+ args = [
+ os.path.join(input_dir,"input.bam"),
+ os.path.join(input_dir,"input.kraken-reads.tsv.gz"),
+ filtered_bam,
+ os.path.join(taxonomy_dir,"nodes.dmp"),
+ os.path.join(taxonomy_dir,"names.dmp"),
+ "--taxNames",
+ "Ebolavirus"
+ ]
+ args = metagenomics.parser_filter_bam_to_taxa(argparse.ArgumentParser()).parse_args(args)
+ args.func_main(args)
+
+ expected_bam = os.path.join(input_dir,"expected.bam")
+ assert_equal_bam_reads(self, filtered_bam, expected_bam)
+
+ def test_bam_filter_by_tax_id(self):
+ temp_dir = tempfile.gettempdir()
+ input_dir = util.file.get_test_input_path(self)
+ taxonomy_dir = os.path.join(util.file.get_test_input_path(),"TestMetagenomicsSimple","db","taxonomy")
+
+ filtered_bam = util.file.mkstempfname('.bam')
+ args = [
+ os.path.join(input_dir,"input.bam"),
+ os.path.join(input_dir,"input.kraken-reads.tsv.gz"),
+ filtered_bam,
+ os.path.join(taxonomy_dir,"nodes.dmp"),
+ os.path.join(taxonomy_dir,"names.dmp"),
+ "--taxIDs",
+ "186538"
+ ]
+ args = metagenomics.parser_filter_bam_to_taxa(argparse.ArgumentParser()).parse_args(args)
+ args.func_main(args)
+
+ expected_bam = os.path.join(input_dir,"expected.bam")
+ assert_equal_bam_reads(self, filtered_bam, expected_bam)
+