Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add metagenomics.py::filter_bam_to_taxa #883

Merged
merged 6 commits into from
Aug 28, 2018
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 121 additions & 0 deletions metagenomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import tools.kraken
import tools.krona
import tools.picard
import tools.samtools
from util.file import open_or_gzopen

__commands__ = []
Expand All @@ -60,6 +61,10 @@ def maybe_compressed(fn):


class TaxonomyDb(object):
"""
This class loads NCBI taxonomy information from:
ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/
"""

def __init__(
self,
Expand Down Expand Up @@ -1145,6 +1150,122 @@ def kraken_library_ids(library):
class KrakenBuildError(Exception):
'''Error while building kraken database.'''

def parser_filter_bam_to_taxa(parser=argparse.ArgumentParser()):
parser.add_argument('in_bam', help='Input bam file.')
parser.add_argument('in_reads_to_tax_ID_map_file', help='TSV file mapping read IDs to taxIDs, Kraken-format by default. Assumes bijective mapping of read ID to tax ID.')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A shorter name perhaps. The help is very descriptive.

parser.add_argument('out_bam', help='Output bam file, filtered to the taxa specified')
parser.add_argument('in_taxdb_nodes_path', help='nodes.dmp file from ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps just nodes_dmp

parser.add_argument('in_taxdb_names_path', help='names.dmp file from ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/')
parser.add_argument('--taxNames', nargs="+", dest="tax_names", help='The taxonomic names to include. More than one can be specified. Mapped to Tax IDs by lowercase exact match only. Ex. "Viruses" This is in addition to any taxonomic IDs provided.')
parser.add_argument('--taxIDs', nargs="+", type=int, dest="tax_ids", help='The NCBI taxonomy IDs to include. More than one can be specified. This is in addition to any taxonomic names provided.')
parser.add_argument('--omit_children', action='store_true', dest="omit_children", help='Omit reads classified more specifically than each taxon specified (without this a taxon and its children are included).')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps --without-children is a little more conventional argument name

parser.add_argument('--read_id_col', type=int, dest="read_id_col", help='The (zero-indexed) number of the column in in_reads_to_tax_ID_map_file containing read IDs. (default: %(default)s)', default=1)
parser.add_argument('--tax_id_col', type=int, dest="tax_id_col", help='The (zero-indexed) number of the column in in_reads_to_tax_ID_map_file containing Taxonomy IDs. (default: %(default)s)', default=2)
parser.add_argument(
'--JVMmemory',
default=tools.picard.FilterSamReadsTool.jvmMemDefault,
help='JVM virtual memory size (default: %(default)s)'
)
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, filter_bam_to_taxa, split_args=True)
return parser

def filter_bam_to_taxa( in_bam,
in_reads_to_tax_ID_map_file,
out_bam,
in_taxdb_nodes_path,
in_taxdb_names_path,
tax_names=None,
tax_ids=None,
omit_children=False,
read_id_col=1,
tax_id_col=2,
JVMmemory=None ):
"""
Filter an (already classified) input bam file to only include reads that have been mapped to specified
taxonomic IDs or scientific names. This requires a classification file, as produced
by tools such as Kraken, as well as the NCBI taxonomy database.
"""
tax_ids = set(tax_ids) if tax_ids else set()
tax_names = tax_names or []
# use TaxonomyDb() class above and tree traversal/collection functions above
db = TaxonomyDb(nodes_path=in_taxdb_nodes_path, names_path=in_taxdb_names_path, load_nodes=True, load_names=True)
db.children = parents_to_children(db.parents)

paired_read_base_pattern = re.compile(r'^(.*)/[1-2]$')

# get taxIDs for each of the heading values specifed (exact matches only)
tax_ids_from_headings = set()
for heading in tax_names:
# map heading to taxID
name_pattern = re.compile(heading.lower())
for row_tax_id, names in db.names.items():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be cleaner in a function but might be marginally slower

found_heading = False
if type(names) == list:
# if taxID->list (of names)
for name in names:
if name_pattern.match(name.lower()):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps use re.I for ignorecase instead of comparing both with lower()

tax_ids_from_headings.add(row_tax_id)
log.debug("Found taxName match: %s -> %s" % (row_tax_id,name))
found_heading = True
break
if found_heading:
break
else:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not just do names = list(names) to guarantee it will be a list and just use the top logic?

# if taxID->str
if name_pattern.match(names.lower()):
tax_ids_from_headings.add(row_tax_id)
log.debug("Found taxName match: %s -> %s" % (row_tax_id,names))
break

tax_ids |= tax_ids_from_headings

log.debug("tax_ids %s" % tax_ids)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use , tax_ids) instead of % tax_ids formatting in a log function call

log.debug("tax_names %s" % tax_names)

# extend tax_ids to include IDs of children
tax_ids_to_include = set()
for tax_id in tax_ids:
tax_ids_to_include.add(tax_id)
if not omit_children:
child_ids = collect_children(db.children, set([tax_id]))
tax_ids_to_include |= set(child_ids)

tax_ids_to_include = frozenset(tax_ids_to_include) # frozenset membership check slightly faster

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

log.debug here for tax_ids_to_include ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed such a line since the set can be huge depending on the number of children (ex. --taxNames="Viruses"), and since we usually set --loglevel=DEBUG on DNAnexus it's nice to avoid filling the logfile with tax IDs.

# perform the actual filtering to return a list of read IDs, writeen to a temp file
with util.file.tempfname(".txt.gz") as temp_read_list:
with open_or_gzopen(temp_read_list, "wt") as read_IDs_file:
read_ids_written = 0
for row in util.file.read_tabfile(in_reads_to_tax_ID_map_file):
assert tax_id_col<len(row), "tax_id_col does not appear to be in range for number of columns present in mapping file"
assert read_id_col<len(row), "read_id_col does not appear to be in range for number of columns present in mapping file"
read_id = row[read_id_col]
read_tax_id = int(row[tax_id_col])

# transform read ID to take read pairs into account
read_id_match = re.match(paired_read_base_pattern,read_id)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This only support of read names with format ending /1 or /2 ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, it should be more flexible now.

if (read_id_match and
read_tax_id in tax_ids_to_include):
log.debug("Found matching read ID: %s" % read_id_match.group(1))
read_IDs_file.write(read_id_match.group(1)+"\n")
read_ids_written+=1

# if we found reads matching the taxNames requested,
if read_ids_written > 0:
# filter the input bam to include only these
tools.picard.FilterSamReadsTool().execute(in_bam,
False,
temp_read_list,
out_bam,
JVMmemory=JVMmemory)
else:
# otherwise, "touch" the output bam to contain the
# header of the input bam (no matching reads)
tools.samtools.SamtoolsTool().dumpHeader(in_bam,out_bam)
__commands__.append(('filter_bam_to_taxa', parser_filter_bam_to_taxa))



def parser_kraken_taxlevel_summary(parser=argparse.ArgumentParser()):
parser.add_argument('summary_files_in', nargs="+", help='Kraken-format summary text file with tab-delimited taxonomic levels.')
Expand Down
4 changes: 4 additions & 0 deletions pipes/WDL/dx-defaults-filter_classified_bam_to_taxa.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"filter_classified_bam_to_taxa.filter_bam_to_taxa.ncbi_taxonomy_db_tgz":
"dx://file-F8Qqj880xf5g5bx23g0KjJbK"
}
5 changes: 5 additions & 0 deletions pipes/WDL/workflows/filter_classified_bam_to_taxa.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import "tasks_metagenomics.wdl" as metagenomics

workflow filter_classified_bam_to_taxa {
call metagenomics.filter_bam_to_taxa
}
55 changes: 55 additions & 0 deletions pipes/WDL/workflows/tasks/tasks_metagenomics.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,61 @@ task krona {
}
}

task filter_bam_to_taxa {
File classified_bam
File classified_reads_txt_gz
File ncbi_taxonomy_db_tgz # nodes.dmp names.dmp
Array[String]? taxonomic_names
Array[Int]? taxonomic_ids

String input_basename = basename(classified_bam, ".bam")

parameter_meta {
ncbi_taxonomy_db_tgz : "stream" # for DNAnexus, until WDL implements the File| type
}

command {
set -ex -o pipefail

# decompress DB to /mnt/db
read_utils.py extract_tarball \
${ncbi_taxonomy_db_tgz} . \
--loglevel=DEBUG

TAX_NAMES="${sep=' ' taxonomic_names}"
if [ -n "$TAX_NAMES" ]; then TAX_NAMES="--taxNames $TAX_NAMES"; fi

TAX_IDs="${sep=' ' taxonomic_ids}"
if [ -n "$TAX_IDs" ]; then TAX_IDs="--taxIDs $TAX_IDs"; fi

metagenomics.py filter_bam_to_taxa \
${classified_bam} \
${classified_reads_txt_gz} \
"${input_basename}_filtered.bam" \
taxonomy/nodes.dmp \
taxonomy/names.dmp \
$TAX_NAMES \
$TAX_IDs \
--loglevel=DEBUG

samtools view -c ${classified_bam} | tee classified_taxonomic_filter_read_count_pre
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why tee ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We use read_int() downstream to ingest the value as a WDL output, and with tee with get the value in the log as well.

samtools view -c "${input_basename}_filtered.bam" | tee classified_taxonomic_filter_read_count_post
}

output {
File bam_filtered_to_taxa = "${input_basename}_filtered.bam"
Int classified_taxonomic_filter_read_count_pre = read_int("classified_taxonomic_filter_read_count_pre")
Int classified_taxonomic_filter_read_count_post = read_int("classified_taxonomic_filter_read_count_post")
}

runtime {
docker: "quay.io/broadinstitute/viral-ngs"
memory: "4 GB"
cpu: 1
dx_instance_type: "mem1_ssd2_x2"
}

}

task diamond_contigs {
File contigs_fasta
Expand Down
Binary file added test/input/TestBamFilter/expected.bam
Binary file not shown.
Binary file added test/input/TestBamFilter/input.bam
Binary file not shown.
Binary file not shown.
46 changes: 45 additions & 1 deletion test/unit/test_metagenomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import metagenomics
import util.file
import util.misc
from test import TestCaseWithTmp, _CPUS
from test import TestCaseWithTmp, assert_equal_bam_reads, _CPUS

if six.PY2:
from StringIO import StringIO
Expand Down Expand Up @@ -309,3 +309,47 @@ def test_coverage_lca(taxa_db):
assert metagenomics.coverage_lca([6, 7, 8], taxa_db.parents) == 6
assert metagenomics.coverage_lca([10, 11, 12], taxa_db.parents, 50) == 7
assert metagenomics.coverage_lca([9], taxa_db.parents) is None

class TestBamFilter(TestCaseWithTmp):
def test_bam_filter_simple(self):
temp_dir = tempfile.gettempdir()
input_dir = util.file.get_test_input_path(self)
taxonomy_dir = os.path.join(util.file.get_test_input_path(),"TestMetagenomicsSimple","db","taxonomy")

filtered_bam = util.file.mkstempfname('.bam')
args = [
os.path.join(input_dir,"input.bam"),
os.path.join(input_dir,"input.kraken-reads.tsv.gz"),
filtered_bam,
os.path.join(taxonomy_dir,"nodes.dmp"),
os.path.join(taxonomy_dir,"names.dmp"),
"--taxNames",
"Ebolavirus"
]
args = metagenomics.parser_filter_bam_to_taxa(argparse.ArgumentParser()).parse_args(args)
args.func_main(args)

expected_bam = os.path.join(input_dir,"expected.bam")
assert_equal_bam_reads(self, filtered_bam, expected_bam)

def test_bam_filter_by_tax_id(self):
temp_dir = tempfile.gettempdir()
input_dir = util.file.get_test_input_path(self)
taxonomy_dir = os.path.join(util.file.get_test_input_path(),"TestMetagenomicsSimple","db","taxonomy")

filtered_bam = util.file.mkstempfname('.bam')
args = [
os.path.join(input_dir,"input.bam"),
os.path.join(input_dir,"input.kraken-reads.tsv.gz"),
filtered_bam,
os.path.join(taxonomy_dir,"nodes.dmp"),
os.path.join(taxonomy_dir,"names.dmp"),
"--taxIDs",
"186538"
]
args = metagenomics.parser_filter_bam_to_taxa(argparse.ArgumentParser()).parse_args(args)
args.func_main(args)

expected_bam = os.path.join(input_dir,"expected.bam")
assert_equal_bam_reads(self, filtered_bam, expected_bam)