-
Notifications
You must be signed in to change notification settings - Fork 67
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add metagenomics.py::filter_bam_to_taxa #883
Changes from 4 commits
267891a
cd8589b
cdf80be
24dbf1a
d759e9c
b0fe08e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,6 +38,7 @@ | |
import tools.kraken | ||
import tools.krona | ||
import tools.picard | ||
import tools.samtools | ||
from util.file import open_or_gzopen | ||
|
||
__commands__ = [] | ||
|
@@ -60,6 +61,10 @@ def maybe_compressed(fn): | |
|
||
|
||
class TaxonomyDb(object): | ||
""" | ||
This class loads NCBI taxonomy information from: | ||
ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/ | ||
""" | ||
|
||
def __init__( | ||
self, | ||
|
@@ -1145,6 +1150,122 @@ def kraken_library_ids(library): | |
class KrakenBuildError(Exception): | ||
'''Error while building kraken database.''' | ||
|
||
def parser_filter_bam_to_taxa(parser=argparse.ArgumentParser()): | ||
parser.add_argument('in_bam', help='Input bam file.') | ||
parser.add_argument('in_reads_to_tax_ID_map_file', help='TSV file mapping read IDs to taxIDs, Kraken-format by default. Assumes bijective mapping of read ID to tax ID.') | ||
parser.add_argument('out_bam', help='Output bam file, filtered to the taxa specified') | ||
parser.add_argument('in_taxdb_nodes_path', help='nodes.dmp file from ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps just |
||
parser.add_argument('in_taxdb_names_path', help='names.dmp file from ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/') | ||
parser.add_argument('--taxNames', nargs="+", dest="tax_names", help='The taxonomic names to include. More than one can be specified. Mapped to Tax IDs by lowercase exact match only. Ex. "Viruses" This is in addition to any taxonomic IDs provided.') | ||
parser.add_argument('--taxIDs', nargs="+", type=int, dest="tax_ids", help='The NCBI taxonomy IDs to include. More than one can be specified. This is in addition to any taxonomic names provided.') | ||
parser.add_argument('--omit_children', action='store_true', dest="omit_children", help='Omit reads classified more specifically than each taxon specified (without this a taxon and its children are included).') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps |
||
parser.add_argument('--read_id_col', type=int, dest="read_id_col", help='The (zero-indexed) number of the column in in_reads_to_tax_ID_map_file containing read IDs. (default: %(default)s)', default=1) | ||
parser.add_argument('--tax_id_col', type=int, dest="tax_id_col", help='The (zero-indexed) number of the column in in_reads_to_tax_ID_map_file containing Taxonomy IDs. (default: %(default)s)', default=2) | ||
parser.add_argument( | ||
'--JVMmemory', | ||
default=tools.picard.FilterSamReadsTool.jvmMemDefault, | ||
help='JVM virtual memory size (default: %(default)s)' | ||
) | ||
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) | ||
util.cmd.attach_main(parser, filter_bam_to_taxa, split_args=True) | ||
return parser | ||
|
||
def filter_bam_to_taxa( in_bam, | ||
in_reads_to_tax_ID_map_file, | ||
out_bam, | ||
in_taxdb_nodes_path, | ||
in_taxdb_names_path, | ||
tax_names=None, | ||
tax_ids=None, | ||
omit_children=False, | ||
read_id_col=1, | ||
tax_id_col=2, | ||
JVMmemory=None ): | ||
""" | ||
Filter an (already classified) input bam file to only include reads that have been mapped to specified | ||
taxonomic IDs or scientific names. This requires a classification file, as produced | ||
by tools such as Kraken, as well as the NCBI taxonomy database. | ||
""" | ||
tax_ids = set(tax_ids) if tax_ids else set() | ||
tax_names = tax_names or [] | ||
# use TaxonomyDb() class above and tree traversal/collection functions above | ||
db = TaxonomyDb(nodes_path=in_taxdb_nodes_path, names_path=in_taxdb_names_path, load_nodes=True, load_names=True) | ||
db.children = parents_to_children(db.parents) | ||
|
||
paired_read_base_pattern = re.compile(r'^(.*)/[1-2]$') | ||
|
||
# get taxIDs for each of the heading values specifed (exact matches only) | ||
tax_ids_from_headings = set() | ||
for heading in tax_names: | ||
# map heading to taxID | ||
name_pattern = re.compile(heading.lower()) | ||
for row_tax_id, names in db.names.items(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would be cleaner in a function but might be marginally slower |
||
found_heading = False | ||
if type(names) == list: | ||
# if taxID->list (of names) | ||
for name in names: | ||
if name_pattern.match(name.lower()): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps use |
||
tax_ids_from_headings.add(row_tax_id) | ||
log.debug("Found taxName match: %s -> %s" % (row_tax_id,name)) | ||
found_heading = True | ||
break | ||
if found_heading: | ||
break | ||
else: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not just do |
||
# if taxID->str | ||
if name_pattern.match(names.lower()): | ||
tax_ids_from_headings.add(row_tax_id) | ||
log.debug("Found taxName match: %s -> %s" % (row_tax_id,names)) | ||
break | ||
|
||
tax_ids |= tax_ids_from_headings | ||
|
||
log.debug("tax_ids %s" % tax_ids) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use |
||
log.debug("tax_names %s" % tax_names) | ||
|
||
# extend tax_ids to include IDs of children | ||
tax_ids_to_include = set() | ||
for tax_id in tax_ids: | ||
tax_ids_to_include.add(tax_id) | ||
if not omit_children: | ||
child_ids = collect_children(db.children, set([tax_id])) | ||
tax_ids_to_include |= set(child_ids) | ||
|
||
tax_ids_to_include = frozenset(tax_ids_to_include) # frozenset membership check slightly faster | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I removed such a line since the set can be huge depending on the number of children (ex. |
||
# perform the actual filtering to return a list of read IDs, writeen to a temp file | ||
with util.file.tempfname(".txt.gz") as temp_read_list: | ||
with open_or_gzopen(temp_read_list, "wt") as read_IDs_file: | ||
read_ids_written = 0 | ||
for row in util.file.read_tabfile(in_reads_to_tax_ID_map_file): | ||
assert tax_id_col<len(row), "tax_id_col does not appear to be in range for number of columns present in mapping file" | ||
assert read_id_col<len(row), "read_id_col does not appear to be in range for number of columns present in mapping file" | ||
read_id = row[read_id_col] | ||
read_tax_id = int(row[tax_id_col]) | ||
|
||
# transform read ID to take read pairs into account | ||
read_id_match = re.match(paired_read_base_pattern,read_id) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This only support of read names with format ending There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, it should be more flexible now. |
||
if (read_id_match and | ||
read_tax_id in tax_ids_to_include): | ||
log.debug("Found matching read ID: %s" % read_id_match.group(1)) | ||
read_IDs_file.write(read_id_match.group(1)+"\n") | ||
read_ids_written+=1 | ||
|
||
# if we found reads matching the taxNames requested, | ||
if read_ids_written > 0: | ||
# filter the input bam to include only these | ||
tools.picard.FilterSamReadsTool().execute(in_bam, | ||
False, | ||
temp_read_list, | ||
out_bam, | ||
JVMmemory=JVMmemory) | ||
else: | ||
# otherwise, "touch" the output bam to contain the | ||
# header of the input bam (no matching reads) | ||
tools.samtools.SamtoolsTool().dumpHeader(in_bam,out_bam) | ||
__commands__.append(('filter_bam_to_taxa', parser_filter_bam_to_taxa)) | ||
|
||
|
||
|
||
def parser_kraken_taxlevel_summary(parser=argparse.ArgumentParser()): | ||
parser.add_argument('summary_files_in', nargs="+", help='Kraken-format summary text file with tab-delimited taxonomic levels.') | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
{ | ||
"filter_classified_bam_to_taxa.filter_bam_to_taxa.ncbi_taxonomy_db_tgz": | ||
"dx://file-F8Qqj880xf5g5bx23g0KjJbK" | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
import "tasks_metagenomics.wdl" as metagenomics | ||
|
||
workflow filter_classified_bam_to_taxa { | ||
call metagenomics.filter_bam_to_taxa | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -121,6 +121,61 @@ task krona { | |
} | ||
} | ||
|
||
task filter_bam_to_taxa { | ||
File classified_bam | ||
File classified_reads_txt_gz | ||
File ncbi_taxonomy_db_tgz # nodes.dmp names.dmp | ||
Array[String]? taxonomic_names | ||
Array[Int]? taxonomic_ids | ||
|
||
String input_basename = basename(classified_bam, ".bam") | ||
|
||
parameter_meta { | ||
ncbi_taxonomy_db_tgz : "stream" # for DNAnexus, until WDL implements the File| type | ||
} | ||
|
||
command { | ||
set -ex -o pipefail | ||
|
||
# decompress DB to /mnt/db | ||
read_utils.py extract_tarball \ | ||
${ncbi_taxonomy_db_tgz} . \ | ||
--loglevel=DEBUG | ||
|
||
TAX_NAMES="${sep=' ' taxonomic_names}" | ||
if [ -n "$TAX_NAMES" ]; then TAX_NAMES="--taxNames $TAX_NAMES"; fi | ||
|
||
TAX_IDs="${sep=' ' taxonomic_ids}" | ||
if [ -n "$TAX_IDs" ]; then TAX_IDs="--taxIDs $TAX_IDs"; fi | ||
|
||
metagenomics.py filter_bam_to_taxa \ | ||
${classified_bam} \ | ||
${classified_reads_txt_gz} \ | ||
"${input_basename}_filtered.bam" \ | ||
taxonomy/nodes.dmp \ | ||
taxonomy/names.dmp \ | ||
$TAX_NAMES \ | ||
$TAX_IDs \ | ||
--loglevel=DEBUG | ||
|
||
samtools view -c ${classified_bam} | tee classified_taxonomic_filter_read_count_pre | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why tee ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We use |
||
samtools view -c "${input_basename}_filtered.bam" | tee classified_taxonomic_filter_read_count_post | ||
} | ||
|
||
output { | ||
File bam_filtered_to_taxa = "${input_basename}_filtered.bam" | ||
Int classified_taxonomic_filter_read_count_pre = read_int("classified_taxonomic_filter_read_count_pre") | ||
Int classified_taxonomic_filter_read_count_post = read_int("classified_taxonomic_filter_read_count_post") | ||
} | ||
|
||
runtime { | ||
docker: "quay.io/broadinstitute/viral-ngs" | ||
memory: "4 GB" | ||
cpu: 1 | ||
dx_instance_type: "mem1_ssd2_x2" | ||
} | ||
|
||
} | ||
|
||
task diamond_contigs { | ||
File contigs_fasta | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A shorter name perhaps. The help is very descriptive.