Skip to content

Commit

Permalink
Merge pull request #9 from WiscEvan/master
Browse files Browse the repository at this point in the history
merge nodes when databases out of sync.
  • Loading branch information
jason-c-kwan authored Mar 2, 2020
2 parents bd3ab6a + 7d16d90 commit faf8243
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 3 deletions.
18 changes: 18 additions & 0 deletions pipeline/add_contig_taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,19 @@ def parse_nodes(nodes_dmp_path):
nodes_dmp.close()
return(nodes)

def parse_merged(fpath):
print(strftime("%Y-%m-%d %H:%M:%S") + ' Processing merged taxid nodes')
wc_output = subprocess.check_output(['wc', '-l', fpath])
wc_list = wc_output.split()
number_of_lines = int(wc_list[0])
fh = open(fpath)
merged = {}
for line in tqdm(fh, desc='parsing merged', total=number_of_lines, leave=False):
old_taxid, new_taxid = [int(taxid) for taxid in line.strip('\t|\n').split('\t|\t')]
merged.update({old_taxid:new_taxid})
fh.close()
return(merged)

def parse_lca(lca_fpath):
print( strftime("%Y-%m-%d %H:%M:%S") + ' Parsing lca taxonomy table')
# Work out number of lines in file
Expand All @@ -199,6 +212,9 @@ def parse_lca(lca_fpath):
orf, name, rank, taxid = line.strip().split('\t')
contig, orf_num = orf.rsplit('_', 1)
taxid = int(taxid)
# Convert any nodes that were recently suppressed/deprecated
# to their new node taxid. Otherwise keep the same taxid
taxid = merged.get(taxid, taxid)
if taxid != 1:
while rank not in set(rank_priority):
taxid = nodes[taxid]['parent']
Expand Down Expand Up @@ -330,12 +346,14 @@ def write_taxa(ranked_ctgs, contig_table_fpath, outfpath):
# Process NCBI taxdump files
name_fpath = os.path.join(taxdump_dir_path, 'names.dmp')
nodes_fpath = os.path.join(taxdump_dir_path, 'nodes.dmp')
merged_fpath = os.path.join(taxdump_dir_path, 'merged.dmp')

pp = pprint.PrettyPrinter(indent=4)

# Build taxid tree structure with associated canoncial ranks and names
names = parse_names(name_fpath)
nodes = parse_nodes(nodes_fpath)
merged = parse_merged(merged_fpath)

rank_priority = [
'species',
Expand Down
6 changes: 3 additions & 3 deletions pipeline/make_taxonomy_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def update_dbs(database_path, db='all'):
download_file(database_path, taxdump_url, taxdump_md5_url)

if os.path.isfile(database_path + '/taxdump.tar.gz'):
run_command('tar -xzf {}/taxdump.tar.gz -C {} names.dmp nodes.dmp'.format(database_path, database_path))
run_command('tar -xzf {}/taxdump.tar.gz -C {} names.dmp nodes.dmp merged.dmp'.format(database_path, database_path))
os.remove('{}/taxdump.tar.gz'.format(database_path))
print("nodes.dmp and names.dmp updated")

Expand All @@ -183,7 +183,7 @@ def check_dbs(db_path):
db_dict = {
'nr': ['nr.dmnd'],
'acc2taxid': ['prot.accession2taxid'],
'taxdump': ['names.dmp','nodes.dmp']
'taxdump': ['names.dmp','nodes.dmp', 'merged.dmp']
}
db_files = os.listdir(db_path)
for db in db_dict:
Expand Down Expand Up @@ -303,7 +303,7 @@ def run_taxonomy(pipeline_path, assembly_path, tax_table_path, db_dir_path,
parser.add_argument('-s', '--single_genome', help='Specifies single genome mode',
action='store_true')
parser.add_argument('-u', '--update', required=False, action='store_true',
help='Checks/Adds/Updates: nodes.dmp, names.dmp, accession2taxid, nr.dmnd files within specified directory.')
help='Checks/Adds/Updates: nodes.dmp, names.dmp, merged.dmp, accession2taxid, nr.dmnd files within specified directory.')

args = vars(parser.parse_args())

Expand Down

0 comments on commit faf8243

Please sign in to comment.