Merge pull request #9 from WiscEvan/master

merge nodes when databases out of sync.
KwanLab · Mar 2, 2020 · faf8243 · faf8243
2 parents bd3ab6a + 7d16d90
commit faf8243
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 3 deletions.
diff --git a/pipeline/add_contig_taxonomy.py b/pipeline/add_contig_taxonomy.py
@@ -185,6 +185,19 @@ def parse_nodes(nodes_dmp_path):
     nodes_dmp.close()
     return(nodes)
 
+def parse_merged(fpath):
+    print(strftime("%Y-%m-%d %H:%M:%S") + ' Processing merged taxid nodes')
+    wc_output = subprocess.check_output(['wc', '-l', fpath])
+    wc_list = wc_output.split()
+    number_of_lines = int(wc_list[0])
+    fh = open(fpath)
+    merged = {}
+    for line in tqdm(fh, desc='parsing merged', total=number_of_lines, leave=False):
+        old_taxid, new_taxid = [int(taxid) for taxid in line.strip('\t|\n').split('\t|\t')]
+        merged.update({old_taxid:new_taxid})
+    fh.close()
+    return(merged)
+
 def parse_lca(lca_fpath):
     print( strftime("%Y-%m-%d %H:%M:%S") + ' Parsing lca taxonomy table')
     # Work out number of lines in file
@@ -199,6 +212,9 @@ def parse_lca(lca_fpath):
         orf, name, rank, taxid = line.strip().split('\t')
         contig, orf_num = orf.rsplit('_', 1)
         taxid = int(taxid)
+        # Convert any nodes that were recently suppressed/deprecated
+        # to their new node taxid. Otherwise keep the same taxid
+        taxid = merged.get(taxid, taxid)
         if taxid != 1:
             while rank not in set(rank_priority):
                 taxid = nodes[taxid]['parent']
@@ -330,12 +346,14 @@ def write_taxa(ranked_ctgs, contig_table_fpath, outfpath):
 # Process NCBI taxdump files
 name_fpath = os.path.join(taxdump_dir_path, 'names.dmp')
 nodes_fpath = os.path.join(taxdump_dir_path, 'nodes.dmp')
+merged_fpath = os.path.join(taxdump_dir_path, 'merged.dmp')
 
 pp = pprint.PrettyPrinter(indent=4)
 
 # Build taxid tree structure with associated canoncial ranks and names
 names = parse_names(name_fpath)
 nodes = parse_nodes(nodes_fpath)
+merged = parse_merged(merged_fpath)
 
 rank_priority = [
     'species',

diff --git a/pipeline/make_taxonomy_table.py b/pipeline/make_taxonomy_table.py
@@ -164,7 +164,7 @@ def update_dbs(database_path, db='all'):
 			download_file(database_path, taxdump_url, taxdump_md5_url)
 
 		if os.path.isfile(database_path + '/taxdump.tar.gz'):
-			run_command('tar -xzf {}/taxdump.tar.gz -C {} names.dmp nodes.dmp'.format(database_path, database_path))
+			run_command('tar -xzf {}/taxdump.tar.gz -C {} names.dmp nodes.dmp merged.dmp'.format(database_path, database_path))
 			os.remove('{}/taxdump.tar.gz'.format(database_path))
 			print("nodes.dmp and names.dmp updated")
 
@@ -183,7 +183,7 @@ def check_dbs(db_path):
 		db_dict = {
 			'nr': ['nr.dmnd'],
 			'acc2taxid': ['prot.accession2taxid'],
-			'taxdump': ['names.dmp','nodes.dmp']
+			'taxdump': ['names.dmp','nodes.dmp', 'merged.dmp']
 			}
 	db_files = os.listdir(db_path)
 	for db in db_dict:
@@ -303,7 +303,7 @@ def run_taxonomy(pipeline_path, assembly_path, tax_table_path, db_dir_path,
 parser.add_argument('-s', '--single_genome', help='Specifies single genome mode',
 	action='store_true')
 parser.add_argument('-u', '--update', required=False, action='store_true',
-	help='Checks/Adds/Updates: nodes.dmp, names.dmp, accession2taxid, nr.dmnd files within specified directory.')
+	help='Checks/Adds/Updates: nodes.dmp, names.dmp, merged.dmp, accession2taxid, nr.dmnd files within specified directory.')
 
 args = vars(parser.parse_args())