From e16b4b834d6285d6460d571b014047ed2b9a1dc1 Mon Sep 17 00:00:00 2001
From: EvanRees <erees@wisc.edu>
Date: Wed, 19 Feb 2020 17:54:46 -0600
Subject: [PATCH 1/4] updated add_contig_taxonomy.py to merge any nodes if
 databases nodes.dmp and merged.dmp are out of sync with nr.gz

---
 pipeline/add_contig_taxonomy.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/pipeline/add_contig_taxonomy.py b/pipeline/add_contig_taxonomy.py
index 02406fab8..083eff157 100755
--- a/pipeline/add_contig_taxonomy.py
+++ b/pipeline/add_contig_taxonomy.py
@@ -185,6 +185,19 @@ def parse_nodes(nodes_dmp_path):
     nodes_dmp.close()
     return(nodes)
 
+def parse_merged(fpath):
+    print(strftime("%Y-%m-%d %H:%M:%S") + ' Processing merged taxid nodes')
+    wc_output = subprocess.check_output(['wc', '-l', fpath])
+    wc_list = wc_output.split()
+    number_of_lines = int(wc_list[0])
+    fh = open(fpath)
+    merged = {}
+    for line in tqdm(fh, desc='parsing merged', total=number_of_lines, leave=False):
+        old_taxid, new_taxid = [int(taxid) for taxid in line.strip('\t|\n').split('\t|\t')]
+        merged.update({old_taxid:new_taxid})
+    fh.close()
+    return(merged)
+
 def parse_lca(lca_fpath):
     print( strftime("%Y-%m-%d %H:%M:%S") + ' Parsing lca taxonomy table')
     # Work out number of lines in file
@@ -199,6 +212,9 @@ def parse_lca(lca_fpath):
         orf, name, rank, taxid = line.strip().split('\t')
         contig, orf_num = orf.rsplit('_', 1)
         taxid = int(taxid)
+        # Convert any nodes that were recently suppressed/deprecated
+        # to their new node taxid. Otherwise keep the same taxid
+        taxid = merged.get(taxid, taxid)
         if taxid != 1:
             while rank not in set(rank_priority):
                 taxid = nodes[taxid]['parent']
@@ -336,6 +352,7 @@ def write_taxa(ranked_ctgs, contig_table_fpath, outfpath):
 # Build taxid tree structure with associated canoncial ranks and names
 names = parse_names(name_fpath)
 nodes = parse_nodes(nodes_fpath)
+merged = parse_merged(merged_fpath)
 
 rank_priority = [
     'species',

From 2889600cccdd0e7324ecb2eaf282b0999898704a Mon Sep 17 00:00:00 2001
From: EvanRees <erees@wisc.edu>
Date: Wed, 19 Feb 2020 17:59:01 -0600
Subject: [PATCH 2/4] added filepath handling to merged.dmp

---
 pipeline/add_contig_taxonomy.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pipeline/add_contig_taxonomy.py b/pipeline/add_contig_taxonomy.py
index 083eff157..132eed2c2 100755
--- a/pipeline/add_contig_taxonomy.py
+++ b/pipeline/add_contig_taxonomy.py
@@ -346,6 +346,7 @@ def write_taxa(ranked_ctgs, contig_table_fpath, outfpath):
 # Process NCBI taxdump files
 name_fpath = os.path.join(taxdump_dir_path, 'names.dmp')
 nodes_fpath = os.path.join(taxdump_dir_path, 'nodes.dmp')
+nodes_fpath = os.path.join(taxdump_dir_path, 'merged.dmp')
 
 pp = pprint.PrettyPrinter(indent=4)
 

From 6505fda146bbae04229d072a691a443251eac473 Mon Sep 17 00:00:00 2001
From: EvanRees <erees@wisc.edu>
Date: Wed, 19 Feb 2020 17:59:53 -0600
Subject: [PATCH 3/4] resolved merged_fpath variable

---
 pipeline/add_contig_taxonomy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline/add_contig_taxonomy.py b/pipeline/add_contig_taxonomy.py
index 132eed2c2..f87fad690 100755
--- a/pipeline/add_contig_taxonomy.py
+++ b/pipeline/add_contig_taxonomy.py
@@ -346,7 +346,7 @@ def write_taxa(ranked_ctgs, contig_table_fpath, outfpath):
 # Process NCBI taxdump files
 name_fpath = os.path.join(taxdump_dir_path, 'names.dmp')
 nodes_fpath = os.path.join(taxdump_dir_path, 'nodes.dmp')
-nodes_fpath = os.path.join(taxdump_dir_path, 'merged.dmp')
+merged_fpath = os.path.join(taxdump_dir_path, 'merged.dmp')
 
 pp = pprint.PrettyPrinter(indent=4)
 

From 7d16d90551e921b05064d7d7687c99fc1db7d69e Mon Sep 17 00:00:00 2001
From: EvanRees <erees@wisc.edu>
Date: Mon, 2 Mar 2020 10:53:17 -0600
Subject: [PATCH 4/4] added extraction of merged.dmp from taxdump.tar.gz

---
 pipeline/make_taxonomy_table.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipeline/make_taxonomy_table.py b/pipeline/make_taxonomy_table.py
index bfc47872f..507704d6d 100755
--- a/pipeline/make_taxonomy_table.py
+++ b/pipeline/make_taxonomy_table.py
@@ -164,7 +164,7 @@ def update_dbs(database_path, db='all'):
 			download_file(database_path, taxdump_url, taxdump_md5_url)
 
 		if os.path.isfile(database_path + '/taxdump.tar.gz'):
-			run_command('tar -xzf {}/taxdump.tar.gz -C {} names.dmp nodes.dmp'.format(database_path, database_path))
+			run_command('tar -xzf {}/taxdump.tar.gz -C {} names.dmp nodes.dmp merged.dmp'.format(database_path, database_path))
 			os.remove('{}/taxdump.tar.gz'.format(database_path))
 			print("nodes.dmp and names.dmp updated")
 
@@ -183,7 +183,7 @@ def check_dbs(db_path):
 		db_dict = {
 			'nr': ['nr.dmnd'],
 			'acc2taxid': ['prot.accession2taxid'],
-			'taxdump': ['names.dmp','nodes.dmp']
+			'taxdump': ['names.dmp','nodes.dmp', 'merged.dmp']
 			}
 	db_files = os.listdir(db_path)
 	for db in db_dict:
@@ -303,7 +303,7 @@ def run_taxonomy(pipeline_path, assembly_path, tax_table_path, db_dir_path,
 parser.add_argument('-s', '--single_genome', help='Specifies single genome mode',
 	action='store_true')
 parser.add_argument('-u', '--update', required=False, action='store_true',
-	help='Checks/Adds/Updates: nodes.dmp, names.dmp, accession2taxid, nr.dmnd files within specified directory.')
+	help='Checks/Adds/Updates: nodes.dmp, names.dmp, merged.dmp, accession2taxid, nr.dmnd files within specified directory.')
 
 args = vars(parser.parse_args())