From e16b4b834d6285d6460d571b014047ed2b9a1dc1 Mon Sep 17 00:00:00 2001 From: EvanRees Date: Wed, 19 Feb 2020 17:54:46 -0600 Subject: [PATCH 1/4] updated add_contig_taxonomy.py to merge any nodes if databases nodes.dmp and merged.dmp are out of sync with nr.gz --- pipeline/add_contig_taxonomy.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pipeline/add_contig_taxonomy.py b/pipeline/add_contig_taxonomy.py index 02406fab8..083eff157 100755 --- a/pipeline/add_contig_taxonomy.py +++ b/pipeline/add_contig_taxonomy.py @@ -185,6 +185,19 @@ def parse_nodes(nodes_dmp_path): nodes_dmp.close() return(nodes) +def parse_merged(fpath): + print(strftime("%Y-%m-%d %H:%M:%S") + ' Processing merged taxid nodes') + wc_output = subprocess.check_output(['wc', '-l', fpath]) + wc_list = wc_output.split() + number_of_lines = int(wc_list[0]) + fh = open(fpath) + merged = {} + for line in tqdm(fh, desc='parsing merged', total=number_of_lines, leave=False): + old_taxid, new_taxid = [int(taxid) for taxid in line.strip('\t|\n').split('\t|\t')] + merged.update({old_taxid:new_taxid}) + fh.close() + return(merged) + def parse_lca(lca_fpath): print( strftime("%Y-%m-%d %H:%M:%S") + ' Parsing lca taxonomy table') # Work out number of lines in file @@ -199,6 +212,9 @@ def parse_lca(lca_fpath): orf, name, rank, taxid = line.strip().split('\t') contig, orf_num = orf.rsplit('_', 1) taxid = int(taxid) + # Convert any nodes that were recently suppressed/deprecated + # to their new node taxid. Otherwise keep the same taxid + taxid = merged.get(taxid, taxid) if taxid != 1: while rank not in set(rank_priority): taxid = nodes[taxid]['parent'] @@ -336,6 +352,7 @@ def write_taxa(ranked_ctgs, contig_table_fpath, outfpath): # Build taxid tree structure with associated canoncial ranks and names names = parse_names(name_fpath) nodes = parse_nodes(nodes_fpath) +merged = parse_merged(merged_fpath) rank_priority = [ 'species', From 2889600cccdd0e7324ecb2eaf282b0999898704a Mon Sep 17 00:00:00 2001 From: EvanRees Date: Wed, 19 Feb 2020 17:59:01 -0600 Subject: [PATCH 2/4] added filepath handling to merged.dmp --- pipeline/add_contig_taxonomy.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pipeline/add_contig_taxonomy.py b/pipeline/add_contig_taxonomy.py index 083eff157..132eed2c2 100755 --- a/pipeline/add_contig_taxonomy.py +++ b/pipeline/add_contig_taxonomy.py @@ -346,6 +346,7 @@ def write_taxa(ranked_ctgs, contig_table_fpath, outfpath): # Process NCBI taxdump files name_fpath = os.path.join(taxdump_dir_path, 'names.dmp') nodes_fpath = os.path.join(taxdump_dir_path, 'nodes.dmp') +nodes_fpath = os.path.join(taxdump_dir_path, 'merged.dmp') pp = pprint.PrettyPrinter(indent=4) From 6505fda146bbae04229d072a691a443251eac473 Mon Sep 17 00:00:00 2001 From: EvanRees Date: Wed, 19 Feb 2020 17:59:53 -0600 Subject: [PATCH 3/4] resolved merged_fpath variable --- pipeline/add_contig_taxonomy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline/add_contig_taxonomy.py b/pipeline/add_contig_taxonomy.py index 132eed2c2..f87fad690 100755 --- a/pipeline/add_contig_taxonomy.py +++ b/pipeline/add_contig_taxonomy.py @@ -346,7 +346,7 @@ def write_taxa(ranked_ctgs, contig_table_fpath, outfpath): # Process NCBI taxdump files name_fpath = os.path.join(taxdump_dir_path, 'names.dmp') nodes_fpath = os.path.join(taxdump_dir_path, 'nodes.dmp') -nodes_fpath = os.path.join(taxdump_dir_path, 'merged.dmp') +merged_fpath = os.path.join(taxdump_dir_path, 'merged.dmp') pp = pprint.PrettyPrinter(indent=4) From 7d16d90551e921b05064d7d7687c99fc1db7d69e Mon Sep 17 00:00:00 2001 From: EvanRees Date: Mon, 2 Mar 2020 10:53:17 -0600 Subject: [PATCH 4/4] added extraction of merged.dmp from taxdump.tar.gz --- pipeline/make_taxonomy_table.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipeline/make_taxonomy_table.py b/pipeline/make_taxonomy_table.py index bfc47872f..507704d6d 100755 --- a/pipeline/make_taxonomy_table.py +++ b/pipeline/make_taxonomy_table.py @@ -164,7 +164,7 @@ def update_dbs(database_path, db='all'): download_file(database_path, taxdump_url, taxdump_md5_url) if os.path.isfile(database_path + '/taxdump.tar.gz'): - run_command('tar -xzf {}/taxdump.tar.gz -C {} names.dmp nodes.dmp'.format(database_path, database_path)) + run_command('tar -xzf {}/taxdump.tar.gz -C {} names.dmp nodes.dmp merged.dmp'.format(database_path, database_path)) os.remove('{}/taxdump.tar.gz'.format(database_path)) print("nodes.dmp and names.dmp updated") @@ -183,7 +183,7 @@ def check_dbs(db_path): db_dict = { 'nr': ['nr.dmnd'], 'acc2taxid': ['prot.accession2taxid'], - 'taxdump': ['names.dmp','nodes.dmp'] + 'taxdump': ['names.dmp','nodes.dmp', 'merged.dmp'] } db_files = os.listdir(db_path) for db in db_dict: @@ -303,7 +303,7 @@ def run_taxonomy(pipeline_path, assembly_path, tax_table_path, db_dir_path, parser.add_argument('-s', '--single_genome', help='Specifies single genome mode', action='store_true') parser.add_argument('-u', '--update', required=False, action='store_true', - help='Checks/Adds/Updates: nodes.dmp, names.dmp, accession2taxid, nr.dmnd files within specified directory.') + help='Checks/Adds/Updates: nodes.dmp, names.dmp, merged.dmp, accession2taxid, nr.dmnd files within specified directory.') args = vars(parser.parse_args())