Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merge nodes when databases out of sync. #9

Merged
merged 4 commits into from
Mar 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions pipeline/add_contig_taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,19 @@ def parse_nodes(nodes_dmp_path):
nodes_dmp.close()
return(nodes)

def parse_merged(fpath):
print(strftime("%Y-%m-%d %H:%M:%S") + ' Processing merged taxid nodes')
wc_output = subprocess.check_output(['wc', '-l', fpath])
wc_list = wc_output.split()
number_of_lines = int(wc_list[0])
fh = open(fpath)
merged = {}
for line in tqdm(fh, desc='parsing merged', total=number_of_lines, leave=False):
old_taxid, new_taxid = [int(taxid) for taxid in line.strip('\t|\n').split('\t|\t')]
merged.update({old_taxid:new_taxid})
fh.close()
return(merged)

def parse_lca(lca_fpath):
print( strftime("%Y-%m-%d %H:%M:%S") + ' Parsing lca taxonomy table')
# Work out number of lines in file
Expand All @@ -199,6 +212,9 @@ def parse_lca(lca_fpath):
orf, name, rank, taxid = line.strip().split('\t')
contig, orf_num = orf.rsplit('_', 1)
taxid = int(taxid)
# Convert any nodes that were recently suppressed/deprecated
# to their new node taxid. Otherwise keep the same taxid
taxid = merged.get(taxid, taxid)
if taxid != 1:
while rank not in set(rank_priority):
taxid = nodes[taxid]['parent']
Expand Down Expand Up @@ -330,12 +346,14 @@ def write_taxa(ranked_ctgs, contig_table_fpath, outfpath):
# Process NCBI taxdump files
name_fpath = os.path.join(taxdump_dir_path, 'names.dmp')
nodes_fpath = os.path.join(taxdump_dir_path, 'nodes.dmp')
merged_fpath = os.path.join(taxdump_dir_path, 'merged.dmp')

pp = pprint.PrettyPrinter(indent=4)

# Build taxid tree structure with associated canoncial ranks and names
names = parse_names(name_fpath)
nodes = parse_nodes(nodes_fpath)
merged = parse_merged(merged_fpath)

rank_priority = [
'species',
Expand Down
6 changes: 3 additions & 3 deletions pipeline/make_taxonomy_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def update_dbs(database_path, db='all'):
download_file(database_path, taxdump_url, taxdump_md5_url)

if os.path.isfile(database_path + '/taxdump.tar.gz'):
run_command('tar -xzf {}/taxdump.tar.gz -C {} names.dmp nodes.dmp'.format(database_path, database_path))
run_command('tar -xzf {}/taxdump.tar.gz -C {} names.dmp nodes.dmp merged.dmp'.format(database_path, database_path))
os.remove('{}/taxdump.tar.gz'.format(database_path))
print("nodes.dmp and names.dmp updated")

Expand All @@ -183,7 +183,7 @@ def check_dbs(db_path):
db_dict = {
'nr': ['nr.dmnd'],
'acc2taxid': ['prot.accession2taxid'],
'taxdump': ['names.dmp','nodes.dmp']
'taxdump': ['names.dmp','nodes.dmp', 'merged.dmp']
}
db_files = os.listdir(db_path)
for db in db_dict:
Expand Down Expand Up @@ -303,7 +303,7 @@ def run_taxonomy(pipeline_path, assembly_path, tax_table_path, db_dir_path,
parser.add_argument('-s', '--single_genome', help='Specifies single genome mode',
action='store_true')
parser.add_argument('-u', '--update', required=False, action='store_true',
help='Checks/Adds/Updates: nodes.dmp, names.dmp, accession2taxid, nr.dmnd files within specified directory.')
help='Checks/Adds/Updates: nodes.dmp, names.dmp, merged.dmp, accession2taxid, nr.dmnd files within specified directory.')

args = vars(parser.parse_args())

Expand Down