|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Convert a blobDB to BlobDir Fields.""" |
| 3 | + |
| 4 | +# pylint: disable=too-many-locals |
| 5 | + |
| 6 | +import math |
| 7 | +from pathlib import Path |
| 8 | +from collections import defaultdict |
| 9 | +import file_io |
| 10 | +import cov |
| 11 | +import hits |
| 12 | +from field import Identifier, Variable, Category |
| 13 | + |
| 14 | + |
| 15 | +def field_name_from_path(path): |
| 16 | + """Extract field name from file path.""" |
| 17 | + parts = Path(path).stem.split('.') |
| 18 | + field_name = parts[-1] |
| 19 | + if len(parts) > 1: |
| 20 | + if parts[-1] in ('bam', 'sam', 'cram'): |
| 21 | + field_name = parts[-2] |
| 22 | + return field_name |
| 23 | + |
| 24 | + |
| 25 | +def values_from_blob_db(blob_db): |
| 26 | + """Read values from a blobDB ints a dict of lists of values.""" |
| 27 | + values = defaultdict(list) |
| 28 | + for identifier in blob_db['order_of_blobs']: |
| 29 | + blob = blob_db['dict_of_blobs'][identifier] |
| 30 | + values['lengths'].append(blob.get('length', 0)) |
| 31 | + values['gcs'].append(blob.get('gc', 0)) |
| 32 | + values['n_counts'].append(blob.get('n_count', 0)) |
| 33 | + for cov_lib in blob_db['covLibs'].keys(): |
| 34 | + values["%s_cov" % cov_lib].append(blob['covs'].get(cov_lib, 0)) |
| 35 | + values["%s_read_cov" % cov_lib].append(blob['read_cov'].get(cov_lib, 0)) |
| 36 | + for tax_rule in blob_db['taxrules']: |
| 37 | + for rank, results in blob['taxonomy'][tax_rule].items(): |
| 38 | + values["%s_%s" % (tax_rule, rank)].append(results.get('tax', 'no-hit')) |
| 39 | + values["%s_%s_score" % (tax_rule, rank)].append(int(results.get('score', 0))) |
| 40 | + values["%s_%s_cindex" % (tax_rule, rank)].append(int(results.get('c_index', 0))) |
| 41 | + return values |
| 42 | + |
| 43 | + |
| 44 | +def parse(file, **kwargs): |
| 45 | + """Parse all synonym files.""" |
| 46 | + blob_db = file_io.load_yaml(file) |
| 47 | + parsed = [] |
| 48 | + identifiers = kwargs['dependencies']['identifiers'] |
| 49 | + if not identifiers: |
| 50 | + identifiers = Identifier('identifiers', |
| 51 | + meta={'field_id': 'identifiers'}, |
| 52 | + values=blob_db['order_of_blobs'], |
| 53 | + parents=[]) |
| 54 | + parsed.append(identifiers) |
| 55 | + values = values_from_blob_db(blob_db) |
| 56 | + parsed.append(Variable('gc', |
| 57 | + meta={ |
| 58 | + 'preload': True, |
| 59 | + 'scale': 'scaleLinear', |
| 60 | + 'field_id': 'gc', |
| 61 | + 'name': 'GC', |
| 62 | + 'datatype': 'float', |
| 63 | + 'range': [min(values['gcs']), max(values['gcs'])] |
| 64 | + }, |
| 65 | + values=values['gcs'], |
| 66 | + parents=[])) |
| 67 | + _min = min(values['lengths']) |
| 68 | + parsed.append(Variable('length', |
| 69 | + meta={ |
| 70 | + 'field_id': 'length', |
| 71 | + 'preload': True, |
| 72 | + 'scale': 'scaleLog', |
| 73 | + 'name': 'Length', |
| 74 | + 'clamp': 100 if _min == 0 else False, |
| 75 | + 'datatype': 'integer', |
| 76 | + 'range': [_min, max(values['lengths'])] |
| 77 | + }, |
| 78 | + parents=[], |
| 79 | + values=values['lengths'])) |
| 80 | + parsed.append(Variable('ncount', |
| 81 | + meta={ |
| 82 | + 'field_id': 'ncount', |
| 83 | + 'scale': 'scaleLinear', |
| 84 | + 'name': 'N count', |
| 85 | + 'datatype': 'integer', |
| 86 | + 'range': [min(values['n_counts']), max(values['n_counts'])] |
| 87 | + }, |
| 88 | + values=values['n_counts'], |
| 89 | + parents=[])) |
| 90 | + cov_range = [math.inf, -math.inf] |
| 91 | + read_cov_range = [math.inf, -math.inf] |
| 92 | + for cov_lib in blob_db['covLibs']: |
| 93 | + cov_file_name = field_name_from_path(blob_db['covLibs'][cov_lib]['f']) |
| 94 | + covs = values["%s_cov" % cov_lib] |
| 95 | + read_covs = values["%s_read_cov" % cov_lib] |
| 96 | + cov_range = [min(covs+[cov_range[0]]), |
| 97 | + max(covs+[cov_range[1]])] |
| 98 | + read_cov_range = [min(read_covs+[read_cov_range[0]]), |
| 99 | + max(read_covs+[read_cov_range[1]])] |
| 100 | + parsed.append(Variable("%s_cov" % cov_file_name, |
| 101 | + values=covs, |
| 102 | + meta={'field_id': "%s_cov" % cov_file_name}, |
| 103 | + parents=cov.parent() + ['children', |
| 104 | + {'id': 'base_coverage', |
| 105 | + 'clamp': 1 if cov_range[0] == 0 else False, |
| 106 | + 'range': cov_range}, |
| 107 | + 'children'] |
| 108 | + )) |
| 109 | + parsed.append(Variable("%s_read_cov" % cov_file_name, |
| 110 | + values=read_covs, |
| 111 | + meta={'field_id': "%s_read_cov" % cov_file_name}, |
| 112 | + parents=cov.parent() + ['children', |
| 113 | + {'id': 'read_coverage', |
| 114 | + 'datatype': 'integer', |
| 115 | + 'clamp': 1 if read_cov_range[0] == 0 else False, |
| 116 | + 'range': read_cov_range}, |
| 117 | + 'children'] |
| 118 | + )) |
| 119 | + ranks = blob_db['dict_of_blobs'][ |
| 120 | + identifiers.values[0]]['taxonomy'][blob_db['taxrules'][0]].keys() |
| 121 | + for tax_rule in blob_db['taxrules']: |
| 122 | + for rank in ranks: |
| 123 | + field_id = "%s_%s" % (tax_rule, rank) |
| 124 | + parsed.append(Category(field_id, |
| 125 | + values=values[field_id], |
| 126 | + meta={'field_id': field_id}, |
| 127 | + parents=hits.parent() + ['children'])) |
| 128 | + parents = hits.parent() + ['children', {'id': field_id}, 'data'] |
| 129 | + field_id = "%s_%s_cindex" % (tax_rule, rank) |
| 130 | + parsed.append(Variable(field_id, |
| 131 | + values=values[field_id], |
| 132 | + meta={ |
| 133 | + 'scale': 'scaleLinear', |
| 134 | + 'field_id': field_id, |
| 135 | + 'datatype': 'integer', |
| 136 | + 'range': [min(values[field_id]), |
| 137 | + max(values[field_id])], |
| 138 | + 'preload': False, |
| 139 | + 'active': False |
| 140 | + }, |
| 141 | + parents=parents)) |
| 142 | + field_id = "%s_%s_score" % (tax_rule, rank) |
| 143 | + _min = min(values[field_id]) |
| 144 | + parsed.append(Variable(field_id, |
| 145 | + values=values[field_id], |
| 146 | + meta={ |
| 147 | + 'scale': 'scaleLog', |
| 148 | + 'field_id': field_id, |
| 149 | + 'clamp': 1 if _min == 0 else False, |
| 150 | + 'datatype': 'integer', |
| 151 | + 'range': [_min, |
| 152 | + max(values[field_id])], |
| 153 | + 'preload': False, |
| 154 | + 'active': False |
| 155 | + }, |
| 156 | + parents=parents)) |
| 157 | + |
| 158 | + return parsed |
| 159 | + |
| 160 | + |
| 161 | +def parent(): |
| 162 | + """Set standard metadata for synonyms.""" |
| 163 | + return [] |
0 commit comments