Skip to content

Commit 2837194

Browse files
committed
add dataset import from blobDB.json file
1 parent c4fc8fd commit 2837194

File tree

8 files changed

+196
-30
lines changed

8 files changed

+196
-30
lines changed

.pylintrc

+1-1
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ indent-after-paren=4
317317
indent-string=' '
318318

319319
# Maximum number of characters on a single line.
320-
max-line-length=100
320+
max-line-length=120
321321

322322
# Maximum number of lines in a module.
323323
max-module-lines=1000

lib/add.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
Usage:
99
blobtools add [--busco TSV...] [--cov BAM...] [--hits TSV...] [--fasta FASTA]
1010
[--key path=value...] [--link path=url...] [--skip-link-test]
11-
[--meta YAML] [--synonyms TSV...]
11+
[--blobDB JSON] [--meta YAML] [--synonyms TSV...]
1212
[--taxdump DIRECTORY] [--taxrule bestsum|bestsumorder]
1313
[--threads INT] [--create] [--replace] DIRECTORY
1414
@@ -24,6 +24,7 @@
2424
--link path=URL Link to an external resource.
2525
--skip-link-test Skip test to see if link URL can be resolved.
2626
--meta YAML Dataset metadata.
27+
--blobDB JSON Blobtools v1 blobDB.
2728
--synonyms TSV TSV file containing current identifiers and synonyms.
2829
--taxdump DIRECTORY Location of NCBI new_taxdump directory.
2930
--taxrule bestsum|bestsumorder
@@ -40,6 +41,7 @@
4041

4142
from docopt import docopt
4243
import file_io
44+
import blob_db
4345
import busco
4446
import cov
4547
import fasta
@@ -51,6 +53,7 @@
5153
from fetch import fetch_field, fetch_metadata, fetch_taxdump
5254

5355
FIELDS = [{'flag': '--fasta', 'module': fasta, 'depends': ['identifiers']},
56+
{'flag': '--blobDB', 'module': blob_db, 'depends': ['identifiers']},
5457
{'flag': '--busco', 'module': busco, 'depends': ['identifiers']},
5558
{'flag': '--cov', 'module': cov, 'depends': ['identifiers', 'length', 'ncount']},
5659
{'flag': '--hits', 'module': hits, 'depends': ['identifiers']},

lib/blob_db.py

+163
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
#!/usr/bin/env python3
2+
"""Convert a blobDB to BlobDir Fields."""
3+
4+
# pylint: disable=too-many-locals
5+
6+
import math
7+
from pathlib import Path
8+
from collections import defaultdict
9+
import file_io
10+
import cov
11+
import hits
12+
from field import Identifier, Variable, Category
13+
14+
15+
def field_name_from_path(path):
16+
"""Extract field name from file path."""
17+
parts = Path(path).stem.split('.')
18+
field_name = parts[-1]
19+
if len(parts) > 1:
20+
if parts[-1] in ('bam', 'sam', 'cram'):
21+
field_name = parts[-2]
22+
return field_name
23+
24+
25+
def values_from_blob_db(blob_db):
26+
"""Read values from a blobDB ints a dict of lists of values."""
27+
values = defaultdict(list)
28+
for identifier in blob_db['order_of_blobs']:
29+
blob = blob_db['dict_of_blobs'][identifier]
30+
values['lengths'].append(blob.get('length', 0))
31+
values['gcs'].append(blob.get('gc', 0))
32+
values['n_counts'].append(blob.get('n_count', 0))
33+
for cov_lib in blob_db['covLibs'].keys():
34+
values["%s_cov" % cov_lib].append(blob['covs'].get(cov_lib, 0))
35+
values["%s_read_cov" % cov_lib].append(blob['read_cov'].get(cov_lib, 0))
36+
for tax_rule in blob_db['taxrules']:
37+
for rank, results in blob['taxonomy'][tax_rule].items():
38+
values["%s_%s" % (tax_rule, rank)].append(results.get('tax', 'no-hit'))
39+
values["%s_%s_score" % (tax_rule, rank)].append(int(results.get('score', 0)))
40+
values["%s_%s_cindex" % (tax_rule, rank)].append(int(results.get('c_index', 0)))
41+
return values
42+
43+
44+
def parse(file, **kwargs):
45+
"""Parse all synonym files."""
46+
blob_db = file_io.load_yaml(file)
47+
parsed = []
48+
identifiers = kwargs['dependencies']['identifiers']
49+
if not identifiers:
50+
identifiers = Identifier('identifiers',
51+
meta={'field_id': 'identifiers'},
52+
values=blob_db['order_of_blobs'],
53+
parents=[])
54+
parsed.append(identifiers)
55+
values = values_from_blob_db(blob_db)
56+
parsed.append(Variable('gc',
57+
meta={
58+
'preload': True,
59+
'scale': 'scaleLinear',
60+
'field_id': 'gc',
61+
'name': 'GC',
62+
'datatype': 'float',
63+
'range': [min(values['gcs']), max(values['gcs'])]
64+
},
65+
values=values['gcs'],
66+
parents=[]))
67+
_min = min(values['lengths'])
68+
parsed.append(Variable('length',
69+
meta={
70+
'field_id': 'length',
71+
'preload': True,
72+
'scale': 'scaleLog',
73+
'name': 'Length',
74+
'clamp': 100 if _min == 0 else False,
75+
'datatype': 'integer',
76+
'range': [_min, max(values['lengths'])]
77+
},
78+
parents=[],
79+
values=values['lengths']))
80+
parsed.append(Variable('ncount',
81+
meta={
82+
'field_id': 'ncount',
83+
'scale': 'scaleLinear',
84+
'name': 'N count',
85+
'datatype': 'integer',
86+
'range': [min(values['n_counts']), max(values['n_counts'])]
87+
},
88+
values=values['n_counts'],
89+
parents=[]))
90+
cov_range = [math.inf, -math.inf]
91+
read_cov_range = [math.inf, -math.inf]
92+
for cov_lib in blob_db['covLibs']:
93+
cov_file_name = field_name_from_path(blob_db['covLibs'][cov_lib]['f'])
94+
covs = values["%s_cov" % cov_lib]
95+
read_covs = values["%s_read_cov" % cov_lib]
96+
cov_range = [min(covs+[cov_range[0]]),
97+
max(covs+[cov_range[1]])]
98+
read_cov_range = [min(read_covs+[read_cov_range[0]]),
99+
max(read_covs+[read_cov_range[1]])]
100+
parsed.append(Variable("%s_cov" % cov_file_name,
101+
values=covs,
102+
meta={'field_id': "%s_cov" % cov_file_name},
103+
parents=cov.parent() + ['children',
104+
{'id': 'base_coverage',
105+
'clamp': 1 if cov_range[0] == 0 else False,
106+
'range': cov_range},
107+
'children']
108+
))
109+
parsed.append(Variable("%s_read_cov" % cov_file_name,
110+
values=read_covs,
111+
meta={'field_id': "%s_read_cov" % cov_file_name},
112+
parents=cov.parent() + ['children',
113+
{'id': 'read_coverage',
114+
'datatype': 'integer',
115+
'clamp': 1 if read_cov_range[0] == 0 else False,
116+
'range': read_cov_range},
117+
'children']
118+
))
119+
ranks = blob_db['dict_of_blobs'][
120+
identifiers.values[0]]['taxonomy'][blob_db['taxrules'][0]].keys()
121+
for tax_rule in blob_db['taxrules']:
122+
for rank in ranks:
123+
field_id = "%s_%s" % (tax_rule, rank)
124+
parsed.append(Category(field_id,
125+
values=values[field_id],
126+
meta={'field_id': field_id},
127+
parents=hits.parent() + ['children']))
128+
parents = hits.parent() + ['children', {'id': field_id}, 'data']
129+
field_id = "%s_%s_cindex" % (tax_rule, rank)
130+
parsed.append(Variable(field_id,
131+
values=values[field_id],
132+
meta={
133+
'scale': 'scaleLinear',
134+
'field_id': field_id,
135+
'datatype': 'integer',
136+
'range': [min(values[field_id]),
137+
max(values[field_id])],
138+
'preload': False,
139+
'active': False
140+
},
141+
parents=parents))
142+
field_id = "%s_%s_score" % (tax_rule, rank)
143+
_min = min(values[field_id])
144+
parsed.append(Variable(field_id,
145+
values=values[field_id],
146+
meta={
147+
'scale': 'scaleLog',
148+
'field_id': field_id,
149+
'clamp': 1 if _min == 0 else False,
150+
'datatype': 'integer',
151+
'range': [_min,
152+
max(values[field_id])],
153+
'preload': False,
154+
'active': False
155+
},
156+
parents=parents))
157+
158+
return parsed
159+
160+
161+
def parent():
162+
"""Set standard metadata for synonyms."""
163+
return []

lib/cov.py

+2-13
Original file line numberDiff line numberDiff line change
@@ -54,17 +54,6 @@ def parse_bam(bam_file, **kwargs):
5454
for result in results:
5555
_covs.update({result[0]: result[1]})
5656
_read_covs.update({result[0]: result[2]})
57-
# for seq_id in tqdm(ids):
58-
# reads = set()
59-
# for pileupcolumn in samfile.pileup(seq_id):
60-
# _covs[seq_id] += pileupcolumn.n
61-
# for pileupread in pileupcolumn.pileups:
62-
# if not pileupread.is_del and not pileupread.is_refskip:
63-
# reads.add(pileupread.alignment.query_name)
64-
# _read_covs[seq_id] = len(reads)
65-
# samfile.close()
66-
# stats = pysam.flagstat(bam_file)
67-
# print(stats)
6857
if index_file:
6958
os.remove(index_file)
7059
if not identifiers.validate_list(list(_covs.keys())):
@@ -84,7 +73,7 @@ def parse_bam(bam_file, **kwargs):
8473
meta={'field_id': field_id},
8574
parents=['children',
8675
{'id': 'base_coverage',
87-
'clamp': 0.1,
76+
'clamp': 1 if fields['cov_range'][0] == 0 else False,
8877
'range': fields['cov_range']},
8978
'children']
9079
)
@@ -97,7 +86,7 @@ def parse_bam(bam_file, **kwargs):
9786
parents=['children',
9887
{'id': 'read_coverage',
9988
'datatype': 'integer',
100-
'clamp': 1,
89+
'clamp': 1 if fields['read_cov_range'][0] == 0 else False,
10190
'range': fields['read_cov_range']},
10291
'children']
10392
)

lib/fasta.py

+4-12
Original file line numberDiff line numberDiff line change
@@ -60,25 +60,17 @@ def parse(file, **kwargs):
6060
lengths.append(_lengths[seq_id] if seq_id in _lengths else 0)
6161
gc_portions.append(_gc_portions[seq_id] if seq_id in _gc_portions else 0)
6262
n_counts.append(_n_counts[seq_id] if seq_id in _n_counts else 0)
63-
parsed.append(Variable('gc',
64-
meta={
65-
'preload': True,
66-
'scale': 'scaleLinear',
67-
'field_id': 'gc',
68-
'name': 'GC',
69-
'datatype': 'float',
70-
'range': [min(gc_portions), max(gc_portions)]
71-
},
72-
values=gc_portions,
73-
parents=[]))
63+
parsed.append()
64+
_min = min(lengths)
7465
parsed.append(Variable('length',
7566
meta={
7667
'preload': True,
7768
'scale': 'scaleLog',
7869
'field_id': 'length',
7970
'name': 'Length',
71+
'clamp': 1 if _min == 0 else False,
8072
'datatype': 'integer',
81-
'range': [min(lengths), max(lengths)]
73+
'range': [_min, max(lengths)]
8274
},
8375
values=lengths,
8476
parents=[]))

lib/fetch.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,10 @@ def fetch_metadata(path_to_dataset, **kwargs):
5656
elif not kwargs.get('meta'):
5757
meta = file_io.load_yaml("%s/meta.json" % path_to_dataset)
5858
if not meta:
59-
meta = kwargs['meta']
59+
if kwargs.get('meta'):
60+
meta = kwargs['meta']
61+
else:
62+
meta = {}
6063
if 'id' not in meta:
6164
meta['id'] = dataset_id
6265
meta['name'] = dataset_id

lib/field.py

+15
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,9 @@ class Identifier(Field):
153153
def __init__(self, field_id, **kwargs):
154154
"""Init Identifier class."""
155155
kwargs['type'] = 'identifier'
156+
if 'meta' not in kwargs:
157+
kwargs['meta'] = {}
158+
kwargs['meta']['type'] = kwargs['type']
156159
super().__init__(field_id, **kwargs)
157160

158161
def to_set(self):
@@ -187,6 +190,9 @@ class Variable(Field):
187190
def __init__(self, field_id, **kwargs):
188191
"""Init Variable class."""
189192
kwargs['type'] = 'variable'
193+
if 'meta' not in kwargs:
194+
kwargs['meta'] = {}
195+
kwargs['meta']['type'] = kwargs['type']
190196
super().__init__(field_id, **kwargs)
191197

192198
def get_indices_in_range(self, min_max, invert=False):
@@ -231,6 +237,9 @@ def __init__(self, field_id, **kwargs):
231237
for index, value in enumerate(kwargs['values']):
232238
value[slot] = values[index]
233239
kwargs['type'] = 'array'
240+
if 'meta' not in kwargs:
241+
kwargs['meta'] = {}
242+
kwargs['meta']['type'] = kwargs['type']
234243
super().__init__(field_id, **kwargs)
235244

236245
def get_values_by_indices_for_slots(self, indices, slots):
@@ -273,6 +282,9 @@ def __init__(self, field_id, **kwargs):
273282
"""Init MultiArray class."""
274283
self.category_slot = None
275284
kwargs['type'] = 'multiarray'
285+
if 'meta' not in kwargs:
286+
kwargs['meta'] = {}
287+
kwargs['meta']['type'] = kwargs['type']
276288
if 'category_slot' in kwargs:
277289
self.category_slot = kwargs['category_slot']
278290
if self.category_slot is not None and not kwargs.get('keys'):
@@ -318,6 +330,9 @@ class Category(Field):
318330
def __init__(self, field_id, **kwargs):
319331
"""Init Category class."""
320332
kwargs['type'] = 'category'
333+
if 'meta' not in kwargs:
334+
kwargs['meta'] = {}
335+
kwargs['meta']['type'] = kwargs['type']
321336
if 'keys' not in kwargs or kwargs['keys'] is None:
322337
keys = kwargs.get('fixed_keys', [])
323338
kwargs['keys'], kwargs['values'] = self._collapse_values(kwargs['values'], keys)

lib/hits.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -116,15 +116,16 @@ def create_fields(results, fields=None):
116116
},
117117
parents=parents))
118118
field_id = "%s_%s" % (result['field_id'], 'score')
119+
_min = min(result['data']['score'])
119120
fields.append(Variable(field_id,
120121
values=result['data']['score'],
121122
meta={
122123
'scale': 'scaleLog',
123124
'field_id': field_id,
124125
'name': field_id,
125-
'clamp': 1,
126+
'clamp': 1 if _min == 0 else False,
126127
'datatype': 'integer',
127-
'range': [min(result['data']['score']),
128+
'range': [_min,
128129
max(result['data']['score'])],
129130
'preload': False,
130131
'active': False

0 commit comments

Comments
 (0)