-
Notifications
You must be signed in to change notification settings - Fork 0
/
collect_data.py
163 lines (139 loc) · 6.39 KB
/
collect_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
from __future__ import division, print_function
import argparse
import numpy as np
import pysam
from locuscollectors import NonCandidateCollector, CandidateCollector
import util as ut
import cyprocessreads as cpr
import h5py
import h5py_util
desc = 'jointly infer sequencing error profiles and polymorphisms'
parser = argparse.ArgumentParser(
description=desc,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('bams', help = 'file containing list of bam files')
parser.add_argument('references',
help = 'name of file containing list of reference sequence names, or '
'comma-separated list of reference sequence names')
parser.add_argument('output', help = 'filename for optional HDF5 data output')
parser.add_argument('--min-bq', help = 'minimum base quality',
type = ut.positive_int, default = 20)
parser.add_argument('--min-mq', help = 'minimum mapping quality',
type = ut.positive_int, default = 20)
parser.add_argument('--bam-data',
help = 'comma-separated (csv) dataset containing supplementary '
'variables for different bam files. added to covariates for '
'each observation from a bam file. first column is the bam '
'name, remaining columns are the covariates. must be '
'dummy-encoded.')
parser.add_argument('--context-length', type = ut.nonneg_int, default = 2,
help = 'number of preceding bases to use as covariate')
parser.add_argument('--round-distance-by',
type = ut.positive_int, default = 10,
help = 'round distance from start of read by this amount. larger '
'numbers make for more compression in the data, faster '
'likelihood evaluations.')
parser.add_argument('--no-mapq', action = 'store_true',
help = 'do not use map qualities')
parser.add_argument('--no-bam', action = 'store_true',
help = 'do not add dummy variable for bam')
parser.add_argument('--min-coverage', type = int, default = 20)
parser.add_argument('--do-not-remove-nonvariable', action = 'store_true')
args = parser.parse_args()
min_bq = args.min_bq
min_mq = args.min_mq
context_len = args.context_length
# break bam names into prefix/name
prefix, bam_fns, bams = ut.get_bams(args.bams)
ref_names = ut.get_ref_names(args.references, bams)
print('getting counts')
all_counts = ut.get_all_counts(bams, ref_names, min_bq, prefix)
print('getting consensus')
all_consensuses = ut.get_all_consensuses(all_counts, min_coverage = args.min_coverage)
print('getting major-minor')
all_majorminor = ut.get_all_majorminor(all_counts)
# make rowmakers: a dict by ref and bam_fn
row_makers, rowlen = ut.get_row_makers(bam_fns, ref_names, context_len,
args.round_distance_by, all_consensuses, not args.no_mapq, not args.no_bam)
covariate_matrices = ut.get_covariate_matrices(rowlen)
output = h5py.File(args.output, 'w')
h5lo = output.create_group('collapsed_locus_observations')
h5mm = output.create_group('major_minor')
h5py_util.add_major_minor(all_majorminor, h5mm)
for ref in ref_names:
h5lo_ref = h5lo.create_group(ref)
for bam_fn in bam_fns:
h5lo_bam = h5lo_ref.create_group(bam_fn)
reflen = len(all_consensuses[ref][bam_fn])
bam = bams[bam_fn]
rm = row_makers[ref][bam_fn]
mm = all_majorminor[ref][bam_fn]
consensus = all_consensuses[ref][bam_fn]
cpr.add_bam_observations(bam, ref, reflen, min_bq, min_mq, context_len,
rm, bam_fn, consensus, covariate_matrices, h5lo_bam, output, mm)
# probably best to translate the various lo's to two numpy arrays, one with the
# data, another with the meta data. the names of the bams and refs can be HDF5 attributes
cm = ut.collect_covariate_matrices(covariate_matrices)
# they're all the same...
cm_names = row_makers[ref][bam_fn].get_covariate_names()
output.attrs['covariate_column_names'] = ','.join(list(cm_names))
if not args.do_not_remove_nonvariable:
nonvariables = []
consts = []
for j in range(cm.shape[1]):
uniques = np.unique(cm[:,j])
isnonvar = uniques.shape[0] == 1
isconst = isnonvar and uniques[0] == 1.0
nonvariables.append(isnonvar)
if isconst:
consts.append(j)
keeper_columns = []
keeper_column_names = []
wrote_const = False
for j in range(cm.shape[1]):
write_col = False
if j in consts:
if not wrote_const:
wrote_const = True
write_col = True
assert j in nonvariables
#if j not in nonvariables:
if not nonvariables[j]:
write_col = True
if write_col:
keeper_columns.append(j)
keeper_column_names.append(cm_names[j])
old_cm = cm
cm = old_cm[:,np.array(keeper_columns)]
origcm = cm[:,:]
rowlen = origcm.shape[1]
cm, cm_minmaxes = ut.normalize_covariates(origcm)
output.attrs['rowlen'] = rowlen
output.create_dataset('covariates_mins_maxes', data = cm_minmaxes)
outcm = output.create_group('covariate_matrices')
outlo = output.create_group('locus_observations')
for chrom in output['collapsed_locus_observations'].keys():
chromcm = outcm.create_group(chrom)
chromlo = outlo.create_group(chrom)
for bam in output['collapsed_locus_observations'][chrom].keys():
bam_idx = output['collapsed_locus_observations'][chrom].keys().index(bam)
bams_len = len(output['locus_observations'][chrom].keys())
bamcm = chromcm.create_group(bam)
bamlo = chromlo.create_group(bam)
for locus in output['collapsed_locus_observations'][chrom][bam].keys():
locuslo = bamlo.create_group(locus)
reg_cm = []
cur_cm_idx = 0
for reg in ['f1','f2','r1','r2']:
locobs = output['collapsed_locus_observations'][chrom][bam][locus][reg][:,:]
reg_idxs = []
for idx in locobs[:,0]:
reg_cm.append(cm[idx])
reg_idxs.append(cur_cm_idx)
cur_cm_idx += 1
loclodat = np.column_stack((reg_idxs, locobs[:,1:]))
locuslo.create_dataset(reg, shape = (locobs.shape[0], 5), dtype = np.uint32, data = loclodat, compression = 'gzip', compression_opts = 7)
reg_cm = np.array(reg_cm)
bamcm.create_dataset(locus, data = reg_cm, dtype = np.float64, compression = 'gzip', compression_opts = 7)
del output['collapsed_locus_observations']
output.close()