-
Notifications
You must be signed in to change notification settings - Fork 77
/
Copy pathhlatyper.py
778 lines (624 loc) · 37.2 KB
/
hlatyper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
from __future__ import division
from __future__ import print_function
from builtins import zip
from builtins import str
from builtins import map
from builtins import range
from past.utils import old_div
import pandas as pd
import numpy as np
import re
import pylab
import warnings
from collections import OrderedDict
from datetime import datetime
import sys
try:
import pysam
PYSAM_AVAILABLE = True
except ImportError:
PYSAM_AVAILABLE = False
VERBOSE = False
def now(start=datetime.now()):
# function argument NOT to be ever set! It gets initialized at function definition time
# so we can calculate difference without further hassle.
return str(datetime.now()-start)[:-4]
def memoize(f): # from http://code.activestate.com/recipes/578231-probably-the-fastest-memoization-decorator-in-the-/
class MemoDict(dict):
def __missing__(self, key):
ret = self[key] = f(key)
return ret
return MemoDict().__getitem__
# if we used MDINSHP=X chars this could be a full cigar parser, however, we only need M and D to
# determine a read's span on the reference sequence to create a coverage plot.
CIGAR_SLICER = re.compile(r'[0-9]+[MD]')
@memoize
def length_on_reference(cigar_string):
return sum([int(p[:-1]) for p in re.findall(CIGAR_SLICER, cigar_string)])
def feature_order(feature):
# to use it in sorted(..., key=feature_order) this returns:
# 0 for ('UTR', 1)
# 2 for exon, 1
# 3 for intron, 1
# 4 for exon, 2
# 5 for intron, 2
# ...
# 999 for UTR, 2
feat_type, feat_number = feature
assert feat_type in ('intron', 'exon', 'UTR'), 'Unknown feature in list (function accepts intron/exon/UTR)'
assert isinstance(feat_number, int), 'Feature number has to be integer'
return 0 if feature==('UTR', 1) else 999 if feature==('UTR', 2) else (feat_number*2 + (feat_type=='intron'))
def store_dataframes(out_hdf, **kwargs):
# DataFrames to serialize have to be passed by keyword arguments. An argument matrix1=DataFrame(...)
# will be written into table 'matrix1' in the HDF file.
complevel = kwargs.pop('complevel', 9) # default complevel & complib values if
complib = kwargs.pop('complib', 'zlib') # not explicitly asked for as arguments
if VERBOSE:
print(now(), 'Storing %d DataFrames in file %s with compression settings %d %s...' % (len(kwargs), out_hdf, complevel, complib))
store = pd.HDFStore(out_hdf, complevel=complevel, complib=complib) # TODO: WRITE ONLY? it probably appends now
for table_name, dataframe in kwargs.items():
store[table_name] = dataframe
store.close()
if VERBOSE:
print(now(), 'DataFrames stored in file.')
def load_hdf(in_hdf, as_dict=False, *args): # isn't really neccesary, but makes a read-only flag on it to be sure
store = pd.HDFStore(in_hdf, 'r')
if len(args):
if as_dict:
to_return = {table: store[table] for table in args}
else:
to_return = tuple((store[table] for table in args))
store.close()
return to_return
else:
return store # store doesn't get closed! Either the user closes it manually or gets closed on exit
def sam_to_hdf(samfile):
if VERBOSE:
print(now(), 'Loading alleles and read IDs from %s...' % samfile)
# run through the SAM file once to see how many reads and alleles we are dealing with
# for a one-step DataFrame initialization instead of slow growing
read_ids, allele_ids = [], []
first_hit_row = True
total_hits = 0
with open(samfile, 'rb') as f:
last_read_id = None
for line in f:
line = line.decode('utf-8')
if line.startswith('@'):
if line.startswith('@SQ'):
allele_ids.append(line.split('\t')[1][3:]) # SN:HLA:HLA00001
continue
total_hits += 1
read_id = line.split('\t')[0]
if last_read_id != read_id:
read_ids.append(read_id)
last_read_id = read_id
if first_hit_row: # analyze SAM file structure and find MD tag column (describes mismatches).
first_hit_row = False
columns = line.split()
try:
nm_index = list(map(lambda x: x.startswith('NM:'), columns)).index(True)
except ValueError:
# TODO: we don't really handle the case if NM-tag is not present, code will fail later
print('\tNo NM-tag found in SAM file!')
nm_index = None
if VERBOSE:
print(now(), '%d alleles and %d reads found.' % (len(allele_ids), len(read_ids)))
print(now(), 'Initializing mapping matrix...')
# major performance increase if we initialize a numpy zero-matrix and pass that in the constructor
# than if we just let pandas initialize its default NaN matrix
matrix_pos = pd.DataFrame(np.zeros((len(read_ids), len(allele_ids)), dtype=np.uint16), columns=allele_ids, index=read_ids)
# read_details contains NM and read length tuples, calculated from the first encountered CIGAR string.
read_details = OrderedDict()
if VERBOSE:
print(now(), '%dx%d mapping matrix initialized. Populating %d hits from SAM file...' % (len(read_ids), len(allele_ids), total_hits))
milestones = [x * total_hits / 10 for x in range(1, 11)] # for progress bar
with open(samfile, 'rb') as f:
counter = 0
percent = 0
for line in f:
line = line.decode('utf-8')
if line.startswith('@'):
continue
fields = line.strip().split('\t')
read_id, allele_id, position, cigar, nm = (fields[i] for i in (0, 2, 3, 5, nm_index))
if read_id not in read_details:
read_details[read_id] = (int(nm[5:]), length_on_reference(cigar))
matrix_pos[allele_id][read_id] = int(position) # SAM indexes from 1, so null elements are not hits
counter += 1
if counter in milestones:
percent += 10
if VERBOSE:
print('\t%d%% completed' % percent)
if VERBOSE:
print(now(), '%d elements filled. Matrix sparsity: 1 in %.2f' % (counter, matrix_pos.shape[0]*matrix_pos.shape[1]/float(counter)))
# convert HLA:HLA00001 identifiers to HLA00001
matrix_pos.rename(columns=lambda x: x.replace('HLA:', ''), inplace=True)
details_df = pd.DataFrame.from_dict(read_details, orient='index')
details_df.columns = ['mismatches', 'read_length']
return matrix_pos, details_df
def pysam_to_hdf(samfile):
if not PYSAM_AVAILABLE:
print("Warning: PySam not available on the system. Falling back to primitive SAM parsing.")
return sam_to_hdf(samfile)
sam_or_bam = 'rb' if samfile.endswith('.bam') else 'r'
sam = pysam.AlignmentFile(samfile, sam_or_bam)
is_yara = (sam.header['PG'][0]['ID'] in ('Yara', 'yara'))
# If yara produced the sam/bam file, we need to know in what form to expect secondary alignments.
# If the -os flag was used in the call, they are proper secondary alignments. Otherwise, they are
# one line per read with a long XA custom tag containing alternative hits.
xa_tag = is_yara and (' -os ' not in sam.header['PG'][0]['CL'])
nref = sam.nreferences
hits = OrderedDict()
allele_id_to_index = {aa: ii for ii, aa in enumerate(sam.references)}
# read_details contains NM and read length tuples. We ignore length on reference from now on.
# Not worth the effort and calculation. Indels are rare and they have to be dealt with differently. The coverage
# plot wil still be fine and +-1 bp regarding read end isn't concerning.
read_details = OrderedDict()
if VERBOSE:
print(now(), 'Loading %s started. Number of HLA reads loaded (updated every thousand):' % samfile)
read_counter = 0
hit_counter = 0
for aln in sam:
# TODO: we could spare on accessing hits[aln.qname] if we were guaranteed alignments of one read come in batches
# and never mix. Most aligners behave like this but who knows...
if aln.qname not in hits: # for primary alignments (first hit of read)
# not using defaultdict because we have other one-time things to do when new reads come in anyway
hits[aln.qname] = np.zeros(nref, dtype=np.uint16) # 16 bits are enough for 65K positions, enough for HLA.
# TODO: as soon as we don't best-map but let in suboptimal alignments, this below is not good enough,
# as we need to store NM for every read-allele pair.
# and then there are cases (usually 1D/1I artefacts at the end) where aligned reference length isn't the same
# for all alignments of a read. How to handle that? Maybe needless if we re-map reads anyway.
read_details[aln.qname] = (aln.get_tag('NM'), aln.query_length) # aln.reference_length it used to be. Soft-trimming is out of question now.
read_counter += 1
if VERBOSE and not (read_counter % 1000):
sys.stdout.write('%dK...' % (old_div(len(hits),1000)))
sys.stdout.flush()
if xa_tag and aln.has_tag('XA'):
current_row = hits[aln.qname] # we may access this hundreds of times, better do it directly
subtags = aln.get_tag('XA').split(';')[:-1]
hit_counter += len(subtags)
for subtag in subtags:
allele, pos = subtag.split(',')[:2] # subtag is like HLA02552,691,792,-,1 (id, start, end, orient, nm)
current_row[allele_id_to_index[allele]] = int(pos) # 1-based positions
# this runs for primary and secondary alignments as well:
hits[aln.qname][aln.reference_id] = aln.reference_start + 1 # pysam reports 0-based positions
hit_counter += 1
# num_mismatches = aln.get_tag('NM') # if we ever need suboptimal alignments... doubtful.
if VERBOSE:
print('\n', now(), len(hits), 'reads loaded. Creating dataframe...')
pos_df = pd.DataFrame.from_dict(hits, orient='index')
pos_df.columns = sam.references[:]
details_df = pd.DataFrame.from_dict(read_details, orient='index')
details_df.columns = ['mismatches', 'read_length']
if VERBOSE:
print(now(), 'Dataframes created. Shape: %d x %d, hits: %d (%d), sparsity: 1 in %.2f' % (
pos_df.shape[0], pos_df.shape[1], np.sign(pos_df).sum().sum(), hit_counter, pos_df.shape[0]*pos_df.shape[1]/float(hit_counter)
)) # TODO: maybe return the binary here right away if we're using it to calculate density anyway.
return pos_df, details_df
def get_compact_model(hit_df, weak_hit_df=None, weight=None):
# turn a binary hit matrix dataframe into a smaller matrix DF that removes duplicate rows and
# creates the "occurence" vector with the number of rows the representative read represents.
# Note: one can pass "weak" hits (e.g., unpaired reads) and use them with a lower weight.
hit_df = hit_df.loc[hit_df.any(axis=1)] # remove all-zero rows
occurence = {r[0]: len(r) for r in hit_df.groupby(hit_df.columns.tolist()).groups.values()}
if weak_hit_df is not None:
weak_hit_df = weak_hit_df.loc[weak_hit_df.any(axis=1)]
assert 0 < weight <= 1, 'weak hit weight must be in (0, 1]'
weak_occ = {r[0]: len(r)*weight for r in weak_hit_df.groupby(weak_hit_df.columns.tolist()).groups.values()}
occurence.update(weak_occ)
unique_mtx = pd.concat([hit_df.drop_duplicates(), weak_hit_df.drop_duplicates()])
else:
unique_mtx = hit_df.drop_duplicates()
return unique_mtx, occurence
def mtx_to_sparse_dict(hit_df):
# takes a hit matrix and creates a dictionary of (read, allele):1 tuples corresponding to hits
# (needed by OptiType)
all_hits = {}
for read_id, alleles in hit_df.iterrows():
hit_alleles = alleles[alleles!=0].index
for hit_allele in hit_alleles:
all_hits[(read_id, hit_allele)] = 1
return all_hits
#return {(read, allele): 1 for read in hit_df.index for allele in hit_df.columns if hit_df[allele][read]>0} # awfully inefficient
def create_allele_dataframes(imgt_dat, fasta_gen, fasta_nuc):
from Bio import SeqIO
if VERBOSE:
print(now(), 'Loading IMGT allele dat file...')
alleles = OrderedDict()
with open(imgt_dat, 'rU') as handle:
for i, record in enumerate(SeqIO.parse(handle, "imgt")):
# TODO: IMGT has changed the ID system. Now it's HLA00001.1 or HLA12345.2 showing versions. I'll get rid of this now though
record.id = record.id.split('.')[0]
alleles[record.id] = record
if VERBOSE:
print(now(), 'Initializing allele DataFrame...')
'''
id HLA000001
type A*01:01:01:01
locus A
class I --- I, II, TAP, MIC, other
flags 0 --- problems with allele stored here, see below
len_gen 3503
len_nuc 1098
full_gen 1
full_nuc 1
flagging: sum of the below codes
+1 if HLA type ends in a letter (N, Q, etc.)
+2 if CDS annotation != exon annotation
+4 if features don't add up to gen sequence
+8 if exons don't add up to nuc sequence
'''
allele_info = 'id type 4digit locus flags len_dat len_gen len_nuc full_gen full_nuc'
table = pd.DataFrame(index=list(alleles.keys()), columns=allele_info.split())
sequences = []
if VERBOSE:
print(now(), 'Filling DataFrame with allele data...')
all_features = [] # contains tuples: (HLA id, feature type, feature number, feature start, feature end)
for allele in alleles.values():
allele_type = allele.description.replace('HLA-', '').split(',')[0]
table.loc[allele.id]['id'] = allele.id
table.loc[allele.id]['type'] = allele_type
table.loc[allele.id]['4digit'] = ':'.join(allele_type.split(':')[:2])
table.loc[allele.id]['locus'] = allele_type.split('*')[0]
table.loc[allele.id]['flags'] = 0 if allele_type[-1].isdigit() else 1
table.loc[allele.id]['len_dat'] = len(str(allele.seq))
table.loc[allele.id]['len_gen'] = 0 # TODO: IT STILL DOESNT SOLVE PERFORMANCEWARNING!
table.loc[allele.id]['len_nuc'] = 0 # we initialize these nulls so that we don't get a
table.loc[allele.id]['full_gen'] = 0 # PerformanceWarning + pickling when storing HDF
table.loc[allele.id]['full_nuc'] = 0 # because of NaNs (they don't map to ctypes)
sequences.append((allele.id, 'dat', str(allele.seq)))
# number of features in 2013-04-30 hla.dat:
# 9296 source (total # of alleles)
# 9291 CDS, 5 gene (HLA-P pseudogenes)
# 24493 exons, 3697 introns, 1027 UTRs
# CDS matches exons in 99+% of cases, some have a single base-pair extension at the end for some
# weird single-base exons or such (all on unimportant pseudogene loci)
# so we extract exons, introns and UTRs
features = [f for f in allele.features if f.type in ('exon', 'intron', 'UTR')]
for feature in features:
if feature.type in ('exon', 'intron'):
feature_num = int(feature.qualifiers['number'][0])
else: # UTR
# UTR either starts at the beginning or ends at the end
assert feature.location.start == 0 or feature.location.end == len(allele.seq)
feature_num = 1 if feature.location.start == 0 else 2 # 1 if 5' UTR, 2 if 3' UTR
all_features.append(
(allele.id, feature.type, feature_num,
int(feature.location.start), int(feature.location.end), len(feature), feature_order((feature.type, feature_num)))
)
# small sanity check, can be commented out
cds = [f for f in allele.features if f.type == 'CDS']
if cds:
if sum(map(len, [f for f in features if f.type == 'exon'])) != len(cds[0]):
if VERBOSE:
print("\tCDS length doesn't match sum of exons for", allele.id, allele_type)
table.loc[allele.id]['flags'] += 2
else:
if VERBOSE:
print("\tNo CDS found for", allele.id, allele_type)
table.loc[allele.id]['flags'] += 2
if VERBOSE:
print(now(), 'Loading gen and nuc files...')
with open(fasta_gen, 'r') as fasta_gen:
for record in SeqIO.parse(fasta_gen, 'fasta'):
allele_id = record.id.replace('HLA:', '')
table.loc[allele_id]['len_gen'] = len(record.seq)
sequences.append((allele_id, 'gen', str(record.seq)))
with open(fasta_nuc, 'r') as fasta_nuc:
for record in SeqIO.parse(fasta_nuc, 'fasta'):
allele_id = record.id.replace('HLA:', '')
table.loc[allele_id]['len_nuc'] = len(record.seq)
sequences.append((allele_id, 'nuc', str(record.seq)))
# convert list of tuples into DataFrame for features and sequences
all_features = pd.DataFrame(all_features, columns=['id', 'feature', 'number', 'start', 'end', 'length', 'order'])
sequences = pd.DataFrame(sequences, columns=['id', 'source', 'sequence'])
joined = pd.merge(table, all_features, how='inner', on='id')
exons_for_locus = {}
for i_locus, i_group in joined.groupby('locus'):
exons_for_locus[i_locus] = i_group[i_group['feature']=='exon']['number'].max()
# for i_id, i_group in joined.groupby('id'):
# max_exons = exons_for_locus[table.loc[i_id]['locus']]
# if len(i_group) >= 2*max_exons-1:
# print i_id, 'is fully annotated on all exons and introns and UTRs'
if VERBOSE:
print(now(), 'Checking dat features vs gen/nuc sequences...')
for allele, features in joined.groupby('id'):
row = features.irow(0) # first row of the features subtable. Contains all allele information because of the join
sum_features_length = features['length'].sum()
sum_exons_length = features.loc[features['feature']=='exon']['length'].sum()
if row['len_gen']>0 and row['len_gen'] != sum_features_length:
if VERBOSE:
print("\tFeature lengths don't add up to gen sequence length", allele, row['len_gen'], sum_features_length, row['type'])
table.loc[allele]['flags'] += 4
if row['len_nuc']>0 and row['len_nuc'] != sum_exons_length:
if VERBOSE:
print("\tExon lengths don't add up to nuc sequence length", allele, row['len_nuc'], sum_exons_length, row['type'])
table.loc[allele]['flags'] += 8
if VERBOSE:
print(now(), 'Sanity check finished. Computing feature sequences...')
ft_seq_lookup = OrderedDict()
ft_seq_lookup['---DUMMY---'] = 0 # it will be useful later on if 0 isn't used. lookup_id*boolean operation, etc.
ft_counter = 1
all_ft_counter = 0
all_features['seq_id'] = 0
for i_id, i_features in all_features.groupby('id'):
seq = sequences.loc[(sequences['id']==i_id) & (sequences['source']=='dat')].irow(0)['sequence']
for ft_idx, feature in i_features.iterrows():
ft_seq = seq[feature['start']:feature['end']]
all_ft_counter += 1
if ft_seq not in ft_seq_lookup:
ft_seq_lookup[ft_seq] = ft_counter
all_features.loc[ft_idx, 'seq_id'] = ft_counter
ft_counter += 1
else:
all_features.loc[ft_idx, 'seq_id'] = ft_seq_lookup[ft_seq]
feature_sequences = pd.DataFrame([seq for seq in list(ft_seq_lookup.keys())], columns=['sequence']) # , index=ft_seq_lookup.values() but it's 0,1,2,... anyway, as the default
return table, all_features, sequences, feature_sequences
def prune_identical_alleles(binary_mtx, report_groups=False):
# return binary_mtx.transpose().drop_duplicates().transpose()
# # faster:
hash_columns = binary_mtx.transpose().dot(np.random.rand(binary_mtx.shape[0])) # dtype np.uint16 okay here because result will be float64
if report_groups:
grouper = hash_columns.groupby(hash_columns)
groups = {g[1].index[0]: g[1].index.tolist() for g in grouper}
alleles_to_keep = hash_columns.drop_duplicates().index # relying on keeping the first representative
# TODO: maybe return sets of alleles that were collapsed into the representative (identical patterned sets)
return binary_mtx[alleles_to_keep] if not report_groups else (binary_mtx[alleles_to_keep], groups)
def prune_identical_reads(binary_mtx):
# almost the same as ht.get_compact_model() except it doesn't return an occurence vector.
# It should only be used to compactify a matrix before passing it to the function finding
# overshadowed alleles. Final compactifying should be later done on the original binary matrix.
#
# return binary_mtx.drop_duplicates()
# # faster:
reads_to_keep = binary_mtx.dot(np.random.rand(binary_mtx.shape[1])).drop_duplicates().index # dtype np.uint16 okay because result will be float64
return binary_mtx.loc[reads_to_keep]
def prune_overshadowed_alleles(binary_mtx):
# Calculates B_T*B of the (pruned) binary matrix to determine if certain alleles "overshadow" others, i.e.
# have the same hits as other alleles plus more. In this case, these "other alleles" can be thrown out early, as
# they would be never chosen over the one overshadowing them.
#
# For a 1000reads x 3600alleles matrix it takes 15 seconds.
# So you should always prune identical columns and rows before to give it a matrix as small as possible.
# So ideal usage:
# prune_overshadowed_alleles(prune_identical_alleles(prune_identical_reads(binary_mtx)))
# np.dot() is prone to overflow and doesn't expand to int64 like np.sum(). Our binary matrices are np.uint16.
# If we have less than 65K rows in the binary mtx, we're good with uint16. We're also good if there's no
# column with 65K+ hits. We check for both with lazy 'or' to avoid calculating the sum if not needed anyway.
# If we would overflow, we change to uint32.
if (binary_mtx.shape[0] < np.iinfo(np.uint16).max) or (binary_mtx.sum(axis=0).max() < np.iinfo(np.uint16).max):
# In case we would reduce the binary mtx to np.uint8 we should to ensure it's at least uint16.
bb = binary_mtx if all(binary_mtx.dtypes == np.uint16) else binary_mtx.astype(np.uint16)
else:
bb = binary_mtx.astype(np.uint32)
covariance = bb.transpose().dot(bb)
diagonal = pd.Series([covariance[ii][ii] for ii in covariance.columns], index=covariance.columns)
new_covariance = covariance[covariance.columns]
for ii in new_covariance.columns:
new_covariance[ii][ii] = 0
overshadowed = []
for ii in new_covariance.columns:
potential_superiors = new_covariance[ii][new_covariance[ii]==diagonal[ii]].index
if any(diagonal[potential_superiors] > diagonal[ii]):
overshadowed.append(ii)
non_overshadowed = covariance.columns.difference(overshadowed)
return non_overshadowed
def create_paired_matrix(binary_1, binary_2, id_cleaning=None):
# id_cleaning is a function object that turns a read ID that contains pair member information (like XYZ_1234:5678/1) into an
# ID that identifies the pair itself (like XYZ_1234:5678) so we can match the two DF's IDs. In the above case a suitable
# id_cleaning function would be lambda x: x[:-2] (gets rid of /1 and /2 from the end). Sometimes it's trickier as pair member
# information can be somewhere in the middle, and sometimes it's not even required at all as both pair members have the same ID
# just in two different files (1000 genomes).
if id_cleaning is not None:
binary_1.index = list(map(id_cleaning, binary_1.index))
binary_2.index = list(map(id_cleaning, binary_2.index))
common_read_ids = binary_1.index.intersection(binary_2.index)
only_1 = binary_1.index.difference(binary_2.index)
only_2 = binary_2.index.difference(binary_1.index)
b_1 = binary_1.loc[common_read_ids]
b_2 = binary_2.loc[common_read_ids]
b_12 = b_1 * b_2 # elementwise AND
b_ispaired = b_12.any(axis=1) # reads with at least one allele w/ paired hits
b_paired = b_12.loc[b_ispaired]
b_mispaired = b_1.loc[~b_ispaired] + b_2.loc[~b_ispaired] # elementwise AND where two ends only hit different alleles
b_unpaired = pd.concat([binary_1.loc[only_1], binary_2.loc[only_2]]) # concatenation for reads w/ just one end mapping anywhere
if VERBOSE:
print(now(), ('Alignment pairing completed. %d paired, %d unpaired, %d discordant ' %
(b_paired.shape[0], b_unpaired.shape[0], b_mispaired.shape[0])))
return b_paired, b_mispaired, b_unpaired
def get_features(allele_id, features, feature_list):
# allele_id can be like HLA12345_HLA67890: then we take features from HLA12345 first if possible, then from HLA67890
# for sequence reconstruction (in that case HLA67890 should be a nearest neighbor of HLA12345)
if '_' in allele_id:
partial_allele, complete_allele = allele_id.split('_')
else:
complete_allele = allele_id
feats_complete = {(of['feature'], of['number']): of for _, of in features.loc[features['id']==complete_allele].iterrows()}
feats_partial = {(of['feature'], of['number']): of for _, of in features.loc[features['id']==partial_allele].iterrows()} if '_' in allele_id else feats_complete
feats_to_include = []
for feat in sorted(feature_list, key=feature_order):
if feat in feats_partial:
feats_to_include.append(feats_partial[feat])
elif feat in feats_complete:
feats_to_include.append(feats_complete[feat])
else:
warnings.warn('Feature %s not found for allele %s' % (feat, allele_id))
return pd.DataFrame(feats_to_include)
def calculate_coverage(alignment, features, alleles_to_plot, features_used):
assert len(alignment) in (2, 4, 5), ("Alignment tuple either has to have 2, 4 or 5 elements. First four: pos, read_details "
"once or twice depending on single or paired end, and an optional binary DF at the end for PROPER paired-end plotting")
has_pairing_info = (len(alignment) == 5)
if len(alignment) == 2:
matrix_pos, read_details = alignment[:2]
pos = matrix_pos[alleles_to_plot]
pairing = np.sign(pos) * 2 # see explanation on the else branch. These mean unpaired hits.
hit_counts = np.sign(pos).sum(axis=1) # how many of the selected alleles does a particular read hit? (unambiguous hit or not)
# if only a single allele is fed to this function that has no hits at all, we still want to create a null-matrix
# for it. Its initialization depends on max_ambiguity (it's the size of one of its dimensions) so we set this to 1 at least.
max_ambiguity = max(1, hit_counts.max())
to_process = [(pos, read_details, hit_counts, pairing)]
elif len(alignment) == 4: # TODO: deprecated. We don't use this, it should probably be taken out.
matrix_pos1, read_details1, matrix_pos2, read_details2 = alignment
pos1 = matrix_pos1[alleles_to_plot]
pos2 = matrix_pos2[alleles_to_plot]
pairing1 = np.sign(pos1) * 2 # every read is considered unpaired
pairing2 = np.sign(pos2) * 2
hit_counts1 = np.sign(pos1).sum(axis=1)
hit_counts2 = np.sign(pos2).sum(axis=1)
max_ambiguity = max(1, hit_counts1.max(), hit_counts2.max())
to_process = [(pos1, read_details1, hit_counts1, pairing1), (pos2, read_details2, hit_counts2, pairing2)]
else:
matrix_pos1, read_details1, matrix_pos2, read_details2, pairing_binaries = alignment
bin_p, bin_u, bin_m = pairing_binaries
pos1 = matrix_pos1[alleles_to_plot]
pos2 = matrix_pos2[alleles_to_plot]
# pairing's values - 0: no hit, 1: paired hit, 2: unpaired hit, 3: discordantly paired hit
pairing = pd.concat([bin_p[alleles_to_plot], bin_u[alleles_to_plot]*2, bin_m[alleles_to_plot]*3])
pairing1 = pairing.loc[pos1.index] # pairing information for first ends' hit matrix
pairing2 = pairing.loc[pos2.index] # pairing information for second ends' hit matrix
hit_counts1 = np.sign(pairing1).sum(axis=1)
hit_counts2 = np.sign(pairing2).sum(axis=1)
max_ambiguity = max(1, hit_counts1.max(), hit_counts2.max())
to_process = [(pos1, read_details1, hit_counts1, pairing1), (pos2, read_details2, hit_counts2, pairing2)]
coverage_matrices = []
for allele in alleles_to_plot:
allele_features = get_features(allele, features, features_used)
allele_length = allele_features['length'].sum()
# Dimensions:
# 1st - 0: perfect hit, 1: hit w/ mismatch(es)
# 2nd - 0: paired, 1: unpaired, 2: discordantly paired # maybe introduce unpaired despite being paired on other alleles
# 3rd - 0: unique hit, 1: ambiguous hit between two alleles, ... n: ambiguous bw/ n+1 alleles
# 4th - 0 ... [sequence length]
coverage = np.zeros((2, 3, max_ambiguity, allele_length), dtype=int)
# to_process is a list containing tuples with mapping/position/hit property dataframes to superimpose.
# For single-end plotting to_process has just one element. For paired end, to_process has two elements that
# were pre-processed to only plot reads where both pairs map, etc. but the point is that superimposing the two
# will provide the correct result.
for pos, read_details, hit_counts, pairing_info in to_process:
reads = pos[pos[allele]!=0].index # reads hitting allele: coverage plot will be built using these
for i_pos, i_read_length, i_mismatches, i_hitcount, i_pairing in zip(
pos.loc[reads][allele],
read_details.loc[reads]['read_length'],
read_details.loc[reads]['mismatches'],
hit_counts[reads],
pairing_info.loc[reads][allele]):
if not i_pairing:
continue # or i_pairing = 4. Happens if one end maps to the allele but a different allele has a full paired hit.
coverage[int(bool(i_mismatches))][i_pairing-1][i_hitcount-1][i_pos-1:i_pos-1+i_read_length] += 1
coverage_matrices.append((allele, coverage))
return coverage_matrices
def plot_coverage(outfile, coverage_matrices, allele_data, features, features_used, columns=2):
def start_end_zeros(cov_array):
# the areaplot polygon gets screwed up if the series don't end/start with zero
# this adds a zero to the sides to circumvent that
return np.append(np.append([0], cov_array), [0])
def allele_sorter(allele_cov_mtx_tuple):
allele, _ = allele_cov_mtx_tuple
return allele_data.loc[allele.split('_')[0]]['type']
def get_allele_locus(allele):
return allele_data.loc[allele.split('_')[0]]['locus']
number_of_loci = len(set((get_allele_locus(allele) for allele, _ in coverage_matrices)))
dpi = 50
box_size = (7, 1)
# subplot_rows = len(coverage_matrices)/columns + bool(len(coverage_matrices) % columns) # instead of float division + rounding up
# subplot_rows = 3 # TODO
subplot_rows = 3 * number_of_loci + 1 # rowspan=3 for each plot, legend at the bottom with third height and colspan=2
area_colors = [ # stacked plot colors
(0.26, 0.76, 0.26), # perfect, paired, unique
(0.40, 0.84, 0.40), # perfect, paired, shared
(0.99, 0.75, 0.20), # perfect, unpaired, unique
(0.99, 0.75, 0.20), # perfect, mispaired, unique
(0.99, 0.85, 0.35), # perfect, unpaired, shared
(0.99, 0.85, 0.35), # perfect, mispaired, shared
(0.99, 0.23, 0.23), # mismatch, paired, unique
(0.99, 0.49, 0.49), # mismatch, paired, shared
(0.14, 0.55, 0.72), # mismatch, unpaired, unique
(0.14, 0.55, 0.72), # mismatch, mispaired, unique
(0.33, 0.70, 0.88), # mismatch, unpaired, shared
(0.33, 0.70, 0.88)] # mismatch, mispaired, shared
figure = pylab.figure(figsize=(box_size[0]*columns, box_size[1]*subplot_rows), dpi=dpi) # TODO: dpi doesn't seem to do shit. Is it stuck in 100?
coverage_matrices = sorted(coverage_matrices, key=allele_sorter) # so that the A alleles come first, then B, and so on.
prev_locus = ''
i_locus = -1
for allele, coverage in coverage_matrices:
if '_' in allele:
partial, complete = allele.split('_')
plot_title = '%s (introns from %s)' % (allele_data.loc[partial]['type'], allele_data.loc[complete]['type']) # , allele for debugging (original ID)
else:
plot_title = allele_data.loc[allele]['type'] # + allele for debugging original ID
if prev_locus != get_allele_locus(allele): # new locus, start new row
i_locus += 1
i_allele_in_locus = 0
else:
i_allele_in_locus = 1
prev_locus = get_allele_locus(allele)
plot = pylab.subplot2grid((subplot_rows, columns), (3*i_locus, i_allele_in_locus), rowspan=3, adjustable='box')
_, _, max_ambig, seq_length = coverage.shape # first two dimensions known (mismatched[2], pairing[3])
shared_weighting = np.reciprocal(np.arange(max_ambig)+1.0) # --> 1, 1/2, 1/3...
shared_weighting[0] = 0 # --> 0, 1/2, 1/3, so the unique part doesn't get mixed in
perfect_paired_unique = start_end_zeros(coverage[0][0][0])
mismatch_paired_unique = start_end_zeros(coverage[1][0][0])
perfect_unpaired_unique = start_end_zeros(coverage[0][1][0])
mismatch_unpaired_unique = start_end_zeros(coverage[1][1][0])
perfect_mispaired_unique = start_end_zeros(coverage[0][2][0])
mismatch_mispaired_unique = start_end_zeros(coverage[1][2][0])
perfect_paired_shared = start_end_zeros(shared_weighting.dot(coverage[0][0]))
mismatch_paired_shared = start_end_zeros(shared_weighting.dot(coverage[1][0]))
perfect_unpaired_shared = start_end_zeros(shared_weighting.dot(coverage[0][1]))
mismatch_unpaired_shared = start_end_zeros(shared_weighting.dot(coverage[1][1]))
perfect_mispaired_shared = start_end_zeros(shared_weighting.dot(coverage[0][2]))
mismatch_mispaired_shared = start_end_zeros(shared_weighting.dot(coverage[1][2]))
# Exon annotation
i_start = 1 # position of last feature's end. It's one because we padded with zeros above
for _, ft in get_features(allele, features, features_used).iterrows():
if ft['feature'] == 'exon':
plot.axvspan(i_start, i_start + ft['length'], facecolor='black', alpha=0.1, linewidth=0, zorder=1)
i_start += ft['length']
areas = plot.stackplot(np.arange(seq_length+2), # seq_length+2 because of the zero padding at either end
perfect_paired_unique + 0.001, # so that 0 isn't -inf on the logplot, but still below cutoff
perfect_paired_shared,
perfect_unpaired_unique,
perfect_mispaired_unique,
perfect_unpaired_shared,
perfect_mispaired_shared,
mismatch_paired_unique,
mismatch_paired_shared,
mismatch_unpaired_unique,
mismatch_mispaired_unique,
mismatch_unpaired_shared,
mismatch_mispaired_shared,
linewidth=0, colors=area_colors, zorder=5)
for aa in areas:
# if you output to pdf it strangely doesn't respect linewidth=0 perfectly, you'll have
# to set line colors identical to area color to avoid having a black line between them
aa.set_edgecolor(aa.get_facecolor())
plot.tick_params(axis='both', labelsize=10, direction='out', which='both', top=False)
plot.text(.015, 0.97, plot_title, horizontalalignment='left', verticalalignment='top', transform=plot.transAxes, fontsize=10, zorder=6)
_, _, _, y2 = plot.axis()
plot.axis((0, seq_length, 1, y2)) # enforce y axis minimum at 10^0. This corresponds to zero coverage because of the +1 above
plot.set_yscale('log')
plot.set_ylim(bottom=0.5)
legend = pylab.subplot2grid((subplot_rows, columns), (subplot_rows-1, 0), colspan=2, adjustable='box')
ppp = pylab.matplotlib.patches
legend.add_patch(ppp.Rectangle((0, 2), 2, 2, color=area_colors[0]))
legend.add_patch(ppp.Rectangle((0, 0), 2, 2, color=area_colors[1]))
legend.add_patch(ppp.Rectangle((25, 2), 2, 2, color=area_colors[2]))
legend.add_patch(ppp.Rectangle((25, 0), 2, 2, color=area_colors[4]))
legend.add_patch(ppp.Rectangle((50, 2), 2, 2, color=area_colors[6]))
legend.add_patch(ppp.Rectangle((50, 0), 2, 2, color=area_colors[7]))
legend.add_patch(ppp.Rectangle((75, 2), 2, 2, color=area_colors[8]))
legend.add_patch(ppp.Rectangle((75, 0), 2, 2, color=area_colors[10]))
legend.text( 2.5, 3, 'paired, no mismatches, unique', va='center', size='smaller')
legend.text( 2.5, 1, 'paired, no mismatches, ambiguous', va='center', size='smaller')
legend.text(27.5, 3, 'unpaired, no mismatches, unique', va='center', size='smaller')
legend.text(27.5, 1, 'unpaired, no mismatches, ambiguous', va='center', size='smaller')
legend.text(52.5, 3, 'paired, mismatched, unique', va='center', size='smaller')
legend.text(52.5, 1, 'paired, mismatched, ambiguous', va='center', size='smaller')
legend.text(77.5, 3, 'unpaired, mismatched, unique', va='center', size='smaller')
legend.text(77.5, 1, 'unpaired, mismatched, ambiguous', va='center', size='smaller')
legend.set_xlim(0, 100)
legend.set_ylim(0, 4)
legend.axison = False
figure.tight_layout()
figure.savefig(outfile)