-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCrossMap+.py
executable file
·1707 lines (1505 loc) · 60.7 KB
/
CrossMap+.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/opt/apps/python2/bin/python
"""
---------------------------------------------------------------------------------------
CrossMap: lift over genomics coordinates between assemblies.
Support BED, GFF/GTF, BAM/SAM, BigWig/Wig file formats.
------------------------------------------------------------------------------------------
"""
import os,sys
if sys.version_info[0] != 2 or sys.version_info[1] != 7:
print >>sys.stderr, "\nYou are using python" + str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + " CrossMap needs python2.7.*!\n"
sys.exit()
import optparse
import collections
import sets
import subprocess
import string
from textwrap import wrap
from time import strftime
import pysam
from bx.bbi.bigwig_file import BigWigFile
from bx.intervals import *
import numpy as np
import datetime
from cmmodule import ireader
from cmmodule import BED
from cmmodule import annoGene
from cmmodule import bam_cigar
from cmmodule import sam_header
from cmmodule import myutils
from cmmodule import fasta
from cmmodule import bgrMerge
__author__ = "Liguo Wang, Hao Zhao"
__contributor__="Liguo Wang, Hao Zhao"
__copyright__ = "Copyleft"
__credits__ = []
__license__ = "GPLv2"
__version__="0.2.7"
__maintainer__ = "Liguo Wang"
__email__ = "wangliguo78@gmail.com"
__status__ = "Production"
def printlog (mesg_lst):
"""
print progress into stderr
"""
if len(mesg_lst)==1:
msg = "@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + mesg_lst[0]
else:
msg = "@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + ' '.join(mesg_lst)
print >>sys.stderr,msg
def parse_header( line ):
return dict( [ field.split( '=' ) for field in line.split()[1:] ] )
def revcomp_DNA(dna):
"""
reverse complement of input DNA sequence.
"""
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A','N':'N','X':'X'}
seq = dna.upper()
return "".join(complement[base] for base in reversed(seq))
def wiggleReader( f ):
"""
Iterator yielding chrom, start, end, strand, value.
Values are zero-based, half-open.
Regions which lack a score are ignored.
"""
current_chrom = None
current_pos = None
current_step = None
# always for wiggle data
strand = '+'
mode = "bed"
for line in ireader.reader(f):
if line.isspace() or line.startswith( "track" ) or line.startswith( "#" ) or line.startswith( "browser" ):
continue
elif line.startswith( "variableStep" ):
header = parse_header( line )
current_chrom = header['chrom']
current_pos = None
current_step = None
if 'span' in header: current_span = int( header['span'] )
else: current_span = 1
mode = "variableStep"
elif line.startswith( "fixedStep" ):
header = parse_header( line )
current_chrom = header['chrom']
current_pos = int( header['start'] ) - 1
current_step = int( header['step'] )
if 'span' in header: current_span = int( header['span'] )
else: current_span = 1
mode = "fixedStep"
elif mode == "bed":
fields = line.split()
if len( fields ) > 3:
if len( fields ) > 5:
yield fields[0], int( fields[1] ), int( fields[2] ), fields[5], float( fields[3] )
else:
yield fields[0], int( fields[1] ), int( fields[2] ), strand, float( fields[3] )
elif mode == "variableStep":
fields = line.split()
pos = int( fields[0] ) - 1
yield current_chrom, pos, pos + current_span, strand, float( fields[1] )
elif mode == "fixedStep":
yield current_chrom, current_pos, current_pos + current_span, strand, float( line.split()[0] )
current_pos += current_step
else:
raise "Unexpected input line: %s" % line.strip()
def bigwigReader(infile, chrom_sizes=None, bin_size = 2000):
'''
infile: bigwig format file
chromsize: chrom_name: size.
return: chrom, position (0-based), value
'''
bw_obj = BigWigFile(file=open(infile))
for chr_name, chr_size in chrom_sizes.items():
for chrom, st, end in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = bin_size):
sig_list = bw_obj.get_as_array(chrom,st,end)
if sig_list is None:
continue
sig_list = np.nan_to_num(sig_list)
if np.sum(sig_list)==0:
continue
low_bound = st
point_num = 1
score = sig_list[0]
for value in (sig_list[1:]):
if value == score:
point_num += 1
else:
yield(( chrom, low_bound, low_bound + point_num, score))
score = value
low_bound = low_bound + point_num
point_num = 1
def check_bed12(bedline):
'''
check if bed12 format is correct or not
'''
fields = bedline.strip().split()
if len(fields) !=12:
return False
if fields[5] not in ['+','-','.']:
return False
try:
chromStart = int(fields[1])
chromEnd = int(fields[2])
thickStart = int(fields[6])
thickEnd = int(fields[7])
blockCount = int(fields[9])
blockSizes = [int(i) for i in fields[10].rstrip(',').split(',')]
blockStarts = [int(i) for i in fields[11].rstrip(',').split(',')]
except:
return False
if chromStart > chromEnd or thickStart > thickEnd:
return False
if thickStart < chromStart or thickEnd > chromEnd:
return False
if len(blockSizes) != blockCount:
return False
if len(blockStarts) != blockCount:
return False
if blockCount <1:
return False
for i in blockSizes:
if i < 0: return False
for i in blockStarts:
if i < 0: return False
return True
def intersectBed((chr1, st1, end1), (chr2, st2, end2)):
'''
return intersection of two bed regions
'''
if int(st1) > int(end1) or int(st2) > int(end2):
raise Exception ("Start cannot be larger than end")
if chr1 != chr2:
return None
if int(st1) > int(end2) or int(end1) < int(st2):
return None
return (chr1, max(st1, st2), min(end1,end2))
def read_chain_file (chain_file, print_table=False):
'''
input chain_file could be either plain text, compressed file (".gz", ".Z", ".z", ".bz", ".bz2", ".bzip2"),
or a URL pointing to the chain file ("http://", "https://", "ftp://"). If url was used, chain file must be plain text
'''
printlog(["Read chain_file: ", chain_file]),
maps={}
target_chromSize={}
source_chromSize={}
if print_table:
blocks=[]
for line in ireader.reader(chain_file):
# Example: chain 4900 chrY 58368225 + 25985403 25985638 chr5 151006098 - 43257292 43257528 1
if not line.strip():
continue
line=line.strip()
if line.startswith(('#',' ')):continue
fields = line.split()
if fields[0] == 'chain' and len(fields) in [12, 13]:
score = int(fields[1]) # Alignment score
source_name = fields[2] # E.g. chrY
source_size = int(fields[3]) # Full length of the chromosome
source_strand = fields[4] # Must be +
if source_strand != '+':
raise Exception("Source strand in an .over.chain file must be +. (%s)" % line)
source_start = int(fields[5]) # Start of source region
source_end = int(fields[6]) # End of source region
target_name = fields[7] # E.g. chr5
target_size = int(fields[8]) # Full length of the chromosome
target_strand = fields[9] # + or -
target_start = int(fields[10])
target_end = int(fields[11])
target_chromSize[target_name]= target_size
source_chromSize[source_name] = source_size
if target_strand not in ['+', '-']:
raise Exception("Target strand must be - or +. (%s)" % line)
#if target_strand == '+':
# target_start = int(fields[10])
# target_end = int(fields[11])
#if target_strand == '-':
# target_start = target_size - target_end
# target_end = target_size - target_start
chain_id = None if len(fields) == 12 else fields[12]
if source_name not in maps:
maps[source_name] = Intersecter()
sfrom, tfrom = source_start, target_start
# Now read the alignment chain from the file and store it as a list (source_from, source_to) -> (target_from, target_to)
elif fields[0] != 'chain' and len(fields) == 3:
size, sgap, tgap = int(fields[0]), int(fields[1]), int(fields[2])
if print_table:
if target_strand == '+': blocks.append((source_name,sfrom, sfrom+size, source_strand, target_name, tfrom, tfrom+size, target_strand))
elif target_strand == '-': blocks.append((source_name,sfrom, sfrom+size, source_strand, target_name, target_size - (tfrom+size), target_size - tfrom, target_strand))
if target_strand == '+':
maps[source_name].add_interval( Interval(sfrom, sfrom+size,(target_name,tfrom, tfrom+size,target_strand)))
elif target_strand == '-':
maps[source_name].add_interval( Interval(sfrom, sfrom+size,(target_name,target_size - (tfrom+size), target_size - tfrom, target_strand)))
sfrom += size + sgap
tfrom += size + tgap
elif fields[0] != 'chain' and len(fields) == 1:
size = int(fields[0])
if print_table:
if target_strand == '+': blocks.append((source_name,sfrom, sfrom+size, source_strand, target_name, tfrom, tfrom+size, target_strand))
elif target_strand == '-': blocks.append((source_name,sfrom, sfrom+size, source_strand, target_name, target_size - (tfrom+size), target_size - tfrom, target_strand))
if target_strand == '+':
maps[source_name].add_interval( Interval(sfrom, sfrom+size,(target_name,tfrom, tfrom+size,target_strand)))
elif target_strand == '-':
maps[source_name].add_interval( Interval(sfrom, sfrom+size,(target_name,target_size - (tfrom+size), target_size - tfrom, target_strand)))
else:
raise Exception("Invalid chain format. (%s)" % line)
if (sfrom + size) != source_end or (tfrom + size) != target_end:
raise Exception("Alignment blocks do not match specified block sizes. (%s)" % header)
if print_table:
for i in blocks:
print '\t'.join([str(n) for n in i])
return (maps,target_chromSize, source_chromSize)
def map_coordinates(mapping, q_chr, q_start, q_end, q_strand='+', print_match=False):
'''
Map coordinates from source assembly to target assembly
'''
matches=[]
complement ={'+':'-','-':'+'}
if q_chr not in mapping:
return None
else:
targets = mapping[q_chr].find(q_start, q_end)
if len(targets)==0:
return None
elif len(targets)==1:
s_start = targets[0].start
s_end = targets[0].end
t_chrom = targets[0].value[0]
t_start = targets[0].value[1]
t_end = targets[0].value[2]
t_strand = targets[0].value[3]
(chr, real_start, real_end) = intersectBed((q_chr,q_start,q_end),(q_chr,s_start,s_end))
l_offset = abs(real_start - s_start)
#r_offset = real_end - s_end
size = abs(real_end - real_start)
matches.append( (chr, real_start, real_end,q_strand))
if t_strand == '+':
i_start = t_start + l_offset
if q_strand == '+':
matches.append( (t_chrom, i_start, i_start + size, t_strand))
else:
matches.append( (t_chrom, i_start, i_start + size, complement[t_strand]))
elif t_strand == '-':
i_start = t_end - l_offset - size
if q_strand == '+':
matches.append( (t_chrom, i_start, i_start + size, t_strand))
else:
matches.append( (t_chrom, i_start, i_start + size, complement[t_strand]))
else:
raise Exception("Unknown strand: %s. Can only be '+' or '-'." % q_strand)
elif len(targets) > 1:
for t in targets:
s_start = t.start
s_end = t.end
t_chrom = t.value[0]
t_start = t.value[1]
t_end = t.value[2]
t_strand = t.value[3]
(chr, real_start, real_end) = intersectBed((q_chr,q_start,q_end),(q_chr,s_start,s_end))
l_offset = abs(real_start - s_start)
#r_offset = abs(real_end - s_end)
size = abs(real_end - real_start)
matches.append( (chr, real_start, real_end,q_strand) )
if t_strand == '+':
i_start = t_start + l_offset
if q_strand == '+':
matches.append( (t_chrom, i_start, i_start + size, t_strand))
else:
matches.append( (t_chrom, i_start, i_start + size, complement[t_strand]))
elif t_strand == '-':
i_start = t_end - l_offset - size
if q_strand == '+':
matches.append( (t_chrom, i_start, i_start + size, t_strand))
else:
matches.append( (t_chrom, i_start, i_start + size, complement[t_strand]))
else:
raise Exception("Unknown strand: %s. Can only be '+' or '-'." % q_strand)
if print_match:
print matches
# input: 'chr1',246974830,247024835
# output: [('chr1', 246974830, 246974833, '+' ), ('chr1', 248908207, 248908210, '+' ), ('chr1', 247024833, 247024835, '+'), ('chr1', 249058210, 249058212,'+')]
# [('chr1', 246974830, 246974833), ('chr1', 248908207, 248908210)]
return matches
def crossmap_vcf_file(mapping, infile,outfile, liftoverfile, refgenome):
'''
Convert genome coordinates in VCF format.
'''
#index refegenome file if it hasn't been done
if not os.path.exists(refgenome + '.fai'):
printlog(["Creating index for", refgenome])
pysam.faidx(refgenome)
refFasta = pysam.Fastafile(refgenome)
FILE_OUT = open(outfile ,'w')
UNMAP = open(outfile + '.unmap','w')
total = 0
fail = 0
withChr = False
for line in ireader.reader(infile):
if not line.strip():
continue
line=line.strip()
if line.startswith('##'):
print >>FILE_OUT,line
print >>UNMAP, line
continue
fields = string.split(line,maxsplit=7)
if fields[0].startswith('#'):
print >>FILE_OUT, "##liftOverProgram=CrossMap(https://sourceforge.net/projects/crossmap/)"
print >>FILE_OUT, "##liftOverFile=" + liftoverfile
print >>FILE_OUT, "##new_reference_genome=" + refgenome
print >>FILE_OUT, "##liftOverTime=" + datetime.date.today().strftime("%B%d,%Y")
print >>FILE_OUT,line
print >>UNMAP, line
else:
total += 1
if fields[0].startswith('chr'):
withChr = True
chrom = fields[0]
else:
chrom = 'chr' + fields[0]
start = int(fields[1])-1 # 0 based
end = start + len(fields[3])
a = map_coordinates(mapping, chrom, start, end,'+')
if a is None:
print >>UNMAP, line
fail += 1
continue
if len(a) == 2:
# update chrom
if withChr is False:
fields[0] = str(a[1][0]).replace('chr','')
else:
fields[0] = str(a[1][0])
# update start coordinate
fields[1] = a[1][1] + 1
# update ref allele
#tmp = pysam.faidx(refgenome,str(a[1][0]) + ':' + str(a[1][1]+1) + '-' + str(a[1][2]))
#fields[3] =tmp[-1].rstrip('\n\r').upper()
fields[3] = refFasta.fetch(str(a[1][0]),a[1][1],a[1][2]).upper()
if fields[3] != fields[4]:
print >>FILE_OUT, '\t'.join(map(str, fields))
else:
print >>UNMAP, line
fail += 1
else:
print >>UNMAP, line
fail += 1
continue
FILE_OUT.close()
UNMAP.close()
printlog (["Total entries:", str(total)])
printlog (["Failed to map:", str(fail)])
def crossmap_bed_file(mapping, inbed,outfile=None):
'''
Convert genome coordinates (in bed format) between assemblies.
BED format: http://genome.ucsc.edu/FAQ/FAQformat.html#format1
'''
# check if 'outfile' was set. If not set, print to screen, if set, print to file
if outfile is not None:
FILE_OUT = open(outfile,'w')
UNMAP = open(outfile + '.unmap','w')
else:
pass
for line in ireader.reader(inbed):
if line.startswith('#'):continue
if line.startswith('track'):continue
if line.startswith('browser'):continue
if not line.strip():continue
line=line.strip()
fields=line.split()
strand = '+'
# filter out line less than 3 columns
if len(fields)<3:
print >>sys.stderr, "Less than 3 fields. skip " + line
if outfile:
print >>UNMAP, line
continue
try:
int(fields[1])
except:
print >>sys.stderr, "Start corrdinate is not an integer. skip " + line
if outfile:
print >>UNMAP, line
continue
try:
int(fields[2])
except:
print >>sys.stderr, "End corrdinate is not an integer. skip " + line
if outfile:
print >>UNMAP, line
continue
if int(fields[1]) > int(fields[2]):
print >>sys.stderr, "\"Start\" is larger than \"End\" corrdinate is not an integer. skip " + line
if outfile:
print >>UNMAP, line
continue
# deal with bed less than 12 columns
if len(fields)<12:
# try to reset strand
try:
for f in fields:
if f in ['+','-']:
strand = f
except:
pass
a = map_coordinates(mapping, fields[0], int(fields[1]),int(fields[2]),strand)
# example of a: [('chr1', 246974830, 246974833,'+'), ('chr1', 248908207, 248908210,'+')]
try:
if len(a) % 2 != 0:
if outfile is None:
print line + '\tFail'
else:
print >>UNMAP, line
continue
# if len(a) == 2:
#
# #reset fields
# fields[0] = a[1][0]
# fields[1] = a[1][1]
# fields[2] = a[1][2]
# for i in range(0,len(fields)): #update the strand information
# if fields[i] in ['+','-']:
# fields[i] = a[1][3]
#
# if outfile is None:
# print line + '\t->\t' + '\t'.join([str(i) for i in fields])
# else:
# print >>FILE_OUT, '\t'.join([str(i) for i in fields])
if len(a) >=2 :
count=0
for j in range(1,len(a),2):
count += 1
fields[0] = a[j][0]
fields[1] = a[j][1]
fields[2] = a[j][2]
for i in range(0,len(fields)): #update the strand information
if fields[i] in ['+','-']:
fields[i] = a[j][3]
if outfile is None:
print line + '\t'+ '(split.' + str(count) + ':' + ':'.join([str(i) for i in a[j-1]]) + ')\t' + '\t'.join([str(i) for i in fields])
else:
print >>FILE_OUT, '\t'.join([str(i) for i in fields])
except:
if outfile is None:
print line + '\tFail'
else:
print >>UNMAP, line
continue
# deal with bed12 and bed12+8 (genePred format)
if len(fields)==12 or len(fields)==20:
strand = fields[5]
if strand not in ['+','-']:
raise Exception("Unknown strand: %s. Can only be '+' or '-'." % strand)
fail_flag=False
exons_old_pos = annoGene.getExonFromLine(line) #[[chr,st,end],[chr,st,end],...]
#print exons_old_pos
exons_new_pos = []
for e_chr, e_start, e_end in exons_old_pos:
a = map_coordinates(mapping, e_chr, e_start, e_end, strand)
if a is None:
fail_flag =True
break
if len(a) == 2:
exons_new_pos.append(a[1])
# a has two elements, first is query, 2nd is target. # [('chr1', 246974830, 246974833,'+'), ('chr1', 248908207, 248908210,'+')]
else:
fail_flag =True
break
if not fail_flag:
# check if all exons were mapped to the same chromosome the same strand
chr_id = set()
exon_strand = set()
for e_chr, e_start, e_end, e_strand in exons_new_pos:
chr_id.add(e_chr)
exon_strand.add(e_strand)
if len(chr_id) != 1 or len(exon_strand) != 1:
fail_flag = True
if not fail_flag:
# build new bed
cds_start_offset = int(fields[6]) - int(fields[1])
cds_end_offset = int(fields[2]) - int(fields[7])
new_chrom = exons_new_pos[0][0]
new_chrom_st = exons_new_pos[0][1]
new_chrom_end = exons_new_pos[-1][2]
new_name = fields[3]
new_score = fields[4]
new_strand = exons_new_pos[0][3]
new_thickStart = new_chrom_st + cds_start_offset
new_thickEnd = new_chrom_end - cds_end_offset
new_ittemRgb = fields[8]
new_blockCount = len(exons_new_pos)
new_blockSizes = ','.join([str(o - n) for m,n,o,p in exons_new_pos])
new_blockStarts = ','.join([str(n - new_chrom_st) for m,n,o,p in exons_new_pos])
new_bedline = '\t'.join(str(i) for i in (new_chrom,new_chrom_st,new_chrom_end,new_name,new_score,new_strand,new_thickStart,new_thickEnd,new_ittemRgb,new_blockCount,new_blockSizes,new_blockStarts))
if check_bed12(new_bedline) is False:
fail_flag = True
else:
if outfile is None:
print line + '\t->\t' + new_bedline
else:
print >>FILE_OUT, new_bedline
if fail_flag:
if outfile is None:
print line + '\tFail'
else:
print >>UNMAP, line
def crossmap_gff_file(mapping, ingff,outfile=None):
'''
Convert genome coordinates (in GFF/GTF format) between assemblies.
GFF (General Feature Format) lines have nine required fields that must be Tab-separated:
1. seqname - The name of the sequence. Must be a chromosome or scaffold.
2. source - The program that generated this feature.
3. feature - The name of this type of feature. Some examples of standard feature types
are "CDS", "start_codon", "stop_codon", and "exon".
4. start - The starting position of the feature in the sequence. The first base is numbered 1.
5. end - The ending position of the feature (inclusive).
6. score - A score between 0 and 1000. If the track line useScore attribute is set to 1
for this annotation data set, the score value will determine the level of gray in
which this feature is displayed (higher numbers = darker gray). If there is no score
value, enter ".".
7. strand - Valid entries include '+', '-', or '.' (for don't know/don't care).
8. frame - If the feature is a coding exon, frame should be a number between 0-2 that
represents the reading frame of the first base. If the feature is not a coding exon,
the value should be '.'.
9. group - All lines with the same group are linked together into a single item.
GFF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format3
GTF (Gene Transfer Format) is a refinement to GFF that tightens the specification. The
first eight GTF fields are the same as GFF. The group field has been expanded into a
list of attributes. Each attribute consists of a type/value pair. Attributes must end
in a semi-colon, and be separated from any following attribute by exactly one space.
GTF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format4
We do NOT check if features (exon, CDS, etc) belonging to the same gene originally were
converted into the same chromosome/strand.
'''
if outfile is not None:
FILE_OUT = open(outfile,'w')
UNMAP = open(outfile + '.unmap', 'w')
for line in ireader.reader(ingff):
if line.startswith('#'):continue
if line.startswith('track'):continue
if line.startswith('browser'):continue
if line.startswith('visibility'):continue
if not line.strip():continue
line=line.strip()
fields=line.split('\t')
# check GFF file
#if len(fields) != 9:
# print >>sys.stderr, 'GFF file must has 9 columns. Skip ' + line
# continue
try:
start = int(fields[3]) - 1 #0-based
end = int(fields[4])/1
feature_size = end - start
except:
print >>sys.stderr, 'Cannot recognize \"start\" and \"end\" coordinates. Skip ' + line
if outfile:
print >>UNMAP, line
continue
if fields[6] not in ['+','-','.']:
print >>sys.stderr, 'Cannot recognize \"strand\". Skip ' + line
if outfile:
print >>UNMAP, line
continue
strand = '-' if fields[6] == '-' else '+'
chrom = fields[0]
a = map_coordinates(mapping, chrom,start,end,strand)
if a is None:
if outfile is None:
print line + '\tfail (no match to target assembly)'
else:
print >>UNMAP, line
continue
if len(a) !=2:
if outfile is None:
print line + '\tfail (multpile match to target assembly)'
else:
print >>UNMAP, line
else:
if (int(a[1][2]) - int(a[1][1])) != feature_size: # check if it is exact match
if outfile is None:
print line + '\tfail (not exact match)'
else:
print >>UNMAP, line
fields[0] = a[1][0] # chrom
fields[3] = a[1][1] + 1 # start, 1-based
fields[4] = a[1][2]
fields[6] = a[1][3]
if outfile is None:
print line + '\t->\t' + '\t'.join([str(i) for i in fields])
else:
print >>FILE_OUT, '\t'.join([str(i) for i in fields])
def crossmap_bam_file(mapping, chainfile, infile, outfile_prefix, chrom_size, IS_size=200, IS_std=30, fold=3, addtag = True):
'''
Convert genome coordinates (in BAM/SAM format) between assemblies.
BAM/SAM format: http://samtools.sourceforge.net/
chrom_size is target chromosome size
if addtag is set to True, will add tags to each alignmnet:
Q = QC (QC failed)
N = unmapped (originally unmapped or originally mapped but failed to liftover to new assembly)
M = multiple mapped (alignment can be liftover to multiple places)
U = unique mapped (alignment can be liftover to only 1 place)
tags for pair-end sequencing include:
QF: QC failed
NN: both read1 and read2 unmapped
NU: read1 unmapped, read2 unique mapped
NM: read1 unmapped, multiple mapped
UN: read1 uniquely mapped, read2 unmap
UU: both read1 and read2 uniquely mapped
UM: read1 uniquely mapped, read2 multiple mapped
MN: read1 multiple mapped, read2 unmapped
MU: read1 multiple mapped, read2 unique mapped
MM: both read1 and read2 multiple mapped
tags for single-end sequencing include:
QF: QC failed
SN: unmaped
SM: multiple mapped
SU: uniquely mapped
'''
# determine the input file format (BAM or SAM)
try:
samfile = pysam.Samfile(infile,'rb')
if len(samfile.header) ==0:
print >>sys.stderr, "BAM file has no header section. Exit!"
sys.exit(1)
bam_format = True
except:
samfile = pysam.Samfile(infile,'r')
if len(samfile.header) ==0:
print >>sys.stderr, "SAM file has no header section. Exit!"
sys.exit(1)
bam_format = False
# add comments into BAM/SAM header section
if bam_format:
comments=['Liftover from original BAM file: ' + infile]
else:
comments=['Liftover from original SAM file: ' + infile]
comments.append('Liftover is based on the chain file: ' + chainfile)
sam_ori_header = samfile.header
(new_header, name_to_id) = sam_header.bam_header_generator(orig_header = sam_ori_header, chrom_size = chrom_size, prog_name="CrossMap",prog_ver = __version__, format_ver=1.0,sort_type = 'coordinate',co=comments)
#
if outfile_prefix is not None:
if bam_format:
OUT_FILE = pysam.Samfile( outfile_prefix + '.bam', "wb", header = new_header )
#OUT_FILE_UNMAP = pysam.Samfile( outfile_prefix + '.unmap.bam', "wb", template=samfile )
printlog (["Liftover BAM file:", infile, '==>', outfile_prefix + '.bam'])
else:
OUT_FILE = pysam.Samfile( outfile_prefix + '.sam', "wh", header = new_header )
#OUT_FILE_UNMAP = pysam.Samfile( outfile_prefix + '.unmap.sam', "wh", template=samfile )
printlog (["Liftover SAM file:", infile, '==>', outfile_prefix + '.sam'])
else:
if bam_format:
OUT_FILE = pysam.Samfile( '-', "wb", header = new_header )
#OUT_FILE_UNMAP = pysam.Samfile( infile.replace('.bam','') + '.unmap.bam', "wb", template=samfile )
printlog (["Liftover BAM file:", infile])
else:
OUT_FILE = pysam.Samfile( '-', "w", header = new_header )
#OUT_FILE_UNMAP = pysam.Samfile( infile.replace('.sam','') + '.unmap.sam', "wh", template=samfile )
printlog (["Liftover SAM file:", infile])
QF = 0
NN = 0
NU = 0
NM = 0
UN = 0
UU = 0
UM = 0
MN = 0
MU = 0
MM = 0
SN = 0
SM = 0
SU = 0
total_item = 0
try:
while(1):
total_item += 1
new_alignment = pysam.AlignedRead() # create AlignedRead object
old_alignment = samfile.next()
# qcfailed reads will be written to OUT_FILE_UNMAP for both SE and PE reads
if old_alignment.is_qcfail:
QF += 1
if addtag: old_alignment.set_tag(tag="QF", value=0)
OUT_FILE.write(old_alignment)
continue
# new_alignment.qqual = old_alignment.qqual # quality string of read, exclude soft clipped part
# new_alignment.qlen = old_alignment.qlen # length of the "aligned part of read"
# new_alignment. aligned_pairs = old_alignment. aligned_pairs #read coordinates <=> ref coordinates
# new_alignment_aend = old_alignment.aend # reference End position of the aligned part (of read)
#new_alignment.qname = old_alignment.qname # 1st column. read name.
#new_alignment.flag = old_alignment.flag # 2nd column. subject to change. flag value
#new_alignment.tid = old_alignment.tid # 3rd column. samfile.getrname(tid) == chrom name
#new_alignment.pos = old_alignment.pos # 4th column. reference Start position of the aligned part (of read) [0-based]
#new_alignment.mapq = old_alignment.mapq # 5th column. mapping quality
#new_alignment.cigar= old_alignment.cigar # 6th column. subject to change.
#new_alignment.rnext = old_alignment.rnext # 7th column. tid of the reference (mate read mapped to)
#new_alignment.pnext = old_alignment.pnext # 8th column. position of the reference (0 based, mate read mapped to)
#new_alignment.tlen = old_alignment.tlen # 9th column. insert size
#new_alignment.seq = old_alignment.seq # 10th column. read sequence. all bases.
#new_alignment.qual = old_alignment.qual # 11th column. read sequence quality. all bases.
#new_alignment.tags = old_alignment.tags # 12 - columns
new_alignment.qname = old_alignment.qname # 1st column. read name.
new_alignment.seq = old_alignment.seq # 10th column. read sequence. all bases.
new_alignment.qual = old_alignment.qual # 11th column. read sequence quality. all bases.
new_alignment.tags = old_alignment.tags # 12 - columns
# by default pysam will change RG:Z to RG:A, which can cause downstream failures with GATK and freebayes
# Thanks Wolfgang Resch <wresch@helix.nih.gov> identified this bug and provided solution.
try:
rg, rgt = old_alignment.get_tag("RG", with_value_type=True)
except KeyError:
pass
else:
new_alignment.set_tag("RG", str(rg), rgt)
if old_alignment.is_paired:
new_alignment.flag = 0x0001 #pair-end
if old_alignment.is_read1:
new_alignment.flag = new_alignment.flag | 0x40
elif old_alignment.is_read2:
new_alignment.flag = new_alignment.flag | 0x80
#==================================
# R1 is originally unmapped
#==================================
if old_alignment.is_unmapped:
#------------------------------------
# both R1 and R2 unmapped
#------------------------------------
if old_alignment.mate_is_unmapped:
NN += 1
if addtag: old_alignment.set_tag(tag="NN", value=0)
OUT_FILE.write(old_alignment)
continue
else: # originally, read-1 unmapped, read-2 is mapped
try:
read2_chr = samfile.getrname(old_alignment.rnext)
read2_strand = '-' if old_alignment.mate_is_reverse else '+'
read2_start = old_alignment.pnext
read2_end = read2_start + 1
read2_maps = map_coordinates(mapping, read2_chr, read2_start, read2_end, read2_strand)
# [('chr1', 246974830, 246974833, '+' ), ('chr1', 248908207, 248908210, '+' )]
except:
read2_maps = None
#------------------------------------
# both R1 and R2 unmapped
#------------------------------------
if read2_maps is None:
NN += 1
if addtag: old_alignment.set_tag(tag="NN", value=0)
OUT_FILE.write(old_alignment)
continue
#------------------------------------
# R1 unmapped, R2 unique
#------------------------------------
elif len(read2_maps) == 2:
# 2
if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20
# 3
new_alignment.tid = name_to_id[read2_maps[1][0]] #recommend to set the RNAME of unmapped read to its mate's
# 4
new_alignment.pos = read2_maps[1][1] #recommend to set the POS of unmapped read to its mate's
# 5
new_alignment.mapq = old_alignment.mapq
# 6
new_alignment.cigar = old_alignment.cigar
# 7
new_alignment.rnext = name_to_id[read2_maps[1][0]]
# 8
new_alignment.pnext = read2_maps[1][1] #start
# 9
new_alignment.tlen = 0
NU += 1
if addtag: new_alignment.set_tag(tag="NU", value=0)
OUT_FILE.write(new_alignment)
continue
#------------------------------------
# R1 unmapped, R2 multiple
#------------------------------------
else:
# 2
if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20
# 2
new_alignment.flag = new_alignment.flag | 0x100
# 3
new_alignment.tid = name_to_id[read2_maps[1][0]]
# 4
new_alignment.pos = read2_maps[1][1]
# 5
new_alignment.mapq = old_alignment.mapq
# 6
new_alignment.cigar = old_alignment.cigar
# 7
new_alignment.rnext = name_to_id[read2_maps[1][0]]
# 8
new_alignment.pnext = read2_maps[1][1] #start
# 9
new_alignment.tlen = 0
NM += 1
if addtag: new_alignment.set_tag(tag="NM", value=0)
OUT_FILE.write(new_alignment)
continue
#==================================
# R1 is originally mapped
#==================================
else:
try:
read1_chr = samfile.getrname(old_alignment.tid)
read1_strand = '-' if old_alignment.is_reverse else '+'
read1_start = old_alignment.pos
read1_end = old_alignment.aend
read1_maps = map_coordinates(mapping, read1_chr, read1_start, read1_end, read1_strand)
except:
read1_maps = None
if not old_alignment.mate_is_unmapped:
try:
read2_chr = samfile.getrname(old_alignment.rnext)
read2_strand = '-' if old_alignment.mate_is_reverse else '+'
read2_start = old_alignment.pnext
read2_end = read2_start + 1
read2_maps = map_coordinates(mapping, read2_chr, read2_start, read2_end, read2_strand)
# [('chr1', 246974830, 246974833, '+' ), ('chr1', 248908207, 248908210, '+' )]
except:
read2_maps = None
#------------------------------------
# R1 unmapped (failed to liftover)
#------------------------------------
if read1_maps is None:
# 2 update flag (0x4: segment unmapped)
new_alignment.flag = new_alignment.flag | 0x4
# 3
new_alignment.tid = -1
# 4
new_alignment.pos = 0
# 5
new_alignment.mapq = 255
# 6
new_alignment.cigar = old_alignment.cigar
# 7
new_alignment.rnext = -1
# 8
new_alignment.pnext = 0
# 9
new_alignment.tlen = 0
# (1) read2 is unmapped before conversion
if old_alignment.mate_is_unmapped:
NN += 1
if addtag: new_alignment.set_tag(tag="NN", value=0)
OUT_FILE.write(new_alignment)
continue