forked from gringer/bioinfscripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
maf2gfa.pl
executable file
·119 lines (105 loc) · 2.76 KB
/
maf2gfa.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/perl
## Filters out non-end and identical LAST results to simulate overlap alignments
use warnings;
use strict;
use Getopt::Long qw(:config auto_help pass_through);
use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
my $seqFileName = "";
my $ignoreSelf = 0;
my $endLeniency = 100; ## base pairs from end to allow
my $containFrac = 0.9; ## minimum containment fraction
my $containIdent = 0.9; ## minimum containment identity
my %quals = ();
my $qSeq = "";
my $qStart = 0;
my $qEnd = 0;
my $qLen = 0;
my $qStrand = "";
my $qMatchLen = 0;
my $qName = "";
my $tSeq = "";
my $tStart = 0;
my $tEnd = 0;
my $tMatchLen = 0;
my $tLen = 0;
my $tName = "";
my $lineBuffer = "";
my $GFAspec = 1;
my %matches = ();
if($GFAspec == 2){
print("H\tVN:Z:2.0\n");
} else {
print("H\tVN:Z:bogart/edges\n");
}
my $fastaFileName = shift(@ARGV);
if(!(-e "${fastaFileName}.fai")){
print(STDERR "Generating index file... ");
system(("samtools", "faidx", $fastaFileName));
print("done\n");
}
open(my $fastaFile, "<", "${fastaFileName}.fai");
while(<$fastaFile>){
chomp;
my @F = split(/\s+/);
if($GFAspec == 2){
printf("S\t%s\t%d\t*\n", $F[0], $F[1]);
} else {
printf("S\t%s\t*\tLN:i:%d\n", $F[0], $F[1]);
}
}
close($fastaFile);
while(<>){
if(/^$/){
next;
}
if(!/^[as]/){
next;
}
$lineBuffer .= $_;
my @F = split(/\s+/);
if($F[0] eq "a"){
$qSeq = "";
$tSeq = "";
} elsif($F[0] eq "s"){
if($tSeq){
$qName = $F[1];
$qStart = $F[2];
$qMatchLen = $F[3];
$qEnd = $qStart + $qMatchLen;
$qStrand = $F[4];
$qLen = $F[5];
$qSeq = $F[6];
if($GFAspec == 2){
if(($qSeq ne $tSeq) &&
(($qStart < $endLeniency) || ($tStart < $endLeniency) ||
($qEnd > ($qLen-$endLeniency)) || ($tEnd > ($tLen-$endLeniency)))){
print(join("\t",("E","${tName}+","${qName}${qStrand}",$tStart,$tEnd,
$qStart,$qEnd))."\n");
}
} else {
if($qSeq ne $tSeq){
if(($tEnd > ($tLen-$endLeniency)) && ($qStart < $endLeniency)){
print(join("\t",("L",$tName,"+",$qName,$qStrand,"*"))."\n");
}
if(($qEnd > ($qLen-$endLeniency)) && ($tStart < $endLeniency)){
print(join("\t",("L",$qName,$qStrand,$tName,"+","*"))."\n");
}
if(($qMatchLen / $qLen) >= $containFrac){
print(join("\t",("C",$tName,"+",$qName,$qStrand,"*"))."\n");
}
if(($tMatchLen / $tLen) >= $containFrac){
print(join("\t",("C",$qName,"+",$tName,$qStrand,"*"))."\n");
}
}
}
$lineBuffer = "";
} else {
$tName = $F[1];
$tStart = $F[2];
$tMatchLen = $F[3];
$tEnd = $tStart + $tMatchLen;
$tLen = $F[5];
$tSeq = $F[6];
}
}
}