-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathsimulate_gtf_for_fusion_circRNA.pl
135 lines (134 loc) · 4.93 KB
/
simulate_gtf_for_fusion_circRNA.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/perl -w
use strict;
die "Usage: $0 \"input_gtf\" \"output\" \"\(optional\)Nr_fusion_transcripts==10\" \"\(optional\)min_Exons==5\" \"\(optional\)minExonLen==100\" " if (@ARGV < 2);
my $gtf=$ARGV[0];
my $fileout=$ARGV[1];
my $NrFT=10;
if (scalar(@ARGV) > 2) { $NrFT=$ARGV[2]; }
my $min_Exs=5;
if (scalar(@ARGV) > 3) { $min_Exs=$ARGV[3]; }
my $minExonLen=100;
if (scalar(@ARGV) > 4) { $minExonLen=$ARGV[4]; }
my %uniq;
my %biotype;
my %Gname;
my %ExonCnt;
open IN,$gtf;
my @Gid;
my %minExLen;
my $gcnt=0;
while(<IN>) {
chomp;
my @a=split("\t",$_);
# MT protein_coding exon 3307 4262 . + . gene_id "ENSG00000198888"; transcript_id "ENST00000361390"; exon_number "1"; gene_name "MT-ND1"; gene_biotype "protein_coding"; transcript_name "MT-ND1-201";
if (($a[2] eq "exon") and ($a[0]!~m/NT/) and ($a[0]!~m/Y/)){
my @b=split(/\"/,$a[8]);
my $Nr=scalar(@b);
my $gene_id="";
my $transcript_id="";
#my $exon_number="";
my $gene_name="";
my $gene_biotype=$a[5];
for(my $i=0; $i<$Nr; $i=$i+2){
$b[$i]=~s/\s//g; $b[$i]=~s/\;//g;
if ($b[$i] eq "gene_id") {$gene_id=$b[$i+1];}
elsif ($b[$i] eq "transcript_id") {$transcript_id=$b[$i+1];}
elsif ($b[$i] eq "gene_name") {$gene_name=$b[$i+1];}
elsif ($b[$i] eq "gene_biotype") {$gene_biotype=$b[$i+1];}
}
if ($gene_id=~m/^NR_/) { next; }
if ($gene_name eq "") {$gene_name=$gene_id;}
if ($transcript_id eq "") {$transcript_id=$gene_id;}
my $id=$gene_id."\t".$transcript_id."\t".$a[0]."\t".$a[6];
if (!exists $uniq{$id}) {
$biotype{$id}=$gene_biotype;
$Gname{$id}=$gene_name;
$Gid[$gcnt]=$id;
$gcnt++;
$minExLen{$id}=$a[4]-$a[3];
}
else {
my $tmplen=$a[4]-$a[3];
if ($tmplen < $minExLen{$id}) { $minExLen{$id}=$tmplen; }
}
$uniq{$id}{$a[3]}=$a[4];
$ExonCnt{$id}++;
}
}
close IN;
open(OUT, ">".$fileout);
my %circ;
for (my $i=1; $i<=$NrFT; $i++) {
my $left_gid=int(rand($gcnt-1));
while (($ExonCnt{$Gid[$left_gid]} < $min_Exs) or ($minExLen{$Gid[$left_gid]} <= $minExonLen)) { $left_gid=int(rand($gcnt-1)); }
my @aL=split("\t",$Gid[$left_gid]); # Gid Tid chr strand
my $right_gid=int(rand($gcnt-1));
my @aR=split("\t",$Gid[$right_gid]); # Gid Tid chr strand
while (($left_gid eq $right_gid) or ($ExonCnt{$Gid[$right_gid]} < $min_Exs) or ($aL[2] eq $aR[2]) or ($minExLen{$Gid[$right_gid]} <= $minExonLen)) {
$right_gid=int(rand($gcnt-1));
@aR=split("\t",$Gid[$right_gid]);
}
my $left_Ex1=2+int(rand($ExonCnt{$Gid[$left_gid]}-3));
my $left_Ex2=2+int(rand($ExonCnt{$Gid[$left_gid]}-3));
while ($left_Ex1 > $left_Ex2) {
$left_Ex1=2+int(rand($ExonCnt{$Gid[$left_gid]}-3));
$left_Ex2=2+int(rand($ExonCnt{$Gid[$left_gid]}-3));
}
my $right_Ex1=2+int(rand($ExonCnt{$Gid[$right_gid]}-3));
my $right_Ex2=2+int(rand($ExonCnt{$Gid[$right_gid]}-3));
while ($right_Ex1 > $right_Ex2) {
$right_Ex1=2+int(rand($ExonCnt{$Gid[$right_gid]}-3));
$right_Ex2=2+int(rand($ExonCnt{$Gid[$right_gid]}-3));
}
my $circ_id0=join("_",$aL[1],$left_Ex1,$left_Ex2,$aR[1],$right_Ex1,$right_Ex2);
my $circ_id1=join("_",$aR[1],$right_Ex1,$right_Ex2,$aL[1],$left_Ex1,$left_Ex2);
if ((exists $circ{$circ_id0}) or (exists $circ{$circ_id1})) {
$i--;
next;
}
$circ{$circ_id0}=1;
$circ{$circ_id1}=1;
my @StartL;
my @EndL;
my $cntL=0;
foreach my $pos (sort{$a <=> $b} keys %{$uniq{$Gid[$left_gid]}}) {
$StartL[$cntL]=$pos;
$EndL[$cntL]=$uniq{$Gid[$left_gid]}{$pos};
$cntL++;
}
my @StartR;
my @EndR;
my $cntR=0;
foreach my $pos (sort{$a <=> $b} keys %{$uniq{$Gid[$right_gid]}}) {
$StartR[$cntR]=$pos;
$EndR[$cntR]=$uniq{$Gid[$right_gid]}{$pos};
$cntR++;
}
my $circ_id=join("__",$aL[1],$left_Ex1,$left_Ex2,$aR[1],$right_Ex1,$right_Ex2,$aL[1]);
my $exoncnt=1;
my $tmp_gtf="";
for(my $j=$left_Ex1; $j<=$left_Ex2; $j++) {
my $tmp_info=join(";","gene_id \"".$circ_id."\"", " transcript_id \"".$circ_id."\""," exon_number \"".$exoncnt."\""," gene_name \"".$circ_id."\"");
if ($tmp_gtf eq "") {
$tmp_gtf=join("\t",$aL[2],"fusion","exon",$StartL[$j],$EndL[$j],".",$aL[3],".",$tmp_info);
}
else {
$tmp_gtf=$tmp_gtf."\n".join("\t",$aL[2],"fusion","exon",$StartL[$j],$EndL[$j],".",$aL[3],".",$tmp_info);
}
$exoncnt++;
}
print OUT $tmp_gtf,"\n";
$circ_id=join("__",$aL[1],$left_Ex1,$left_Ex2,$aR[1],$right_Ex1,$right_Ex2,$aR[1]);
$tmp_gtf="";
for(my $j=$right_Ex1; $j<=$right_Ex2; $j++) {
my $tmp_info=join(";","gene_id \"".$circ_id."\"", " transcript_id \"".$circ_id."\""," exon_number \"".$exoncnt."\""," gene_name \"".$circ_id."\"");
if ($tmp_gtf eq "") {
$tmp_gtf=join("\t",$aR[2],"fusion","exon",$StartR[$j],$EndR[$j],".",$aR[3],".",$tmp_info);
}
else {
$tmp_gtf=$tmp_gtf."\n".join("\t",$aR[2],"fusion","exon",$StartR[$j],$EndR[$j],".",$aR[3],".",$tmp_info);
}
$exoncnt++;
}
print OUT $tmp_gtf,"\n";
}