-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcalculateCoverageAndGaps2.pl
executable file
·87 lines (67 loc) · 2.18 KB
/
calculateCoverageAndGaps2.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/perl
# script to calculate average coverage of genes or contigs from file with depth of coverage at every position (except at the positions with 0 depth of coverage; output of samtools depth);
# it replaces Mads Albertsen's calculateCoverageAndGaps.pl, which miscalculated the covered length
# takes 2 inputs: - the fasta file with genes or contigs
# - the depth file
# 1 output is written to standard out: a tab-separated table with the gene or contig IDs, lengths, average coverage and covered length
# written by Anna Heintz-Buschart (April 2015)
use strict;
my $contigs=$ARGV[0];
my $indepth=$ARGV[1];
my %contigs=();
my ($id);
#
my %coverage;
#my @order;
my %length;
open(CON,$contigs) or die $!;
while(my $str=<CON>){
chomp($str);
next if length($str)==0;
#print STDERR "str: $str\n";
if($str=~/>(\S+)/){
$id=$1;
#print STDERR "$id\n";
die "$id already exists\n" if exists($contigs{$id});
}else{
#print STDERR "$id\n";
$contigs{$id}{'seq'}.=$str;
}
}
close(CON);
open(INdepth, "$indepth") or die("Cannot read file: $indepth\n");
while ( my $line = <INdepth> ) {
chomp $line;
my @splitline = split(/\t/,$line);
if (exists($coverage{$splitline[0]})){
$coverage{$splitline[0]} = $coverage{$splitline[0]} + $splitline[2];
$length{$splitline[0]}++;
}
else{
$coverage{$splitline[0]} = $splitline[2];
$length{$splitline[0]} = 1;
# push (@order , $splitline[0]);
}
}
close INdepth;
#open(OUT, ">$outputfile") or die("Cannot create file: $outputfile\n");
print "SequenceID\tReference.length\tAverage.coverage\tCovered.length\n";
#
#print "sequenceID\tlength\tGCperc\n";
foreach $id (keys(%contigs)){
next if length($id)==0;
my $seq=$contigs{$id}{'seq'};
my $reflength=length($seq);
if ($reflength eq 0){
print STDERR "ERROR: $id $reflength $seq\n";
next;
}
my $cov = 0;
my $covlength = 0;
if (exists $coverage{$id}){
$cov = sprintf("%.3f", $coverage{$id} / $reflength);
$covlength = $length{$id}
}
$id=~s/\s+/_/g;
print "$id\t$reflength\t$cov\t$covlength\n";
}