-
Notifications
You must be signed in to change notification settings - Fork 2
/
gffcount
executable file
·73 lines (67 loc) · 2.26 KB
/
gffcount
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/bin/bash
show_usage() {
cat << EOF
Usage:
gffcount [options] [<gff3_file>]
Default operation is to show the number of distinct transcript IDs (or any
transcript-like features parenting exonic or coding sequence subfeatures)
in the input (expected at stdin when <gffr_file> is not provided)
Options (only use one at a time):
-l list the feature IDs having exonic or coding sequence sub-features
-c count features having CDS/codon subfeatures
-cl list feature IDs having CDS features
-g count distinct gene IDs only (feature type ending with 'gene')
-gl list distinct gene IDs (feature type ending with 'gene')
-il list all distinct IDs with their feature type
-pl list all Parent identifiers with their feature type
-r report transcript counts per reference sequence
-rg report gene counts per reference sequence
EOF
}
if [[ "$1" == "-h" || "$1" == "--help" ]]; then
show_usage
exit 1
fi
a=""
if [[ $1 =~ ^-.+ ]]; then
a="${1:1}" #remove first character (the dash)
shift
fi
if [[ -z "$a" ]]; then
perl -aF'\t' -ne 'if($F[2]=~m/(?:exon|CDS|utr)$/i && m/\bParent=([^;]+)/){$_=$1;chomp;print "$_\n"}' $1 | sort -u | wc -l
exit 0
fi
case $a in
l)
perl -aF'\t' -ne 'if($F[2]=~m/(?:exon|CDS|utr)$/i && m/\bParent=([^;]+)/){$_=$1;chomp;print "$_\n" if (++$n{$_})==1}' $1
;;
g)
perl -aF'\t' -ne 'chomp;@t=split(/\t/);print "$1\n" if $t[2]=~m/gene$/i && m/\bID=([^;]+)/' $1 | wc -l
;;
gl)
perl -aF'\t' -ne 'chomp;@t=split(/\t/);print "$1\n" if $t[2]=~m/gene$/i && m/\bID=([^;]+)/' $1
;;
c)
perl -aF'\t' -ne 'chomp;@t=split(/\t/);print "$1\n" if ($t[2] eq "CDS") && m/\bParent=([^;]+)/ && (++$n{$1})==1' $1 | wc -l
;;
cl)
perl -aF'\t' -ne 'chomp;@t=split(/\t/);print "$1\n" if ($t[2] eq "CDS") && m/\bParent=([^;]+)/ && (++$n{$1})==1' $1
;;
il)
perl -aF'\t' -ne 'chomp;@t=split(/\t/);print "$1\t$t[2]\n" if m/\bID=([^;]+)/ && (++$n{$1})==1' $1
;;
pl)
#assumes parent IDs are always found BEFORE their children reference them with Parent attribute
read -r -d '' script <<'EOF'
chomp; @t=split(/\t/);
$h{$1}=$t[2] if m/\bID=([^;]+)/;
print "$1\t$h{$1}\n" if m/\bParent=([^;]+)/ &&(++$n{$1})==1;
EOF
perl -ne "$script" $1
;;
*)
show_usage
echo "Error: -$a option not recognized!"
exit 1
;;
esac