forked from nextgenusfs/genome_scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
jgi2gbk.sh
executable file
·72 lines (66 loc) · 3.5 KB
/
jgi2gbk.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/bin/bash
# script to format JGI GFF and Fasta files to GBK
# Jon Palmer: nextgenusfs@gmail.com
#run test if no files given, print usage
if [ -z "$1" ]; then
echo "Run this script as follows:
$0 <GFF file> <Fasta file> <KOG tab> <Genome Name>
GFF file downloaded from JGI, Genome FASTA file, and KOG tab file"
exit
else
#check to see if all programs are installed and in path
command -v gsed >/dev/null 2>&1 || { echo "I require gsed but it's not installed. Aborting." >&2; exit 1; }
command -v gag.py >/dev/null 2>&1 || { echo "I require gag.py but it's not installed. Aborting." >&2; exit 1; }
command -v tbl2asn >/dev/null 2>&1 || { echo "I require tbl2asn but it's not installed. Aborting." >&2; exit 1; }
command -v blastp >/dev/null 2>&1 || { echo "I require blastp but it's not installed. Aborting." >&2; exit 1; }
command -v gsort >/dev/null 2>&1 || { echo "I require gsort but it's not installed. Aborting." >&2; exit 1; }
command -v gawk >/dev/null 2>&1 || { echo "I require gawk but it's not installed. Aborting." >&2; exit 1; }
#get directory of script to find genbank template
script_dir=$(command -v format_jgi_kog.sh | gsed 's,/format_jgi_kog.sh,,g')
#get current directory for writing results to
dir=$PWD
#make folder
mkdir -p $4
#get name of fasta headers from human input, get first 5 headers and print out.
header_count=$(grep -c "^>" $2)
header=$(grep "^>" $2 | head -n5 | gsed 's/>//g')
printf "\nThere are $header_count fasta files: first 5 headers look like this:\n\n$header\n\n"
printf "Do you need to edit headers? (you need to simplify complicated headers) [y/n]: "
read header_question
if [ $header_question == "n" ]; then
cp $2 $4/genome.fasta
gsed 's/jgi.p|//g' $1 | gsed 's/|/_/g' > $4/genome.gff
else
printf "Enter leading portion of header to remove (copy and paste works well): "
read header_trim
#now reformat fasta headers and save
gsed "s/>$header_trim/>/g" $2 > $4/genome.fasta
#print back new headers
new_header=$(grep "^>" $4/genome.fasta | head -n5 | gsed 's/>//g')
echo "New fasta headers look like this:
$new_header"
#now reformat GFF to match fasta headers
gsed "s/^$header_trim//g" $1 | gsed 's/jgi.p|//g' | gsed 's/|/_/g' > $4/genome.gff
fi
#now get annotations from KOG file, first get example gene name from gff
name=$(grep $'\tgene\t' $4/genome.gff | head -n1 | gsed 's/;/\t/g' | gsed 's/Name=//g' | cut -f10)
name_stem=${name/_*/}
#now need to make a 2 column "association" file
grep $'\tmRNA\t' $1 | cut -f9 | gsed 's/ID=//g' | gsed 's/;/\t/g' | gsed 's/proteinId=//g' | gawk -F"\t" -v n="$name_stem" '{ print n"_"$4"\t"$1;}' > $4/annotations.map
grep -v "#transcriptId" $3 | gawk -F"\t" -v n="$name_stem" '{ print n"_"$2"\t""product""\t"$4;}' > $4/annotations.tmp
gawk -F"\t" 'NR==FNR{a[$1]=$2;next}$1 in a{print a[$1]"\t"$2"\t"$3;}' $4/annotations.map $4/annotations.tmp > $4/annotations.txt
#try running command line gag
cd $dir/$4
echo "------------------------"
echo "Running GAG."
echo "------------------------"
gag.py -f genome.fasta -g genome.gff -a annotations.txt
cd $dir/$4/gag_output
cp genome.fasta genome.fsa
echo "------------------------"
echo "Running tbl2asn."
echo "------------------------"
tbl2asn -p . -t $script_dir/test.sbt -M n -Z discrep -a r10u -l paired-ends -j "[organism=$4]" -V b -c fx
cp genome.gbf $dir/$4.gbk
cd $dir
fi