-
Notifications
You must be signed in to change notification settings - Fork 1
/
get_data_sgd.sh
54 lines (41 loc) · 2.41 KB
/
get_data_sgd.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
set -e
cd data/sgd/
# Download and extract gff, delete the rest
curl -kL http://sgd-archive.yeastgenome.org/sequence/S288C_reference/genome_releases/S288C_reference_genome_Current_Release.tgz -o sgd_genome.tgz
tar -xvzf sgd_genome.tgz -C .
rm sgd_genome.tgz
mv S288C*/*.gff.gz sgd_genome.gff.gz
rm -rf S288C*
gzip -fd sgd_genome.gff.gz
# Split into fasta and gff
perl -ne 'if ($found) { print; } elsif (m/##FASTA/) { $found = 1; }' sgd_genome.gff > genome_sequence.fsa
perl -ne 'print; last if /##FASTA/' sgd_genome.gff > features.gff
# Remove problematic ARS lines that are not used
grep -v 'SGD ARS' features.gff > features.gff.tmp
mv features.gff.tmp features.gff
# Use intermine to get the latest SGD alleles.
# TODO: include unique identifier
python get_sgd_alleles.py alleles_sgd_raw.tsv
# TODO: download the latest genome
# convert the gff to embl, using emblmygff3 docker image (see translation*.json), which are used for the transformation
bash convert_sgd_gff2embl.sh
# Download all previous protein sequence (visit the repo), as well as the current protein sequences, to make the dictionary.
curl -kL https://raw.githubusercontent.com/pombase/all_previous_sgd_peptide_sequences/master/all_previous_seqs.tsv -o all_previous_seqs.tsv
curl http://sgd-archive.yeastgenome.org/sequence/S288C_reference/orf_protein/orf_trans_all.fasta.gz -o current_protein_seqs.fasta.gz
gzip -fd current_protein_seqs.fasta.gz
cd ../..
# Extract allele descriptions from their name or description field.
# TODO: use a description field provided by SGD, this applies also to all commands below
# that use _description_name or description_semicolon
python format_alleles_sgd.py
# Load the genome to a pickle file
python load_genome.py --output data/sgd/genome.pickle --config data/sgd/config.sgd.json data/sgd/genome_embl_files/*.embl
# Remove unknown ids (not in gff), or pseudogene (YLL016W), no main feature (YJL018W)
# TODO: Check why these are missing
missing_genes="R0010W YSC0029 R0040C YLL016W YSC0032 YJL018W"
for missing_gene in $missing_genes; do
grep -v $missing_gene data/sgd/alleles_description_name.tsv > data/sgd/alleles_description_name.tsv.tmp
mv data/sgd/alleles_description_name.tsv.tmp data/sgd/alleles_description_name.tsv
grep -v $missing_gene data/sgd/alleles_description_semicolon.tsv > data/sgd/alleles_description_semicolon.tsv.tmp
mv data/sgd/alleles_description_semicolon.tsv.tmp data/sgd/alleles_description_semicolon.tsv
done