-
Notifications
You must be signed in to change notification settings - Fork 8
/
generate_alphapulldown_scripts.sh
141 lines (116 loc) · 5.23 KB
/
generate_alphapulldown_scripts.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/bin/bash
# Check if the correct number of arguments is provided
if [ "$#" -ne 4 ]; then
echo "Usage: $0 output_directory fasta_location_1 fasta_location_2 number_of_predictions_per_model"
exit 1
fi
# Create the output directory if it doesn't exist
mkdir -p "$1"
# Assign arguments to variables
output_directory=`realpath $1`
fasta_location_1=`realpath $2`
fasta_location_2=`realpath $3`
pred_per_model=$4
# Function to count the number of entries in a fasta file
count_entries() {
grep -c '^>' "$1"
}
# Function to create a protein list from a fasta file
create_protein_list() {
grep '^>' "$1" | sed 's/^>//' | sed 's/[|= #;]/_/g' > "$2"
}
# Count entries in each fasta file
count1=$(count_entries "$fasta_location_1")
count2=$(count_entries "$fasta_location_2")
total_count=$((count1 + count2))
total_preds=$((count1 * count2))
# Create protein lists
protein_list_1="${output_directory}/protein_list_1.txt"
protein_list_2="${output_directory}/protein_list_2.txt"
create_protein_list "$fasta_location_1" "$protein_list_1"
create_protein_list "$fasta_location_2" "$protein_list_2"
# Create MSA job script
msa_job_script="${output_directory}/msa_job_script.sh"
cat <<EOT > "$msa_job_script"
#PBS -N msa_pulldown
#PBS -l nodes=1:ppn=4
#PBS -l walltime=12:00:00
#PBS -l mem=64gb
module load AlphaPulldown/2.0.0b4-foss-2022a
export ALPHAFOLD_DATA_DIR=/arcanine/scratch/gent/apps/AlphaFold/20230310
cd $output_directory
create_individual_features.py \
--fasta_paths=$fasta_location_1,$fasta_location_2 \
--data_dir=\$ALPHAFOLD_DATA_DIR \
--save_msa_files=False \
--output_dir=$output_directory/msas \
--use_precomputed_msas=False \
--max_template_date=2050-01-01 \
--skip_existing=True \
--uniref30_database_path=/arcanine/scratch/gent/apps/AlphaFold/20230310/uniref30/UniRef30_2021_03 \
--seq_index \${PBS_ARRAYID}
EOT
instr_file=$output_directory/instructions.txt
echo "" > $instr_file
# Print MSA job script command
echo "######################################" >> $instr_file
echo "##### STEP 1. MSA search #####" >> $instr_file
echo "######################################" >> $instr_file
echo ">>> To launch the MSA search, run: " >> $instr_file
echo " module swap cluster/doduo" >> $instr_file
echo " qsub -m ae $msa_job_script -t1-$total_count -o $output_directory/output_msa.log -e $output_directory/output_msa.log" >> $instr_file
echo ">>> Note that you can change the doduo cluster to any CPU cluster you wish" >> $instr_file
echo ">>> Wait until all jobs have been successfully completed - you will receive an email when finished." >> $instr_file
echo " " >> $instr_file
# Create the GPU job script
predict_job_script="${output_directory}/predict_job_script.sh"
cat <<EOT > "$predict_job_script"
#PBS -N predict_pulldown
#PBS -l nodes=1:ppn=8,gpus=1
#PBS -l mem=64g
#PBS -l walltime=12:00:00
module load AlphaPulldown/2.0.0b4-foss-2022a-CUDA-11.7.0
cd $output_directory
export EBROOTCOLABFOLD=\$PBS_O_WORKDIR/$output_directory
export ALPHAFOLD_DATA_DIR=/arcanine/scratch/gent/apps/AlphaFold/20230310
run_multimer_jobs.py --mode=pulldown \
--num_cycle=3 \
--num_predictions_per_model=$pred_per_model \
--output_path=$output_directory/preds/ \
--data_dir=\$ALPHAFOLD_DATA_DIR \
--protein_lists=$protein_list_1,$protein_list_2 \
--monomer_objects_dir=$output_directory/msas/ \
--compress_result_pickles=True \
--remove_result_pickles=True \
--job_index=\${PBS_ARRAYID}
EOT
echo "######################################" >> $instr_file
echo "##### STEP 2. Prediction #####" >> $instr_file
echo "######################################" >> $instr_file
echo ">>> To launch the prediction search, run: " >> $instr_file
echo " module swap cluster/joltik" >> $instr_file
echo " qsub -m ae $predict_job_script -t1-$total_preds -o $output_directory/output_predict.log -e $output_directory/output_predict.log" >> $instr_file
echo ">>> Note that you can change the joltik cluster to any GPU cluster you wish" >> $instr_file
echo ">>> You will receive an email when finished." >> $instr_file
echo " " >> $instr_file
gather_results_script="${output_directory}/gather_results_script.sh"
cat <<EOT > "$gather_results_script"
#PBS -N gather_results_pulldown
#PBS -l nodes=1:ppn=4
#PBS -l walltime=6:00:00
#PBS -l mem=16gb
cd $output_directory
apptainer exec /arcanine/scratch/gent/apps/AlphaPulldown/alpha-analysis_jax_0.4.sif run_get_good_pae.sh --cutoff=10 --output_dir=$output_directory/preds
EOT
echo "######################################" >> $instr_file
echo "##### STEP 3. Results #####" >> $instr_file
echo "######################################" >> $instr_file
echo ">>> Finally, you can gather results in a csv by running" >> $instr_file
echo " module swap cluster/doduo" >> $instr_file
echo " qsub -m ae $gather_results_script -o $output_directory/output_gather.log -e $output_directory/output_gather.log" >> $instr_file
echo ">>> Note that you can adjust the cutoff parameter in the file to include lower quality interactions as well" >> $instr_file
echo ">>> Note that you can change the doduo cluster to any CPU cluster you wish" >> $instr_file
echo " " >> $instr_file
echo ">>> Outputs can be found in $output_directory/outputs." >> $instr_file
echo "Instructions can be found at $instr_file as well"
cat $instr_file