-
Notifications
You must be signed in to change notification settings - Fork 3
/
PubGrep.sh
executable file
·397 lines (344 loc) · 13.3 KB
/
PubGrep.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
#!/bin/bash
rawurlencode() {
local string="${1}"
local strlen=${#string}
local encoded=""
local pos c o
for (( pos=0 ; pos<strlen ; pos++ )); do
c=${string:$pos:1}
case "$c" in
[-_.~a-zA-Z0-9] ) o="${c}" ;;
* ) printf -v o '%%%02x' "'$c"
esac
encoded+="${o}"
done
echo "${encoded}" # You can either set a return variable (FASTER)
REPLY="${encoded}" #+or echo the result (EASIER)... or both... :p
}
echo "---------------------------------------------------------------------"
echo "- PubGrep 0.3.2 -"
echo "- This Program tries to search CIDs from the Pubchem Database based -"
echo "- on a list of compounds given as Input. Afterwards it creates sdf -"
echo "- Files for each Compound given in an appropriate subdirectory. -"
echo "- If you are using this program extensively (like, a lot!) -"
echo "- for your Research, please consider citing 10.1039/D3RA01705B -"
echo "- MS, 2021-2023 -"
echo "---------------------------------------------------------------------"
echo ""
to3="n"
xtb=$(which xtb)
if [ -e $xtb ]; then
to3="y"
fi
skip="n"
argcheck=y
optstr=$@
compounds=""
compound_list=""
input="name"
helper="n"
output="sdf"
single=false
while [ "$argcheck" = "y" ]; do
if [ -n "$1" ]; then
case $1 in
"--input" ) shift; input=$1;;
"--help" ) helper="y";;
"--output" ) shift; output=$1;;
"--skip" ) shift; skip="y";;
* )
if [ ! -z $compounds ]; then
echo "ERROR: Too many positional arguments."
exit
else
compounds=$1
fi
;;
esac
shift
else
argcheck=n
fi
done
if [ $helper == "y" ]; then
echo "This Programm uses a commandline interface to determine the file"
echo "containing the Compound Data and the Input format."
echo "Uses a compound list or a single compound as input."
echo "Usage: PubGrep [compound/compound_list] [options]"
echo "Possible options are:"
echo "--input [name, cid, smiles, cas, inchi]: Determines the input format."
echo "--output [sdf, logP, list]: Determines the output data."
exit
fi
if [ ! -s $compounds ]; then
echo "Single compound mode for "$compounds"."
echo $compounds > list.tmp
compound_list="list.tmp"
single=true
else
echo "Multiple compound mode, reading input from "$compounds"."
compound_list=$compounds
fi
if [ ! -f $compound_list ] || [ -z $compound_list ]; then
echo "No Compound list given or Compound List does not exist."
echo "To get help, use --help."
exit
fi
if [ -e error ]; then
rm error
fi
echo "Testing Pubchem Server..."
curl -X get https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/1/cids/TXT > test.tmp 2>/dev/null
if [ ! -s test.tmp ]; then
echo "No connection could be established. Check if you have access to the internet." |tee error
rm test.tmp
exit
fi
tester=$(cat test.tmp)
if [ $tester == "1" ]; then
echo "Pubchem Server is working fine."
echo ""
rm test.tmp
else
echo "Some Problem occured. Check error message."
cat test.tmp > error
rm test.tmp
exit
fi
if [ ! $skip == "y" ]; then
if [ $input == "inchi" ]; then
if [ -f cid.tmp ]; then
rm cid.tmp
fi
if [ -f not_found.compound ]; then
rm not_found.compound
fi
while read -r line; do
line_nows="$(echo -e "${line}" | tr -d '[:space:]')"
curl --data "inchi=$line_nows" https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchi/cids/TXT > cid.tmp 2>/dev/null
cid=$(cat cid.tmp)
if [[ $cid == *"PUGREST.NotFound"* ]] || [[ $cid == *"PUGREST.BadRequest"* ]]; then
echo "Compound: "$line_nows, "CID not found, check your Input."
echo $line_nows >> not_found.compound
else
echo "Compound: "$line_nows", CID:" $cid
echo $line_nows >> found.compound
echo $cid >> found.cid
fi
done < $compound_list
paste found.compound found.cid > found.results 2> error
rm found.compound found.cid cid.tmp 2> error
elif [ $input == "name" ]; then
if [ -f cid.tmp ]; then
rm cid.tmp
fi
if [ -f not_found.compound ]; then
rm not_found.compound
fi
while read -r line; do
line_nows="$(echo -e "${line}" | tr -d '[:space:]')"
url_encoded=$( rawurlencode "$line_nows")
curl -X get https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/$url_encoded/cids/TXT > cid.tmp 2>/dev/null
cid=$(cat cid.tmp)
if [[ $cid == *"PUGREST.NotFound"* ]] || [[ $cid == *"PUGREST.BadRequest"* ]]; then
echo "Compound: "$line_nows, "CID not found, check your Input."
echo $line_nows >> not_found.compound
else
echo "Compound: "$line_nows", CID:" $cid
echo $line_nows >> found.compound
echo $cid >> found.cid
fi
done < $compound_list
paste found.compound found.cid > found.results 2> error
rm found.compound found.cid cid.tmp 2> error
elif [ $input == "cid" ]; then
if [ -f name.tmp ]; then
rm name.tmp
fi
if [ -f not_found.compound ]; then
rm not_found.compound
fi
while read -r line; do
line_nows="$(echo -e "${line}" | tr -d '[:space:]')"
curl -X get https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/$line_nows/property/IUPACname/TXT > name.tmp 2>/dev/null
name=$(cat name.tmp)
if [[ $name == *"PUGREST.NotFound"* ]] || [[ $name == *"PUGREST.BadRequest"* ]]; then
echo "Compound: "$line_nows, "Name not found, check your Input."
echo $line_nows >> not_found.compound
else
echo "Compound: "$line_nows", Name:" $name
echo $line_nows >> found.cid
namenows="$(echo -e "$name" | tr -d '[:space:]')"
echo $namenows >> found.compound
fi
done < $compound_list
paste found.compound found.cid > found.results
rm found.compound found.cid name.tmp
elif [ $input == "smile" ] || [ $input == "smiles" ]; then
if [ -f name.tmp ]; then
rm name.tmp
fi
if [ -f cid.tmp ]; then
rm cid.tmp
fi
if [ -f not_found.compound ]; then
rm not_found.compound
fi
while read -r line; do
line_nows="$(echo -e "${line}" | tr -d '[:space:]')"
url_encoded=$( rawurlencode "$line_nows")
curl -X get https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/$url_encoded/property/IUPACname/TXT > name.tmp 2>/dev/null
curl -X get https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/$url_encoded/cids/TXT > cid.tmp 2>/dev/null
name=$(cat name.tmp)
cid=$(cat cid.tmp)
if [[ $name == *"PUGREST.NotFound"* ]] || [[ $name == *"PUGREST.BadRequest"* ]]; then
echo "Compound: "$line_nows, "CID not found, check your Input."
echo $line_nows >> not_found.smiles
else
echo "Compound: "$line_nows", Name:" $name", CID:"$cid
echo $cid >> found.cid
echo $name >> found.compound
echo $line_nows >> found.smiles
fi
done < $compound_list
paste found.compound found.cid found.smiles > found.results 2> error
rm found.compound found.cid name.tmp found.smiles 2> error
elif [ $input == "cas" ] || [ $input == "regid" ]; then
if [ -f name.tmp ]; then
rm name.tmp
fi
if [ -f cid.tmp ]; then
rm cid.tmp
fi
if [ -f not_found.compound ]; then
rm not_found.compound
fi
echo "------------------------------------------------------------------------------------------------------"
echo " CAS Inputs may lead to Problems while searching th PubChem Database. "
echo " You may consider using the standard input (name) insted! "
echo " If an CAS is not found as a Registry ID, it will be searched for as a Registry Number. "
echo " Note however, that there may be Problems with this approach, like duplicate entrys. "
echo " It is therefore recommendend to first use a list output, and use the resulting CIDs as the new input."
echo "------------------------------------------------------------------------------------------------------"
while read -r line; do
line_nows="$(echo -e "${line}" | tr -d '[:space:]')"
url_encoded=$( rawurlencode "$line_nows")
curl -X get https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/xref/RegistryID/$url_encoded/property/IUPACname/TXT > name.tmp 2>/dev/null
curl -X get https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/xref/RegistryID/$url_encoded/cids/TXT > cid.tmp 2>/dev/null
name=$(cat name.tmp)
cid=$(cat cid.tmp)
if [[ $name == *"PUGREST.NotFound"* ]]; then
echo "CAS Number ",$line_nows," was not found as a Registry ID. Trying to get ",$line_nows," as a Registry Number."
curl -X get https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/xref/RN/$line_nows/property/IUPACname/TXT > name.tmp 2>/dev/null
curl -X get https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/xref/RN/$line_nows/cids/TXT > cid.tmp 2>/dev/null
name=$(cat name.tmp)
cid=$(cat cid.tmp)
fi
if [[ $name == *"PUGREST.NotFound"* ]]; then
echo "Compound: "$line_nows, "CID not found, check your Input."
echo $line_nows >> not_found.cas
echo ""
else
echo "Compound: "$line_nows", Name:" $name", CID:"$cid
echo ""
echo $cid >> found.cid
namenows="$(echo -e "$name" | tr -d '[:space:]')"
echo $namenows >> found.compound
echo $line_nows >> found.cas
fi
done < $compound_list
paste found.compound found.cid found.cas > found.results 2> error
rm found.compound found.cid name.tmp found.cas 2> error
else
echo "Input Format not known or not supported."
exit
fi
fi
if [ $output == "sdf" ]; then
if [ ! -s found.results ]; then
echo "No compounds found, can't search for geometries."
exit
fi
if [ "$single" = true ]; then
while read -r line; do
compound=$(echo $line | awk '{print $1}')
cid=$(echo $line | awk '{if ($2 != "") print $2; else print $1;}')
curl -X get https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/$cid/sdf?record_type=3d > $cid.sdf 2>/dev/null
success_string=$(cat $cid.sdf)
if [[ $success_string == *"PUGREST.NotFound"* ]]; then
echo "No 3D Conformer Data found for "$compound
echo "Retrieving 2D Conformer Data instead."
curl -X get https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/$cid/sdf > $cid.sdf 2>/dev/null
if [ $to3 == "y" ]; then
echo "Using xTB for an attempt to convert the 2D structure to 3D."
$xtb $cid.sdf --gfn 2 --sp --ceasefiles > xtb_3d.out
if grep -q "converted geometry written to" xtb_3d.out; then
echo "3D conversion successfull."
mv gfnff_convert.sdf .$cid.sdf
rm list.tmp convert.log mdrestart xtbmdok xtb.trj
mv .$cid.sdf $cid.sdf
fi
fi
fi
echo $compound $cid > pubchem_data
echo $compound > iupac
done < found.results
echo "Done!"
exit
fi
if [ ! -d pubchem_compounds ]; then
mkdir pubchem_compounds
fi
pushd pubchem_compounds >/dev/null 2>/dev/null
while read -r line; do
compound=$(echo $line | awk '{print $1}')
cid=$(echo $line | awk '{if ($2 != "") print $2; else print $1;}')
if [ ! -d $cid ]; then
echo $cid
mkdir $cid
pushd $cid > /dev/null 2>/dev/null
curl -X get https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/$cid/sdf?record_type=3d > $cid.sdf 2>/dev/null
success_string=$(cat $cid.sdf)
if [[ $success_string == *"PUGREST.NotFound"* ]]; then
echo "No 3D Conformer Data found for "$compound
echo "Retrieving 2D Conformer Data instead."
curl -X get https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/$cid/sdf > $cid.sdf 2>/dev/null
if [ $to3 == "y" ]; then
echo "Using xTB for an attempt to convert the 2D structure to 3D."
$xtb $cid.sdf --gfn 2 --sp --ceasefiles > xtb_3d.out
if grep -q "converted geometry written to" xtb_3d.out; then
echo "3D conversion successfull."
mv gfnff_convert.sdf .$cid.sdf
rm list.tmp convert.log mdrestart xtbmdok xtb.trj
mv .$cid.sdf $cid.sdf
fi
fi
fi
echo $compound $cid > pubchem_data
echo $compound > iupac
popd > /dev/null
else
echo $compound " already exists."
fi
done < ../found.results
elif [ $output == "logp" ] || [ $output == "logP" ]; then
echo "Retrieving log P data:"
while read -r line; do
compound=$(echo $line | awk '{print $1}')
cid=$(echo $line | awk '{print $2}')
curl -X get https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/$cid/property/XlogP/txt > logP.tmp 2>/dev/null
logP=$(cat logP.tmp)
echo $compound $cid $logP
echo $compound $cid $logP >> pubchem_logP.data
done < found.results
rm logP.tmp
elif [ $output="list" ]; then
echo "Only List output choosen. No additional data will be created."
else
echo "Output Format not known."
exit
fi
if [ -e list.tmp ]; then
rm list.tmp
fi
echo "Done!"