-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathbootstrap_ukb_cat.sh
417 lines (335 loc) · 16.2 KB
/
bootstrap_ukb_cat.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
set -e -u
###############################################################################
# HOW TO USE #
# #
# Please adjust every variable within a "FIXME" markup to your #
# filesystem, data, and software container. #
# Depending on which job scheduling system you use, comment out #
# or remove the irrelevant system. #
# More information about this script can be found in the README. #
# #
###############################################################################
# Jobs are set up to not require a shared filesystem (except for the lockfile)
# ------------------------------------------------------------------------------
# FIXME: Supply a RIA-URL to a RIA store that will collect all outputs, and a
# RIA-URL to a different RIA store from which the dataset will be cloned from.
# Both RIA stores will be created if they don't yet exist.
output_store="ria+file:///data/group/psyinf/forrestoutput"
input_store="ria+file:///data/group/psyinf/forrestinput"
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# ------------------------------------------------------------------------------
# FIXME: Supply the name of container you have registered (see README for info)
# FIXME: Supply a path or URL to the place where your container dataset is
# located, and a path or URL to the place where an input (super)dataset exists.
containername='cat'
containerstore="ria+http://containers.ds.inm7.de"
data="https://github.com/psychoinformatics-de/studyforrest-data-structural.git"
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#-------------------------------------------------------------------------------
# FIXME: Replace this name with a dataset name of your choice.
source_ds="cat"
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# Create a source dataset with all analysis components as an analysis access
# point.
datalad create -c yoda $source_ds
cd $source_ds
# clone the container-dataset as a subdataset. Please see README for
# instructions how to create a container dataset.
datalad clone -d . "${containerstore}#~${containername}" code/pipeline
# Register the container in the top-level dataset.
#-------------------------------------------------------------------------------
# FIXME: If necessary, configure your own container call in the --call-fmt
# argument. If your container does not need a custom call format, remove the
# --call-fmt flag and its options below.
datalad containers-add \
--call-fmt 'singularity exec -B {{pwd}} --cleanenv {img} {cmd}' \
-i code/pipeline/.datalad/environments/${containername}/image \
$containername
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# amend the previous commit with a nicer commit message
git commit --amend -m 'Register pipeline dataset'
# import custom code
# ------------------------------------------------------------------------------
# FIXME: If you need custom scripts, copy them into the analysis source
# dataset. If you don't need custom scripts, remove the copy and commit
# operations below. (The scripts below are only relevant for CAT processing)
cp /data/group/psyinf/ukb_workflow_template/finalize_job_outputs.sh code
datalad save -m "Import script to tune the CAT outputs for storage"
git commit --no-edit --amend --author "Małgorzata Wierzba <gosia.wierzba@gmail.com>"
cp /data/group/psyinf/ukb_workflow_template/code_cat_standalone_batchUKB.txt code/cat_standalone_batch.txt
datalad save -m "Import desired CAT batch configuration"
git commit --no-edit --amend --author "Felix Hoffstaedter <f.hoffstaedter@fz-juelich.de>"
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# create dedicated input and output locations. Results will be pushed into the
# output sibling and the analysis will start with a clone from the input sibling.
datalad create-sibling-ria --new-store-ok -s output "${output_store}"
pushremote=$(git remote get-url --push output)
datalad create-sibling-ria --new-store-ok -s input --storage-sibling off "${input_store}"
# register the input dataset
datalad clone -d . ${data} inputs/data
# amend the previous commit with a nicer commit message
git commit --amend -m 'Register input data dataset as a subdataset'
# the actual compute job specification
cat > code/participant_job << "EOT"
#!/bin/bash
# the job assumes that it is a good idea to run everything in PWD
# the job manager should make sure that is true
# fail whenever something is fishy, use -x to get verbose logfiles
set -e -u -x
dssource="$1"
pushgitremote="$2"
subid="$3"
# get the analysis dataset, which includes the inputs as well
# importantly, we do not clone from the lcoation that we want to push the
# results too, in order to avoid too many jobs blocking access to
# the same location and creating a throughput bottleneck
datalad clone "${dssource}" ds
# all following actions are performed in the context of the superdataset
cd ds
# in order to avoid accumulation temporary git-annex availability information
# and to avoid a syncronization bottleneck by having to consolidate the
# git-annex branch across jobs, we will only push the main tracking branch
# back to the output store (plus the actual file content). Final availability
# information can be establish via an eventual `git-annex fsck -f joc-storage`.
# this remote is never fetched, it accumulates a larger number of branches
# and we want to avoid progressive slowdown. Instead we only ever push
# a unique branch per each job (subject AND process specific name)
git remote add outputstore "$pushgitremote"
# all results of this job will be put into a dedicated branch
git checkout -b "job-$JOBID"
# we pull down the input subject manually in order to discover relevant
# files. We do this outside the recorded call, because on a potential
# re-run we want to be able to do fine-grained recomputing of individual
# outputs. The recorded calls will have specific paths that will enable
# recomputation outside the scope of the original Condor setup
datalad get -n "inputs/data/${subid}"
# ------------------------------------------------------------------------------
# FIXME: Replace the datalad containers-run command starting below with a
# command that fits your analysis. (Note that the command for CAT processing is
# quite complex and involves separate scripts - if you are not using CAT
# processing, remove everything until \; )
# the meat of the matter
# look for T1w files in the input data for the given participant
# it is critical for reproducibility that the command given to
# `containers-run` does not rely on any property of the immediate
# computational environment (env vars, services, etc)
find \
inputs/data/${subid} \
-name '*T1w.nii.gz' \
-exec sh -c '
odir=$(echo {} | cut -d / -f3-4);
datalad -c datalad.annex.retry=12 containers-run \
-m "Compute $odir" \
-n cat \
--explicit \
-o $odir \
-i {} \
-i code/cat_standalone_batch.txt \
-i code/finalize_job_outputs.sh \
sh -e -u -x -c "
rm -rf {outputs[0]}/tmp ;
mkdir -p {outputs[0]}/tmp \
&& cp {inputs[0]} {outputs[0]}/tmp \
&& gzip -d {outputs[0]}/tmp/* \
&& /singularity -b {inputs[1]} {outputs[0]}/tmp/*.nii \
&& bash {inputs[2]} {inputs[0]} {outputs[0]}/tmp {outputs[0]} \
&& rm -rf {outputs[0]}/tmp \
" \
' \;
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# it may be that the above command did not yield any outputs
# and no commit was made (no T1s found for the given participant)
# we nevertheless push the branch to have a record that this was
# attempted and did not fail
# file content first -- does not need a lock, no interaction with Git
datalad push --to output-storage
# and the output branch
flock --verbose $DSLOCKFILE git push outputstore
echo SUCCESS
# job handler should clean up workspace
EOT
chmod +x code/participant_job
datalad save -m "Participant compute job implementation"
mkdir logs
echo logs >> .gitignore
###############################################################################
# HTCONDOR SETUP START - FIXME remove or adjust this according to your needs.
###############################################################################
# HTCondor compute setup
# the workspace is to be ignored by git
echo dag_tmp >> .gitignore
echo .condor_datalad_lock >> .gitignore
## define ID for git commits (take from local user configuration)
git_name="$(git config user.name)"
git_email="$(git config user.email)"
# compute environment for a single job
#-------------------------------------------------------------------------------
# FIXME: Adjust job requirements to your needs
cat > code/process.condor_submit << EOT
universe = vanilla
# resource requirements for each job
request_cpus = 1
request_memory = 4G
request_disk = 5G
# tell condor that a job is self contained and the executable
# is enough to bootstrap the computation on the execute node
should_transfer_files = yes
# explicitly do not transfer anything back
# we are using datalad for everything that matters
transfer_output_files = ""
# the actual job script, nothing condor-specific in it
executable = \$ENV(PWD)/code/participant_job
# the job expects these environment variables for labeling and synchronization
# - JOBID: subject AND process specific ID to make a branch name from
# (must be unique across all (even multiple) submissions)
# including the cluster ID will enable sorting multiple computing attempts
# - DSLOCKFILE: lock (must be accessible from all compute jobs) to synchronize
# write access to the output dataset
# - DATALAD_GET_SUBDATASET__SOURCE__CANDIDATE__...:
# (additional) locations for datalad to locate relevant subdatasets, in case
# a configured URL is outdated
# - GIT_AUTHOR_...: Identity information used to save dataset changes in compute
# jobs
environment = "\\
JOBID=\$(subject).\$(Cluster) \\
DSLOCKFILE=\$ENV(PWD)/.condor_datalad_lock \\
DATALAD_GET_SUBDATASET__SOURCE__CANDIDATE__101cat='${containerstore}#{id}' \\
GIT_AUTHOR_NAME='${git_name}' \\
GIT_AUTHOR_EMAIL='${git_email}' \\
"
# place the job logs into PWD/logs, using the same name as for the result branches
# (JOBID)
log = \$ENV(PWD)/logs/\$(subject)_\$(Cluster).log
output = \$ENV(PWD)/logs/\$(subject)_\$(Cluster).out
error = \$ENV(PWD)/logs/\$(subject)_\$(Cluster).err
# essential args for 'participant_job'
# 1: where to clone the analysis dataset
# 2: location to push the result git branch to. The 'ria+' prefix is stripped.
# 3: ID of the subject to process
arguments = "\\
${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset) \\
${pushremote} \\
sub-\$(subject) \\
"
queue
EOT
# ------------------------------------------------------------------------------
# FIXME: Adjust the find command below to return the unit over which your
# analysis should parallelize. Here, subject directories on the first hierarchy
# level in the input data are returned by searching for the 'sub-*' prefix.
# The setup below creates an HTCondor DAG.
# ------------------------------------------------------------------------------
# processing graph specification for computing all jobs
cat > code/process.condor_dag << "EOT"
# Processing DAG
EOT
for s in $(find inputs/data -maxdepth 1 -type d -name 'sub-*' -printf '%f\n'); do
s=${s:4}
printf "JOB sub-$s code/process.condor_submit\nVARS sub-$s subject=\"$s\"\n" >> code/process.condor_dag
done
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
datalad save -m "HTCondor submission setup" code/ .gitignore
################################################################################
# HTCONDOR SETUP END
################################################################################
################################################################################
# SLURM SETUP START - FIXME remove or adjust to your needs
################################################################################
echo .SLURM_datalad_lock >> .gitignore
cat > code/runJOB.sh << "EOT"
#!/bin/bash
#
# splitting the all.jobs file according to node distribution
# in the PWD numbered files [1 -> splits] are created and deleted after
JOBFILE=code/all.jobs
splits=FIXME
# the JOBFILE containing all jobs as single lines is split into seperate files
# numbered from 1 to [splits] including header each and written into PWD
parallel -j${splits} --block -1 -a $JOBFILE --header : --pipepart 'cat > {#}'
# submitting independent SLURM jobs for efficiency and robustness
# parallel SLURM submission of jobs each with one of [1..splits] jobfile
parallel 'sbatch code/catpart.sbatch {}' ::: $(seq ${splits})
EOT
chmod +x code/runJOB.sh
# SLURM compute environment
# makes sure that the jobs per node don't exceed RAM and wall clock time !!
cat > code/process.sbatch << "EOT"
#!/bin/bash -x
### If you need a compute time project for job submission set here
#SBATCH --account=FIXME
#SBATCH --mail-user=FIXME
#SBATCH --mail-type=END
#SBATCH --job-name=FIXME
#SBATCH --output=logs/processing-out.%j
#SBATCH --error=logs/processing-err.%j
### If there's a time limit for job runs, set (max) here
#SBATCH --time=24:00:00
### If specific partitions are available i.e. with more RAM define here
#SBATCH --partition=FIXME
#SBATCH --nodes=1
### Define number of jobs that are run simultaneously: 125 is good with RAM disk
srun parallel --delay 0.2 -a $1 -j FIXME
rm $1
wait
EOT
# create job.call-file for all commands to call
# each subject is processed on a temporary/local store in an own dataset
# define local storage for writting temporary files for performace and to save inodes
# FIX-ME: Adjust the temporary_store variable to point to a temporary/scratch
# location on your system
temporary_store=/dev/shm/
cat > code/call.job << EOT
#!/bin/bash -x
#
# redundant input per subject
subid=\$1
# define DSLOCKFILE, DATALAD & GIT ENV for participant_job
export DSLOCKFILE=$(pwd)/.SLURM_datalad_lock \
DATALAD_GET_SUBDATASET__SOURCE__CANDIDATE__101cat=${containerstore}#{id} \
GIT_AUTHOR_NAME=\$(git config user.name) \
GIT_AUTHOR_EMAIL=\$(git config user.email) \
JOBID=\${subid:4}.\${SLURM_JOB_ID} \
# use subject specific folder to clone input dataset
mkdir ${temporary_store}/\${JOBID}
cd ${temporary_store}/\${JOBID}
# run things
$(pwd)/code/participant_job \
${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset) \
$(git remote get-url --push output) \
\${subid} \
>$(pwd)/logs/\${JOBID}.out \
2>$(pwd)/logs/\${JOBID}.err
# wipe local store with job specific data after completion, especially when using RAM disk
cd ${temporary_store}/
chmod 777 -R ${temporary_store}/\${JOBID}
rm -fr ${temporary_store}/\${JOBID}
EOT
chmod +x code/call.job
# create job-file with commands for all jobs
cat > code/all.jobs << "EOT"
#!/bin/bash
EOT
# ------------------------------------------------------------------------------
# FIX-ME: Adjust the find command below to return the unit over which your
# analysis should parallelize. Here, subject directories on the first hierarchy
# level in the input data are returned by searching for the 'sub-*' prefix.
#
for s in $(find inputs/data -maxdepth 1 -type d -name 'sub-*' -printf '%f\n'); do
printf "code/call.job $s\n" >> code/all.jobs
done
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
chmod +x code/all.jobs
datalad save -m "SLURM submission setup" code/ .gitignore
################################################################################
# SLURM SETUP END
################################################################################
# cleanup - we have generated the job definitions, we do not need to keep a
# massive input dataset around. Having it around wastes resources and makes many
# git operations needlessly slow
datalad uninstall -r --nocheck inputs/data
# make sure the fully configured output dataset is available from the designated
# store for initial cloning and pushing the results.
datalad push --to input
datalad push --to output
# if we get here, we are happy
echo SUCCESS