-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgitlab-lib.yaml
253 lines (223 loc) · 8.23 KB
/
gitlab-lib.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
include:
- remote: "https://raw.githubusercontent.com/pnnl-miscscripts/gitlab-lib/v1/gitlab-lib.yaml"
.pnnl_default_vars:
variables:
# TODO - have stage that verifies `test.sh` is "correct":
# - uses /bin/bash not anything else
# - has the correct shebang without `-xve`
# - has the correct exit/cleanup scripts w/ return statement
TEST_SCRIPT: ./scripts/test.sh
GIT_SUBMODULE_STRATEGY: none
SLURM_Q: shared,short
NTASKS: 3
TIME_LIMIT: 1:00:00
SLURM_ACCT: ops
SLURM_ARGS: " --exclusive "
MY_CLUSTER: deception
WORKDIR_SUFFIX: $MY_CLUSTER
OUTPUT_SUFFIX: $WORKDIR_SUFFIX
.pnnl_after_script_template:
extends: .pnnl_default_vars
after_script:
- |
export WORKDIR="$HOME/gitlab/${CI_PIPELINE_ID}/${WORKDIR_SUFFIX}/"
# Iterate over possible jobid named files (jobid_%J)
job_ids="$WORKDIR/jobid_*"
for job in $job_ids
do
if [[ -f "$job" ]]; then
jobid=$(cat "$job")
scancel $jobid
fi
done
rm -rf $WORKDIR
.pnnl_cluster_test:
extends:
- .pnnl_after_script_template
- .pnnl_default_vars
script:
- |
# Run TEST_SCRIPT in a SLURM job on the HPC cluster
#
# NOTES: WORKDIR is on constance/deception/newell
# ./ is only on the Kubernetes instance
#
set -xv
export WORKDIR="$HOME/gitlab/${CI_PIPELINE_ID}/${WORKDIR_SUFFIX}"
if [[ ! -d "$WORKDIR" ]]; then
# if workdir already exists, we're in the testing job
mkdir -p "$WORKDIR"
cp -r . "$WORKDIR"
fi
cd "$WORKDIR"
# Unique output file for this stage
output="output${OUTPUT_SUFFIX}"
[ -f $output ] && rm $output
touch $output
tail -f $output &
tailpid=$!
# jobid used in pnnl_after_script_template to cancel job if cancelled or
# timed out by gitlab through the UI
# TODO - maybe fix the permissions on this outside of pipeline
chmod +x $TEST_SCRIPT
STBATCH_CMD="sbatch --export==ALL -A ${SLURM_ACCT} ${SLURM_ARGS} --ntasks=${NTASKS} -p ${SLURM_Q} -o ${output} -e ${output} -t ${TIME_LIMIT} ${TEST_SCRIPT}"
echo "Using sbatch command: " $SBATCH_CMD
jobid=$(${SBATCH_CMD})
export jobid=$(echo $jobid | cut -f4 -d' ')
# Unique jobid filename for this job
echo $jobid > "$WORKDIR/jobid_${jobid}"
res=1
while :;
do
if [[ "$(awk 'BEGIN{i=0}/BUILD_STATUS/{i++}END{print i}' $output)" != "0" ]]; then
kill $tailpid
echo 'Last tail of build $output:'
tail -n 200 $output
# Original line below --- res=$(grep BUILD_STATUS $output | tail -n 1 | cut -f2 -d':')
grep BUILD_STATUS $output
res=$(grep BUILD_STATUS $output | tail -n 1 | cut -f2 -d':')
# res=$(grep BUILD_STATUS $output | tail -n 5 )
break
fi
sleep 10
done
echo "Finished batch job with exit code: $res"
rm "$WORKDIR/jobid_${jobid}"
set +xv
exit $res
.pnnl_cleanup_template:
resource_group: cleanup
allow_failure: true
image: ubuntu:22.04
stage: .pre
variables:
# No need to clone for cleanup jobs
GIT_STRATEGY: none
MIN_THRESHOLD: 360
script:
- export WORKDIR="$HOME/gitlab/"
# clears directory of files more than MIN_THRESHOLD minutes old
- find $WORKDIR -type d -mindepth 1 -mmin +$MIN_THRESHOLD -prune -print -exec rm -rf {} \; || true
- ls -hal $WORKDIR
# --- Spack TCL module utilities ---
.spack_tcl_module:
extends: .pnnl_cluster_test
variables:
TEST_SCRIPT: ./spack_script.sh
# This has to be a valid branch/remote ref - raw commit hash could work but would need to refactor
SPACK_VERSION: develop
SPACK_UPDATE: "false"
SPACK_INSTALL: /rcfs/scratch/$USER/spack
SPACK_YAML: ./spack.yaml
NUM_CORES: 64
USER: svcmlops
FORCE_CLEAN: "false"
SPACK_CLEAN: "false"
HPC_PYTHON_MODULE: miniconda23.5.2
COMPILER: gcc
COMPILER_VERSION: 9.1.0
VERBOSE_OUTPUT: "false"
VERBOSE_SPACK: "false"
before_script:
- |
# Create the shell script to be ran in SLURM job
# Adding this script to heredoc to run in SLURM job
cat << 'EOF' > spack_script.sh
#!/bin/bash
exit() {
# Clear all trap handlers so this isn't echo'ed multiple times, potentially
# throwing off the CI script watching for this output
trap - `seq 1 31`
# If called without an argument, assume not an error
local ec=${1:-0}
# Echo the snippet the CI script is looking for
echo BUILD_STATUS:${ec}
# Actually exit with that code, although it won't matter in most cases, as CI
# is only looking for the string 'BUILD_STATUS:N'
builtin exit ${ec}
}
# This will be the catch-all trap handler after arguments are parsed.
cleanup() {
# Clear all trap handlers
trap - `seq 1 31`
# When 'trap' is invoked, each signal handler will be a curried version of
# this function which has the first argument bound to the signal it's catching
local sig=$1
echo
echo Exit code $2 caught in build script triggered by signal ${sig}.
echo
exit $2
}
# If verbose output, toggle xv
if [[ $VERBOSE_OUTPUT == "true" ]]; then
set -xv
fi
. /etc/bashrc
groups
module purge
module load git
git --version
# Configure proxy...
# export HTTPS_PROXY=http://proxy01.pnl.gov:3128
# export https_proxy=http://proxy01.pnl.gov:3128
# This is debugging/useful for cleanup
# If there is no `.git` folder in directory or no setup-spack.sh script or we are force cleaning with FORCE_CLEAN,
# force clean before clone
if [[ ! -d $SPACK_INSTALL/.git ]] || [[ -f $SPACK_INSTALL/share/spack/setup-spack.sh ]] || [[ $FORCE_CLEAN == "true" ]]; then
rm -rfd $SPACK_INSTALL &&
# Need to change this if spack-tmp is ever changed...
rm -rfd $SPACK_INSTALL/../spack-tmp &&
rm -rfd ~/.spack &&
git clone --depth=1 -c feature.manyFiles=true https://github.com/spack/spack.git $SPACK_INSTALL
fi &&
pushd $SPACK_INSTALL &&
# If we are updating (SPACK_UPDATE == true)
if [[ $SPACK_UPDATE == "true" ]]; then
# Fetch SPACK_VERSION's latest changes and reset branch to that
git fetch origin $SPACK_VERSION &&
git reset --hard origin/$SPACK_VERSION &&
fi &&
# Attempt to set spack temp directories to make things easier for users...
export tempdir=$SPACK_INSTALL/../spack-tmp &&
export user_cache_path=$tempdir &&
export TMP=$tempdir &&
export TMPDIR=$tempdir &&
export SPACK_DISABLE_LOCAL_CONFIG=1 &&
export SPACK_USER_CACHE_PATH=$user_cache_path &&
# Configure spack's python for use
module load python/$HPC_PYTHON_MODULE > /dev/null 2>&1 &&
# If miniconda at start of python version, load extra shell script for conda
if [[ $HPC_PYTHON_MODULE == "miniconda"* ]]; then
source /share/apps/python/$HPC_PYTHON_MODULE/etc/profile.d/conda.sh
fi &&
export SPACK_PYTHON=$(which python3) &&
# If verbose spack, set flag to `--debug`
if [[ $VERBOSE_SPACK == "true" ]]; then
export SPACK_DEBUG="--debug"
fi &&
source ./share/spack/setup-env.sh &&
popd &&
spack -V &&
spack $SPACK_DEBUG env create -d $(pwd)/tmp-env &&
cp $SPACK_YAML $(pwd)/tmp-env/spack.yaml &&
# Configure compilers...
module load $COMPILER/$COMPILER_VERSION &&
spack $SPACK_DEBUG -e tmp-env compiler find &&
# Print config for debugging
spack -e tmp-env config get config &&
# Build modules
spack $SPACK_DEBUG -e tmp-env concretize -f &&
spack $SPACK_DEBUG -e tmp-env install -j $NUM_CORES --fail-fast &&
spack $SPACK_DEBUG -e tmp-env module tcl refresh -y
ret_code=$?
# Cleanup spack
if [[ $SPACK_CLEAN == "true" ]]; then
spack $SPACK_DEBUG -e tmp-env clean -abm
fi
chmod -R ugo+wrx $SPACK_INSTALL > /dev/null 2>&1
chmod -R ugo+wrx $SPACK_USER_CACHE_PATH > /dev/null 2>&1
if [[ $VERBOSE_OUTPUT == "true" ]]; then
set +xv
fi
exit $ret_code
EOF