-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #23 from ORNL/deception-gitlab-pipelines
Deception CI Status
- Loading branch information
Showing
6 changed files
with
373 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
trigger_deception: | ||
needs: [] | ||
trigger: | ||
include: | ||
- local: .gitlab/pnnl/base.gitlab-ci.yml | ||
- local: .gitlab/pnnl/deception.gitlab-ci.yml | ||
|
||
SVC-Account-Cleanup: | ||
image: kfox1111/slurm:deception2 | ||
tags: | ||
- k8s | ||
- ikp | ||
- exasgd | ||
- marianas | ||
resource_group: cleanup | ||
stage: .pre | ||
allow_failure: true | ||
variables: | ||
# Don't clone for cleanup job | ||
GIT_STRATEGY: none | ||
script: | ||
- export WORKDIR="$HOME/gitlab/" | ||
# clears directory of files more than 3 hours/180 minutes old | ||
- find $WORKDIR -type d -mindepth 1 -mmin +180 -prune -print -exec rm -rf {} \; || true | ||
- ls -hal $WORKDIR |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,253 @@ | ||
include: | ||
- remote: 'https://raw.githubusercontent.com/pnnl-miscscripts/gitlab-lib/v1/gitlab-lib.yaml' | ||
|
||
# https://stackoverflow.com/questions/67009007 | ||
# We have to use YAML anchors for rules here | ||
# Rules are also evaluated sequentially, so you need to be careful about | ||
# the order in which these rules are included in child jobs - list "never" rules before other rules | ||
.rules: | ||
rules: | ||
# -- | ||
# Rule to fall back on... | ||
- &default | ||
when: always | ||
# -- | ||
# Only run when the commit **DOES NOT** contains "[deception-test]" | ||
- &rule_no_deception_test | ||
if: '$CI_COMMIT_TITLE =~ /\[deception-test\]/' | ||
when: never | ||
# Only run when the commit **DOES** contains "[deception-test]" | ||
- &rule_yes_deception_test | ||
if: '$CI_COMMIT_TITLE =~ /\[deception-test\]/' | ||
when: always | ||
|
||
|
||
variables: | ||
GIT_SUBMODULE_STRATEGY: recursive | ||
KUBERNETES_HELPER_CPU_REQUEST: 100m | ||
KUBERNETES_HELPER_CPU_LIMIT: 500m | ||
KUBERNETES_HELPER_MEMORY_REQUEST: 512Mi | ||
KUBERNETES_HELPER_MEMORY_LIMIT: 512Mi | ||
|
||
stages: | ||
- build | ||
- test | ||
|
||
.non_hpc_job: | ||
needs: [] | ||
allow_failure: true | ||
when: always | ||
stage: test | ||
extends: .pnnl_nonhpc_tags | ||
variables: | ||
GIT_SUBMODULE_STRATEGY: none | ||
|
||
.cluster_build: | ||
needs: [] | ||
stage: build | ||
retry: 1 | ||
extends: | ||
- .pnnl_tags_template | ||
- .pnnl_script_template | ||
variables: | ||
OUTPUT_SUFFIX: "_build" | ||
SCRIPT_ARGS: " --build-only " | ||
|
||
.cluster_test: | ||
stage: test | ||
extends: | ||
- .pnnl_tags_template | ||
- .pnnl_script_template | ||
- .pnnl_after_script_template | ||
variables: | ||
# Don't clone for test jobs | ||
GIT_STRATEGY: none | ||
CTESTARGS: "--timeout 240 --output-on-failure" | ||
OUTPUT_SUFFIX: "_test" | ||
artifacts: | ||
when: always | ||
paths: | ||
- partition | ||
|
||
.pnnl_after_script_template: | ||
after_script: | ||
- | | ||
export WORKDIR="$HOME/gitlab/${CI_PIPELINE_ID}/${MY_CLUSTER}/" | ||
# Iterate over possible jobid named files (jobid_%J) | ||
job_ids="$WORKDIR/jobid_*" | ||
for job in $job_ids | ||
do | ||
if [[ -f "$job" ]]; then | ||
jobid=$(cat "$job") | ||
scancel $jobid | ||
fi | ||
done | ||
rm -rf $WORKDIR | ||
.pnnl_script_template: | ||
script: | ||
- | | ||
# pass --verbose to build.sh for verbose debugging | ||
# | ||
# NOTES: WORKDIR is on constance/deception/newell | ||
# ./ is only on the Kubernetes instance | ||
# | ||
export WORKDIR="$HOME/gitlab/${CI_PIPELINE_ID}/${WORKDIR_SUFFIX}" | ||
if [[ ! -d "$WORKDIR" ]]; then | ||
# if workdir already exists, we're in the testing job | ||
mkdir -p "$WORKDIR" | ||
cp -r . "$WORKDIR" | ||
fi | ||
pushd "$WORKDIR" | ||
if [[ $MY_CLUSTER = "deception" ]]; then | ||
export SLURM_Q="dl,dl_shared,dlv,a100_shared,a100_80_shared,fat_shared,dlt_shared" | ||
elif [[ $MY_CLUSTER = "incline" ]]; then | ||
export SLURM_Q="incline" | ||
else | ||
export SLURM_Q="newell8" | ||
fi | ||
# Unique output file for this stage | ||
output="output${OUTPUT_SUFFIX}" | ||
[ -f $output ] && rm $output | ||
touch $output | ||
tail -f $output & | ||
tailpid=$! | ||
# Set some directories used in the build script manually, as they | ||
# can be inconsistent in gitlab pipelines | ||
export srcdir=$WORKDIR builddir=$WORKDIR/build installdir=$WORKDIR/install | ||
# Export CTESTARGS defined in variables to run correct tests for stage | ||
echo $CTESTARGS being used as testing args | ||
export CTESTARGS=$CTESTARGS | ||
# jobid used in pnnl_after_script_template to cancel job if cancelled or | ||
# timed out by gitlab through the UI | ||
jobid=$(sbatch --export=ALL -A EXASGD $SLURM_ARGS -p $SLURM_Q -o $output -e $output -t 1:00:00 $WORKDIR/buildsystem/build.sh $SCRIPT_ARGS) | ||
export jobid=$(echo $jobid | cut -f4 -d' ') | ||
partition=$(squeue -j $jobid -h --format="%P") | ||
export partition=$(echo $partition | cut -f2 -d'=' | cut -f1 -d' ') | ||
popd | ||
echo "$partition" >> ./partition | ||
pushd $WORKDIR | ||
# Unique jobid filename for this job | ||
echo $jobid > "$WORKDIR/jobid_${jobid}" | ||
res=1 | ||
while :; | ||
do | ||
if [[ "$(awk 'BEGIN{i=0}/BUILD_STATUS/{i++}END{print i}' $output)" != "0" ]]; then | ||
kill $tailpid | ||
echo 'Last tail of build $output:' | ||
tail -n 200 $output | ||
res=$(grep BUILD_STATUS $output | tail -n 1 | cut -f2 -d':') | ||
break | ||
fi | ||
sleep 10 | ||
done | ||
popd | ||
echo "Finished batch job with exit code: $res" | ||
rm "$WORKDIR/jobid_${jobid}" | ||
exit $res | ||
.report-job: | ||
retry: 1 | ||
image: mrnonz/alpine-git-curl:alpine3.16 | ||
resource_group: status | ||
environment: | ||
name: reporting-gitlab | ||
variables: | ||
GIT_STRATEGY: none | ||
STATUS_PROJECT: ORNL/ReSolve | ||
STATUS_NAME: NotSet | ||
extends: .pnnl_nonhpc_tags | ||
|
||
.report-status: | ||
extends: .report-job | ||
script: | ||
- | | ||
set -x | ||
if [[ ! -e partition ]]; then | ||
echo "No partition file found" | ||
export part="none" | ||
else | ||
export part=$(cat partition) | ||
fi | ||
export newell_status="ppc64le/gcc@8.5/cuda@11.4/v100@70" | ||
export deception_status="x86_64/gcc@9.1/cuda@11.4" | ||
export incline_status="arm64/clang@15.0/rocm@5.3/MI100@gfx908" | ||
if [[ "$part" == *"newell"* ]]; then | ||
export STATUS_NAME=$newell_status | ||
elif [[ "$part" == *"incline"* ]]; then | ||
export STATUS_NAME="$incline_status" | ||
elif [[ "$part" == *"a100"* ]]; then | ||
export gpu_arch=a100@80 | ||
export STATUS_NAME="$deception_status/$gpu_arch" | ||
elif [[ "$part" == *"dl"* ]]; then | ||
gpu_arch=p100@60 | ||
export STATUS_NAME="$deception_status/$gpu_arch" | ||
elif [[ "$part" == *"dlv"* ]]; then | ||
gpu_arch=v100@70 | ||
export STATUS_NAME="$deception_status/$gpu_arch" | ||
elif [[ "$part" == *"dlt"* ]]; then | ||
gpu_arch=RTX2080@75 | ||
export STATUS_NAME="$deception_status/$gpu_arch" | ||
else | ||
echo "Unknown partition" | ||
export STATUS_NAME="Unknown Partition/Tests Skipped" | ||
fi | ||
curl -L \ | ||
-X POST \ | ||
-H @${GITHUB_CURL_HEADER}\ | ||
https://api.github.com/repos/${STATUS_PROJECT}/statuses/${CI_COMMIT_SHA} \ | ||
-d "{\"state\":\"${CI_JOB_NAME}\",\"target_url\":\"${CI_PIPELINE_URL}\",\"description\":\"${STATUS_NAME}\",\"context\":\"${MY_CLUSTER}\"}" | ||
.report-pending: | ||
extends: .report-job | ||
script: | ||
- | | ||
set -x | ||
curl -L \ | ||
-X POST \ | ||
-H @${GITHUB_CURL_HEADER}\ | ||
https://api.github.com/repos/${STATUS_PROJECT}/statuses/${CI_COMMIT_SHA} \ | ||
-d "{\"state\":\"${CI_JOB_NAME}\",\"target_url\":\"${CI_PIPELINE_URL}\",\"context\":\"${MY_CLUSTER}\"}" | ||
.pnnl_tags_template: | ||
# This image allows you to connect to SLURM | ||
image: kfox1111/slurm:deception2 | ||
tags: | ||
- k8s | ||
- ikp | ||
- exasgd | ||
- marianas | ||
|
||
.pnnl_nonhpc_tags: | ||
tags: | ||
- k8s | ||
- ikp | ||
- exasgd | ||
- basic | ||
|
||
.deception: | ||
rules: | ||
- *rule_yes_deception_test | ||
- *default | ||
variables: | ||
MY_CLUSTER: "deception" | ||
WORKDIR_SUFFIX: "x86_64-build" | ||
SLURM_ARGS: " --gres=gpu:1 --ntasks=3 " |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
Deception Build: | ||
extends: | ||
- .cluster_build | ||
- .deception | ||
variables: | ||
SCRIPT_ARGS: " --build-only " | ||
|
||
Deception Test: | ||
extends: | ||
- .cluster_test | ||
- .deception | ||
variables: | ||
CTESTARGS: " --timeout 240 --output-on-failure -LE deception-skip " | ||
SCRIPT_ARGS: " --test-only " | ||
needs: ['Deception Build'] | ||
|
||
pending: | ||
variables: | ||
MY_CLUSTER: "Deception" | ||
extends: | ||
- .report-pending | ||
stage: .pre | ||
|
||
success: | ||
variables: | ||
MY_CLUSTER: "Deception" | ||
extends: | ||
- .report-status | ||
stage: .post | ||
|
||
failure: | ||
stage: .post | ||
variables: | ||
MY_CLUSTER: "Deception" | ||
extends: | ||
- .report-status | ||
rules: | ||
- when: on_failure |
Oops, something went wrong.