Skip to content

Commit

Permalink
Merge pull request #23 from ORNL/deception-gitlab-pipelines
Browse files Browse the repository at this point in the history
Deception CI Status
  • Loading branch information
ryandanehy authored Oct 25, 2023
2 parents 395267b + f47c97e commit 00b5cb1
Show file tree
Hide file tree
Showing 6 changed files with 373 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ jobs:
# Give actions access to some secrets
with:
GIT_PUSH_ARGS: '--tags --force --push-option=ci.skip'
REMOTE: 'https://gitlab.pnnl.gov/exasgd/frameworks/exago-github-mirror'
REMOTE: 'https://gitlab.pnnl.gov/exasgd/resolve-mirror'
GIT_USERNAME: ${{ secrets.GIT_USER }}
GIT_PASSWORD: ${{ secrets.GIT_PASSWORD }}
GIT_PASSWORD: ${{ secrets.PNNL_GIT_PASSWORD }}

# Trigger CI pipeline since it was skipped in the above push
- name: Trigger Pipeline
run: curl -X POST -F token=${{ secrets.PNNL_PIPELINE_TRIGGER }} -F ref=${{ steps.extract_branch.outputs.branch }} https://gitlab.pnnl.gov/api/v4/projects/1662/trigger/pipeline
14 changes: 13 additions & 1 deletion .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,19 @@ stages:
- &rule_yes_ascent_test
if: '$CI_COMMIT_TITLE =~ /\[ascent-test\]/'
when: always
# --
# Only run when the commit **DOES NOT** contains "[deception-rebuild]"
- &rule_no_deception_rebuild
if: '$CI_COMMIT_TITLE =~ /\[deception-rebuild\]/'
when: never
# Only run when the commit **DOES NOT** contains "[deception-test]"
- &rule_no_deception_test
if: '$CI_COMMIT_TITLE =~ /\[deception-test\]/'
when: never
# Only run when the commit **DOES** contains "[deception-test]"
- &rule_yes_deception_test
if: '$CI_COMMIT_TITLE =~ /\[deception-test\]/'
when: always

# Ascent Variables
.ornl_environment_template:
Expand Down Expand Up @@ -127,4 +140,3 @@ failure:
- .report-status
rules:
- when: on_failure

25 changes: 25 additions & 0 deletions .gitlab/pnnl/.gitlab-ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
trigger_deception:
needs: []
trigger:
include:
- local: .gitlab/pnnl/base.gitlab-ci.yml
- local: .gitlab/pnnl/deception.gitlab-ci.yml

SVC-Account-Cleanup:
image: kfox1111/slurm:deception2
tags:
- k8s
- ikp
- exasgd
- marianas
resource_group: cleanup
stage: .pre
allow_failure: true
variables:
# Don't clone for cleanup job
GIT_STRATEGY: none
script:
- export WORKDIR="$HOME/gitlab/"
# clears directory of files more than 3 hours/180 minutes old
- find $WORKDIR -type d -mindepth 1 -mmin +180 -prune -print -exec rm -rf {} \; || true
- ls -hal $WORKDIR
253 changes: 253 additions & 0 deletions .gitlab/pnnl/base.gitlab-ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
include:
- remote: 'https://raw.githubusercontent.com/pnnl-miscscripts/gitlab-lib/v1/gitlab-lib.yaml'

# https://stackoverflow.com/questions/67009007
# We have to use YAML anchors for rules here
# Rules are also evaluated sequentially, so you need to be careful about
# the order in which these rules are included in child jobs - list "never" rules before other rules
.rules:
rules:
# --
# Rule to fall back on...
- &default
when: always
# --
# Only run when the commit **DOES NOT** contains "[deception-test]"
- &rule_no_deception_test
if: '$CI_COMMIT_TITLE =~ /\[deception-test\]/'
when: never
# Only run when the commit **DOES** contains "[deception-test]"
- &rule_yes_deception_test
if: '$CI_COMMIT_TITLE =~ /\[deception-test\]/'
when: always


variables:
GIT_SUBMODULE_STRATEGY: recursive
KUBERNETES_HELPER_CPU_REQUEST: 100m
KUBERNETES_HELPER_CPU_LIMIT: 500m
KUBERNETES_HELPER_MEMORY_REQUEST: 512Mi
KUBERNETES_HELPER_MEMORY_LIMIT: 512Mi

stages:
- build
- test

.non_hpc_job:
needs: []
allow_failure: true
when: always
stage: test
extends: .pnnl_nonhpc_tags
variables:
GIT_SUBMODULE_STRATEGY: none

.cluster_build:
needs: []
stage: build
retry: 1
extends:
- .pnnl_tags_template
- .pnnl_script_template
variables:
OUTPUT_SUFFIX: "_build"
SCRIPT_ARGS: " --build-only "

.cluster_test:
stage: test
extends:
- .pnnl_tags_template
- .pnnl_script_template
- .pnnl_after_script_template
variables:
# Don't clone for test jobs
GIT_STRATEGY: none
CTESTARGS: "--timeout 240 --output-on-failure"
OUTPUT_SUFFIX: "_test"
artifacts:
when: always
paths:
- partition

.pnnl_after_script_template:
after_script:
- |
export WORKDIR="$HOME/gitlab/${CI_PIPELINE_ID}/${MY_CLUSTER}/"
# Iterate over possible jobid named files (jobid_%J)
job_ids="$WORKDIR/jobid_*"
for job in $job_ids
do
if [[ -f "$job" ]]; then
jobid=$(cat "$job")
scancel $jobid
fi
done
rm -rf $WORKDIR
.pnnl_script_template:
script:
- |
# pass --verbose to build.sh for verbose debugging
#
# NOTES: WORKDIR is on constance/deception/newell
# ./ is only on the Kubernetes instance
#
export WORKDIR="$HOME/gitlab/${CI_PIPELINE_ID}/${WORKDIR_SUFFIX}"
if [[ ! -d "$WORKDIR" ]]; then
# if workdir already exists, we're in the testing job
mkdir -p "$WORKDIR"
cp -r . "$WORKDIR"
fi
pushd "$WORKDIR"
if [[ $MY_CLUSTER = "deception" ]]; then
export SLURM_Q="dl,dl_shared,dlv,a100_shared,a100_80_shared,fat_shared,dlt_shared"
elif [[ $MY_CLUSTER = "incline" ]]; then
export SLURM_Q="incline"
else
export SLURM_Q="newell8"
fi
# Unique output file for this stage
output="output${OUTPUT_SUFFIX}"
[ -f $output ] && rm $output
touch $output
tail -f $output &
tailpid=$!
# Set some directories used in the build script manually, as they
# can be inconsistent in gitlab pipelines
export srcdir=$WORKDIR builddir=$WORKDIR/build installdir=$WORKDIR/install
# Export CTESTARGS defined in variables to run correct tests for stage
echo $CTESTARGS being used as testing args
export CTESTARGS=$CTESTARGS
# jobid used in pnnl_after_script_template to cancel job if cancelled or
# timed out by gitlab through the UI
jobid=$(sbatch --export=ALL -A EXASGD $SLURM_ARGS -p $SLURM_Q -o $output -e $output -t 1:00:00 $WORKDIR/buildsystem/build.sh $SCRIPT_ARGS)
export jobid=$(echo $jobid | cut -f4 -d' ')
partition=$(squeue -j $jobid -h --format="%P")
export partition=$(echo $partition | cut -f2 -d'=' | cut -f1 -d' ')
popd
echo "$partition" >> ./partition
pushd $WORKDIR
# Unique jobid filename for this job
echo $jobid > "$WORKDIR/jobid_${jobid}"
res=1
while :;
do
if [[ "$(awk 'BEGIN{i=0}/BUILD_STATUS/{i++}END{print i}' $output)" != "0" ]]; then
kill $tailpid
echo 'Last tail of build $output:'
tail -n 200 $output
res=$(grep BUILD_STATUS $output | tail -n 1 | cut -f2 -d':')
break
fi
sleep 10
done
popd
echo "Finished batch job with exit code: $res"
rm "$WORKDIR/jobid_${jobid}"
exit $res
.report-job:
retry: 1
image: mrnonz/alpine-git-curl:alpine3.16
resource_group: status
environment:
name: reporting-gitlab
variables:
GIT_STRATEGY: none
STATUS_PROJECT: ORNL/ReSolve
STATUS_NAME: NotSet
extends: .pnnl_nonhpc_tags

.report-status:
extends: .report-job
script:
- |
set -x
if [[ ! -e partition ]]; then
echo "No partition file found"
export part="none"
else
export part=$(cat partition)
fi
export newell_status="ppc64le/gcc@8.5/cuda@11.4/v100@70"
export deception_status="x86_64/gcc@9.1/cuda@11.4"
export incline_status="arm64/clang@15.0/rocm@5.3/MI100@gfx908"
if [[ "$part" == *"newell"* ]]; then
export STATUS_NAME=$newell_status
elif [[ "$part" == *"incline"* ]]; then
export STATUS_NAME="$incline_status"
elif [[ "$part" == *"a100"* ]]; then
export gpu_arch=a100@80
export STATUS_NAME="$deception_status/$gpu_arch"
elif [[ "$part" == *"dl"* ]]; then
gpu_arch=p100@60
export STATUS_NAME="$deception_status/$gpu_arch"
elif [[ "$part" == *"dlv"* ]]; then
gpu_arch=v100@70
export STATUS_NAME="$deception_status/$gpu_arch"
elif [[ "$part" == *"dlt"* ]]; then
gpu_arch=RTX2080@75
export STATUS_NAME="$deception_status/$gpu_arch"
else
echo "Unknown partition"
export STATUS_NAME="Unknown Partition/Tests Skipped"
fi
curl -L \
-X POST \
-H @${GITHUB_CURL_HEADER}\
https://api.github.com/repos/${STATUS_PROJECT}/statuses/${CI_COMMIT_SHA} \
-d "{\"state\":\"${CI_JOB_NAME}\",\"target_url\":\"${CI_PIPELINE_URL}\",\"description\":\"${STATUS_NAME}\",\"context\":\"${MY_CLUSTER}\"}"
.report-pending:
extends: .report-job
script:
- |
set -x
curl -L \
-X POST \
-H @${GITHUB_CURL_HEADER}\
https://api.github.com/repos/${STATUS_PROJECT}/statuses/${CI_COMMIT_SHA} \
-d "{\"state\":\"${CI_JOB_NAME}\",\"target_url\":\"${CI_PIPELINE_URL}\",\"context\":\"${MY_CLUSTER}\"}"
.pnnl_tags_template:
# This image allows you to connect to SLURM
image: kfox1111/slurm:deception2
tags:
- k8s
- ikp
- exasgd
- marianas

.pnnl_nonhpc_tags:
tags:
- k8s
- ikp
- exasgd
- basic

.deception:
rules:
- *rule_yes_deception_test
- *default
variables:
MY_CLUSTER: "deception"
WORKDIR_SUFFIX: "x86_64-build"
SLURM_ARGS: " --gres=gpu:1 --ntasks=3 "
38 changes: 38 additions & 0 deletions .gitlab/pnnl/deception.gitlab-ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
Deception Build:
extends:
- .cluster_build
- .deception
variables:
SCRIPT_ARGS: " --build-only "

Deception Test:
extends:
- .cluster_test
- .deception
variables:
CTESTARGS: " --timeout 240 --output-on-failure -LE deception-skip "
SCRIPT_ARGS: " --test-only "
needs: ['Deception Build']

pending:
variables:
MY_CLUSTER: "Deception"
extends:
- .report-pending
stage: .pre

success:
variables:
MY_CLUSTER: "Deception"
extends:
- .report-status
stage: .post

failure:
stage: .post
variables:
MY_CLUSTER: "Deception"
extends:
- .report-status
rules:
- when: on_failure
Loading

0 comments on commit 00b5cb1

Please sign in to comment.