-
Notifications
You must be signed in to change notification settings - Fork 32
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Build and host algorithm images on GitHub (base images) (#20)
* wip: first version of PR test workflow * feat: adapt first algorithm to new infrastructure * fix: job generation script * feat: adapt second algorithm to new infrastructure * fix: python version * feat: allow matrix generation script to run in different contexts (folders) * refactor: split up base images and intermediate images; rename folders * feat: adjust workflow to build images in order * fix: build matrix computation script * test empty matrix * feat: prepare image publishing and adapt docker images * chore: restore lof and sublof algorithms and use kmeans; also add licenses to base images * feat: adjust r base image * chore: cleanup workflow definition * refactor: revert changes to intermediate images and algos (later PR) * feat: test image push * feat: test image push again * fix: image license information * feat: fix version information in image labels and finish PR
- Loading branch information
Sebastian Schmidl
authored
Nov 20, 2023
1 parent
60814e3
commit 96af897
Showing
25 changed files
with
563 additions
and
105 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
#!/usr/bin/env python3 | ||
import json | ||
import sys | ||
|
||
from pathlib import Path | ||
|
||
MODEL_FILEPATH = Path("./results/model.pkl") | ||
SCORES_FILEPATH = Path("./results/scores.csv") | ||
|
||
|
||
def parse_manifest(algorithm: str) -> dict: | ||
manifest_path = Path(".") / algorithm / "manifest.json" | ||
with manifest_path.open("r") as fh: | ||
manifest = json.load(fh) | ||
return manifest | ||
|
||
|
||
def is_readable(filename: Path) -> bool: | ||
stat = filename.stat() | ||
return stat.st_uid == 1000 and stat.st_gid == 1000 | ||
|
||
|
||
def has_postprocessing(algorithm: str) -> bool: | ||
readme_path = Path(".") / algorithm / "README.md" | ||
if not readme_path.exists(): | ||
return False | ||
|
||
with readme_path.open("r") as fh: | ||
readme = fh.readlines() | ||
|
||
marker = ["<!--BEGIN:timeeval-post-->", "<!--END:timeeval-post-->"] | ||
return any([m in l for m in marker for l in readme]) | ||
|
||
|
||
def main(algorithm): | ||
manifest = parse_manifest(algorithm) | ||
errors = [] | ||
|
||
if manifest["learningType"].lower() in ["supervised", "semi-supervised"]: | ||
# check model.pkl | ||
if not is_readable(MODEL_FILEPATH): | ||
errors.append("Model file was written with the wrong user and/or group. Do you use a TimeEval base image?") | ||
|
||
# check scores.csv | ||
if not is_readable(SCORES_FILEPATH): | ||
errors.append("Scoring was written with the wrong user and/or group. Do you use a TimeEval base image?") | ||
|
||
with SCORES_FILEPATH.open("r") as fh: | ||
lines = fh.readlines() | ||
|
||
|
||
# if not post-processing, check length | ||
if has_postprocessing(algorithm): | ||
print("Skipping scoring (scores.csv) check, because algorithm uses post-processing!") | ||
else: | ||
# only a single column/dimension: | ||
if any(["," in l for l in lines]): | ||
errors.append("Scoring contains multiple dimensions (found a ',' in the file). " | ||
"Only a single anomaly score is allowed per time step!") | ||
|
||
# there should be no header | ||
try: | ||
float(lines[0]) | ||
except ValueError as e: | ||
errors.append(f"No header allowed for the scoring file! First value is not a number! {e}") | ||
|
||
# same length as dataset | ||
if manifest["inputDimensionality"].lower() == "univariate": | ||
data_path = Path("./data/dataset.csv") | ||
else: | ||
data_path = Path("./data/multi-dataset.csv") | ||
|
||
n_data = 0 | ||
with data_path.open("r") as fh: | ||
for _ in fh: | ||
n_data += 1 | ||
# substract header | ||
n_data -= 1 | ||
|
||
if len(lines) != n_data: | ||
errors.append("Scoring has wrong length; each input time step needs an anomaly score " | ||
f"(expected={n_data}, found={len(lines)})!") | ||
|
||
for error in errors: | ||
print(error, file=sys.stderr) | ||
|
||
if len(errors) > 0: | ||
exit(1) | ||
|
||
|
||
if __name__ == "__main__": | ||
args = sys.argv | ||
if len(args) != 2: | ||
raise ValueError("You have to spacify an algorithm name (directory / docker image name)!") | ||
|
||
main(args[1]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -e | ||
|
||
default_branch=main | ||
folder="${1:-.}" | ||
ignore_pattern="0-base-images|1-intermediate-images|2-scripts|data|results|Dockerfile|README.md|\..*|.*\.py|.*\.yml|.*\.sh|.*\.png" | ||
changes_in_basedir="" | ||
|
||
function echoerr () { | ||
echo "$@" >&2 | ||
} | ||
|
||
# GITHUB_EVENT_NAME=pull_request | ||
# GITHUB_BASE_REF=PR target branch (probably default branch) | ||
# GITHUB_HEAD_REF=PR source branch | ||
# GITHUB_REF=refs/pull/<pr_number>/merge | ||
# GITHUB_REF_TYPE=tag or branch | ||
# RUNNER_ARCH=X86, X64, ARM, or ARM64 | ||
# RUNNER_OD=Linux, Windows, or macOS | ||
|
||
# if this is a workflow for a PR targeting the default branch | ||
if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "$GITHUB_BASE_REF" == "$default_branch" ]]; then | ||
# build diff to main | ||
echoerr "Detected pipeline for a non-default branch (assuming pull request with target $GITHUB_BASE_REF)" | ||
git fetch origin || echoerr "Could not update remote 'origin'! Repository might be out of date." | ||
changes_in_basedir=$( git diff --name-only "refs/remotes/origin/$GITHUB_BASE_REF..HEAD" -- "$folder" | sed "s#${folder//\./\\.}/##" | cut -d '/' -f 1 ) | ||
#changes_in_basedir=$( git diff --name-only "$GITHUB_BASE_REF..HEAD" | cut -d '/' -f 1 ) | ||
|
||
# if this is a workflow for the default branch | ||
elif [[ "$GITHUB_EVENT_NAME" == "push" ]] && [[ "$GITHUB_BASE_REF" == "$default_branch" ]]; then | ||
# build latest commit for the default branch | ||
echoerr "Detected pipeline for default branch" | ||
#changes_in_basedir=$( git diff --name-only "$CI_COMMIT_BEFORE_SHA..$CI_COMMIT_SHA" ) | ||
changes_in_basedir=$( git diff --name-only HEAD~1..HEAD -- "$folder" | sed "s#${folder//\./\\.}/##" | cut -d '/' -f 1 ) | ||
|
||
# if this is a tag-workflow: build all algorithm images | ||
elif [[ "$GITHUB_EVENT_NAME" == "push" ]] && [[ "$GITHUB_REF_TYPE" == "tag" ]]; then | ||
echoerr "Detected pipeline for a tag" | ||
changes_in_basedir=$( ls -1 ) | ||
|
||
else | ||
echoerr "Cannot determine algorithm images to build! Please check the environment variables:" | ||
env | grep "GITHUB" >&2 && true | ||
echoerr "" | ||
fi | ||
|
||
# filter changes: remove non-algorithm-files/-folders and allow grep to find nothing (exit code 1) | ||
changed_algos=$( echo "$changes_in_basedir" | sort | uniq | grep -x -v -E "${ignore_pattern}" || [[ $? == 1 ]] ) | ||
# filter changes: remove non-existing algos (e.g. when branch is not up-to-date with default branch or an algorithm was removed) | ||
changed_algos=$( echo "$changed_algos" | while read -r f; do [[ -d "$folder/$f" ]] && echo "$f" || true; done ) | ||
|
||
if [[ -z "$changed_algos" ]]; then | ||
echoerr "No algorithm changed!" | ||
fi | ||
|
||
echoerr "Generating pipeline for algorithms: $(xargs <<<$changed_algos)" | ||
(jq -Rc '[.]' | jq -sc '{"algorithm_name": add}') <<<"${changed_algos}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -e | ||
|
||
folder="${1:-}" | ||
SEMVER_REGEX="^(0|[1-9][0-9]*)\\.(0|[1-9][0-9]*)\\.(0|[1-9][0-9]*)(\\-[0-9A-Za-z-]+(\\.[0-9A-Za-z-]+)*)?(\\+[0-9A-Za-z-]+(\\.[0-9A-Za-z-]+)*)?$" | ||
|
||
trim-and-validate() { | ||
local var="$*" | ||
# remove leading whitespace characters | ||
var="${var#"${var%%[![:space:]]*}"}" | ||
# remove trailing whitespace characters | ||
var="${var%"${var##*[![:space:]]}"}" | ||
|
||
# validate semver version string | ||
if [[ "$var" =~ $SEMVER_REGEX ]]; then | ||
printf '%s' "$var" | ||
else | ||
echo "Version $var is not a proper version string according to SemVer 'X.Y.Z(-PRERELEASE)(+BUILD)'!" >&2 | ||
exit 1 | ||
fi | ||
} | ||
|
||
if [[ -f "$folder/version.txt" ]]; then | ||
trim-and-validate "$( cat "$folder/version.txt" )" | ||
elif [[ -f "$folder/manifest.json" ]]; then | ||
trim-and-validate "$( jq -r '.version' "$folder/manifest.json" )" | ||
else | ||
echo "No version.txt or manifest.json present. Cannot determine Docker image version!" >&2 | ||
exit 1 | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
#!/usr/bin/env python3 | ||
import json | ||
import sys | ||
|
||
from pathlib import Path | ||
|
||
if __name__ == "__main__": | ||
args = sys.argv | ||
if len(args) != 2: | ||
raise ValueError("You have to specify an algorithm name (directory / docker image name)!") | ||
|
||
algorithm = args[1] | ||
manifest_path = Path(".") / algorithm / "manifest.json" | ||
with manifest_path.open("r") as fh: | ||
manifest = json.load(fh) | ||
|
||
value = manifest["inputDimensionality"] | ||
if value.lower() == "univariate": | ||
print("data/dataset.csv") | ||
elif value.lower() == "multivariate": | ||
print("data/multi-dataset.csv") | ||
else: | ||
raise ValueError(f"Input dimensionality ({value}) of {algorithm}'s manifest is unknown!") |
Oops, something went wrong.