openqa-investigate

#!/bin/bash

# Usage
# echo jobnumber | openqa-investigate

# http://redsymbol.net/articles/unofficial-bash-strict-mode/
set -euo pipefail

# shellcheck source=/dev/null
. "$(dirname "$0")"/_common

host="${host:-"openqa.opensuse.org"}"
scheme="${scheme:-"https"}"
host_url="$scheme://$host"
dry_run="${dry_run:-"0"}"
verbose="${verbose:-"false"}"
prio_add="${prio_add:-"100"}"
exclude_name_regex="${exclude_name_regex:-":investigate:"}"
exclude_no_group="${exclude_no_group:-"true"}"
# exclude_group_regex checks a combined string "<parent job group name> / <job group name>"
exclude_group_regex="${exclude_group_regex:-"Development.*/ "}"
client_args=(api --header 'User-Agent: openqa-investigate (https://github.com/os-autoinst/scripts)' --host "$host_url")
jq_output_limit="${jq_output_limit:-15}"
curl_args=(--user-agent "openqa-investigate")
echoerr() { echo "$@" >&2; }

clone() {
    local origin id name_suffix refspec job_data unsupported_cluster_jobs name base_prio clone_settings casedir repo out url clone_id
    origin=${1:?"Need 'origin'"}
    id=${2:?"Need 'id'"}
    name_suffix=${3+":$3"}
    refspec=${4+$4}
    job_data=$(openqa-cli "${client_args[@]}" --json jobs/"$id")
    # shellcheck disable=SC2181
    [[ $? != 0 ]] && echoerr "unable to query job data for $id: $job_data" && return 1
    unsupported_cluster_jobs=$(echo "$job_data" | runjq -r '(.job.children["Parallel"] | length) + (.job.parents["Parallel"] | length) + (.job.children["Directly chained"] | length) + (.job.parents["Directly chained"] | length)') || return $?
    [[ $unsupported_cluster_jobs != 0 ]] \
        && echoerr "unable to clone job $id: it is part of a parallel or directly chained cluster (not supported)" && return 2
    name="$(echo "$job_data" | runjq -r '.job.test'):investigate$name_suffix" || return $?
    base_prio=$(echo "$job_data" | runjq -r '.job.priority') || return $?
    clone_settings=("TEST=$name" '_GROUP_ID=0' 'BUILD=')
    if [[ $refspec ]]; then
        casedir=$(echo "$job_data" | runjq -r '.job.settings.CASEDIR') || return $?
        [[ $casedir == null ]] && casedir=''
        repo=${casedir:-'https://github.com/os-autoinst/os-autoinst-distri-opensuse.git'}
        clone_settings+=("CASEDIR=${repo%#*}#${refspec}")
    fi
    [[ -n ${*:5} ]] && clone_settings+=("${@:5}")
    # clear "PUBLISH_" settings to avoid overriding production assets
    # shellcheck disable=SC2207
    clone_settings+=($(echo "$job_data" | runjq -r '.job.settings | keys[] | select (startswith("PUBLISH_")) | . + "="')) || return $?
    clone_settings+=("OPENQA_INVESTIGATE_ORIGIN=$host_url/t$origin")
    out=$($clone_call "$host_url/tests/$id" "${clone_settings[@]}")
    [[ $dry_run = 1 ]] && echo "$out"
    # output is in $out, should change that text to a markdown list entry
    url=$(echo "$out" | sed -n 's/^Created job.*-> //p')
    echo "* **$name**: $url"
    clone_id=${out/:*/}; clone_id=${clone_id/*#/}
    "${client_call[@]}" --json --data "{\"priority\": $((base_prio + prio_add))}" -X PUT jobs/"$clone_id" >/dev/null
}

trigger_jobs() {
    id="${1:?"Need 'job_id'"}"
    # for 1. current job/build + current test -> check if reproducible/sporadic
    clone "$id" "$id" 'retry' '' "${@:2}"

    job_url="$host_url/tests/$id"
    investigation=$(runcurl "${curl_args[@]}" -s "$job_url"/investigation_ajax) || return $?
    last_good_exists=$(echo "$investigation" | runjq -r '.last_good') || return $?
    if [[ "$last_good_exists" = "null" || "$last_good_exists" = "not found" ]]; then
        echo "No last good recorded, skipping regression investigation jobs" && return 1
    fi
    last_good=$(echo "$investigation" | runjq -r '.last_good.text') || return $?
    [[ ! $last_good =~ ^[0-9]+$ ]] && echo ".last_good.text not found: investigation for test $id returned '$investigation'" >&2 && return 1

    # for 2. current job/build + last good test (+ last good needles) ->
    #   check for test (+needles) regression
    test_log=$(echo "$investigation" | runjq -r '.test_log') || return $?
    if echo "$test_log" | grep -q "No.*changes recorded"; then
        echo "$test_log. Skipping test regression investigation job."
        last_good_tests=''
    else
        vars_last_good=$(runcurl "${curl_args[@]}" -s "$host_url/tests/$last_good"/file/vars.json) || return $?
        last_good_tests=$(echo "$vars_last_good" | runjq -r '.TEST_GIT_HASH') || return $?
        # here we could apply the same approach for needles, not only tests
        # With https://github.com/os-autoinst/os-autoinst/pull/1358 we could
        # theoretically use TEST_GIT_REFSPEC but this would act on the shared
        # test case dir within either the common openQA folder or the common
        # worker cache and hence influence other tests.
        # So better we use CASEDIR with a git refspec, only slightly less
        # efficient and also needing to know which git repo to use
        #refspec_arg="TEST_GIT_REFSPEC=$last_good_tests"
        refspec_arg=$last_good_tests
        clone "$id" "$id" "last_good_tests:$last_good_tests" "$refspec_arg" "${@:2}"
    fi

    # 3. last good job/build + current test -> check for product regression
    if ! echo "$investigation" | grep -q '\<BUILD\>'; then
        echo "Current job has same build as last good, product regression unlikely. Skipping product regression investigation job."
        last_good_build=''
    else
        vars_last_good=${vars_last_good:-$(runcurl "${curl_args[@]}" -s "$host_url/tests/$last_good"/file/vars.json)} || return $?
        last_good_build=$(echo "$vars_last_good" | runjq -r '.BUILD') || return $?
        # here we clone with unspecified test refspec, i.e. this could be a
        # more recent tests version. As an alternative we could explicitly
        # checkout the git version from "first bad"
        clone "$id" "$last_good" "last_good_build:$last_good_build" '' "${@:2}"
    fi

    # 4. last good job/build + last good test -> check for other problem
    #    sources, e.g. infrastructure
    if [[ -z $last_good_tests ]]; then
        echo "No test regression expected. Not triggered 'good build+test' as it would be the same as 3., good build + current test"
    elif [[ -z $last_good_build ]]; then
        echo "No product regression expected. Not triggered 'good build+test' as it would be the same as 2., current build + good test"
    else
        clone "$id" "$last_good" "last_good_tests_and_build:$last_good_tests+$last_good_build" "$refspec_arg" "${@:2}"
    fi
}

# crosscheck
# 1. current job/build + current test -> check if reproducible/sporadic
# 2. current job/build + last good test (+ last good needles) -> check for
#    test (+needles) regression
# 3. last good job/build + current test -> check for product regression
# 4. last good job/build + last good test -> check for other problem
#    sources, e.g. infrastructure
investigate() {
    id="${1##*/}"

    job_data=$(openqa-cli "${client_args[@]}" --json jobs/"$id")
    # shellcheck disable=SC2181
    [[ $? != 0 ]] && echoerr "unable to query job data for $id: $job_data" && return 1
    old_name="$(echo "$job_data" | runjq -r '.job.test')" || return $?
    if [[ "$old_name" =~ ":investigate:" ]]; then
        [[ ! "$old_name" =~ investigate:retry$ ]] && echo "Job is ':investigate:' already, skipping investigation" && return 0
        investigate_origin="$(echo "$job_data" | runjq -r '.job.settings.OPENQA_INVESTIGATE_ORIGIN')" || return 1
        origin_job_id=${investigate_origin#$host_url/t}
        # meanwhile the original job might have been deleted already, handle
        # gracefully
        out=$("${client_call[@]}" -X POST jobs/"$origin_job_id"/comments text="Investigate retry job: $host_url/t$id failed, likely not a sporadic test" 2>/dev/null) || \
            if [[ $(echo "$out" | runjq .error_status) != "404" ]]; then
                echoerr "Unexpected error encountered when posting comments: '$out'"
                return 2
            fi
        return 0
    fi
    [[ "$old_name" =~ $exclude_name_regex ]] && echo "Job name '$old_name' matches \$exclude_name_regex '$exclude_name_regex', skipping investigation" && return 0
    group="$(echo "$job_data" | runjq -r '.job.parent_group + " / " + .job.group')" || return $?
    [[ "$group" = " / " ]] && [[ "$exclude_no_group" = "true" ]] && echo "Job w/o job group, \$exclude_no_group is set, skipping investigation" && return 0
    [[ "$group" =~ $exclude_group_regex ]] && echo "Job group '$group' matches \$exclude_group_regex '$exclude_group_regex', skipping investigation" && return 0

    # Optionally we can find "first failed", could extend openQA investigation
    # method instead for we are just working based on supplied job which can
    # have more, ambiguous potential changes that we need to bisect on

    out=$(trigger_jobs "$id" "${@:2}")
    $verbose && echo "$0, id: '$id', out: '$out'"
    [[ $out ]] || return 0
    comment="Automatic investigation jobs:

$out"
    "${client_call[@]}" -X POST jobs/"$id"/comments text="$comment"
}

main() {
    client_prefix=''
    [ "$dry_run" = "1" ] && client_prefix="echo"
    set +u
    if [[ -z "$client_call" ]]; then
        client_call=(openqa-cli "${client_args[@]}")
        if [[ -n "$client_prefix" ]]; then
            client_call=("$client_prefix" "${client_call[@]}")
        fi
    fi
    set -u
    clone_call="${clone_call:-"$client_prefix openqa-clone-job --skip-chained-deps --within-instance"}"
    error_count=0
    # shellcheck disable=SC2013
    for i in $(cat - | sed 's/ .*$//'); do
        investigate "$i" "$@" || ((error_count++)) ||:
    done
    exit "$error_count"
}

main "$@"