From 13903203c34b1e44fd5e9bbbd842d8dfea655e82 Mon Sep 17 00:00:00 2001 From: Adam Farley Date: Fri, 22 Dec 2023 13:23:05 +0000 Subject: [PATCH] Creating regex logic for build autotriager (#3576) This change allows us to guess at the cause for each failure, enabling rapid prioritisation when triaging. Also includes various fixes, comments, and a new readme. Signed-off-by: Adam Farley --- .github/workflows/build-autotriage.yml | 3 +- tooling/build_autotriage/README.md | 57 ++++++++ .../build_autotriage/autotriage_regexes.sh | 136 +++++++++++++++++- tooling/build_autotriage/build_autotriage.sh | 130 ++++++++++++----- 4 files changed, 287 insertions(+), 39 deletions(-) create mode 100644 tooling/build_autotriage/README.md diff --git a/.github/workflows/build-autotriage.yml b/.github/workflows/build-autotriage.yml index 296807426..5d4ff3d59 100644 --- a/.github/workflows/build-autotriage.yml +++ b/.github/workflows/build-autotriage.yml @@ -14,10 +14,11 @@ jobs: Label: runs-on: ubuntu-latest name: Run Build Triage + if: github.repository == 'adoptium/temurin-build' steps: - uses: actions/checkout@v3 - name: "Run Build Auto Triage" - run: bash "${PWD}/${TRIAGE_SCRIPT}" jdk8u jdk11u jdk17u jdk21u jdk22head + run: bash "${PWD}/${TRIAGE_SCRIPT}" jdk8u jdk11u jdk17u jdk21u jdk22 jdk23head - name: Create Issue From File env: diff --git a/tooling/build_autotriage/README.md b/tooling/build_autotriage/README.md new file mode 100644 index 000000000..3065503e7 --- /dev/null +++ b/tooling/build_autotriage/README.md @@ -0,0 +1,57 @@ +# Readme for Build Auto-Triage Tool + +## Summary + +This tool generates links to all of the latest build failures for Eclipse Temurin™ at Adoptium. + +It also includes the likely cause of each failure, allowing for efficient triage. + +## Details + +When passed one or more jdk major versions, this script identifies the latest attempts to build Eclipse Temurin™ at +the Adoptium project, and it returns links and triage information for all the failed/aborted-state builds. + +Benefits of using this script include: + +- The ability to efficiently focus your time on failures that are the most important to you. +- The ability to share a community-wide view of the current build health. +- The ability to quickly identify and resolve failures. +- The ability to spot missing platforms for a specific major version (see the "Script Issues" section in the created issue/file). +- Conversely, the ability to know when you're building platforms you shouldn't be building. +- The ability to quickly identify the latest Temurin pipelines + +## Instructions + +bash build_autotriage.sh jdk8u jdk11u jdk17u jdk21u jdk22head + +## Output + +This script generates a file in Markdown format. + +The output is designed to be used by a git action to populate a new GitHub issue. + +## Developer tips + +Developers should add the following temporary code snippet into the build-autotriage.yml file while developing a change: + +```YAML + push: + paths: + - '**build-autotriage.yml' + - '**build_autotriage.sh' + - '**autotriage_regexes.sh' +``` + +This should begin at the line immediately after the cron command. + +This temporary change will automatically run the GitHub action every time you push a change set, allowing easy testing. + +For this to work, you need to have GitHub actions and issues enabled in your repository. + +*Make sure to remove this before pushing your change upstream.* + +## Associated file breakdown + +- build_autotriage.sh: This is the main script, containing most of the logic. +- autotriage_regexes.sh: This contains all of the regular expressions used to identify failures. +- build-autotriage.yml: This is the git action that runs the main script and generates an issue from the output. diff --git a/tooling/build_autotriage/autotriage_regexes.sh b/tooling/build_autotriage/autotriage_regexes.sh index 9a2a5999c..2556809b5 100644 --- a/tooling/build_autotriage/autotriage_regexes.sh +++ b/tooling/build_autotriage/autotriage_regexes.sh @@ -22,8 +22,136 @@ # ################################################################################ -# declare -a arrayOfRegexes -# declare -a arrayOfRegexMetadata -# declare -a arrayOfRegexPreventability +# Regular expressions to match a single line of jenkins job console output. +declare -a arrayOfRegexes +# A short description of the sort of error we're dealing with. Can contain URLs. Markdown format. +declare -a arrayOfRegexMetadata +# 0 = This issue was preventable, and 1 = This issue was not preventable. +declare -a arrayOfRegexPreventability +# 0 = This issue was probably a build failure, and 1 = This issue was probably a test failure. +declare -a arrayOfFailureSources -# TODO. \ No newline at end of file +storeInArrays() { + arrayOfRegexes+=("${1}") + arrayOfRegexMetadata+=("${2}") + arrayOfRegexPreventability+=("${3}") + arrayOfFailureSources+=("${4}") +} + +echo "Generating regex arrays to match against failures." + +r="SIGSEGV" +m="Segmentation error." +p="1" +s="0" +storeInArrays "${r}" "${m}" "${p}" "${s}" + +r="No.space.left.on.device" +m="Out of disk space." +p="0" +s="0" +storeInArrays "${r}" "${m}" "${p}" "${s}" + +r="(insufficient.memory|Out.of.system.resources|Out.?of.?Memory.?Error)" +m="Out of memory." +p="1" +s="0" +storeInArrays "${r}" "${m}" "${p}" "${s}" + +r="Read\-only\sfile\ssystem" +m="Read-only file system." +p="0" +s="0" +storeInArrays "${r}" "${m}" "${p}" "${s}" + +r="(was.marked.offline\:.Connection.was.broken|Unexpected.termination.of.the.channel)" +m="Lost connection to machine." +p="1" +s="0" +storeInArrays "${r}" "${m}" "${p}" "${s}" + +r="(Failed.to.connect.to.github\.com|archive.is.not.a.ZIP.archive)" +m="Download failed." +p="1" +s="0" +storeInArrays "${r}" "${m}" "${p}" "${s}" + +r="(Program.*timed.out|Agent.[0-9]+.timed.out.with.a.timeout.of)" +m="Timeout." +p="1" +s="0" +storeInArrays "${r}" "${m}" "${p}" "${s}" + +r="there.are.rogue.processes.kicking.about" +m="ProcessCatch found something." +p="1" +s="0" +storeInArrays "${r}" "${m}" "${p}" "${s}" + +r="No.such.device" +m="No such device." +p="0" +s="0" +storeInArrays "${r}" "${m}" "${p}" "${s}" + +r="Build.Test_openjdk.*completed\:.(FAILURE|ABORTED)" +m="Post-build AQATest subjob failed." +p="1" +s="1" +storeInArrays "${r}" "${m}" "${p}" "${s}" + +r="Build.*SmokeTests.*completed\:.(FAILURE|ABORTED)" +m="Smoke test failed." +p="1" +s="1" +storeInArrays "${r}" "${m}" "${p}" "${s}" + +r="Build.*create_installer.*\#[0-9]+.completed\:.(FAILURE|ABORTED)" +m="Installer subjob failed." +p="1" +s="0" +storeInArrays "${r}" "${m}" "${p}" "${s}" + +r="Build.*\sign\_.*.\#[0-9].completed\:.(FAILURE|ABORTED)" +m="Signing subjob failed." +p="1" +s="0" +storeInArrays "${r}" "${m}" "${p}" "${s}" + +r="Build.*\#[0-9].completed\:.(FAILURE|ABORTED)" +m="Subjob failed. It was not a test, installer, or signing job." +p="1" +s="0" +storeInArrays "${r}" "${m}" "${p}" "${s}" + +r="(permission.denied|AccessDeniedException)" +m="AccessDeniedException or Permission Denied" +p="0" +s="0" +storeInArrays "${r}" "${m}" "${p}" "${s}" + +r="Error.creating.temporary.file" +m="Error creating temporary file." +p="0" +s="0" +storeInArrays "${r}" "${m}" "${p}" "${s}" + +r="(Unable.to.delete|Could.not.create.(file|directory))" +m="Error creating/deleting a file" +p="0" +s="0" +storeInArrays "${r}" "${m}" "${p}" "${s}" + +r="return\scode\s[1-9]+" +m="." +p="1" +s="0" +storeInArrays "${r}" "${m}" "${p}" "${s}" + +r="(Error\:\s|Exception\:\s)" +m="Error/exception found." +p="1" +s="0" +storeInArrays "${r}" "${m}" "${p}" "${s}" + +echo "Regex arrays ready." diff --git a/tooling/build_autotriage/build_autotriage.sh b/tooling/build_autotriage/build_autotriage.sh index 8ab8d2e74..430ddd59a 100644 --- a/tooling/build_autotriage/build_autotriage.sh +++ b/tooling/build_autotriage/build_autotriage.sh @@ -16,22 +16,24 @@ ################################################################################ # -# This script turns a list of Temurin build jobs into two things: -# 1. A markdown summary table that gives pass and fail numbers. -# 2. A list of each failing job/subjob link, plus information that can -# help identify the specific issue causing the failure. +# This script takes a list of JDK major versions and outputs a list of +# the latest failed attempts to build Temurin at the Eclipse Adoptium project. +# We then use a series of regular expressions to identify the cause of each +# failure, and to output useful information to aid triage. # ################################################################################ declare -a arrayOfFailedJobs -# declare -a arrayOfFailedJobRegexs +declare -a arrayOfRegexsForFailedJobs +declare -a arrayOfErrorLinesForFailedJobs declare -a arrayOfAllJDKVersions declare -a arrayOfUs declare -a buildIssues headJDKVersion=9999 -# Imports arrayOfRegexes, arrayOfRegexMetadata, and arrayOfRegexPreventability +# Imports a series of arrays related to the regular expressions we use to recognise failures: +# arrayOfRegexes, arrayOfRegexMetadata, arrayOfRegexPreventability, and arrayOfFailureSources . ./tooling/build_autotriage/autotriage_regexes.sh # All temurin-available platforms. @@ -41,6 +43,9 @@ declare -a platformStart # The last jdk major version on that platform ("99" for ongoing). declare -a platformEnd +totalBuildFailures=0 +totalTestFailures=0 + temurinPlatforms+=("aix-ppc64"); platformStart+=(8); platformEnd+=(99) temurinPlatforms+=("alpine-linux-aarch64"); platformStart+=(21); platformEnd+=(99) temurinPlatforms+=("alpine-linux-x64"); platformStart+=(8); platformEnd+=(99) @@ -56,6 +61,7 @@ temurinPlatforms+=("solaris-x64"); platformStart+=(8); platformEnd+=(8 temurinPlatforms+=("windows-x64"); platformStart+=(8); platformEnd+=(99) temurinPlatforms+=("windows-x86-32"); platformStart+=(8); platformEnd+=(17) +# This stores any error messages that did not terminate the triage script altogether. errorLog() { buildIssues+=("$1") echo "ERROR FOUND: Issue ${#buildIssues[@]}: $1" @@ -111,6 +117,10 @@ identifyFailedBuildsInTimerPipelines() { latestTimerJenkinsJobID="" oldIFS=$IFS IFS="," + + # Here we identify the latest pipeline that wasn't run by a user. + # This is to avoid triaging a pipeline that uses a non-standard framework, and is + # therefore not representative of the quality of Temurin pipelines during a release. for jsonEntry in $latestTimerPipelineRaw do if [[ $jsonEntry =~ ^\[\{\"_id\".* ]]; then @@ -171,7 +181,8 @@ identifyFailedBuildsInTimerPipelines() { IFS=$oldIFS - # Now iterate over platforms to make sure we're launching every platform we should. + # Now iterate over platforms to make sure we're launching every platform we should, + # and that we're not running builds for any platform we shouldn't be. triageThesePlatforms="," for p in "${!temurinPlatforms[@]}" do @@ -199,12 +210,14 @@ identifyFailedBuildsInTimerPipelines() { triageThesePlatforms+="${jdkJenkinsJobVersion}-${temurinPlatforms[p]}-temurin," done - if [[ ${#triageThesePlatforms[@]} -gt 1 ]]; then + if [[ ${triageThesePlatforms} = "" ]]; then errorLog "Cannot find any valid build platforms launched by jdk ${arrayOfAllJDKVersions[v]}${arrayOfUs[v]} pipeline ${latestTimerJenkinsJobID}. Skipping to the next jdk version." continue fi echo "Platforms validated. Identifying build numbers for these platforms: ${triageThesePlatforms:1:-1}" + # Iterate over the platforms we need to triage and find the build numbers for + # any build that failed or was aborted (includes propagated test failures). for b in "${!listOfBuildNames[@]}" do if [[ $triageThesePlatforms =~ .*,${listOfBuildNames[$b]},.* ]]; then @@ -227,15 +240,45 @@ identifyFailedBuildsInTimerPipelines() { # Takes a single failed jenkins build job URL as a string, and identifies the source of # the failure if possible. +# Uses: arrayOfRegexes, arrayOfRegexMetadata, arrayOfRegexPreventability buildFailureTriager() { - echo "Attempting to triage a job: ${1}" - echo "- Failed job: ${1}" >> build_triage_output.md - # Todo: Iterate over the failures found and triage them against the pending array of regexes. - # For now we'll put them in a tidy md-style file for issue inclusion. - + echo "Triaging jobs now." + # Iterate over the failures found and triage them against the pending array of regexes. + for failedJob in "${arrayOfFailedJobs[@]}"; do + wget -q -O - "${failedJob}/consoleText" > ./jobOutput.txt + # If the file size is beyond 50m bytes, then report script error and do not triage, for efficiency. + fileSize=$(wc -c < ./jobOutput.txt) + if [[ ${fileSize} -gt 52500000 ]]; then + arrayOfRegexsForFailedJobs+=("Unmatched") + arrayOfErrorLinesForFailedJobs+=("Output size was ${fileSize} bytes") + totalBuildFailures=$((totalBuildFailures+1)) + continue + fi + while IFS= read -r jobOutputLine; do + for regexIndex in "${!arrayOfRegexes[@]}"; do + # When a regex matches, store the id of the regex we matched against, and also the line of output that matched the regex. + if [[ "$jobOutputLine" =~ ${arrayOfRegexes[regexIndex]} ]]; then + arrayOfRegexsForFailedJobs+=("$regexIndex") + arrayOfErrorLinesForFailedJobs+=("$jobOutputLine") + if [[ ${arrayOfFailureSources[regexIndex]} = 0 ]]; then + totalBuildFailures=$((totalBuildFailures+1)) + else + totalTestFailures=$((totalTestFailures+1)) + fi + continue 3 + fi + done + done < ./jobOutput.txt + # If we reach this line, then we have not matched any of the regexs + arrayOfRegexsForFailedJobs+=("Unmatched") + arrayOfErrorLinesForFailedJobs+=("No error found") + totalBuildFailures=$((totalBuildFailures+1)) + done + echo "Triage has ended." } -startOutputFile() { +# Stores everything we've found in a markdown-formatted file. +generateOutputFile() { { echo "---"; echo "name: Build Issue Summary"; echo "about: For triaging the nightly and weekend build failures"; @@ -243,6 +286,43 @@ startOutputFile() { echo "labels: 'weekly-build-triage'"; echo "---"; echo ""; + echo "# Summary" + echo "Build failures: ${totalBuildFailures}" + echo "Test failures: ${totalTestFailures}" + echo "" + if [[ ${#arrayOfFailedJobs[@]} -gt 0 ]]; then + echo "# Failed Builds" + for failedJobIndex in "${!arrayOfFailedJobs[@]}" + do + regexID="${arrayOfRegexsForFailedJobs[failedJobIndex]}" + echo "Failure: ${arrayOfFailedJobs[failedJobIndex]}" + if [[ ${regexID} =~ Unmatched ]]; then + echo "Cause: ${arrayOfErrorLinesForFailedJobs[failedJobIndex]}" + else + echo "Cause: ${arrayOfRegexMetadata[regexID]}" + preventable="yes" + if [[ "${arrayOfRegexPreventability[regexID]}" -gt 0 ]]; then + preventable="no" + fi + echo "Preventable: ${preventable}" + echo "\`\`\`" + echo "${arrayOfErrorLinesForFailedJobs[failedJobIndex]}" + echo "\`\`\`" + fi + echo "" + done + echo "# End of list" + else + echo "All build jobs passed. Huzzah!" + fi + if [[ ${#buildIssues[@]} -gt 0 ]]; then + echo "# Script Issues" + for issueID in "${!buildIssues[@]}" + do + echo "- Issue ${issueID}: ${buildIssues[issueID]}" + done + echo "# End of Issues" + fi } >> build_triage_output.md } @@ -254,26 +334,8 @@ argumentParser "$@" identifyFailedBuildsInTimerPipelines -startOutputFile +buildFailureTriager -if [[ ${#arrayOfFailedJobs[@]} -gt 0 ]]; then - echo "# Failed Builds" >> build_triage_output.md - for failedJob in "${arrayOfFailedJobs[@]}" - do - buildFailureTriager "$failedJob" - done - echo "# End of list" >> build_triage_output.md -else - echo "All build jobs passed. Huzzah!" -fi - -if [[ ${#buildIssues[@]} -gt 0 ]]; then - echo "# Script Issues" >> build_triage_output.md - for issueID in "${!buildIssues[@]}" - do - echo "- Issue ${issueID}: ${buildIssues[issueID]}\n" >> build_triage_output.md - done - echo "# End of Issues" >> build_triage_output.md -fi +generateOutputFile echo "Build AutoTriage is complete."