diff --git a/.github/workflows/ur-build-hw.yml b/.github/workflows/ur-build-hw.yml index 7338135140551..5e713af23be4a 100644 --- a/.github/workflows/ur-build-hw.yml +++ b/.github/workflows/ur-build-hw.yml @@ -76,6 +76,12 @@ env: UR_LOG_OPENCL: "level:error;flush:error" jobs: + health-check: + name: Health Check + uses: ./.github/workflows/ur-health-check.yml + with: + runner_name: ${{ inputs.runner_name }} + adapter_build_hw: name: Build & CTS # run only on upstream; forks won't have the HW diff --git a/.github/workflows/ur-health-check.yml b/.github/workflows/ur-health-check.yml new file mode 100644 index 0000000000000..84ecc2b41463a --- /dev/null +++ b/.github/workflows/ur-health-check.yml @@ -0,0 +1,124 @@ +# This workflow monitors runners' health by checking their load average. +# It collects data from all runners before specific jobs and aggregates it into a JSON file from the whole day. + +name: UR Health Monitoring + +on: + workflow_call: + inputs: + runner_name: + required: true + type: string + schedule: + - cron: '0 0 * * *' # Runs daily at midnight + +jobs: + health-check: + if: github.event_name == 'workflow_call' + runs-on: ${{inputs.runner_name}} + steps: + - name: Check load average + id: check + run: | + DATE=$(date +"%Y-%m-%d_%H-%M-%S") + echo "DATE=${DATE}" >> $GITHUB_OUTPUT + mkdir -p artifacts + echo "Directory created, verifying..." + ls -la artifacts/ + uptime | awk -F'load average:' '{ print $2 }' > artifacts/${DATE}_${{ runner.name }}.txt + echo "File created, verifying contents..." + cat artifacts/${DATE}_${{ runner.name }}.txt + echo "File path: artifacts/${DATE}_${{ runner.name }}.txt" + ls -lh artifacts/${DATE}_${{ runner.name }}.txt + - name: Upload load average artifact + uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 + with: + name: ${{ steps.check.outputs.DATE }}_${{ runner.name }}_load-average + path: artifacts/${{ steps.check.outputs.DATE }}_${{ runner.name }}.txt + + aggregate-load-data: + if: github.event_name == 'schedule' + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Calculate yesterday's date + id: date + run: | + YESTERDAY=$(date -d "yesterday" +'%Y-%m-%d') + echo "YESTERDAY=$YESTERDAY" >> $GITHUB_OUTPUT + echo "Date for artifact search: $YESTERDAY" + + - name: Download all artifacts from yesterday's health-check runs + env: + GH_TOKEN: ${{ github.token }} + run: | + YESTERDAY="${{ steps.date.outputs.YESTERDAY }}" + echo "Searching for all workflow runs from $YESTERDAY" + + curl -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer $GH_TOKEN" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "https://api.github.com/repos/${{ github.repository }}/actions/workflows/ur-health-check.yml/runs?status=success&created=${YESTERDAY}" \ + | jq -r '.workflow_runs[].id' > run_ids.txt + + mkdir -p artifacts + + # Download artifacts from each run + while read run_id; do + echo "Downloading artifacts from run $run_id" + gh run download "$run_id" \ + --dir artifacts \ + --pattern "${YESTERDAY}*" \ + --repo "${{ github.repository }}" || echo "No matching artifacts in run $run_id" + done < run_ids.txt + + echo "All artifacts downloaded:" + + - name: Process and aggregate data + run: | + echo "Processing artifacts from ${{ steps.date.outputs.YESTERDAY }}" + ls -laR artifacts/ + echo "{" > aggregated_${{ steps.date.outputs.YESTERDAY }}.json + first_data=true + + for artifact_dir in artifacts/*/; do + for file in "$artifact_dir"*.txt; do + if [[ -f "$file" ]]; then + filename=$(basename "$file" .txt) + load=$(cat "$file" | xargs) + + if [ "$first_data" = true ]; then + echo " \"$filename\": \"$load\"" >> aggregated_${{ steps.date.outputs.YESTERDAY }}.json + first_data=false + else + echo ", \"$filename\": \"$load\"" >> aggregated_${{ steps.date.outputs.YESTERDAY }}.json + fi + fi + done + done + + echo "}" >> aggregated_${{ steps.date.outputs.YESTERDAY }}.json + + cat aggregated_${{ steps.date.outputs.YESTERDAY }}.json + + - name: Check if data exists + id: check_data + run: | + content=$(cat aggregated_${{ steps.date.outputs.YESTERDAY }}.json | tr -d ' \n') + if [ "$content" = "{}" ]; then + echo "has_data=false" >> $GITHUB_OUTPUT + echo "No data found, skipping artifact upload" + else + echo "has_data=true" >> $GITHUB_OUTPUT + echo "Data found, will upload artifact" + fi + + - name: Upload aggregated data artifact + if: steps.check_data.outputs.has_data == 'true' + uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 + with: + name: aggregated_data_${{ steps.date.outputs.YESTERDAY }} + path: aggregated_${{ steps.date.outputs.YESTERDAY }}.json