From 8ff79936622fe8860385e5424811f37d33b782e1 Mon Sep 17 00:00:00 2001 From: Michael Sarahan Date: Wed, 13 Nov 2024 22:00:24 -0600 Subject: [PATCH] Use OpenTelemetry python API; simplify actions --- .github/workflows/test-artifact-cleanup.yml | 24 ++ README.md | 133 ++++++-- .../action.yml | 29 -- telemetry-dispatch-setup/action.yml | 27 ++ .../action.yml | 18 +- .../action.yml | 20 ++ telemetry-dispatch-summarize/action.yml | 15 + telemetry-dispatch-write-summary/action.yml | 31 -- telemetry-impls/clean-up-artifacts/action.yml | 52 ++++ .../ensure-otel-cli-available/action.yml | 30 -- .../github-actions-job-info/action.yml | 83 ++--- telemetry-impls/load-base-env-vars/action.yml | 2 +- telemetry-impls/load-then-clone/action.yml | 47 +++ telemetry-impls/sanity-checks/action.yml | 24 -- .../set-otel-service-name/action.yml | 2 +- .../stash-base-env-vars/action.yml | 18 +- .../stash-job-attributes/action.yml | 50 +++ telemetry-impls/summarize/action.yml | 160 ++-------- telemetry-impls/summarize/bump_time.py | 40 +++ telemetry-impls/summarize/requirements.txt | 4 + telemetry-impls/summarize/send_trace.py | 292 ++++++++++++++++++ telemetry-impls/traceparent.sh | 42 +++ telemetry-impls/traceparent/action.yml | 43 --- 23 files changed, 802 insertions(+), 384 deletions(-) create mode 100644 .github/workflows/test-artifact-cleanup.yml delete mode 100644 telemetry-dispatch-load-base-env-vars/action.yml create mode 100644 telemetry-dispatch-setup/action.yml create mode 100644 telemetry-dispatch-stash-job-attributes/action.yml create mode 100644 telemetry-dispatch-summarize/action.yml delete mode 100644 telemetry-dispatch-write-summary/action.yml create mode 100644 telemetry-impls/clean-up-artifacts/action.yml delete mode 100644 telemetry-impls/ensure-otel-cli-available/action.yml create mode 100644 telemetry-impls/load-then-clone/action.yml delete mode 100644 telemetry-impls/sanity-checks/action.yml create mode 100644 telemetry-impls/stash-job-attributes/action.yml create mode 100644 telemetry-impls/summarize/bump_time.py create mode 100644 telemetry-impls/summarize/requirements.txt create mode 100644 telemetry-impls/summarize/send_trace.py create mode 100755 telemetry-impls/traceparent.sh delete mode 100644 telemetry-impls/traceparent/action.yml diff --git a/.github/workflows/test-artifact-cleanup.yml b/.github/workflows/test-artifact-cleanup.yml new file mode 100644 index 00000000..6b4e0b8f --- /dev/null +++ b/.github/workflows/test-artifact-cleanup.yml @@ -0,0 +1,24 @@ +name: test-artifact-cleanup +# During this workflow, we upload a file that follows the 'telemetry-tools-*' +# After running the clean-up-artifacts action, the artifact should no longer show up +# in the web UI. + +on: + workflow_dispatch: + +jobs: + telemetry-setup: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Create dummy file + shell: bash + run: echo "Dumbo" > file.txt + - name: Upload dummy file + uses: actions/upload-artifact@v4 + with: + name: telemetry-tools-attrs-1234 + path: file.txt + - name: Clean up telemetry intermediary artifacts + uses: ./telemetry-impls/clean-up-artifacts diff --git a/README.md b/README.md index aecbad81..0046847e 100644 --- a/README.md +++ b/README.md @@ -1,66 +1,79 @@ # shared-actions -Contains all of the shared composite actions used by RAPIDS. +Contains all of the shared composite actions used by RAPIDS. Several of these actions, +especially the telemetry actions, use a pattern that we refer to as "dispatch actions." +The general idea of a dispatch action is to make it easier to depend on other actions +at a specific revision, and also to simplify using files beyond a given action .yml file. + +A dispatch action is one that: +* clones the shared-actions repository (repo/ref changeable using env vars) +* runs (dispatches to) another action within the clone, using a relative path + +There can be more complicated arrangements of more actions, but the idea is to +have the local clone of the shared-actions repository be the first step of an action. Actions that refer to each other assume that they have been checked out to the ./shared-actions folder. This *should* be the root of the GitHub Actions workspace. This assumption is what allow code reuse between actions. -In general, we should try to never call "implementation actions" here. Instead, -we should prefer to create "dispatch actions" that clone shared-actions from a particular repo -at a particular ref, and then dispatch to an implementation action from that repo. -This adds complexity, but has other advantages: - -* simplifies specifying a custom branch for actions for development and testing -* changes all shared-actions calls in a workflow at once, instead of changing each one -* allows reuse of shared-actions within the shared-actions repo. Trying to use these - without the clone and relative path would not otherwise keep the repo and ref - consistent, leading to great confusion over why changes aren't being reflected. +Actions that use this pattern should include "dispatch" in their folder name, so +that they can be readily distinguished from any actions that are either +standalone or otherwise implementations that assume that the ./shared-actions +folder is already cloned, so that they can use relative paths to reference other +actions and files. ## Example dispatch action ```yaml -name: 'Example dispatch action' +name: 'dispatch-example-action' description: | The purpose of this wrapper is to keep it easy for external consumers to switch branches of the shared-actions repo when they are changing something about shared-actions and need to test it in their pipelines. - Inputs here are all assumed to be env vars set outside of this script. - Set them in your main repo's workflows. - runs: using: 'composite' steps: - name: Clone shared-actions repo uses: actions/checkout@v4 with: - repository: ${{ env.SHARED_ACTIONS_REPO}} - ref: ${{ env.SHARED_ACTIONS_REF}} + repository: ${{ env.SHARED_ACTIONS_REPO }} + ref: ${{ env.SHARED_ACTIONS_REF }} path: ./shared-actions - - name: Stash base env vars - uses: ./shared-actions/_stash-base-env-vars + - name: Run local implementation action + uses: ./shared-actions/impls/example-action ``` In this action, the "implementation action" is the -`./shared-actions/_stash-base-env-vars`. You can have inputs in your +`./shared-actions/impls/example-action`. You can have inputs in your dispatch actions. You would just pass them through to the implementation action. Environment variables do carry through from the parent workflow through the -dispatch action, into the implemetation action. In most cases, it is simpler +dispatch action, and then into the implemetation action. In most cases, it is simpler (though less explicit) to set environment variables instead of plumbing inputs through each action. -Environment variables are hard-coded, not detected. If you want to pass a different -environment variable through, you need to add it to implementation stash action, -like `telemetry-impls/stash-base-env-vars/action.yml`. You do not need to -explicitly specify it on the loading side. - ## Implementation action These are similar to dispatch actions, except that they should not clone shared-actions. They can depend on other actions from the shared-actions repository using the `./shared-actions` relative path. +```yaml +name: 'example-action' +description: | + An example of calling a python script in an action. Both the action + and the python file are part of the shared-actions repo. + +runs: + using: 'composite' + steps: + - name: Run local action + uses: ./shared-actions/impls/another-action + - name: Run local script file + run: python -c "./shared-actions/impls/hello.py" + shell: bash +``` + ## Example calling workflow The key detail here is that the presence of the SHARED_ACTIONS_REPO and/or @@ -76,9 +89,69 @@ env: jobs: actions-user: runs-on: ubuntu-latest + steps: + - name: Call dispatch example + # DO NOT change the branch here (@main) in PRs + uses: rapidsai/shared-actions/dispatch-example-action@main +``` + +This works because the environment variables get passed into the shared action. They are then +used by the `actions/checkout` action, taking priority over the default values. + +## Calling in child shared workflows + +Shared workflows complicate matters because environment variables do not get +passed through. If you set the `SHARED_ACTIONS_REPO` and/or `SHARED_ACTIONS_REF` +variables in the top-level parent workflow, they will not take effect in any +dispatch actions that you may call in child workflows. You can pass them as inputs +to child shared workflows, but that ends up being very verbose. + +To carry this information into child workflows, we use a scheme that writes a +file with environment variables, uploads this file as an artifact, then downloads +and loads the file at the start of the child workflow. + +The general scheme is: + +### Top-level workflow +```yaml +jobs: + setup-env-vars: + runs-on: ubuntu-latest + steps: + # implicitly picks up env vars for SHARED_ACTIONS_REPO and _REF + - uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env@main + + + + summarize-telemetry: + needs: + # private networks will affect your choice here. If your tempo server or + # forwarder/collector is only accessible on some instances, then use one of + # those instances here + runs-on: + steps: + - uses: rapidsai/shared-actions/telemetry-dispatch-summarize@main + # if you use mTLS, this is probably the right place to pass in the certificates +``` + +### Child workflows +```yaml +jobs: + tests: + strategy: + matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }} + runs-on: "linux-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-1" steps: - name: Telemetry setup - id: telemetry-setup - # DO NOT change this in PRs - uses: rapidsai/shared-actions/dispatch-script@main -``` \ No newline at end of file + uses: rapidsai/shared-actions/telemetry-dispatch-setup@main + continue-on-error: true + extra_attributes: "rapids.cuda=${{ matrix.CUDA_VER }},rapids.py=${{ matrix.PY_VER }}" + + +``` + +Behind the scenes, the implementation actions are: +* ./telemetry-impls/stash-base-env-vars: storing base environment variables (including setting default values): +* ./telemetry-impls/load-then-clone: Downloads base env var file, loads it, then + clones shared-actions according to env vars that were just loaded +* ./telemetry-impls/summarize: Runs Python script to parse GitHub logs and send OpenTelemetry spans to endpoint diff --git a/telemetry-dispatch-load-base-env-vars/action.yml b/telemetry-dispatch-load-base-env-vars/action.yml deleted file mode 100644 index b3e3b4a5..00000000 --- a/telemetry-dispatch-load-base-env-vars/action.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: dispatch-load-base-env-vars -description: | - Wrapper that clones a specific branch/ref of the shared-actions repo, then - calls the `load-base-env-vars` action to download the base environment - variables and load them into the current environment. - - This does not overwrite any environment variables that are already set. -inputs: - load_service_name: - description: | - If true, loads OTEL_SERVICE_NAME from the stashed env vars. This is used for top-level workflows. - Otherwise the telemetry service name is obtained from Github job metadata. - Getting the service name from Github job metadata is for child workflows. - default: 'false' - -runs: - using: 'composite' - steps: - - name: Clone shared-actions repo - uses: actions/checkout@v4 - with: - repository: ${{ env.SHARED_ACTIONS_REPO || 'rapidsai/shared-actions' }} - ref: ${{ env.SHARED_ACTIONS_REF || 'main' }} - path: ./shared-actions - - name: Set OTEL_SERVICE_NAME from job if not loading from stash - if: ${{ inputs.load_service_name != 'true' }} - uses: ./shared-actions/telemetry-impls/set-otel-service-name - - name: Load base env vars - uses: ./shared-actions/telemetry-impls/load-base-env-vars diff --git a/telemetry-dispatch-setup/action.yml b/telemetry-dispatch-setup/action.yml new file mode 100644 index 00000000..c4173517 --- /dev/null +++ b/telemetry-dispatch-setup/action.yml @@ -0,0 +1,27 @@ +name: telemetry-dispatch-setup +description: | + This script sets important environment variables that may be used by tools that + implement OpenTelemetry. This script also stores attributes (metadata) for the + current job, so that this metadata can be associated with spans during the final + parsing of job metadata. + + This action should be called at the beginning of child workflows, generally as the first + step in any job other than computing the matrix. + +inputs: + extra_attributes: + description: "comma-separated key=value attributes to associate with the current job" + +runs: + using: 'composite' + steps: + - uses: rapidsai/shared-actions/telemetry-impls/load-then-clone@empty-certs + # overrides loaded value + - name: Set OTEL_SERVICE_NAME from job + uses: ./shared-actions/telemetry-impls/set-otel-service-name + - name: Store attributes to use as metadata when creating spans + # This also sets OTEL_RESOURCE_ATTRIBUTES, for any subsequent steps + # in the calling workflow that might use it. + uses: ./shared-actions/telemetry-impls/stash-job-attributes + with: + extra_attributes: ${{ inputs.extra_attributes }} diff --git a/telemetry-dispatch-stash-base-env-vars/action.yml b/telemetry-dispatch-stash-base-env-vars/action.yml index 085ba35d..4d1eed22 100644 --- a/telemetry-dispatch-stash-base-env-vars/action.yml +++ b/telemetry-dispatch-stash-base-env-vars/action.yml @@ -1,22 +1,24 @@ -name: dispatch-stash-base-env-vars +name: telemetry-dispatch-stash-base-env-vars description: | - Clones a particular branch/ref of a shared-actions repo, then - call the stash-base-env-vars implementation script, which writes - some environment variables so that downstream jobs can refer to them. + Stores base environment variables in a file and uploads that file + as an artifact. - Inputs here are all assumed to be env vars set outside of this script. - Set them in your main repo's workflows. + This action should only be called once in a build, + at the start of the top-level workflow. All other jobs in the top + level workflow should come after this job. It is generally enough + to have only the checks and devcontainers jobs explicitly depend on + it and have everything else be downstream of them. runs: using: 'composite' steps: + # We can't use the load-then-clone action because the env vars file + # that it needs is something that we create here. - name: Clone shared-actions repo uses: actions/checkout@v4 with: repository: ${{ env.SHARED_ACTIONS_REPO || 'rapidsai/shared-actions' }} ref: ${{ env.SHARED_ACTIONS_REF || 'main' }} path: ./shared-actions - - name: Get traceparent representation of current workflow - uses: ./shared-actions/telemetry-impls/traceparent - name: Stash base env vars uses: ./shared-actions/telemetry-impls/stash-base-env-vars diff --git a/telemetry-dispatch-stash-job-attributes/action.yml b/telemetry-dispatch-stash-job-attributes/action.yml new file mode 100644 index 00000000..cc93053d --- /dev/null +++ b/telemetry-dispatch-stash-job-attributes/action.yml @@ -0,0 +1,20 @@ +name: dispatch-stash-attributes +description: | + Clones a particular branch/ref of a shared-actions repo, then + call the stash-attributes implementation script, which writes + some environment variables so that downstream jobs can refer to them. + + Inputs here are all assumed to be env vars set outside of this script. + Set them in your main repo's workflows. +inputs: + extra_attributes: + description: "comma-separated key=value attributes to associate with the current job" + +runs: + using: 'composite' + steps: + - uses: rapidsai/shared-actions/telemetry-impls/load-then-clone@empty-certs + - name: Stash current job's OTEL_RESOURCE_ATTRIBUTES + uses: ./shared-actions/telemetry-impls/stash-job-attributes + with: + extra_attributes: ${{ inputs.extra_attributes }} diff --git a/telemetry-dispatch-summarize/action.yml b/telemetry-dispatch-summarize/action.yml new file mode 100644 index 00000000..fa1e8015 --- /dev/null +++ b/telemetry-dispatch-summarize/action.yml @@ -0,0 +1,15 @@ +name: telemetry-dispatch-summarize +description: | + This action is run in a final job on the top-level workflow, after all other + jobs are completed. This action downloads the JSON records of all jobs from + the current run. It then associates metadata records that were uploaded with + the telemetry-dispatch-stash-job-attributes action with jobs. This is + effectively label metadata. Finally, this action creates OpenTelemetry spans + with the timing and label metadata, and sends it to the configured Tempo + endpoint (or forwarder). + +runs: + using: 'composite' + steps: + - uses: rapidsai/shared-actions/telemetry-impls/load-then-clone@empty-certs + - uses: ./shared-actions/telemetry-impls/summarize diff --git a/telemetry-dispatch-write-summary/action.yml b/telemetry-dispatch-write-summary/action.yml deleted file mode 100644 index c48d1de7..00000000 --- a/telemetry-dispatch-write-summary/action.yml +++ /dev/null @@ -1,31 +0,0 @@ -name: dispatch-summarize -description: | - Clones a particular branch/ref of a shared-actions repo, then calls its telemetry summarize - action. The summarize action downloads and parses Github job metadata, and creates - OpenTelemetry spans from the job metadata. These are sent to the configured OTLP receiver/endpoint. -inputs: - cert_concat: - description: Concatenation of certs (CA;Client;ClientKey) - extra_attributes: - description: - Additional attributes to add to OTEL_RESOURCE_ATTRIBUTES. - See https://opentelemetry.io/docs/languages/sdk-configuration/general/#otel_resource_attributes - -runs: - using: 'composite' - steps: - - name: Clone shared-actions repo - uses: actions/checkout@v4 - with: - repository: ${{ env.SHARED_ACTIONS_REPO || 'rapidsai/shared-actions' }} - ref: ${{ env.SHARED_ACTIONS_REF || 'main' }} - path: ./shared-actions - # This is necessary because this action will generally be run in a job separately from - # where the env vars are set - - name: Load base environment variables - uses: ./shared-actions/telemetry-impls/load-base-env-vars - - name: Run summarize action - uses: ./shared-actions/telemetry-impls/summarize - with: - cert_concat: ${{ inputs.cert_concat }} - extra_attributes: ${{ inputs.extra_attributes }} diff --git a/telemetry-impls/clean-up-artifacts/action.yml b/telemetry-impls/clean-up-artifacts/action.yml new file mode 100644 index 00000000..3d401d1f --- /dev/null +++ b/telemetry-impls/clean-up-artifacts/action.yml @@ -0,0 +1,52 @@ +name: 'clean-up-artifacts' +description: | + Artifact cleanup removes all of the temporary "carrier" files that are created + by each build job to transmit attribute values. Otherwise, these would be + noisy and make it harder to find "real," interesting artifacts. + +runs: + using: 'composite' + steps: + - name: Clean up telemetry files + uses: actions/github-script@v7 + if: runner.debug != '1' + with: + retries: 3 + script: | + const runAttempt = parseInt(process.env.GITHUB_RUN_ATTEMPT, 10) + get_artifacts = async ({github, context, process}) => { + const opts = github.rest.actions.listWorkflowRunArtifacts.endpoint.merge( + { + attempt_number: runAttempt, + owner: context.repo.owner, + repo: context.repo.repo, + run_id: context.runId, + per_page: 100 + }); + const artifacts = await github.paginate(opts); + return artifacts; + }; + + var artifacts = undefined; + var retryCount = 0; + maxRetries = 5; + initialDelay = 2000; + while (artifacts === undefined) { + artifacts = await get_artifacts({github, context, process}); + if (artifacts === undefined && retryCount < maxRetries) { + retryCount++; + await new Promise((res) => setTimeout(res, initialDelay * Math.pow(2, retryCount - 1))); + } else { + break; + } + } + + artifacts.forEach(artifact => { + if (artifact.name.startsWith('telemetry-tools-')) { + github.rest.actions.deleteArtifact({ + owner: context.repo.owner, + repo: context.repo.repo, + artifact_id: artifact.id + }); + } + }); diff --git a/telemetry-impls/ensure-otel-cli-available/action.yml b/telemetry-impls/ensure-otel-cli-available/action.yml deleted file mode 100644 index b25d825e..00000000 --- a/telemetry-impls/ensure-otel-cli-available/action.yml +++ /dev/null @@ -1,30 +0,0 @@ -name: 'ensure-otel-cli-available' -description: 'Ensure otel-cli is available for sending spans' -inputs: - otel_cli_version: - description: "version of otel-cli to download" - default: "0.4.5" -runs: - using: 'composite' - steps: - - shell: bash - id: install-otel-cli - run: - - mkdir -p bin; - echo $(pwd)/bin >> $GITHUB_PATH; - if ! otel-cli --help 2>&1 >/dev/null; then - ARCH=$(uname -m); - if [ "$(uname -m)" = "x86_64" ]; then - ARCH="amd64"; - else - ARCH="arm64"; - fi; - curl -L -o otel-cli-${ARCH}.tar.gz https://github.com/equinix-labs/otel-cli/releases/download/v${{ inputs.otel_cli_version}}/otel-cli_${{ inputs.otel_cli_version}}_linux_${ARCH}.tar.gz; - tar -zxf otel-cli-${ARCH}.tar.gz; - mv otel-cli ./bin/; - rm -rf otel-cli-${ARCH}.tar.gz; - fi; - curl -LO https://github.com/rapidsai/gha-tools/releases/latest/download/tools.tar.gz; - tar -xzf tools.tar.gz -C ./bin; - echo "$(pwd)/bin" >> ${GITHUB_PATH}; diff --git a/telemetry-impls/github-actions-job-info/action.yml b/telemetry-impls/github-actions-job-info/action.yml index bfbde3e4..86b08eee 100644 --- a/telemetry-impls/github-actions-job-info/action.yml +++ b/telemetry-impls/github-actions-job-info/action.yml @@ -1,17 +1,27 @@ name: 'Github Actions Job info' description: | Obtains job metadata from the GitHub Actions API. - Provides this as a file on disk, job_info.json. + Provides one of two files on disk: + - job_info.json: metadata for the job that is calling this script. Jobs use this to know their + unique ID, which is used to associate attribute metadata with job timing information. + - all_jobs.json: metadata for all jobs under the top-level workflow. This is used by the + final job, which parses this file, associates the span information with attribute metadata, + and sends the spans to the OpenTelemetry receiver endpoint. - If debug logging is enabled, this action will save an additional file, - all_jobs.json, that captures all jobs in the pipeline. This is useful - for troubleshooting misbehaving spans. +inputs: + all_jobs: + description: | + When true, saves a JSON file with all jobs. Otherwise, saves a JSON file + from only the current job. + default: "false" runs: using: 'composite' steps: - uses: actions/github-script@v7 id: get-job-info + env: + ALL_JOBS: "${{ inputs.all_jobs }}" with: retries: 3 script: | @@ -28,30 +38,16 @@ runs: per_page: 100 }); const jobs = await github.paginate(opts); - - if (core.getBooleanInput('debug') || core.isDebug()) { - try { - fs.writeFileSync('all_jobs.json', JSON.stringify(jobs)); - } catch(err) { - console.error(err) - throw(err) - } - } - - // We know what the run ID is, but we don't know which specific job we're being run from. - // https://github.com/orgs/community/discussions/8945 - return jobs.find((job) => { - return job.runner_name === process.env.RUNNER_NAME && job.run_attempt === runAttempt; - }); + return jobs; }; - var this_job = undefined; + var jobs = undefined; var retryCount = 0; maxRetries = 5; initialDelay = 2000; - while (this_job === undefined) { - this_job = await get_job({github, context, process}); - if (this_job === undefined && retryCount < maxRetries) { + while (jobs === undefined) { + jobs = await get_job({github, context, process}); + if (jobs === undefined && retryCount < maxRetries) { retryCount++; await new Promise(res => setTimeout(res, initialDelay * Math.pow(2, retryCount - 1))); } else { @@ -59,23 +55,28 @@ runs: } } - if (this_job === undefined){ - github.log.error("Telemetry values were not available. Please see debug logs for more info."); - github.log.error("All jobs:"); - const jobs = await get_job({github, context, process}); - github.log.error(JSON.stringify(jobs)); + if (process.env.ALL_JOBS === 'true') { + try { + fs.writeFileSync('all_jobs.json', JSON.stringify(jobs)); + } catch(err) { + console.error(err) + throw(err) + } + } else { + // We know what the run ID is, but we don't know which specific job we're being run from. + // https://github.com/orgs/community/discussions/8945 + const this_job = jobs.find((job) => { + return job.runner_name === process.env.RUNNER_NAME && job.run_attempt === runAttempt; + }); - throw "Telemetry values were not available. Please see debug logs for more info." - } - try { - fs.writeFileSync('job_info.json', JSON.stringify(this_job)); - } catch(err) { - console.error(err) - throw(err) + + if (this_job === undefined){ + throw "Could not identify current job in workflow. Please see debug logs for more info." + } + try { + fs.writeFileSync('job_info.json', JSON.stringify(this_job)); + } catch(err) { + console.error(err) + throw(err) + } } - - name: Upload job_info_json if in debug mode - if: runner.debug == '1' - uses: actions/upload-artifact@v4 - with: - name: github-job-info - path: all_jobs.json diff --git a/telemetry-impls/load-base-env-vars/action.yml b/telemetry-impls/load-base-env-vars/action.yml index 31c04425..d25671b9 100644 --- a/telemetry-impls/load-base-env-vars/action.yml +++ b/telemetry-impls/load-base-env-vars/action.yml @@ -12,7 +12,7 @@ runs: - name: Download base environment variables file uses: actions/download-artifact@v4 with: - name: telemetry-env-vars + name: telemetry-tools-env-vars - name: Set environment variables from file into GITHUB_ENV shell: bash # Only set the env var if it is not already set diff --git a/telemetry-impls/load-then-clone/action.yml b/telemetry-impls/load-then-clone/action.yml new file mode 100644 index 00000000..09720bcb --- /dev/null +++ b/telemetry-impls/load-then-clone/action.yml @@ -0,0 +1,47 @@ +name: load-then-clone +description: | + This is a kind of bootstrapping action. Environment variables do not + transfer between top-level workflows and child workflows. We have to + pass them another way. We could use inputs and outputs, but that would + proliferate lots of boilerplate. + + We use a scheme where the top-level workflows store environment variables + in files, and child workflows download those files and re-hydrate the + environment in their context. There are two variables that are especially + tricky, though. We allow users to specify their own repo and ref for where + to get the `shared-actions` repo. That info is in the shared file, but we + have to clone a `shared-actions` repo to use its script to load the variables. + As a result, we clone the code twice: first to learn how to load the variables, + then another time when the variables are actually set. + +runs: + using: 'composite' + steps: + - name: Download base environment variables file + uses: actions/download-artifact@v4 + with: + name: telemetry-tools-env-vars + # We can't use ./telemetry-implementation/load-base-env-vars here + # because at this point we have not cloned the repo. + - name: Set environment variables from file into GITHUB_ENV + shell: bash + # Only set the env var if it is not already set + # the ${!VARIABLE} syntax is called "indirect expansion" and it is kind of equivalent to ${${env_var_name}} + # in other words, expand to find the variable name, then dereference that variable name + # The goofy env_var_value filtering through tr is to ensure that the strings don't include quotes. + run: | + while read LINE; do + env_var_name="$( cut -d '=' -f 1 <<< "$LINE" )"; + if [ "${!env_var_name}" = "" ]; then + env_var_value="$(echo ${LINE#*=} | tr -d '"')" + echo "${env_var_name}=$(echo "${env_var_value}" | sed 's/^,//')" >> ${GITHUB_ENV}; + else + echo "Load base env info: ignoring new value for "${env_var_name}" in loading base env vars. It is already set to "${!env_var_name}"." >&2; + fi + done &1 >/dev/null; then - echo "otel-cli is missing"; - exit 1; - fi - - if ! type -P rapids-get-telemetry-trace-id 2>&1 >/dev/null; then - echo "rapidsai/gha-tools is missing or lacks telemetry scripts"; - exit 1; - fi diff --git a/telemetry-impls/set-otel-service-name/action.yml b/telemetry-impls/set-otel-service-name/action.yml index 570c7a0e..c06a4bde 100644 --- a/telemetry-impls/set-otel-service-name/action.yml +++ b/telemetry-impls/set-otel-service-name/action.yml @@ -28,4 +28,4 @@ runs: - shell: bash id: set-otel-service-name-env run: | - echo OTEL_SERVICE_NAME="${{ steps.get-job-name.outputs.JOB_NAME}}" >> ${GITHUB_ENV}; + echo OTEL_SERVICE_NAME="${{ steps.get-job-name.outputs.JOB_NAME }}" >> ${GITHUB_ENV}; diff --git a/telemetry-impls/stash-base-env-vars/action.yml b/telemetry-impls/stash-base-env-vars/action.yml index 8764b74c..e6026027 100644 --- a/telemetry-impls/stash-base-env-vars/action.yml +++ b/telemetry-impls/stash-base-env-vars/action.yml @@ -15,32 +15,30 @@ description: | runs: using: 'composite' steps: - - name: Compute traceparent - # This sets TRACEPARENT env var, which we store below. - # TRACEPARENT implicity depends on OTEL_SERVICE_NAME being set. This will have one value - # for the top-level build (e.g. pr.yaml), and one value for each of the child workflows - uses: ./shared-actions/telemetry-impls/traceparent - name: Write base env vars to a file shell: bash run: | + TRACEPARENT=$(./shared-actions/telemetry-impls/traceparent.sh "${OTEL_SERVICE_NAME}") OTEL_RESOURCE_ATTRIBUTES="${OTEL_RESOURCE_ATTRIBUTES},git.repository=${GITHUB_REPOSITORY}" OTEL_RESOURCE_ATTRIBUTES="${OTEL_RESOURCE_ATTRIBUTES},git.ref=${GITHUB_REF}" OTEL_RESOURCE_ATTRIBUTES="${OTEL_RESOURCE_ATTRIBUTES},git.sha=${GITHUB_SHA}" OTEL_RESOURCE_ATTRIBUTES="${OTEL_RESOURCE_ATTRIBUTES},git.job_url=${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}" cat < telemetry-env-vars OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:-https://fb.local.gha-runners.nvidia.com:4318} + OTEL_TRACES_EXPORTER=${OTEL_TRACES_EXPORTER:-otlp_proto_http} OTEL_EXPORTER_OTLP_PROTOCOL=${OTEL_EXPORTER_OTLP_PROTOCOL:-http/protobuf} OTEL_RESOURCE_ATTRIBUTES="$(echo "${OTEL_RESOURCE_ATTRIBUTES}" | sed 's/^,//')" OTEL_SERVICE_NAME="${OTEL_SERVICE_NAME}" - SHARED_ACTIONS_REPO=${SHARED_ACTIONS_REPO} - SHARED_ACTIONS_REF=${SHARED_ACTIONS_REF} - START_TIME="${START_TIME:-$(date --rfc-3339=ns | sed "s/ /T/g" | sed "s/+00:00/Z/g")}" - TRACEPARENT=${TRACEPARENT} + SHARED_ACTIONS_REPO=${SHARED_ACTIONS_REPO:-rapidsai/shared-actions} + SHARED_ACTIONS_REF=${SHARED_ACTIONS_REF:-main} + START_TIME="${START_TIME:-$(date +'%s')}" + TRACEPARENT="${TRACEPARENT}" + TELEMETRY_ENABLED="${TELEMETRY_ENABLED}" EOF - name: Upload env vars file uses: actions/upload-artifact@v4 with: - name: telemetry-env-vars + name: telemetry-tools-env-vars path: telemetry-env-vars - name: Re-export env vars so that defaults take effect uses: ./shared-actions/telemetry-impls/load-base-env-vars diff --git a/telemetry-impls/stash-job-attributes/action.yml b/telemetry-impls/stash-job-attributes/action.yml new file mode 100644 index 00000000..ac966326 --- /dev/null +++ b/telemetry-impls/stash-job-attributes/action.yml @@ -0,0 +1,50 @@ + +name: stash-job-attributes +description: | + Obtains GitHub Actions job list and matches current job using runner name and attempt number. + Saves and uploads a file with telemetry attributes that should be attached to spans from this run. + + We stash only the attributes here because we retrieve the rest of the timing + info later. We get info for all jobs at once, so we wait to retrieve that info + at the very end of the top-level job. +inputs: + extra_attributes: + description: "comma-separated key=value attributes to associate with the current job" + +runs: + using: 'composite' + steps: + - uses: ./shared-actions/telemetry-impls/github-actions-job-info + id: github-job-info + - shell: bash + id: get-job-id + run: + echo JOB_ID="$(cat job_info.json | jq -r '.id')" >> ${GITHUB_ENV}; + + - name: Add attribute metadata beyond the stashed basic stuff + shell: bash + run: + attributes="${OTEL_RESOURCE_ATTRIBUTES}"; + labels="$(jq -r '.labels | join(" ")' job_info.json)"; + if [ "${labels}" != "" ]; then + attributes="${attributes},rapids.labels=${labels}"; + fi; + if [ "${{ inputs.extra_attributes }}" != "" ]; then + attributes="${attributes},${{ inputs.extra_attributes }}"; + fi; + attributes=$(echo "${attributes}" | sed 's/^,//'); + attributes=$(echo "${attributes}" | sed 's/,$//'); + attributes=$(echo "${attributes}" | sed -r "s/(git.job_url=[^,]+)/\1\/job\/${JOB_ID}/"); + echo OTEL_RESOURCE_ATTRIBUTES="${attributes}" >> ${GITHUB_ENV}; + + - name: Write attributes to file, one per line + shell: bash + run: + IFS=, read -ra values <<< "$OTEL_RESOURCE_ATTRIBUTES"; + printf "%s\n" "${values[@]}" > attrs-${JOB_ID}; + + - name: Upload attr file + uses: actions/upload-artifact@v4 + with: + name: telemetry-tools-attrs-${{ env.JOB_ID }} + path: attrs-${{ env.JOB_ID }} diff --git a/telemetry-impls/summarize/action.yml b/telemetry-impls/summarize/action.yml index 16ba17d6..10da5693 100644 --- a/telemetry-impls/summarize/action.yml +++ b/telemetry-impls/summarize/action.yml @@ -1,149 +1,37 @@ name: 'Telemetry summarize' description: | Consumes job info, parses into spans, and pushes spans. -inputs: - cert_concat: - description: Concatenation of certs (CA;Client;ClientKey) - extra_attributes: - description: | - Additional attributes to add to OTEL_RESOURCE_ATTRIBUTES. - See https://opentelemetry.io/docs/languages/sdk-configuration/general/#otel_resource_attributes - - Do not include any leading or trailing join characters (,) runs: using: 'composite' steps: - - uses: ./shared-actions/telemetry-impls/ensure-otel-cli-available + - name: Setup python + uses: actions/setup-python@v5 + with: + python-version: 3.11 + cache: 'pip' + cache-dependency-path: './shared-actions/telemetry-impls/summarize/requirements.txt' + - name: Install dependencies + run: pip install -r './shared-actions/telemetry-impls/summarize/requirements.txt' + shell: bash # Writes JSON file that jobs below consume - uses: ./shared-actions/telemetry-impls/github-actions-job-info id: github-job-info - - - name: Add attribute metadata beyond the stashed basic stuff - shell: bash - run: - attributes="${OTEL_RESOURCE_ATTRIBUTES}"; - labels="$(jq -r '.labels | join(" ")' job_info.json)"; - if [ "${labels}" != "" ]; then - attributes="${attributes},rapids.labels=${labels}"; - fi; - if [ "${{ inputs.extra_attributes }}" != "" ]; then - attributes="${attributes},${{ inputs.extra_attributes }}"; - fi; - echo OTEL_RESOURCE_ATTRIBUTES="$(echo "${attributes}" | sed 's/^,//')" >> ${GITHUB_ENV}; - - - uses: ./shared-actions/telemetry-impls/set-otel-service-name - if: env.OTEL_SERVICE_NAME == '' - - # OpenTelemetry implementations look for these variable names to be set to the paths - # of the files with the mTLS certs and keys. We base64 encode them when storing them - # to avoid issues with quoting and multiline text. - # If these env vars are not set, then otel-cli will not attempt to use mTLS. - - name: Write certificate files for mTLS - if: "${{ inputs.cert_concat }} != ''" + with: + all_jobs: true + - name: Upload job JSON file if debugging + uses: actions/upload-artifact@v4 + if: runner.debug == '1' + with: + name: telemetry-tools-all_jobs.json + path: all_jobs.json + # This downloads ALL of the files that we have collected from each job. + - uses: actions/download-artifact@v4 + + - name: Run parse and send trace/spans to endpoint shell: bash run: | - mkdir -p /tmp/certs - IFS='; ' read -r -a CERT_PARTS <<< "${{ inputs.cert_concat }}" - - OTEL_EXPORTER_OTLP_CERTIFICATE=/tmp/certs/ca.crt.pem - echo "${CERT_PARTS[0]}" | base64 --decode > ${OTEL_EXPORTER_OTLP_CERTIFICATE} - echo OTEL_EXPORTER_OTLP_CERTIFICATE=${OTEL_EXPORTER_OTLP_CERTIFICATE} >> ${GITHUB_ENV} - - OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE=/tmp/certs/client.crt.pem - echo "${CERT_PARTS[1]}" | base64 --decode > ${OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE} - echo OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE=${OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE} >> ${GITHUB_ENV} - - OTEL_EXPORTER_OTLP_CLIENT_KEY=/tmp/certs/client.key.pem - echo "${CERT_PARTS[2]}" | base64 --decode > ${OTEL_EXPORTER_OTLP_CLIENT_KEY} - echo OTEL_EXPORTER_OTLP_CLIENT_KEY=${OTEL_EXPORTER_OTLP_CLIENT_KEY} >> ${GITHUB_ENV} - - - uses: ./shared-actions/telemetry-impls/sanity-checks - - - shell: bash - run: | - set -e - - TOP_LEVEL_TRACEPARENT=${TRACEPARENT} - - job_traceparent="$(rapids-get-telemetry-traceparent "${OTEL_SERVICE_NAME}")" - export TRACEPARENT=${job_traceparent} - - # The reporting of the completion time is earlier than the actual last step's completion. - # We compensate for that by picking up the last known completion time of any step. - last_timestamp=$(date +'%s') - - # Base64 encoding is to avoid issues with spaces/newlines/whatever funny business - for row in $(jq -r '.steps[] | @base64' job_info.json); do - name="$(echo ${row} | base64 --decode | jq -r ".name")" - conclusion="$(echo ${row} | base64 --decode | jq -r ".conclusion")" - - step_traceparent="$(rapids-get-telemetry-traceparent "${OTEL_SERVICE_NAME}" "$name")"; - - timestamp_as_date=$last_timestamp - case $timestamp_as_date in - ''|*[!0-9]*) echo "Date is not an integer" ;; - *) timestamp_as_date="$(date -d @${last_timestamp} --rfc-3339=ns | sed "s/ /T/g" | sed "s/+00:00/Z/g")" ;; - esac - - otel-cli span create \ - --name="$name" \ - --force-trace-id="$(cut -d'-' -f2 <<<"$job_traceparent")" \ - --force-span-id="$(cut -d'-' -f3 <<<"$step_traceparent")" \ - --start="$(echo ${row} | base64 --decode | jq -r ".started_at // \"${timestamp_as_date}\"")" \ - --end="$(echo ${row} | base64 --decode | jq -r ".completed_at // \"${timestamp_as_date}\"")" \ - --verbose --fail - - # Compare timestamps; keep the latest one - step_end_timestamp="$(echo ${row} | base64 --decode | jq -r ".completed_at")"; - if [ "$step_end_timestamp" != "null" ]; then - step_end_timestamp=$(date -d "$step_end_timestamp" +'%s'); - if [ ${step_end_timestamp} -ge ${last_timestamp} ]; then - last_timestamp=${step_end_timestamp}; - fi - fi - done - - echo "Final timestamp is ${last_timestamp}" - case $last_timestamp in - ''|*[!0-9]*) echo "Date is not an integer" ;; - *) last_timestamp="$(date -d @${last_timestamp} --rfc-3339=ns | sed "s/ /T/g" | sed "s/+00:00/Z/g")" ;; - esac - - if [ "$status_description" != "" ] && [ "$status_description" != "null" ]; then - status_description="--status-description ${status_description}" - else - status_description= - fi - - # unset this so that the parent does not automatically get picked up - export TRACEPARENT= - if [ "${TOP_LEVEL_TRACEPARENT}" = "${job_traceparent}" ]; then - otel-cli span create \ - --name "workflow root" \ - --force-trace-id "$(cut -d'-' -f2 <<<"$job_traceparent")" \ - --force-span-id "$(cut -d'-' -f3 <<<"$job_traceparent")" \ - --force-parent-span-id "" \ - --start "${START_TIME}" \ - --end "${last_timestamp}" \ - --verbose --fail \ - $status_description; - else - otel-cli span create \ - --name "Start delay time" \ - --force-trace-id "$(cut -d'-' -f2 <<<"$job_traceparent")" \ - --force-parent-span-id "$(cut -d'-' -f3 <<<"$job_traceparent")" \ - --start "$(jq -r '.created_at // "now"' job_info.json)" \ - --end "$(jq -r '.started_at // "now"' job_info.json)" \ - --verbose --fail + timeout 5m python3 ./shared-actions/telemetry-impls/summarize/send_trace.py - otel-cli span create \ - --name "child workflow root" \ - --force-trace-id "$(cut -d'-' -f2 <<<"$job_traceparent")" \ - --force-span-id "$(cut -d'-' -f3 <<<"$job_traceparent")" \ - --force-parent-span-id "$(cut -d'-' -f3 <<<"$TOP_LEVEL_TRACEPARENT")" \ - --start "$(jq -r '.created_at // "now"' job_info.json)" \ - --end "${last_timestamp}" \ - --verbose --fail \ - $status_description; - fi + - name: Clean up attributes artifacts from all jobs + uses: ./shared-actions/telemetry-impls/clean-up-artifacts diff --git a/telemetry-impls/summarize/bump_time.py b/telemetry-impls/summarize/bump_time.py new file mode 100644 index 00000000..446b8e62 --- /dev/null +++ b/telemetry-impls/summarize/bump_time.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python + +# This script is meant to act on an 'all_jobs.json' file that comes from +# the summarize job when debug info is enabled. Bumping the time makes +# it easier to re-run the span-sending python script and check results +# in either Jaeger or Grafana + +import json +import datetime + +with open('all_jobs.json') as f: + jobs = json.load(f) + +parse_time = lambda x: int(datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ").timestamp() * 1e9) + +start_time = parse_time(jobs[0]['created_at']) +needed_time = parse_time(jobs[-3]['completed_at']) - parse_time(jobs[0]['created_at']) +new_start_time = datetime.datetime.utcnow() - datetime.timedelta(minutes=60) + +for idx, job in enumerate(jobs): + if job['created_at']: + job['created_at'] = (new_start_time + datetime.timedelta(seconds=(parse_time(job['created_at']) - start_time)/1e9)).strftime("%Y-%m-%dT%H:%M:%SZ") + if job['started_at']: + job['started_at'] = (new_start_time + datetime.timedelta(seconds=(parse_time(job['started_at']) - start_time)/1e9)).strftime("%Y-%m-%dT%H:%M:%SZ") + if job['completed_at']: + job['completed_at'] = (new_start_time + datetime.timedelta(seconds=(parse_time(job['completed_at']) - start_time)/1e9)).strftime("%Y-%m-%dT%H:%M:%SZ") + steps = [] + for step in job['steps']: + if step['started_at']: + step['started_at'] = (new_start_time + datetime.timedelta(seconds=(parse_time(step['started_at']) - start_time)/1e9)).strftime("%Y-%m-%dT%H:%M:%SZ") + if step['completed_at']: + step['completed_at'] = (new_start_time + datetime.timedelta(seconds=(parse_time(step['completed_at']) - start_time)/1e9)).strftime("%Y-%m-%dT%H:%M:%SZ") + steps.append(step) + job['steps'] = steps + + jobs[idx] = job + + +with open("all_jobs.json", "w") as f: + json.dump(jobs, f) diff --git a/telemetry-impls/summarize/requirements.txt b/telemetry-impls/summarize/requirements.txt new file mode 100644 index 00000000..214625f4 --- /dev/null +++ b/telemetry-impls/summarize/requirements.txt @@ -0,0 +1,4 @@ +opentelemetry-api==1.29.* +opentelemetry-sdk==1.29.* +opentelemetry-exporter-otlp-proto-http==1.29.* +googleapis-common-protos<2.0.dev0,>=1.56.2 diff --git a/telemetry-impls/summarize/send_trace.py b/telemetry-impls/summarize/send_trace.py new file mode 100644 index 00000000..1137ef00 --- /dev/null +++ b/telemetry-impls/summarize/send_trace.py @@ -0,0 +1,292 @@ +# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Processes a GitHub Actions workflow log record and outputs OpenTelemetry span data.""" + + +from __future__ import annotations +from datetime import datetime, timezone +import hashlib +import json +import logging +import os +from pathlib import Path +from typing import Optional, Dict + +from opentelemetry import trace +from opentelemetry.context import attach, detach +from opentelemetry.propagate import extract +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.resources import Resource +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.trace.status import StatusCode +from opentelemetry.sdk.trace.id_generator import IdGenerator + +match os.getenv("OTEL_EXPORTER_OTLP_PROTOCOL"): + case "http/protobuf": + from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter + case "grpc": + from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter + case _: + from opentelemetry.sdk.trace.export import ConsoleSpanExporter as OTLPSpanExporter + + +import logging +logging.basicConfig(level=logging.WARNING) + +SpanProcessor = BatchSpanProcessor + + +def parse_attribute_file(filename: str) -> Dict[str, str]: + attributes = {} + with open(filename, "r") as attribute_file: + for line in attribute_file.readlines(): + key, value = line.strip().split('=', 1) + attributes[key] = value + return attributes + + +def date_str_to_epoch(date_str: str, value_if_not_set: Optional[int] = 0) -> int: + if date_str: + # replace bit is to attach the UTC timezone to our datetime object, so + # that it doesn't "help" us by adjusting our string value, which is + # already in UTC + timestamp_ns = int(datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc).timestamp() * 1e9) + else: + timestamp_ns = value_if_not_set or 0 + return timestamp_ns + + +def map_conclusion_to_status_code(conclusion: str) -> StatusCode: + if conclusion == "success": + return StatusCode.OK + elif conclusion == "failure": + return StatusCode.ERROR + else: + return StatusCode.UNSET + +def load_env_vars(): + env_vars = {} + with open('telemetry-tools-env-vars/telemetry-env-vars') as f: + for line in f.readlines(): + k, v = line.split("=", 1) + env_vars[k] = v.strip().strip('"') + return env_vars + +class LoadTraceParentGenerator(IdGenerator): + def __init__(self, traceparent) -> None: + # purpose of this is to keep the trace ID constant if the same data is sent different times, + # which mainly happens during testing. Having the trace ID be something that we control + # will also be useful for tying together logs and metrics with our traces. + ctx = extract( + carrier={'traceparent': traceparent}, + ) + self.context = list(ctx.values())[0].get_span_context() + + + def generate_span_id(self) -> int: + """Get a new span ID. + + Returns: + A 64-bit int for use as a span ID + """ + return self.context.span_id + + def generate_trace_id(self) -> int: + """Get a new trace ID. + + Implementations should at least make the 64 least significant bits + uniformly random. Samplers like the `TraceIdRatioBased` sampler rely on + this randomness to make sampling decisions. + + See `the specification on TraceIdRatioBased `_. + + Returns: + A 128-bit int for use as a trace ID + """ + return self.context.trace_id + +class RapidsSpanIdGenerator(IdGenerator): + def __init__(self, trace_id, job_name) -> None: + self.trace_id = trace_id + self.job_name = job_name + self.step_name = None + + def update_step_name(self, step_name): + self.step_name = step_name + + def generate_span_id(self) -> int: + """Get a new span ID. + + Returns: + A 64-bit int for use as a span ID + """ + span_id = hashlib.sha256() + span_id.update(str(self.trace_id).encode()) + span_id.update(bytes(self.job_name.encode())) + if self.step_name: + span_id.update(bytes(self.step_name.encode())) + return int(span_id.hexdigest()[:16], 16) + + def generate_trace_id(self) -> int: + """Get a new trace ID. + + Implementations should at least make the 64 least significant bits + uniformly random. Samplers like the `TraceIdRatioBased` sampler rely on + this randomness to make sampling decisions. + + See `the specification on TraceIdRatioBased `_. + + Returns: + A 128-bit int for use as a trace ID + """ + return self.trace_id + + +class GithubActionsParserGenerator(IdGenerator): + def __init__(self, traceparent) -> None: + # purpose of this is to keep the trace ID constant if the same data is sent different times, + # which mainly happens during testing. Having the trace ID be something that we control + # will also be useful for tying together logs and metrics with our traces. + ctx = extract( + carrier={'traceparent': traceparent}, + ) + self.context = list(ctx.values())[0].get_span_context() + + def update_span_job_name(self, new_name): + self.job_name = new_name + + def update_span_step_name(self, new_name): + self.step_name = new_name + + + def generate_span_id(self) -> int: + """Get a new span ID. + + Returns: + A 64-bit int for use as a span ID + """ + return self.context.span_id + + def generate_trace_id(self) -> int: + """Get a new trace ID. + + Implementations should at least make the 64 least significant bits + uniformly random. Samplers like the `TraceIdRatioBased` sampler rely on + this randomness to make sampling decisions. + + See `the specification on TraceIdRatioBased `_. + + Returns: + A 128-bit int for use as a trace ID + """ + return self.context.trace_id + + +def main(args): + with open("all_jobs.json") as f: + jobs = json.loads(f.read()) + + env_vars = load_env_vars() + + first_timestamp = date_str_to_epoch(jobs[0]["created_at"]) + # track the latest timestamp observed and use it for any unavailable times. + last_timestamp = date_str_to_epoch(jobs[0]["completed_at"]) + + attribute_files = list(Path.cwd().glob(f"telemetry-tools-attrs-*/*")) + if attribute_files: + attribute_file = attribute_files[0] + attributes = parse_attribute_file(attribute_file.as_posix()) + else: + attributes = {} + global_attrs = {} + for k, v in attributes.items(): + if k.startswith('git.'): + global_attrs[k] = v + + global_attrs['service.name'] = env_vars['OTEL_SERVICE_NAME'] + + provider = TracerProvider(resource=Resource(global_attrs), id_generator=LoadTraceParentGenerator(env_vars["TRACEPARENT"])) + provider.add_span_processor(span_processor=SpanProcessor(OTLPSpanExporter())) + tracer = trace.get_tracer("GitHub Actions parser", "0.0.1", tracer_provider=provider) + + with tracer.start_as_current_span("workflow root", start_time=first_timestamp, end_on_exit=False) as root_span: + for job in jobs: + job_name = job["name"] + job_id = job["id"] + logging.info(f"Processing job '{job_name}'") + job_create = date_str_to_epoch(job["created_at"], first_timestamp) + job_start = date_str_to_epoch(job["started_at"], first_timestamp) + # this may get later in time as we progress through the steps. It is + # common to have the job completion time be earlier than the end of + # the final cleanup steps + job_last_timestamp = date_str_to_epoch(job["completed_at"], job_start) + + if job_start == 0: + logging.info(f"Job is empty (no start time) - bypassing") + continue + + attribute_file = Path.cwd() / f"telemetry-tools-attrs-{job_id}/attrs-{job_id}" + attributes = {} + if attribute_file.exists(): + logging.debug(f"Found attribute file for job '{job_id}'") + attributes = parse_attribute_file(attribute_file.as_posix()) + else: + logging.debug(f"No attribute metadata found for job '{job_id}'") + + attributes["service.name"] = job_name + + job_id_generator = RapidsSpanIdGenerator(trace_id=root_span.get_span_context().trace_id, job_name=job_name) + + job_provider = TracerProvider(resource=Resource(attributes=attributes), id_generator=job_id_generator) + job_provider.add_span_processor(span_processor=SpanProcessor(OTLPSpanExporter())) + job_tracer = trace.get_tracer("GitHub Actions parser", "0.0.1", tracer_provider=job_provider) + + with job_tracer.start_as_current_span(job['name'], start_time=job_create, end_on_exit=False) as job_span: + job_span.set_status(map_conclusion_to_status_code(job["conclusion"])) + + job_id_generator.update_step_name('start delay time') + with job_tracer.start_as_current_span( + name="start delay time", + start_time=job_create, + end_on_exit=False, + ) as delay_span: + delay_span.end(job_start) + + for step in job["steps"]: + start = date_str_to_epoch(step["started_at"], job_last_timestamp) + end = date_str_to_epoch(step["completed_at"], start) + job_id_generator.update_step_name(step['name']) + + if (end - start) / 1e9 > 1: + logging.info(f"processing step: '{step['name']}'") + with job_tracer.start_as_current_span( + name=step['name'], + start_time=start, + end_on_exit=False, + ) as step_span: + step_span.set_status(map_conclusion_to_status_code(step["conclusion"])) + step_span.end(end) + + job_last_timestamp = max(end, job_last_timestamp) + + job_end = max(date_str_to_epoch(job["completed_at"], job_last_timestamp), job_last_timestamp) + last_timestamp = max(job_end, last_timestamp) + job_span.end(job_end) + root_span.end(last_timestamp) + + +if __name__ == "__main__": + import sys + + main(sys.argv[1:]) diff --git a/telemetry-impls/traceparent.sh b/telemetry-impls/traceparent.sh new file mode 100755 index 00000000..7bba8e47 --- /dev/null +++ b/telemetry-impls/traceparent.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# This emits a TRACEPARENT, which follows the w3c trace context standard. +# https://www.w3.org/TR/trace-context/ +# +# This script can operate for two purposes: +# 1. The top level of a job, whether it is the job at the source repo (e.g. rmm) level, or +# the matrix job level +# 2. The steps level within a job, which uses both the job name and the step name +# +# The job name must always be provided as the first argument. +# A step name MAY be provided as the second argument. If it is specified, the output corresponds to +# the step within the context of its job. +# +# This is a shell script instead of an action because we need to call it +# in loops that iterate through job metadata. + +JOB_NAME=$1 +STEP_NAME=${2:-} + +if [ "$JOB_NAME" = "" ]; then + echo "ERROR: JOB_NAME (first parameter) is empty. This means your trace doesn't identify anything." + exit 1 +fi + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +sha="$(echo "${GITHUB_REPOSITORY}+${GITHUB_RUN_ID}+${GITHUB_RUN_ATTEMPT}" | sha256sum | cut -f1 -d' ')" +TRACE_ID="${sha:0:32}" +JOB_SPAN_ID="${TRACE_ID}-${JOB_NAME}" +STEP_SPAN_ID="${JOB_SPAN_ID}-${STEP_NAME}" + +# echo "JOB_SPAN_ID pre-hash: \"$JOB_SPAN_ID\"" 1>&2 +# echo "STEP_SPAN_ID pre-hash: \"$STEP_SPAN_ID\"" 1>&2 + +JOB_TRACEPARENT=$(echo -n "${JOB_SPAN_ID}" | sha256sum | cut -f1 -d' ') +STEP_TRACEPARENT=$(echo -n "${STEP_SPAN_ID}" | sha256sum | cut -f1 -d' ') + +if [ "${STEP_NAME}" != "" ]; then + echo "00-${TRACE_ID}-${STEP_TRACEPARENT:0:16}-01" +else + echo "00-${TRACE_ID}-${JOB_TRACEPARENT:0:16}-01" +fi diff --git a/telemetry-impls/traceparent/action.yml b/telemetry-impls/traceparent/action.yml deleted file mode 100644 index 2f597364..00000000 --- a/telemetry-impls/traceparent/action.yml +++ /dev/null @@ -1,43 +0,0 @@ -name: traceparent -description: | - Outputs a trace identifier computed from the GitHub runtime environment - - Trace identifier background: - https://www.w3.org/TR/trace-context/ - - Note that the TRACEPARENT env var is special to otel-cli. It will be picked up - automatically and used as a parent to whatever span otel-cli is created. -outputs: - traceparent: - description: The W3C-format traceparent, which identifies the current context - value: ${{ steps.shell.outputs.TRACEPARENT }} - -runs: - using: 'composite' - steps: - - uses: ./shared-actions/telemetry-impls/ensure-otel-cli-available - # this ensures that OTEL_SERVICE_NAME is either set before this action, or sets it from the github job metadata - - uses: ./shared-actions/telemetry-impls/set-otel-service-name - if: env.OTEL_SERVICE_NAME == '' - - uses: ./shared-actions/telemetry-impls/sanity-checks - - - shell: bash - id: output-inputs - if: runner.debug == '1' - run: | - echo "::debug::trace ID input: '${GITHUB_REPOSITORY}+${GITHUB_RUN_ID}+${GITHUB_RUN_ATTEMPT}'" - echo "::debug::$GITHUB_REPOSITORY="${GITHUB_REPOSITORY}"" - echo "::debug::$GITHUB_RUN_ID="${GITHUB_RUN_ID}"" - echo "::debug::$GITHUB_RUN_ATTEMPT="${GITHUB_RUN_ATTEMPT}"" - echo "::debug::Evaluated trace ID input (pre-hash): "${GITHUB_REPOSITORY}+${GITHUB_RUN_ID}+${GITHUB_RUN_ATTEMPT}"" - export TRACE_ID="$(rapids-get-telemetry-trace-id)" - echo "::debug::Computed trace ID: ${TRACE_ID}" - - echo "::debug::JOB TRACEPARENT input (step empty): '00-\${TRACE_ID}-hash(\${TRACE_ID}-\$\{\OTEL_SERVICE_NAME}\})-01'" - echo "::debug::evaluated job traceparent input: "00-${TRACE_ID}-hash\(${TRACE_ID}-${OTEL_SERVICE_NAME}\)-01"" - - - shell: bash - id: shell - run: | - echo "TRACEPARENT=$(rapids-get-telemetry-traceparent "${OTEL_SERVICE_NAME}")" >> ${GITHUB_OUTPUT} - echo "TRACEPARENT=$(rapids-get-telemetry-traceparent "${OTEL_SERVICE_NAME}")" >> ${GITHUB_ENV}