diff --git a/.github/workflows/sycl_linux_build_and_test.yml b/.github/workflows/sycl_linux_build_and_test.yml index bac87cc5eea5e..9785a561f41e5 100644 --- a/.github/workflows/sycl_linux_build_and_test.yml +++ b/.github/workflows/sycl_linux_build_and_test.yml @@ -40,6 +40,10 @@ on: type: string required: false default: "" + lts_aws_matrix: + type: string + required: false + default: "" lts_cmake_extra_args: type: string required: false @@ -155,9 +159,31 @@ jobs: name: sycl_lit_${{ inputs.build_artifact_suffix }} path: lit.tar.xz - llvm_test_suite: + aws-start: + name: Start AWS needs: build - if: ${{ inputs.lts_matrix != '' }} + if: ${{ inputs.lts_aws_matrix != '' }} + runs-on: ubuntu-latest + environment: aws + steps: + - name: Setup script + run: | + mkdir -p ./aws-ec2 + wget raw.githubusercontent.com/intel/llvm/sycl/devops/actions/aws-ec2/action.yml -P ./aws-ec2 + wget raw.githubusercontent.com/intel/llvm/sycl/devops/actions/aws-ec2/aws-ec2.js -P ./aws-ec2 + wget raw.githubusercontent.com/intel/llvm/sycl/devops/actions/aws-ec2/package.json -P ./aws-ec2 + npm install ./aws-ec2 + - name: Start AWS EC2 runners + uses: ./aws-ec2 + with: + runs-on-list: ${{ inputs.lts_aws_matrix }} + GH_PERSONAL_ACCESS_TOKEN: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + AWS_ACCESS_KEY: ${{ secrets.AWS_ACCESS_KEY }} + AWS_SECRET_KEY: ${{ secrets.AWS_SECRET_KEY }} + + llvm_test_suite: + needs: [build, aws-start] + if: ${{ !failure() && inputs.lts_matrix != '' }} strategy: fail-fast: false max-parallel: ${{ inputs.max_parallel }} @@ -203,3 +229,26 @@ jobs: check_sycl_all: ${{ matrix.check_sycl_all }} results_name_suffix: ${{ matrix.config }}_${{ inputs.build_artifact_suffix }} cmake_args: '${{ matrix.cmake_args }} ${{ inputs.lts_cmake_extra_args }}' + + aws-stop: + name: Stop AWS + needs: [ aws-start, llvm_test_suite ] + if: ${{ always() && inputs.lts_ats_matrix != '' }} + runs-on: ubuntu-latest + environment: aws + steps: + - name: Setup script + run: | + mkdir -p ./aws-ec2 + wget raw.githubusercontent.com/intel/llvm/sycl/devops/actions/aws-ec2/action.yml -P ./aws-ec2 + wget raw.githubusercontent.com/intel/llvm/sycl/devops/actions/aws-ec2/aws-ec2.js -P ./aws-ec2 + wget raw.githubusercontent.com/intel/llvm/sycl/devops/actions/aws-ec2/package.json -P ./aws-ec2 + npm install ./aws-ec2 + - name: Stop AWS EC2 runners + uses: ./aws-ec2 + with: + runs-on-list: ${{ inputs.lts_aws_matrix }} + mode: stop + GH_PERSONAL_ACCESS_TOKEN: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + AWS_ACCESS_KEY: ${{ secrets.AWS_ACCESS_KEY }} + AWS_SECRET_KEY: ${{ secrets.AWS_SECRET_KEY }} diff --git a/.github/workflows/sycl_nightly.yml b/.github/workflows/sycl_nightly.yml index 61f8fa531b032..9299ac11ad490 100644 --- a/.github/workflows/sycl_nightly.yml +++ b/.github/workflows/sycl_nightly.yml @@ -20,6 +20,7 @@ jobs: if: github.repository == 'intel/llvm' uses: ./.github/workflows/sycl_linux_build_and_test.yml needs: resolve_matrix + secrets: inherit with: build_cache_root: "/__w/" build_artifact_suffix: default @@ -29,6 +30,7 @@ jobs: if: github.repository == 'intel/llvm' uses: ./.github/workflows/sycl_linux_build_and_test.yml needs: resolve_matrix + secrets: inherit with: build_cache_root: "/__w/" build_cache_suffix: opaque_pointers diff --git a/.github/workflows/sycl_post_commit.yml b/.github/workflows/sycl_post_commit.yml index db8bcd1560f29..cef878bb8331d 100644 --- a/.github/workflows/sycl_post_commit.yml +++ b/.github/workflows/sycl_post_commit.yml @@ -21,13 +21,16 @@ jobs: name: Linux Default needs: resolve_matrix uses: ./.github/workflows/sycl_linux_build_and_test.yml + secrets: inherit with: build_cache_root: "/__w/llvm" build_artifact_suffix: "post_commit" lts_matrix: ${{ needs.resolve_matrix.outputs.lts_matrix }} + lts_aws_matrix: ${{ needs.resolve_matrix.outputs.lts_aws_matrix }} linux_no_assert: name: Linux (no assert) uses: ./.github/workflows/sycl_linux_build_and_test.yml + secrets: inherit with: build_cache_root: "/__w/llvm" build_cache_suffix: gcc_no_assertions diff --git a/.github/workflows/sycl_precommit.yml b/.github/workflows/sycl_precommit.yml index 1a30ee7dece42..e9539f9a0d13a 100644 --- a/.github/workflows/sycl_precommit.yml +++ b/.github/workflows/sycl_precommit.yml @@ -1,7 +1,7 @@ name: SYCL on: - pull_request: + pull_request_target: branches: - sycl # Do not run builds if changes are only in the following locations @@ -25,6 +25,7 @@ jobs: steps: - uses: actions/checkout@v2 with: + persist-credentials: false fetch-depth: 2 - name: Run clang-format uses: ./devops/actions/clang-format @@ -43,9 +44,11 @@ jobs: needs: [lint, resolve_matrix] if: always() && (success() || contains(github.event.pull_request.labels.*.name, 'ignore-lint')) uses: ./.github/workflows/sycl_linux_build_and_test.yml + secrets: inherit with: build_cache_root: "/__w/" build_cache_size: "8G" build_artifact_suffix: "default" build_cache_suffix: "default" lts_matrix: ${{ needs.resolve_matrix.outputs.lts_matrix }} + lts_aws_matrix: ${{ needs.resolve_matrix.outputs.lts_aws_matrix }} diff --git a/.github/workflows/sycl_resolve_test_matrix.yml b/.github/workflows/sycl_resolve_test_matrix.yml index 6d20fe9dedbf7..8feeee04860b3 100644 --- a/.github/workflows/sycl_resolve_test_matrix.yml +++ b/.github/workflows/sycl_resolve_test_matrix.yml @@ -19,10 +19,18 @@ on: type: string required: true default: "" + uniq: + description: Unique string to name dynamic runners in AWS + type: string + required: false + default: ${{ github.run_id }}-${{ github.run_attempt }} outputs: lts_matrix: description: "Generated Matrix" value: ${{ jobs.resolve_matrix.outputs.lts_matrix }} + lts_aws_matrix: + description: "Generated Matrix AWS subset" + value: ${{ jobs.resolve_matrix.outputs.lts_aws_matrix }} jobs: resolve_matrix: name: Resolve Test Matrix diff --git a/devops/actions/aws-ec2/action.yml b/devops/actions/aws-ec2/action.yml new file mode 100644 index 0000000000000..9e2730fdc4107 --- /dev/null +++ b/devops/actions/aws-ec2/action.yml @@ -0,0 +1,66 @@ +name: aws-ec2 +description: Start AWS EC2 instances with Github actions runner agent in it + +inputs: + runs-on-list: + description: "JSON string with array of objects with aws-type, runs-on, aws-ami, aws-spot, aws-disk, aws-timebomb, one-job properties" + required: true + # aws-type: AWS EC2 instance type. This property must be present if you want to trigger AWS EC2 instance start/stop. + # runs-on: Name of the unique label assigned to the runner used as 'runs-on' property for the following jobs. Mandatory presence required. + # aws-ami: AWS AMI id. Makes sense only for start mode. Default "ami-0966bccbb521ccb24". + + # ami-0966bccbb521ccb24: Ubuntu 22.04 (ami-02f3416038bdb17fb with /dev/sda1 disk) with docker installed and gh_runner (1001) like this: + # sudo -s + # apt-get update + # curl -fsSL https://get.docker.com -o /tmp/get-docker.sh + # sh /tmp/get-docker.sh + # groupadd -g 1001 gh_runner; useradd gh_runner -u 1001 -g 1001 -m -s /bin/bash; usermod -aG docker gh_runner; usermod -aG video gh_runner + # sync; shutdown -h now + + # ami-02ec0f344128253f9: Amazon Linux 2 AMI with NVIDIA TESLA GPU Driver (ami-06bf0a3f89fe08f0a with /dev/xvda disk) with docker installed and gh_runner (1001) like this: + # sudo -s + # yum update -y + # amazon-linux-extras install docker + # sudo systemctl --now enable docker + # distribution=$(. /etc/os-release;echo $ID$VERSION_ID) && curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.repo | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo + # yum-config-manager --disable amzn2-graphics; yum clean expire-cache; yum install -y nvidia-docker2; systemctl restart docker + # groupadd -g 1001 gh_runner; useradd gh_runner -u 1001 -g 1001 -m -s /bin/bash; usermod -aG docker gh_runner; usermod -aG video gh_runner + # sync; shutdown -h now + + # ami-0ccda708841dde988: Amazon Linux 2 AMI with AMD Radeon Pro Driver (ami-0bb1072e787242eb6 with /dev/xvda disk) with docker installed and gh_runner (1001) like this: + # sudo -s + # amazon-linux-extras install docker + # sudo systemctl --now enable docker + # groupadd -g 1001 gh_runner; useradd gh_runner -u 1001 -g 1001 -m -s /bin/bash; usermod -aG docker gh_runner; usermod -aG video gh_runner + # sync; shutdown -h now + + # aws-spot: Enable usage of spot instances to save money (less reliable). Makes sense only for start mode. Default true. + # aws-disk: AWS EC2 instance AMI specific disk device path and size in GB (8 by default). Makes sense only for start mode. Default "/dev/sda1:16". + # aws-timebomp: AWS EC2 instance maximum live time. Makes sense only for start mode. Default "1h". + # one-job: Will terminate AWS EC2 instance after one job (not waiting for stop job) saving money. Makes sense only for start mode. Default true. + + mode: + description: "Mode of operation: start or stop" + required: false + default: start + + GH_PERSONAL_ACCESS_TOKEN: + description: "Github personal access token with repo permission" + required: true + + AWS_ACCESS_KEY: + description: "AWS access id" + required: true + + AWS_SECRET_KEY: + description: "AWS access secret key" + required: true + + aws-region: + description: "AWS EC2 region" + required: false + default: "us-east-2" # Ohio + +runs: + using: node12 + main: ./aws-ec2.js diff --git a/devops/actions/aws-ec2/aws-ec2.js b/devops/actions/aws-ec2/aws-ec2.js new file mode 100644 index 0000000000000..bf7cedea673de --- /dev/null +++ b/devops/actions/aws-ec2/aws-ec2.js @@ -0,0 +1,242 @@ +const core = require('@actions/core'); +const github = require('@actions/github'); +const AWS = require('aws-sdk'); + +// shortcut to reference current repo +const repo = `${github.context.repo.owner}/${github.context.repo.repo}`; + +// get github registration token that allows to register new runner based on +// GH_PERSONAL_ACCESS_TOKEN github user api key +async function getGithubRegToken() { + core.info("Preparing Github SDK API"); + const octokit = github.getOctokit(core.getInput("GH_PERSONAL_ACCESS_TOKEN")); + + try { + core.info(`Getting Github Actions Runner registration token for ${repo} repo`); + const response = await octokit.request(`POST /repos/${repo}/actions/runners/registration-token`); + core.info("Got Github Actions Runner registration token"); + return response.data.token; + } catch (error) { + core.error("Error getting Github Actions Runner registration token"); + throw error; + } +} + +// add delay before retrying promise one more time +function rejectDelay(reason) { + return new Promise(function(resolve, reject) { + setTimeout(reject.bind(null, reason), 10 * 1000); + }); +} + +// we better keep GH_PERSONAL_ACCESS_TOKEN here and do not pass it to AWS EC2 +// userscript so it will keep secret +let reg_token; + +// starts AWS EC2 instance that will spawn Github runner for a given label +async function start(param_type, param_label, param_ami, param_spot, param_disk, param_timebomb, param_onejob) { + const ec2 = new AWS.EC2(); + + reg_token = reg_token ? reg_token : await getGithubRegToken(); + const ec2types = typeof param_type === 'string' ? [ param_type ] : param_type; + const label = typeof param_label === 'string' ? param_label : param_label[0]; + const ec2ami = typeof param_ami !== 'undefined' ? param_ami : "ami-0966bccbb521ccb24"; + const ec2spot = typeof param_spot !== 'undefined' ? param_spot : true; + const ec2disk = typeof param_disk !== 'undefined' ? param_disk : "/dev/sda1:16"; + const timebomb = typeof param_timebomb !== 'undefined' ? param_timebomb : "1h"; + const onejob = typeof param_onejob !== 'undefined' ? param_onejob : true; + // ephemeral runner will exit after one job so we will terminate instance sooner + const ephemeral_str = onejob ? "--ephemeral" : ""; + + let ec2id; // AWS EC2 instance id + // last error that will be thrown in case all our attemps in instance creation will fails + let last_error; + // loop for spot/ondemand instances + for (let spot of (ec2spot ? [ 1, 0 ] : [ 0 ])) { + const spot_str = spot ? "spot" : "on-demand"; + for (let ec2type of ec2types) { // iterate for provided instance types + const setup_github_actions_runner = [ + `#!/bin/bash -x`, `mkdir actions-runner`, `cd actions-runner`, + // we can not place runner into AMI image since it is updated often and + // latest version in required to connect to github + `export RUNNER_VERSION=$(curl -s https://api.github.com/repos/actions/runner/releases/latest | sed -n \'s,.*"tag_name": "v\\(.*\\)".*,\\1,p\')`, + `curl -O -L https://github.com/actions/runner/releases/download/v$RUNNER_VERSION/actions-runner-linux-x64-$RUNNER_VERSION.tar.gz || shutdown -h now`, + `tar xf ./actions-runner-linux-x64-$RUNNER_VERSION.tar.gz || shutdown -h now`, + `su gh_runner -c "./config.sh --unattended ${ephemeral_str} --url https://github.com/${repo} --token ${reg_token} --name ${label}_${ec2type}_${spot_str} --labels ${label} --replace || shutdown -h now"`, + // timebomb to avoid paying for stale AWS instances + `(sleep ${timebomb}; su gh_runner -c "./config.sh remove --token ${reg_token}"; shutdown -h now) &`, + `su gh_runner -c "./run.sh"`, + `su gh_runner -c "./config.sh remove --token ${reg_token}"`, + // in case we launch insance with InstanceInitiatedShutdownBehavior = "terminate" it will terminate instance here as well + `shutdown -h now` + ]; + try { + let params = { + ImageId: ec2ami, + InstanceType: ec2type, + UserData: Buffer.from(setup_github_actions_runner.join('\n')).toString('base64'), + MinCount: 1, + MaxCount: 1, + InstanceInitiatedShutdownBehavior: "terminate", + TagSpecifications: [ + { ResourceType: "instance", Tags: [ {Key: "Label", Value: label} ] } + ] + }; + if (spot) params.InstanceMarketOptions = { MarketType: "spot" }; + if (ec2disk) { + const items = ec2disk.split(':'); + params.BlockDeviceMappings = [ {DeviceName: items[0], Ebs: {VolumeSize: items[1]}} ]; + } + const result = await ec2.runInstances(params).promise(); + ec2id = result.Instances[0].InstanceId; + core.info(`Created AWS EC2 ${spot_str} instance ${ec2id} of ${ec2type} type with ${label} label`); + break; + } catch (error) { + core.warning(`Error creating AWS EC2 ${spot_str} instance of ${ec2type} type with ${label} label`); + last_error = error; + } + } + // we already created instance and do not need to iterate these loops + if (ec2id) break; + } + if (last_error) { + core.error(`Error creating AWS EC2 instance with ${label} label`); + throw last_error; + } + + // wait untill instance will be found running before continuing (spot instance + // can be created but never run and will be in pending state untill + // termination) + let p = ec2.waitFor("instanceRunning", { + Filters: [ { Name: "tag:Label", Values: [ label ] } ] + }).promise(); + for (let i = 0; i < 2; i++) { + p = p.catch(function() { + core.warning(`Error searching for running AWS EC2 instance ${ec2id} with ${label} label. Will retry.`); + }).catch(rejectDelay); + } + p = p.then(function() { + core.info(`Found running AWS EC2 instance ${ec2id} with ${label} label`); + }).catch(function(error) { + core.error(`Error searching for running AWS EC2 instance ${ec2id} with ${label} label`); + throw error; + }); +} + +// terminate (completely remove) AWS EC instances (normally one instance) with +// given tag label and also remove all Github actions runners (normally one +// runner) with that label +async function stop(param_label) { + // last error that will be thrown in case something will break here + let last_error; + const ec2 = new AWS.EC2(); + + const label = typeof param_label === 'string' ? param_label : param_label[0]; + + // find AWS EC2 instances with tag label + let instances; + try { + instances = await ec2.describeInstances({ + Filters: [ { Name: "tag:Label", Values: [ label ] } ] + }).promise(); + core.info(`Searched for AWS EC2 instance with label ${label}`); + } catch (error) { + core.error(`Error searching for AWS EC2 instance with label ${label}: ${error}`); + last_error = error; + } + + // remove all found AWS EC2 instances + if (instances) + for (const reservation of instances.Reservations) { + for (const instance of reservation.Instances) { + try { + await ec2.terminateInstances({ InstanceIds: [ instance.InstanceId ] }).promise(); + core.info(`Terminated AWS EC2 instance ${instance.InstanceId} with label ${label}`); + } catch (error) { + core.error(`Error terminating AWS EC2 instance ${instance.InstanceId} with label ${label}: ${error}`); + last_error = error; + } + } + } + + // find all Github action runners + core.info("Preparing Github SDK API"); + const octokit = github.getOctokit(core.getInput("GH_PERSONAL_ACCESS_TOKEN")); + let runners; + try { + runners = await octokit.paginate(`GET /repos/${repo}/actions/runners`); + core.info(`Searched for Github action runners with label ${label}`); + } catch (error) { + core.info(`Error searching for Github action runners with label ${label}`); + last_error = error; + } + + // remove Github action runners with specified label + if (runners) + for (runner of runners) { + let label_found = false; + for (label_obj of runner.labels) + if (label_obj.name == label) { + label_found = true; + break; + } + if (!label_found) continue; + let p = octokit.request(`DELETE /repos/${repo}/actions/runners/${runner.id}`); + // retry deletion up to 5 times (with 10 seconds delay) sincec Github can + // not remove runners still marked as active (with running job) + for (let i = 0; i < 5; i++) { + p = p.catch(function() { + core.warning(`Error removing Github self-hosted runner ${runner.id} with ${label}. Will retry.`); + }).catch(rejectDelay); + } + p = p.then(function() { + core.info(`Removed Github self-hosted runner ${runner.id} with ${label}`); + }).catch(function(error) { + core.error(`Error removing Github self-hosted runner ${runner.id} with ${label}: ${error}`); + last_error = error; + }); + } + + if (last_error) throw last_error; +} + +(async function() { + try { + // provide AWS credentials + AWS.config.update({ + accessKeyId: core.getInput("AWS_ACCESS_KEY"), + secretAccessKey: core.getInput("AWS_SECRET_KEY"), + region: core.getInput("aws-region") + }); + // mode is start or stop + const mode = core.getInput("mode"); + const runs_on_list = core.getInput("runs-on-list") ? JSON.parse(core.getInput("runs-on-list")) : []; + + if (mode == "start") { + for (let c of runs_on_list) { + const raw_label = c["runs-on"]; + if (c["aws-type"]) { + await start(c["aws-type"], raw_label, c["aws-ami"], c["aws-spot"], c["aws-disk"], c["aws-timebomb"], c["one-job"]); + } else core.info(`Skipping ${raw_label} config`); + } + } else if (mode == "stop") { + // last error that will be thrown in case something will break here + let last_error; + for (let c of runs_on_list) { + const raw_label = c["runs-on"]; + try { + if (c["aws-type"]) { + await stop(raw_label); + } else core.info(`Skipping ${raw_label} config`); + } catch (error) { + core.error(`Error removing runner with ${raw_label}: ${error}`); + last_error = error; + } + } + if (last_error) throw last_error; + } + } catch (error) { + core.error(error); + core.setFailed(error.message); + } +})(); diff --git a/devops/actions/aws-ec2/package.json b/devops/actions/aws-ec2/package.json new file mode 100644 index 0000000000000..8dc4f1087946c --- /dev/null +++ b/devops/actions/aws-ec2/package.json @@ -0,0 +1,9 @@ +{ + "name": "aws-ec2", + "description": "Start AWS EC2 spot instances with Github actions runner agent in it", + "dependencies": { + "@actions/core": "1.9.0", + "@actions/github": "5.0.3", + "aws-sdk": "2.1179.0" + } +} diff --git a/devops/actions/cached_checkout/action.yml b/devops/actions/cached_checkout/action.yml index c9c4633e9513b..c4ed22704ebcf 100644 --- a/devops/actions/cached_checkout/action.yml +++ b/devops/actions/cached_checkout/action.yml @@ -36,6 +36,7 @@ runs: GIT_ALTERNATE_OBJECT_DIRECTORIES: ${{ inputs.cache_path }}/${{ inputs.repository }}/.git/objects uses: actions/checkout@v2 with: + persist-credentials: false repository: ${{ inputs.repository }} ref: ${{ inputs.ref }} path: ${{ inputs.path }} diff --git a/devops/scripts/generate_test_matrix.js b/devops/scripts/generate_test_matrix.js index dd6cdafcb689d..154af1eb49240 100644 --- a/devops/scripts/generate_test_matrix.js +++ b/devops/scripts/generate_test_matrix.js @@ -27,6 +27,7 @@ module.exports = ({core, process}) => { const ltsConfigs = inputs.lts_config.split(';'); const enabledLTSConfigs = []; + const enabledLTSAWSConfigs = []; testConfigs.lts.forEach(v => { if (ltsConfigs.includes(v.config)) { @@ -44,22 +45,30 @@ module.exports = ({core, process}) => { v["env"] = {}; } enabledLTSConfigs.push(v); + if (v["aws-type"]) enabledLTSAWSConfigs.push(v); } }); let ltsString = JSON.stringify(enabledLTSConfigs); + let ltsAWSString = JSON.stringify(enabledLTSAWSConfigs); console.log(ltsString); + console.log(ltsAWSString) for (let [key, value] of Object.entries(inputs)) { ltsString = ltsString.replaceAll("${{ inputs." + key + " }}", value); + ltsAWSString = ltsAWSString.replaceAll("${{ inputs." + key + " }}", value); } if (needsDrivers) { ltsString = ltsString.replaceAll( "ghcr.io/intel/llvm/ubuntu2004_intel_drivers:latest", "ghcr.io/intel/llvm/ubuntu2004_base:latest"); + ltsAWSString = ltsAWSString.replaceAll( + "ghcr.io/intel/llvm/ubuntu2004_intel_drivers:latest", + "ghcr.io/intel/llvm/ubuntu2004_base:latest"); } core.setOutput('lts_matrix', ltsString); + core.setOutput('lts_aws_matrix', ltsAWSString); } }); }