diff --git a/.github/workflows/sycl_linux_build_and_test.yml b/.github/workflows/sycl_linux_build_and_test.yml index 6e933ff87308c..8cedcef3fad8b 100644 --- a/.github/workflows/sycl_linux_build_and_test.yml +++ b/.github/workflows/sycl_linux_build_and_test.yml @@ -64,6 +64,11 @@ on: type: number required: false default: 4 + uniq: + description: Unique string to name dynamic runers in AWS + type: string + required: false + default: ${{ github.run_id }}-${{ github.run_attempt }} jobs: build: @@ -191,8 +196,40 @@ jobs: const script = require('./generate_test_matrix.js'); script({core, process}); +aws-start-matrix: + name: Start AWS Matrix + needs: [ build, resolve_matrix ] + if: ${{ inputs.lts_config != '' }} + strategy: + fail-fast: false + max-parallel: ${{ inputs.max_parallel }} + matrix: + include: ${{ fromJSON(needs.resolve_matrix.outputs.lts) }} + runs-on: ubuntu-latest + environment: aws + steps: + - uses: actions/checkout@v3 + if: ${{ matrix.aws-type }} + with: + path: llvm + - run: npm install ./llvm/devops/actions/aws-ec2 + if: ${{ matrix.aws-type }} + - name: Start AWS EC2 runner + if: ${{ matrix.aws-type }} + uses: ./llvm/devops/actions/aws-ec2 + with: + label: ${{ matrix.runs-on }} + GH_PERSONAL_ACCESS_TOKEN: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + AWS_ACCESS_KEY: ${{ secrets.AWS_ACCESS_KEY }} + AWS_SECRET_KEY: ${{ secrets.AWS_SECRET_KEY }} + aws-ami: ${{ matrix.aws-ami }} + aws-spot: ${{ matrix.aws-spot }} + aws-type: ${{ matrix.aws-type }} + aws-disk: ${{ matrix.aws-disk }} + one-job: ${{ matrix.one-job }} + llvm_test_suite: - needs: [build, resolve_matrix] + needs: [build, resolve_matrix, aws-start-matrix] if: ${{ inputs.lts_config != '' }} strategy: fail-fast: false @@ -240,3 +277,30 @@ jobs: results_name_suffix: ${{ matrix.config }}_${{ inputs.build_artifact_suffix }} cmake_args: '${{ matrix.cmake_args }} ${{ inputs.lts_cmake_extra_args }}' + aws-stop-matrix: + name: Stop AWS Matrix + needs: [ aws-start-matrix, resolve_matrix, llvm_test_suite ] + if: ${{ always() && inputs.lts_config != '' }} + strategy: + fail-fast: false + max-parallel: ${{ inputs.max_parallel }} + matrix: + include: ${{ fromJSON(needs.resolve_matrix.outputs.lts) }} + runs-on: ubuntu-latest + environment: aws + steps: + - uses: actions/checkout@v3 + if: ${{ matrix.aws-type }} + with: + path: llvm + - run: npm install ./llvm/devops/actions/aws-ec2 + if: ${{ matrix.aws-type }} + - name: Stop AWS EC2 runner + if: ${{ matrix.aws-type }} + uses: ./llvm/devops/actions/aws-ec2 + with: + label: ${{ matrix.runs-on }} + mode: stop + GH_PERSONAL_ACCESS_TOKEN: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + AWS_ACCESS_KEY: ${{ secrets.AWS_ACCESS_KEY }} + AWS_SECRET_KEY: ${{ secrets.AWS_SECRET_KEY }} diff --git a/.github/workflows/sycl_nightly.yml b/.github/workflows/sycl_nightly.yml index 7e7cae192456b..38f53c5b4e605 100644 --- a/.github/workflows/sycl_nightly.yml +++ b/.github/workflows/sycl_nightly.yml @@ -13,6 +13,7 @@ jobs: ubuntu2004_build_test: if: github.repository == 'intel/llvm' uses: ./.github/workflows/sycl_linux_build_and_test.yml + secrets: inherit with: build_cache_root: "/__w/" build_artifact_suffix: default @@ -22,6 +23,7 @@ jobs: ubuntu2004_opaque_pointers_build_test: if: github.repository == 'intel/llvm' uses: ./.github/workflows/sycl_linux_build_and_test.yml + secrets: inherit with: build_cache_root: "/__w/" build_cache_suffix: opaque_pointers diff --git a/.github/workflows/sycl_post_commit.yml b/.github/workflows/sycl_post_commit.yml index d54cfc67e69c9..5aae06728e175 100644 --- a/.github/workflows/sycl_post_commit.yml +++ b/.github/workflows/sycl_post_commit.yml @@ -14,12 +14,14 @@ jobs: linux_default: name: Linux Default uses: ./.github/workflows/sycl_linux_build_and_test.yml + secrets: inherit with: build_cache_root: "/__w/llvm" build_artifact_suffix: default linux_no_assert: name: Linux (no assert) uses: ./.github/workflows/sycl_linux_build_and_test.yml + secrets: inherit with: build_cache_root: "/__w/llvm" build_cache_suffix: gcc_no_assertions diff --git a/.github/workflows/sycl_precommit.yml b/.github/workflows/sycl_precommit.yml index 374f930d00cea..e0704c083b579 100644 --- a/.github/workflows/sycl_precommit.yml +++ b/.github/workflows/sycl_precommit.yml @@ -36,6 +36,7 @@ jobs: needs: lint if: always() && (success() || contains(github.event.pull_request.labels.*.name, 'ignore-lint')) uses: ./.github/workflows/sycl_linux_build_and_test.yml + secrets: inherit with: build_cache_root: "/__w/" build_cache_size: "8G" diff --git a/devops/actions/aws-ec2/action.yml b/devops/actions/aws-ec2/action.yml new file mode 100644 index 0000000000000..47defc4752464 --- /dev/null +++ b/devops/actions/aws-ec2/action.yml @@ -0,0 +1,80 @@ +name: aws-ec2 +description: Starts AWS EC2 instance with github actions runner agent in it to process one job + +inputs: + label: + description: "Name of the unique label assigned to the runner used as 'runs-on' property for the following jobs" + required: true + GH_PERSONAL_ACCESS_TOKEN: + description: "Github personal access token with repo permission" + required: true + AWS_ACCESS_KEY: + description: "AWS access id" + required: true + AWS_SECRET_KEY: + description: "WS access secret key" + required: true + aws-region: + description: "AWS EC2 region" + required: false + default: "us-east-2" # Ohio + aws-ami: + description: "AWS AMI id. Makes sense only for start mode" + required: false + default: "ami-0966bccbb521ccb24" # Ubuntu 22.04 (ami-02f3416038bdb17fb with /dev/sda1 disk) with docker installed and gh_runner (1001) like this: + # sudo -s + # apt-get update + # curl -fsSL https://get.docker.com -o /tmp/get-docker.sh + # sh /tmp/get-docker.sh # or "yum install -y docker" for Amazon Linux or RHEL/CentOS + # groupadd -g 1001 gh_runner; useradd gh_runner -u 1001 -g 1001 -m -s /bin/bash; usermod -aG docker gh_runner; usermod -aG video gh_runner + # sync; shutdown -h now + + # "ami-02ec0f344128253f9" # Amazon Linux 2 AMI with NVIDIA TESLA GPU Driver (ami-06bf0a3f89fe08f0a with /dev/xvda disk) with docker installed and gh_runner (1001) like this: + # sudo -s + # yum update -y + # amazon-linux-extras install docker + # sudo systemctl --now enable docker + # distribution=$(. /etc/os-release;echo $ID$VERSION_ID) && curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.repo | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo + # yum-config-manager --disable amzn2-graphics; yum clean expire-cache; yum install -y nvidia-docker2; systemctl restart docker + # groupadd -g 1001 gh_runner; useradd gh_runner -u 1001 -g 1001 -m -s /bin/bash; usermod -aG docker gh_runner; usermod -aG video gh_runner + # sync; shutdown -h now + + # "ami-0ccda708841dde988" # Amazon Linux 2 AMI with AMD Radeon Pro Driver (ami-0bb1072e787242eb6 with /dev/xvda disk) with docker installed and gh_runner (1001) like this: + # sudo -s + # sh ./get-docker.sh # or "yum install -y docker" for Amazon Linux or RHEL/CentOS + # amazon-linux-extras install docker + # sudo systemctl --now enable docker + # groupadd -g 1001 gh_runner; useradd gh_runner -u 1001 -g 1001 -m -s /bin/bash; usermod -aG docker gh_runner; usermod -aG video gh_runner + # sync; shutdown -h now + + aws-type: + description: "AWS EC2 instance type. Makes sense only for start mode" + required: false + default: "[ \"t2.micro\" ]" + # "[ \"g4dn.2xlarge\" ]": 1 NVIDIA T4 GPU, 8 CPU, 32 GB RAM + # "[ \"g4ad.4xlarge\" ]": 1 AMD Radeon Pro V520 GPU, 16 CPU, 64 GB RAM (overloaded pool so do not use spot here now) + aws-spot: + description: "Enable usage of spot instances to save money (less reliable). Makes sense only for start mode" + requred: false + default: true + aws-disk: + description: "AWS EC2 instance AMI specific disk device path and size in GB (8 by default). Makes sense only for start mode" + required: false + default: "/dev/sda1:16" # Github actions container engine will fail with lack of disk space for 8GB + aws-timebomb: + description: "AWS EC2 instance maximum live time. Makes sense only for start mode" + required: false + default: 1h + + mode: + description: "Mode of operation: start or stop" + required: false + default: start + one-job: + description: "Will terminate AWS EC2 instance after one job (not waiting for stop job) saving money" + required: false + default: true + +runs: + using: node12 + main: ./aws-ec2.js diff --git a/devops/actions/aws-ec2/aws-ec2.js b/devops/actions/aws-ec2/aws-ec2.js new file mode 100644 index 0000000000000..50fe21f0abc46 --- /dev/null +++ b/devops/actions/aws-ec2/aws-ec2.js @@ -0,0 +1,230 @@ +const core = require('@actions/core'); +const github = require('@actions/github'); +const AWS = require('aws-sdk'); + +// shortcut to reference current repo +const repo = `${github.context.repo.owner}/${github.context.repo.repo}`; + +// get github registration token that allows to register new runner based on +// GH_PERSONAL_ACCESS_TOKEN github user api key +async function getGithubRegToken() { + const octokit = github.getOctokit(core.getInput("GH_PERSONAL_ACCESS_TOKEN")); + + try { + const response = await octokit.request( + `POST /repos/${repo}/actions/runners/registration-token`); + core.info("Got Github Actions Runner registration token"); + return response.data.token; + } catch (error) { + core.error("Error getting Github Actions Runner registration token"); + throw error; + } +} + +// add delay before retrying promise one more time +function rejectDelay(reason) { + return new Promise(function( + resolve, reject) { setTimeout(reject.bind(null, reason), 10 * 1000); }); +} + +// starts AWS EC2 instance that will spawn Github runner for a given label +async function start(label) { + const ec2 = new AWS.EC2(); + + // we better keep GH_PERSONAL_ACCESS_TOKEN here and do not pass it to AWS EC2 + // userscript so it will keep secret + const reg_token = await getGithubRegToken(); + const timebomb = core.getInput("aws-timebomb"); + const raw_ec2types = JSON.parse(core.getInput("aws-type")); + const ec2types = + typeof raw_ec2types == "string" ? [ raw_ec2types ] : raw_ec2types; + const ec2disk = core.getInput("aws-disk"); + const ec2spot = core.getInput("aws-spot") != "false"; + const onejob = core.getInput("one-job") != "false"; + + let ec2id; // AWS EC2 instance id + let last_error; // last error that ill be thrown in case all our attemps in + // instance creation will fails + // loop for spot/ondemand instances + for (let spot of (ec2spot ? [ 1, 0 ] : [ 0 ])) { + const spot_str = spot ? "spot" : "on-demand"; + for (let ec2type of ec2types) { // iterate for provided instance types + const setup_github_actions_runner = [ + `#!/bin/bash -x`, `mkdir actions-runner`, `cd actions-runner`, + // we can not place runner into AMI image since it is updated often and + // latest version in required to connect to github + `export RUNNER_VERSION=$(curl -s https://api.github.com/repos/actions/runner/releases/latest | sed -n \'s,.*"tag_name": "v\\(.*\\)".*,\\1,p\')`, + `curl -O -L https://github.com/actions/runner/releases/download/v$RUNNER_VERSION/actions-runner-linux-x64-$RUNNER_VERSION.tar.gz || shutdown -h now`, + `tar xf ./actions-runner-linux-x64-$RUNNER_VERSION.tar.gz || shutdown -h now`, + `su gh_runner -c "./config.sh --unattended --url https://github.com/${repo} --token ${reg_token} --name ${label}_${ec2type}_${spot_str} --labels ${label} --replace || shutdown -h now"`, + // timebomb to avoid paying for stale AWS instances + `(sleep ${timebomb}; su gh_runner -c "./config.sh remove --token ${reg_token}"; shutdown -h now) &`, + // ephemeral runner will exit after one job so we will terminate + // instance sooner + onejob ? `su gh_runner -c "./run.sh --ephemeral"` + : `su gh_runner -c "./run.sh"`, + `su gh_runner -c "./config.sh remove --token ${reg_token}"`, + `shutdown -h now` // in case we launch insance with + // InstanceInitiatedShutdownBehavior = "terminate" it + // will terminate instance here as well + ]; + try { + let params = { + ImageId : core.getInput("aws-ami"), + InstanceType : ec2type, + InstanceInitiatedShutdownBehavior : "terminate", + UserData : Buffer.from(setup_github_actions_runner.join('\n')) + .toString('base64'), + MinCount : 1, + MaxCount : 1, + TagSpecifications : [ { + ResourceType : "instance", + Tags : [ {Key : "Label", Value : label} ] + } ] + }; + if (spot) + params.InstanceMarketOptions = {MarketType : "spot"}; + if (ec2disk) { + const items = ec2disk.split(':'); + params.BlockDeviceMappings = + [ {DeviceName : items[0], Ebs : {VolumeSize : items[1]}} ]; + } + const result = await ec2.runInstances(params).promise(); + ec2id = result.Instances[0].InstanceId; + core.info(`Created AWS EC2 ${spot_str} instance ${ec2id} of ${ec2type} type with ${label} label`); + break; + } catch (error) { + core.warning(`Error creating AWS EC2 ${spot_str} instance of ${ec2type} type with ${label} label`); + last_error = error; + } + } + // we already created instance and do not need to iterate these loops + if (ec2id) + break; + } + if (last_error) { + core.error(`Error creating AWS EC2 instance with ${label} label`); + throw last_error; + } + + // wait untill instance will be found running before continuing (spot instance + // can be created but never run and will be in pending state untill + // termination) + let p = ec2.waitFor("instanceRunning", + {Filters : [ {Name : "tag:Label", Values : [ label ]} ]}) + .promise(); + for (let i = 0; i < 2; i++) { + p = p.catch(function() { + core.warning(`Error searching for running AWS EC2 spot instance ${ec2id} with ${label} label. Will retry.`); + }).catch(rejectDelay); + } + p = p.then(function() { + core.info(`Found running AWS EC2 spot instance ${ec2id} with ${label} label`); + }).catch(function(error) { + core.error(`Error searching for running AWS EC2 spot instance ${ec2id} with ${label} label`); + throw error; + }); +} + +// terminate (completely remove) AWS EC instances (normally one instance) with +// given tag label and also remove all Github actions runners (normally one +// runner) with that label +async function stop(label) { + // last error that will be thrown in case something will break here + let last_error; + const ec2 = new AWS.EC2(); + + // find AWS EC2 instances with tag label + let instances; + try { + instances = + await ec2 + .describeInstances( + {Filters : [ {Name : "tag:Label", Values : [ label ]} ]}) + .promise(); + core.info(`Searched for AWS EC2 instance with label ${label}`); + } catch (error) { + core.error(`Error searching for AWS EC2 instance with label ${label}`); + last_error = error; + } + + // remove all found AWS EC2 instances + if (instances) + for (const reservation of instances.Reservations) { + for (const instance of reservation.Instances) { + try { + await ec2.terminateInstances({InstanceIds : [ instance.InstanceId ]}) + .promise(); + core.info(`Terminated AWS EC2 instance ${instance.InstanceId} with label ${label}`); + } catch (error) { + core.error(`Error terminating AWS EC2 instance ${instance.InstanceId} with label ${label}`); + last_error = error; + } + } + } + + // find all Github action runners + const octokit = github.getOctokit(core.getInput("GH_PERSONAL_ACCESS_TOKEN")); + let runners; + try { + runners = await octokit.paginate(`GET /repos/${repo}/actions/runners`); + core.info(`Searched for Github action runners with label ${label}`); + } catch (error) { + core.info(`Error searching for Github action runners with label ${label}`); + last_error = error; + } + + // remove Github action runners with specified label + if (runners) + for (runner of runners) { + let label_found = false; + for (label_obj of runner.labels) + if (label_obj.name == label) { + label_found = true; + break; + } + if (!label_found) + continue; + let p = + octokit.request(`DELETE /repos/${repo}/actions/runners/${runner.id}`); + // retry deletion up to 5 times (with 10 seconds delay) sincec Github can + // not remove runners still marked as active (with running job) + for (let i = 0; i < 5; i++) { + p = p.catch(function() { + core.warning(`Error removing Github self-hosted runner ${runner.id} with ${label}. Will retry.`); + }).catch(rejectDelay); + } + p = p.then(function() { + core.info(`Removed Github self-hosted runner ${runner.id} with ${label}`); + }).catch(function(error) { + core.error(`Error removing Github self-hosted runner ${runner.id} with ${label}`); + last_error = error; + }); + } + + if (last_error) + throw last_error; +} + +(async function() { + try { + // provide AWS credentials + AWS.config.update({ + accessKeyId : core.getInput("AWS_ACCESS_KEY"), + secretAccessKey : core.getInput("AWS_SECRET_KEY"), + region : core.getInput("aws-region") + }); + // mode is start or stop + const mode = core.getInput("mode"); + // label used to indentify AWS EC2 instances and Github runners + const label = core.getInput("label"); + if (mode == "start") { + await start(label); + } else if (mode == "stop") { + await stop(label); + } + } catch (error) { + core.error(error); + core.setFailed(error.message); + } +})(); diff --git a/devops/actions/aws-ec2/package.json b/devops/actions/aws-ec2/package.json new file mode 100644 index 0000000000000..0aa9f488bc77c --- /dev/null +++ b/devops/actions/aws-ec2/package.json @@ -0,0 +1,9 @@ +{ + "name": "aws-ec2", + "description": "Starts AWS EC2 instance with github actions runner agent in it to process one job", + "dependencies": { + "@actions/core": "^1.9.0", + "@actions/github": "^5.0.3", + "aws-sdk": "^2.1179.0" + } +}