diff --git a/.ci/build-yarn.sh b/.ci/build-yarn.sh new file mode 100755 index 0000000000..1283d9ea8b --- /dev/null +++ b/.ci/build-yarn.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +# Build all the lambda's, output on the default place (inside the lambda module) + +lambdaSrcDirs=("modules/runner-binaries-syncer/lambdas/runner-binaries-syncer" "modules/runners/lambdas/runners" "modules/webhook/lambdas/webhook") +repoRoot=$(dirname $(dirname $(realpath ${BASH_SOURCE[0]}))) + +for lambdaDir in ${lambdaSrcDirs[@]}; do + cd "$repoRoot/${lambdaDir}" + yarn && yarn run dist +done diff --git a/README.md b/README.md index b38102056b..86354d19cd 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,16 @@ This [Terraform](https://www.terraform.io/) module creates the required infrastructure needed to host [GitHub Actions](https://github.com/features/actions) self hosted, auto scaling runners on [AWS spot instances](https://aws.amazon.com/ec2/spot/). It provides the required logic to handle the life cycle for scaling up and down using a set of AWS Lambda functions. Runners are scaled down to zero to avoid costs when no workflows are active. +> NEW: Ephemeral runners available as beta feature. + +> NEW: Windows runners are available. + +> NEW: Examples for custom AMI are available. + - [Motivation](#motivation) - [Overview](#overview) - - [ARM64 support via Graviton/Graviton2 instance-types](#arm64-support-via-gravitongraviton2-instance-types) + - [Major configuration options.](#major-configuration-options) + - [ARM64 support via Graviton/Graviton2 instance-types](#arm64-support-via-gravitongraviton2-instance-types) - [Usages](#usages) - [Setup GitHub App (part 1)](#setup-github-app-part-1) - [Setup terraform module](#setup-terraform-module) @@ -16,6 +23,7 @@ This [Terraform](https://www.terraform.io/) module creates the required infrastr - [Install app](#install-app) - [Encryption](#encryption) - [Idle runners](#idle-runners) + - [Ephemeral runners](#ephemeral-runners) - [Prebuilt Images](#prebuilt-images) - [Examples](#examples) - [Sub modules](#sub-modules) @@ -48,7 +56,6 @@ For receiving the `check_run` or `workflow_job` event by the webhook (lambda) a - `check_run`: create a webhook on enterprise, org, repo or app level. When using the app option, the app needs to be installed to repo's are using the self-hosted runners. - a Webhook needs to be created. The webhook hook can be defined on enterprise, org, repo, or app level. - In AWS a [API gateway](https://docs.aws.amazon.com/apigateway/index.html) endpoint is created that is able to receive the GitHub webhook events via HTTP post. The gateway triggers the webhook lambda which will verify the signature of the event. This check guarantees the event is sent by the GitHub App. The lambda only handles `workflow_job` or `check_run` events with status `queued` and matching the runner labels (only for `workflow_job`). The accepted events are posted on a SQS queue. Messages on this queue will be delayed for a configurable amount of seconds (default 30 seconds) to give the available runners time to pick up this build. The "scale up runner" lambda is listening to the SQS queue and picks up events. The lambda runs various checks to decide whether a new EC2 spot instance needs to be created. For example, the instance is not created if the build is already started by an existing runner, or the maximum number of runners is reached. @@ -71,7 +78,18 @@ Permission are managed on several places. Below the most important ones. For det Besides these permissions, the lambdas also need permission to CloudWatch (for logging and scheduling), SSM and S3. For more details about the required permissions see the [documentation](./modules/setup-iam-permissions/README.md) of the IAM module which uses permission boundaries. -### ARM64 support via Graviton/Graviton2 instance-types +### Major configuration options. + +To be able to support a number of use-cases the module has quite a lot configuration options. We try to choose reasonable defaults. The several examples also shows for the main cases how to configure the runners. + +- Org vs Repo level. You can configure the module to connect the runners in GitHub on a org level and share the runners in your org. Or set the runners on repo level. The module will install the runner to the repo. This can be multiple repo's but runners are not shared between repo's. +- Checkrun vs Workflow job event. You can configure the webhook in GitHub to send checkrun or workflow job events to the webhook. Workflow job events are introduced by GitHub in September 2021 and are designed to support scalable runners. We advise when possible to use the workflow job event, you can set `disable_check_wokflow_job_labels = true` to disable the label check. +- Linux vs Windows. you can configure the os types linux and win. Linux will be used by default. +- Re-use vs Ephemeral. By default runners are re-used for till detected idle, once idle they will be removed from the pool. To improve security we are introducing ephemeral runners. Those runners are only used for one job. Ephemeral runners are only working in combination with the workflow job event. We also suggest to use a pre-build AMI to improve the start time of jobs. +- GitHub cloud vs GitHub enterprise server (GHES). The runner support GitHub cloud as well GitHub enterprise service. For GHES we rely on our community to test and support. We have no possibility to test ourselves on GHES. + + +#### ARM64 support via Graviton/Graviton2 instance-types When using the default example or top-level module, specifying an `instance_type` that matches a Graviton/Graviton 2 (ARM64) architecture (e.g. a1 or any 6th-gen `g` or `gd` type), the sub-modules will be automatically configured to provision with ARM64 AMIs and leverage GitHub's ARM64 action runner. See below for more details. @@ -268,6 +286,17 @@ idle_config = [{ _**Note**_: When using Windows runners it's recommended to keep a few runners warmed up due to the minutes-long cold start time. +### Ephemeral runners + +Currently a beta feature! You can configure runners to be ephemeral, runners will be used only for one job. The feature should be used in conjunction with listening for the workflow job event. Please consider the following: + +- The scale down lambda is still active, and should only remove orphan instances. But there is no strict check in place. So ensure you configure the `minimum_running_time_in_minutes` to a value that is high enough to got your runner booted and connected to avoid it got terminated before executing a job. +- The messages sent from the webhook lambda to scale-up lambda are by default delayed delayed by SQS, to give available runners to option to start the job before the decision is made to scale more runners. For ephemeral runners there is no need to wait. Set `delay_webhook_event` to `0`. +- To ensure runners are created in the same order GitHub sends the events we use by default a FIFO queue, this is mainly relevant for repo level runners. For ephemeral runners you can set `fifo_build_queue` to `false`. +- Error related to scaling should be retried via SQS. You can configure `job_queue_retention_in_seconds` `redrive_build_queue` to tune the behavior. We have no mechanism to avoid events will never processed, which means potential no runner could be created and the job in GitHub can time out in 6 hours. + +The example for [ephemeral runners](./examples/ephemeral) is based on the [default example](./examples/default). Have look on the diff to see the major configuration differences. + ### Prebuilt Images This module also allows you to run agents from a prebuilt AMI to gain faster startup times. You can find more information in [the image README.md](/images/README.md) @@ -295,10 +324,11 @@ For time zones please check [TZ database name column](https://en.wikipedia.org/w Examples are located in the [examples](./examples) directory. The following examples are provided: - _[Default](examples/default/README.md)_: The default example of the module -- _[Permissions boundary](examples/permissions-boundary/README.md)_: Example usages of permissions boundaries. - _[Ubuntu](examples/ubuntu/README.md)_: Example usage of creating a runner using Ubuntu AMIs. -- _[Prebuilt Images](examples/prebuilt/README.md)_: Example usages of deploying runners with a custom prebuilt image. - _[Windows](examples/windows/README.md)_: Example usage of creating a runner using Windows as the OS. +- _[Ephemeral](examples/ephemeral/README.md) : Example usages of ephemeral runners based on the default example. +- _[Prebuilt Images](examples/prebuilt/README.md)_: Example usages of deploying runners with a custom prebuilt image. +- _[Permissions boundary](examples/permissions-boundary/README.md)_: Example usages of permissions boundaries. ## Sub modules @@ -367,6 +397,7 @@ In case the setup does not work as intended follow the trace of events: |------|------| | [aws_resourcegroups_group.resourcegroups_group](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/resourcegroups_group) | resource | | [aws_sqs_queue.queued_builds](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sqs_queue) | resource | +| [aws_sqs_queue.queued_builds_dlq](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sqs_queue) | resource | | [random_string.random](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/string) | resource | ## Inputs @@ -382,10 +413,12 @@ In case the setup does not work as intended follow the trace of events: | [delay\_webhook\_event](#input\_delay\_webhook\_event) | The number of seconds the event accepted by the webhook is invisible on the queue before the scale up lambda will receive the event. | `number` | `30` | no | | [disable\_check\_wokflow\_job\_labels](#input\_disable\_check\_wokflow\_job\_labels) | Disable the the check of workflow labels for received workflow job events. | `bool` | `false` | no | | [enable\_cloudwatch\_agent](#input\_enable\_cloudwatch\_agent) | Enabling the cloudwatch agent on the ec2 runner instances, the runner contains default config. Configuration can be overridden via `cloudwatch_config`. | `bool` | `true` | no | +| [enable\_ephemeral\_runners](#input\_enable\_ephemeral\_runners) | Enable ephemeral runners, runners will only be used once. | `bool` | `false` | no | | [enable\_organization\_runners](#input\_enable\_organization\_runners) | Register runners to organization, instead of repo level | `bool` | `false` | no | | [enable\_ssm\_on\_runners](#input\_enable\_ssm\_on\_runners) | Enable to allow access the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances. | `bool` | `false` | no | | [enabled\_userdata](#input\_enabled\_userdata) | Should the userdata script be enabled for the runner. Set this to false if you are using your own prebuilt AMI | `bool` | `true` | no | | [environment](#input\_environment) | A name that identifies the environment, used as prefix and for tagging. | `string` | n/a | yes | +| [fifo\_build\_queue](#input\_fifo\_build\_queue) | Enable a FIFO queue to remain the order of events received by the webhook. Suggest to set to true for repo level runners. | `bool` | `false` | no | | [ghes\_ssl\_verify](#input\_ghes\_ssl\_verify) | GitHub Enterprise SSL verification. Set to 'false' when custom certificate (chains) is used for GitHub Enterprise Server (insecure). | `bool` | `true` | no | | [ghes\_url](#input\_ghes\_url) | GitHub Enterprise Server URL. Example: https://github.internal.co - DO NOT SET IF USING PUBLIC GITHUB | `string` | `null` | no | | [github\_app](#input\_github\_app) | GitHub app parameters, see your github app. Ensure the key is the base64-encoded `.pem` file (the output of `base64 app.private-key.pem`, not the content of `private-key.pem`). |
object({
key_base64 = string
id = string
webhook_secret = string
})
| n/a | yes | @@ -405,6 +438,7 @@ In case the setup does not work as intended follow the trace of events: | [logging\_retention\_in\_days](#input\_logging\_retention\_in\_days) | Specifies the number of days you want to retain log events for the lambda log group. Possible values are: 0, 1, 3, 5, 7, 14, 30, 60, 90, 120, 150, 180, 365, 400, 545, 731, 1827, and 3653. | `number` | `180` | no | | [market\_options](#input\_market\_options) | Market options for the action runner instances. Setting the value to `null` let the scaler create on-demand instances instead of spot instances. | `string` | `"spot"` | no | | [minimum\_running\_time\_in\_minutes](#input\_minimum\_running\_time\_in\_minutes) | The time an ec2 action runner should be running at minimum before terminated if not busy. | `number` | `null` | no | +| [redrive\_build\_queue](#input\_redrive\_build\_queue) | Set options to attach (optional) a dead letter queue to the build queue, the queue between the webhook and the scale up lambda. You have the following options. 1. Disable by setting, `enalbed' to false. 2. Enable by setting `enabled` to `true`, `maxReceiveCount` to a number of max retries.` |
object({
enabled = bool
maxReceiveCount = number
})
|
{
"enabled": false,
"maxReceiveCount": null
}
| no | | [repository\_white\_list](#input\_repository\_white\_list) | List of repositories allowed to use the github app | `list(string)` | `[]` | no | | [role\_path](#input\_role\_path) | The path that will be added to role path for created roles, if not set the environment name will be used. | `string` | `null` | no | | [role\_permissions\_boundary](#input\_role\_permissions\_boundary) | Permissions boundary that will be added to the created roles. | `string` | `null` | no | @@ -428,7 +462,7 @@ In case the setup does not work as intended follow the trace of events: | [runners\_lambda\_zip](#input\_runners\_lambda\_zip) | File location of the lambda zip file for scaling runners. | `string` | `null` | no | | [runners\_maximum\_count](#input\_runners\_maximum\_count) | The maximum number of runners that will be created. | `number` | `3` | no | | [runners\_scale\_down\_lambda\_timeout](#input\_runners\_scale\_down\_lambda\_timeout) | Time out for the scale down lambda in seconds. | `number` | `60` | no | -| [runners\_scale\_up\_lambda\_timeout](#input\_runners\_scale\_up\_lambda\_timeout) | Time out for the scale up lambda in seconds. | `number` | `180` | no | +| [runners\_scale\_up\_lambda\_timeout](#input\_runners\_scale\_up\_lambda\_timeout) | Time out for the scale up lambda in seconds. | `number` | `30` | no | | [scale\_down\_schedule\_expression](#input\_scale\_down\_schedule\_expression) | Scheduler expression to check every x for scale down. | `string` | `"cron(*/5 * * * ? *)"` | no | | [scale\_up\_reserved\_concurrent\_executions](#input\_scale\_up\_reserved\_concurrent\_executions) | Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations. | `number` | `1` | no | | [subnet\_ids](#input\_subnet\_ids) | List of subnets in which the action runners will be launched, the subnets needs to be subnets in the `vpc_id`. | `list(string)` | n/a | yes | @@ -450,6 +484,7 @@ In case the setup does not work as intended follow the trace of events: | Name | Description | |------|-------------| | [binaries\_syncer](#output\_binaries\_syncer) | n/a | +| [queues](#output\_queues) | SQS queues. | | [runners](#output\_runners) | n/a | | [ssm\_parameters](#output\_ssm\_parameters) | n/a | | [webhook](#output\_webhook) | n/a | diff --git a/examples/default/main.tf b/examples/default/main.tf index 2a5ec1edee..46e57fc183 100644 --- a/examples/default/main.tf +++ b/examples/default/main.tf @@ -30,11 +30,13 @@ module "runners" { webhook_secret = random_id.random.hex } + # Grab zip files via lambda_download webhook_lambda_zip = "lambdas-download/webhook.zip" runner_binaries_syncer_lambda_zip = "lambdas-download/runner-binaries-syncer.zip" runners_lambda_zip = "lambdas-download/runners.zip" - enable_organization_runners = false - runner_extra_labels = "default,example" + + enable_organization_runners = false + runner_extra_labels = "default,example" # enable access to the runners via SSM enable_ssm_on_runners = true @@ -61,7 +63,11 @@ module "runners" { instance_types = ["m5.large", "c5.large"] # override delay of events in seconds - delay_webhook_event = 5 + delay_webhook_event = 5 + runners_maximum_count = 1 + + # set up a fifo queue to remain order + fifo_build_queue = true # override scaling down scale_down_schedule_expression = "cron(* * * * ? *)" diff --git a/examples/ephemeral/.terraform.lock.hcl b/examples/ephemeral/.terraform.lock.hcl new file mode 100644 index 0000000000..d940521fcb --- /dev/null +++ b/examples/ephemeral/.terraform.lock.hcl @@ -0,0 +1,57 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/aws" { + version = "3.61.0" + constraints = ">= 3.27.0" + hashes = [ + "h1:fpZ14qQnn+uEOO2ZOlBFHgty48Ol8IOwd+ewxZ4z3zc=", + "zh:0483ca802ddb0ae4f73144b4357ba72242c6e2641aeb460b1aa9a6f6965464b0", + "zh:274712214ebeb0c1269cbc468e5705bb5741dc45b05c05e9793ca97f22a1baa1", + "zh:3c6bd97a2ca809469ae38f6893348386c476cb3065b120b785353c1507401adf", + "zh:53dd41a9aed9860adbbeeb71a23e4f8195c656fd15a02c90fa2d302a5f577d8c", + "zh:65c639c547b97bc880fd83e65511c0f4bbfc91b63cada3b8c0d5776444221700", + "zh:a2769e19137ff480c1dd3e4f248e832df90fb6930a22c66264d9793895161714", + "zh:a5897a99332cc0071e46a71359b86a8e53ab09c1453e94cd7cf45a0b577ff590", + "zh:bdc2353642d16d8e2437a9015cd4216a1772be9736645cc17d1a197480e2b5b7", + "zh:cbeace1deae938f6c0aca3734e6088f3633ca09611aff701c15cb6d42f2b918a", + "zh:d33ca19012aabd98cc03fdeccd0bd5ce56e28f61a1dfbb2eea88e89487de7fb3", + "zh:d548b29a864b0687e85e8a993f208e25e3ecc40fcc5b671e1985754b32fdd658", + ] +} + +provider "registry.terraform.io/hashicorp/local" { + version = "2.1.0" + hashes = [ + "h1:KfieWtVyGWwplSoLIB5usKAUnrIkDQBkWaR5TI+4WYg=", + "zh:0f1ec65101fa35050978d483d6e8916664b7556800348456ff3d09454ac1eae2", + "zh:36e42ac19f5d68467aacf07e6adcf83c7486f2e5b5f4339e9671f68525fc87ab", + "zh:6db9db2a1819e77b1642ec3b5e95042b202aee8151a0256d289f2e141bf3ceb3", + "zh:719dfd97bb9ddce99f7d741260b8ece2682b363735c764cac83303f02386075a", + "zh:7598bb86e0378fd97eaa04638c1a4c75f960f62f69d3662e6d80ffa5a89847fe", + "zh:ad0a188b52517fec9eca393f1e2c9daea362b33ae2eb38a857b6b09949a727c1", + "zh:c46846c8df66a13fee6eff7dc5d528a7f868ae0dcf92d79deaac73cc297ed20c", + "zh:dc1a20a2eec12095d04bf6da5321f535351a594a636912361db20eb2a707ccc4", + "zh:e57ab4771a9d999401f6badd8b018558357d3cbdf3d33cc0c4f83e818ca8e94b", + "zh:ebdcde208072b4b0f8d305ebf2bfdc62c926e0717599dcf8ec2fd8c5845031c3", + "zh:ef34c52b68933bedd0868a13ccfd59ff1c820f299760b3c02e008dc95e2ece91", + ] +} + +provider "registry.terraform.io/hashicorp/random" { + version = "3.1.0" + hashes = [ + "h1:rKYu5ZUbXwrLG1w81k7H3nce/Ys6yAxXhWcbtk36HjY=", + "zh:2bbb3339f0643b5daa07480ef4397bd23a79963cc364cdfbb4e86354cb7725bc", + "zh:3cd456047805bf639fbf2c761b1848880ea703a054f76db51852008b11008626", + "zh:4f251b0eda5bb5e3dc26ea4400dba200018213654b69b4a5f96abee815b4f5ff", + "zh:7011332745ea061e517fe1319bd6c75054a314155cb2c1199a5b01fe1889a7e2", + "zh:738ed82858317ccc246691c8b85995bc125ac3b4143043219bd0437adc56c992", + "zh:7dbe52fac7bb21227acd7529b487511c91f4107db9cc4414f50d04ffc3cab427", + "zh:a3a9251fb15f93e4cfc1789800fc2d7414bbc18944ad4c5c98f466e6477c42bc", + "zh:a543ec1a3a8c20635cf374110bd2f87c07374cf2c50617eee2c669b3ceeeaa9f", + "zh:d9ab41d556a48bd7059f0810cf020500635bfc696c9fc3adab5ea8915c1d886b", + "zh:d9e13427a7d011dbd654e591b0337e6074eef8c3b9bb11b2e39eaaf257044fd7", + "zh:f7605bd1437752114baf601bdf6931debe6dc6bfe3006eb7e9bb9080931dca8a", + ] +} diff --git a/examples/ephemeral/README.md b/examples/ephemeral/README.md new file mode 100644 index 0000000000..0eec98561d --- /dev/null +++ b/examples/ephemeral/README.md @@ -0,0 +1,30 @@ +# Action runners deployment ephemeral example + +This example is based on the default setup, but shows how runners can be used with the ephemeral flag enabled. Once enabled, ephemeral runners will be used for one job only. Each job requires a fresh instance. This feature should be used in combination with the `workflow_job` event. See GitHub webhook endpoint configuration(link needed here). It is also suggested to use a pre-build AMI to minimize runner launch times. +## Usages + +Steps for the full setup, such as creating a GitHub app can be found in the root module's [README](../../README.md). First download the Lambda releases from GitHub. Alternatively you can build the lambdas locally with Node or Docker, there is a simple build script in `/.ci/build.sh`. In the `main.tf` you can simply remove the location of the lambda zip files, the default location will work in this case. + +> Ensure you have set the version in `lambdas-download/main.tf` for running the example. The version needs to be set to a GitHub release version, see https://github.com/philips-labs/terraform-aws-github-runner/releases + +```bash +cd lambdas-download +terraform init +terraform apply +cd .. +``` + +Before running Terraform, ensure the GitHub app is configured. See the [configuration details](../../README.md#usages) for more details. + +```bash +terraform init +terraform apply +``` + +You can receive the webhook details by running: + +```bash +terraform output -raw webhook_secret +``` + +Be-aware some shells will print some end of line character `%`. \ No newline at end of file diff --git a/examples/ephemeral/lambdas-download/main.tf b/examples/ephemeral/lambdas-download/main.tf new file mode 100644 index 0000000000..87f31bd8a9 --- /dev/null +++ b/examples/ephemeral/lambdas-download/main.tf @@ -0,0 +1,25 @@ +locals { + version = "" +} + +module "lambdas" { + source = "../../../modules/download-lambda" + lambdas = [ + { + name = "webhook" + tag = local.version + }, + { + name = "runners" + tag = local.version + }, + { + name = "runner-binaries-syncer" + tag = local.version + } + ] +} + +output "files" { + value = module.lambdas.files +} diff --git a/examples/ephemeral/main.tf b/examples/ephemeral/main.tf new file mode 100644 index 0000000000..b394034a3e --- /dev/null +++ b/examples/ephemeral/main.tf @@ -0,0 +1,71 @@ +locals { + environment = "ephemeraal" + aws_region = "eu-west-1" +} + +resource "random_id" "random" { + byte_length = 20 +} + +data "aws_caller_identity" "current" {} + +module "runners" { + source = "../../" + create_service_linked_role_spot = true + aws_region = local.aws_region + vpc_id = module.vpc.vpc_id + subnet_ids = module.vpc.private_subnets + + environment = local.environment + tags = { + Project = "ProjectX" + } + + github_app = { + key_base64 = var.github_app_key_base64 + id = var.github_app_id + webhook_secret = random_id.random.hex + } + + # Grab the lambda packages from local directory. Must run /.ci/build.sh first + webhook_lambda_zip = "../../lambda_output/webhook.zip" + runner_binaries_syncer_lambda_zip = "../../lambda_output/runner-binaries-syncer.zip" + runners_lambda_zip = "../../lambda_output/runners.zip" + + enable_organization_runners = true + runner_extra_labels = "default,example" + + # enable access to the runners via SSM + enable_ssm_on_runners = true + + # Let the module manage the service linked role + # create_service_linked_role_spot = true + + instance_types = ["m5.large", "c5.large"] + + # override delay of events in seconds + delay_webhook_event = 0 + + # Ensure you set the number not too low, each build require a new instance + runners_maximum_count = 20 + + # override scaling down + scale_down_schedule_expression = "cron(* * * * ? *)" + + enable_ephemeral_runners = true + + # configure your pre-built AMI + # enabled_userdata = false + # ami_filter = { name = ["github-runner-amzn2-x86_64-2021*"] } + # ami_owners = [data.aws_caller_identity.current.account_id] + + # Enable logging + # log_level = "debug" + + # Setup a dead letter queue, by default scale up lambda will kepp retrying to process event in case of scaling error. + # redrive_policy_build_queue = { + # enabled = true + # maxReceiveCount = 50 # 50 retries every 30 seconds => 25 minutes + # deadLetterTargetArn = null + # } +} diff --git a/examples/ephemeral/outputs.tf b/examples/ephemeral/outputs.tf new file mode 100644 index 0000000000..c50214f566 --- /dev/null +++ b/examples/ephemeral/outputs.tf @@ -0,0 +1,15 @@ +output "runners" { + value = { + lambda_syncer_name = module.runners.binaries_syncer.lambda.function_name + } +} + +output "webhook_endpoint" { + value = module.runners.webhook.endpoint +} + +output "webhook_secret" { + sensitive = true + value = random_id.random.hex +} + diff --git a/examples/ephemeral/providers.tf b/examples/ephemeral/providers.tf new file mode 100644 index 0000000000..b6c81d5415 --- /dev/null +++ b/examples/ephemeral/providers.tf @@ -0,0 +1,3 @@ +provider "aws" { + region = local.aws_region +} diff --git a/examples/ephemeral/variables.tf b/examples/ephemeral/variables.tf new file mode 100644 index 0000000000..1f4576b1b5 --- /dev/null +++ b/examples/ephemeral/variables.tf @@ -0,0 +1,5 @@ + +variable "github_app_key_base64" {} + +variable "github_app_id" {} + diff --git a/examples/ephemeral/versions.tf b/examples/ephemeral/versions.tf new file mode 100644 index 0000000000..c96d0eee84 --- /dev/null +++ b/examples/ephemeral/versions.tf @@ -0,0 +1,15 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 3.27" + } + local = { + source = "hashicorp/local" + } + random = { + source = "hashicorp/random" + } + } + required_version = ">= 0.14" +} diff --git a/examples/ephemeral/vpc.tf b/examples/ephemeral/vpc.tf new file mode 100644 index 0000000000..a7d21422f1 --- /dev/null +++ b/examples/ephemeral/vpc.tf @@ -0,0 +1,7 @@ +module "vpc" { + source = "git::https://github.com/philips-software/terraform-aws-vpc.git?ref=2.2.0" + + environment = local.environment + aws_region = local.aws_region + create_private_hosted_zone = false +} diff --git a/main.tf b/main.tf index 48f5d7a88b..4d9f60dc97 100644 --- a/main.tf +++ b/main.tf @@ -19,13 +19,24 @@ resource "random_string" "random" { } resource "aws_sqs_queue" "queued_builds" { - name = "${var.environment}-queued-builds.fifo" + name = "${var.environment}-queued-builds${var.fifo_build_queue ? ".fifo" : ""}" delay_seconds = var.delay_webhook_event visibility_timeout_seconds = var.runners_scale_up_lambda_timeout message_retention_seconds = var.job_queue_retention_in_seconds - fifo_queue = true - receive_wait_time_seconds = 10 - content_based_deduplication = true + fifo_queue = var.fifo_build_queue + receive_wait_time_seconds = 0 + content_based_deduplication = var.fifo_build_queue + redrive_policy = var.redrive_build_queue.enabled ? jsonencode({ + deadLetterTargetArn = aws_sqs_queue.queued_builds_dlq[0].arn, + maxReceiveCount = var.redrive_build_queue.maxReceiveCount + }) : null + + tags = var.tags +} + +resource "aws_sqs_queue" "queued_builds_dlq" { + count = var.redrive_build_queue.enabled ? 1 : 0 + name = "${var.environment}-queued-builds_dead_letter" tags = var.tags } @@ -48,6 +59,7 @@ module "webhook" { kms_key_arn = var.kms_key_arn sqs_build_queue = aws_sqs_queue.queued_builds + sqs_build_queue_fifo = var.fifo_build_queue github_app_webhook_secret_arn = module.ssm.parameters.github_app_webhook_secret.arn lambda_s3_bucket = var.lambda_s3_bucket @@ -92,6 +104,7 @@ module "runners" { sqs_build_queue = aws_sqs_queue.queued_builds github_app_parameters = local.github_app_parameters enable_organization_runners = var.enable_organization_runners + enable_ephemeral_runners = var.enable_ephemeral_runners scale_down_schedule_expression = var.scale_down_schedule_expression minimum_running_time_in_minutes = var.minimum_running_time_in_minutes runner_boot_time_in_minutes = var.runner_boot_time_in_minutes diff --git a/modules/runners/README.md b/modules/runners/README.md index 05cb8b2c42..6f9772e561 100644 --- a/modules/runners/README.md +++ b/modules/runners/README.md @@ -81,6 +81,7 @@ No modules. | [aws_iam_role_policy.cloudwatch](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy.describe_tags](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy.dist_bucket](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | +| [aws_iam_role_policy.ec2](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy.runner_session_manager_aws_managed](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy.scale_down](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy.scale_down_logging](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | @@ -110,7 +111,7 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [ami\_filter](#input\_ami\_filter) | Map of lists used to create the AMI filter for the action runner AMI. | `map(list(string))` |
{
"name": [
"amzn2-ami-hvm-2.*-x86_64-ebs"
]
}
| no | +| [ami\_filter](#input\_ami\_filter) | Map of lists used to create the AMI filter for the action runner AMI. | `map(list(string))` | `null` | no | | [ami\_owners](#input\_ami\_owners) | The list of owners used to select the AMI of action runner instances. | `list(string)` |
[
"amazon"
]
| no | | [aws\_region](#input\_aws\_region) | AWS region. | `string` | n/a | yes | | [block\_device\_mappings](#input\_block\_device\_mappings) | The EC2 instance block device configuration. Takes the following keys: `device_name`, `delete_on_termination`, `volume_type`, `volume_size`, `encrypted`, `iops` | `map(string)` | `{}` | no | @@ -118,8 +119,10 @@ No modules. | [create\_service\_linked\_role\_spot](#input\_create\_service\_linked\_role\_spot) | (optional) create the service linked role for spot instances that is required by the scale-up lambda. | `bool` | `false` | no | | [egress\_rules](#input\_egress\_rules) | List of egress rules for the GitHub runner instances. |
list(object({
cidr_blocks = list(string)
ipv6_cidr_blocks = list(string)
prefix_list_ids = list(string)
from_port = number
protocol = string
security_groups = list(string)
self = bool
to_port = number
description = string
}))
|
[
{
"cidr_blocks": [
"0.0.0.0/0"
],
"description": null,
"from_port": 0,
"ipv6_cidr_blocks": [
"::/0"
],
"prefix_list_ids": null,
"protocol": "-1",
"security_groups": null,
"self": null,
"to_port": 0
}
]
| no | | [enable\_cloudwatch\_agent](#input\_enable\_cloudwatch\_agent) | Enabling the cloudwatch agent on the ec2 runner instances, the runner contains default config. Configuration can be overridden via `cloudwatch_config`. | `bool` | `true` | no | +| [enable\_ephemeral\_runners](#input\_enable\_ephemeral\_runners) | Enable ephemeral runners, runners will only be used once. | `bool` | `false` | no | | [enable\_organization\_runners](#input\_enable\_organization\_runners) | n/a | `bool` | n/a | yes | | [enable\_ssm\_on\_runners](#input\_enable\_ssm\_on\_runners) | Enable to allow access to the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances. | `bool` | n/a | yes | +| [enabled\_userdata](#input\_enabled\_userdata) | Should the userdata script be enabled for the runner. Set this to false if you are using your own prebuilt AMI | `bool` | `true` | no | | [environment](#input\_environment) | A name that identifies the environment, used as prefix and for tagging. | `string` | n/a | yes | | [ghes\_ssl\_verify](#input\_ghes\_ssl\_verify) | GitHub Enterprise SSL verification. Set to 'false' when custom certificate (chains) is used for GitHub Enterprise Server (insecure). | `bool` | `true` | no | | [ghes\_url](#input\_ghes\_url) | GitHub Enterprise Server URL. DO NOT SET IF USING PUBLIC GITHUB | `string` | `null` | no | @@ -127,7 +130,7 @@ No modules. | [idle\_config](#input\_idle\_config) | List of time period that can be defined as cron expression to keep a minimum amount of runners active instead of scaling down to 0. By defining this list you can ensure that in time periods that match the cron expression within 5 seconds a runner is kept idle. |
list(object({
cron = string
timeZone = string
idleCount = number
}))
| `[]` | no | | [instance\_profile\_path](#input\_instance\_profile\_path) | The path that will be added to the instance\_profile, if not set the environment name will be used. | `string` | `null` | no | | [instance\_type](#input\_instance\_type) | [DEPRECATED] See instance\_types. | `string` | `"m5.large"` | no | -| [instance\_types](#input\_instance\_types) | List of instance types for the action runner. | `list(string)` | `null` | no | +| [instance\_types](#input\_instance\_types) | List of instance types for the action runner. Defaults are based on runner\_os (amzn2 for linux and Windows Server Core for win). | `list(string)` | `null` | no | | [key\_name](#input\_key\_name) | Key pair name | `string` | `null` | no | | [kms\_key\_arn](#input\_kms\_key\_arn) | Optional CMK Key ARN to be used for Parameter Store. | `string` | `null` | no | | [lambda\_s3\_bucket](#input\_lambda\_s3\_bucket) | S3 bucket from which to specify lambda functions. This is an alternative to providing local files directly. | `any` | `null` | no | @@ -141,7 +144,7 @@ No modules. | [logging\_retention\_in\_days](#input\_logging\_retention\_in\_days) | Specifies the number of days you want to retain log events for the lambda log group. Possible values are: 0, 1, 3, 5, 7, 14, 30, 60, 90, 120, 150, 180, 365, 400, 545, 731, 1827, and 3653. | `number` | `180` | no | | [market\_options](#input\_market\_options) | Market options for the action runner instances. | `string` | `"spot"` | no | | [metadata\_options](#input\_metadata\_options) | Metadata options for the ec2 runner instances. | `map(any)` |
{
"http_endpoint": "enabled",
"http_put_response_hop_limit": 1,
"http_tokens": "optional"
}
| no | -| [minimum\_running\_time\_in\_minutes](#input\_minimum\_running\_time\_in\_minutes) | The time an ec2 action runner should be running at minimum before terminated if non busy. | `number` | `5` | no | +| [minimum\_running\_time\_in\_minutes](#input\_minimum\_running\_time\_in\_minutes) | The time an ec2 action runner should be running at minimum before terminated if non busy. If not set the default is calculated based on the OS. | `number` | `null` | no | | [overrides](#input\_overrides) | This map provides the possibility to override some defaults. The following attributes are supported: `name_sg` overrides the `Name` tag for all security groups created by this module. `name_runner_agent_instance` overrides the `Name` tag for the ec2 instance defined in the auto launch configuration. `name_docker_machine_runners` overrides the `Name` tag spot instances created by the runner agent. | `map(string)` |
{
"name_runner": "",
"name_sg": ""
}
| no | | [role\_path](#input\_role\_path) | The path that will be added to the role; if not set, the environment name will be used. | `string` | `null` | no | | [role\_permissions\_boundary](#input\_role\_permissions\_boundary) | Permissions boundary that will be added to the created role for the lambda. | `string` | `null` | no | @@ -153,7 +156,8 @@ No modules. | [runner\_extra\_labels](#input\_runner\_extra\_labels) | Extra labels for the runners (GitHub). Separate each label by a comma | `string` | `""` | no | | [runner\_group\_name](#input\_runner\_group\_name) | Name of the runner group. | `string` | `"Default"` | no | | [runner\_iam\_role\_managed\_policy\_arns](#input\_runner\_iam\_role\_managed\_policy\_arns) | Attach AWS or customer-managed IAM policies (by ARN) to the runner IAM role | `list(string)` | `[]` | no | -| [runner\_log\_files](#input\_runner\_log\_files) | (optional) List of logfiles to send to CloudWatch, will only be used if `enable_cloudwatch_agent` is set to true. Object description: `log_group_name`: Name of the log group, `prefix_log_group`: If true, the log group name will be prefixed with `/github-self-hosted-runners/`, `file_path`: path to the log file, `log_stream_name`: name of the log stream. |
list(object({
log_group_name = string
prefix_log_group = bool
file_path = string
log_stream_name = string
}))
|
[
{
"file_path": "/var/log/messages",
"log_group_name": "messages",
"log_stream_name": "{instance_id}",
"prefix_log_group": true
},
{
"file_path": "/var/log/user-data.log",
"log_group_name": "user_data",
"log_stream_name": "{instance_id}",
"prefix_log_group": true
},
{
"file_path": "/var/log/runner-startup.log",
"log_group_name": "runner-startup",
"log_stream_name": "{instance_id}",
"prefix_log_group": true
},
{
"file_path": "/home/ec2-user/actions-runner/_diag/Runner_**.log",
"log_group_name": "runner",
"log_stream_name": "{instance_id}",
"prefix_log_group": true
}
]
| no | +| [runner\_log\_files](#input\_runner\_log\_files) | (optional) List of logfiles to send to CloudWatch, will only be used if `enable_cloudwatch_agent` is set to true. Object description: `log_group_name`: Name of the log group, `prefix_log_group`: If true, the log group name will be prefixed with `/github-self-hosted-runners/`, `file_path`: path to the log file, `log_stream_name`: name of the log stream. |
list(object({
log_group_name = string
prefix_log_group = bool
file_path = string
log_stream_name = string
}))
| `null` | no | +| [runner\_os](#input\_runner\_os) | The EC2 Operating System type to use for action runner instances (linux,win). | `string` | `"linux"` | no | | [runners\_lambda\_s3\_key](#input\_runners\_lambda\_s3\_key) | S3 key for runners lambda function. Required if using S3 bucket to specify lambdas. | `any` | `null` | no | | [runners\_lambda\_s3\_object\_version](#input\_runners\_lambda\_s3\_object\_version) | S3 object version for runners lambda function. Useful if S3 versioning is enabled on source bucket. | `any` | `null` | no | | [runners\_maximum\_count](#input\_runners\_maximum\_count) | The maximum number of runners that will be created. | `number` | `3` | no | @@ -164,7 +168,6 @@ No modules. | [sqs\_build\_queue](#input\_sqs\_build\_queue) | SQS queue to consume accepted build events. |
object({
arn = string
})
| n/a | yes | | [subnet\_ids](#input\_subnet\_ids) | List of subnets in which the action runners will be launched, the subnets needs to be subnets in the `vpc_id`. | `list(string)` | n/a | yes | | [tags](#input\_tags) | Map of tags that will be added to created resources. By default resources will be tagged with name and environment. | `map(string)` | `{}` | no | -| [enabled\_userdata](#input\_enabled_userdata) | Should the userdata script be enabled for the runner. Set this to false if you are using your own prebuilt AMI | `bool` | `true` | no | | [userdata\_post\_install](#input\_userdata\_post\_install) | User-data script snippet to insert after GitHub action runner install | `string` | `""` | no | | [userdata\_pre\_install](#input\_userdata\_pre\_install) | User-data script snippet to insert before GitHub action runner install | `string` | `""` | no | | [userdata\_template](#input\_userdata\_template) | Alternative user-data template, replacing the default template. By providing your own user\_data you have to take care of installing all required software, including the action runner. Variables userdata\_pre/post\_install are ignored. | `string` | `null` | no | diff --git a/modules/runners/lambdas/runners/jest.config.js b/modules/runners/lambdas/runners/jest.config.js index 79ed0ba8aa..8c7a9f17c5 100644 --- a/modules/runners/lambdas/runners/jest.config.js +++ b/modules/runners/lambdas/runners/jest.config.js @@ -2,7 +2,7 @@ module.exports = { preset: 'ts-jest', testEnvironment: 'node', collectCoverage: true, - collectCoverageFrom: ['src/**/*.{ts,js,jsx}'], + collectCoverageFrom: ['src/**/*.{ts,js,jsx}','!src/**/*local*.ts'], coverageThreshold: { global: { branches: 80, diff --git a/modules/runners/lambdas/runners/src/lambda.test.ts b/modules/runners/lambdas/runners/src/lambda.test.ts new file mode 100644 index 0000000000..43f5ffdff3 --- /dev/null +++ b/modules/runners/lambdas/runners/src/lambda.test.ts @@ -0,0 +1,136 @@ +import { fail } from 'assert'; +import { Context, SQSEvent, SQSRecord } from 'aws-lambda'; +import { mocked } from 'ts-jest/utils'; +import { scaleUpHandler } from './lambda'; +import { ActionRequestMessage, scaleUp } from './scale-runners/scale-up'; +import ScaleError from './scale-runners/ScaleError'; +import { logger } from './scale-runners/logger'; +import { scaleDown } from './scale-runners/scale-down'; + +const body: ActionRequestMessage = { + eventType: 'workflow_job', + id: 1, + installationId: 1, + repositoryName: 'name', + repositoryOwner: 'owner', +}; + +const sqsRecord: SQSRecord = { + attributes: { + ApproximateFirstReceiveTimestamp: '', + ApproximateReceiveCount: '', + SenderId: '', + SentTimestamp: '', + }, + awsRegion: '', + body: JSON.stringify(body), + eventSource: 'aws:SQS', + eventSourceARN: '', + md5OfBody: '', + messageAttributes: {}, + messageId: '', + receiptHandle: '', +}; + +const sqsEvent: SQSEvent = { + Records: [sqsRecord], +}; + +const context: Context = { + awsRequestId: '1', + callbackWaitsForEmptyEventLoop: false, + functionName: '', + functionVersion: '', + getRemainingTimeInMillis: () => 0, + invokedFunctionArn: '', + logGroupName: '', + logStreamName: '', + memoryLimitInMB: '', + done: () => { + return; + }, + fail: () => { + return; + }, + succeed: () => { + return; + }, +}; + +jest.mock('./scale-runners/scale-up'); +jest.mock('./scale-runners/scale-down'); +jest.mock('./scale-runners/logger'); + +describe('Test scale up lambda wrapper.', () => { + it('Do not handle multiple record sets.', async () => { + await testInvalidRecords([sqsRecord, sqsRecord]); + }); + + it('Do not handle empty record sets.', async () => { + await testInvalidRecords([]); + }); + + it('Scale without error should resolve.', async () => { + const mock = mocked(scaleUp); + mock.mockImplementation(() => { + return new Promise((resolve, reject) => { + resolve(); + }); + }); + await expect(scaleUpHandler(sqsEvent, context)).resolves; + }); + + it('Non scale should resolve.', async () => { + const error = new Error('some error'); + const mock = mocked(scaleUp); + mock.mockRejectedValue(error); + await expect(scaleUpHandler(sqsEvent, context)).resolves; + }); + + it('Scale should be rejected', async () => { + const error = new ScaleError('some scale error'); + const mock = mocked(scaleUp); + + mock.mockRejectedValue(error); + await expect(scaleUpHandler(sqsEvent, context)).rejects.toThrow(error); + }); +}); + +async function testInvalidRecords(sqsRecords: SQSRecord[]) { + const mock = mocked(scaleUp); + const logWarnSpy = jest.spyOn(logger, 'warn'); + mock.mockImplementation(() => { + return new Promise((resolve) => { + resolve(); + }); + }); + const sqsEventMultipleRecords: SQSEvent = { + Records: sqsRecords, + }; + + await expect(scaleUpHandler(sqsEventMultipleRecords, context)).resolves; + + expect(logWarnSpy).toHaveBeenCalledWith( + 'Event ignored, only one record at the time can be handled, ensure the lambda batch size is set to 1.', + undefined, + ); +} + +describe('Test scale down lambda wrapper.', () => { + it('Scaling down no error.', async () => { + const mock = mocked(scaleDown); + mock.mockImplementation(() => { + return new Promise((resolve) => { + resolve(); + }); + }); + await expect(scaleDown()).resolves; + }); + + it('Scaling down with error.', async () => { + const error = new Error('some error'); + const mock = mocked(scaleDown); + mock.mockRejectedValue(error); + await expect(scaleDown()).resolves; + }); +}); diff --git a/modules/runners/lambdas/runners/src/lambda.ts b/modules/runners/lambdas/runners/src/lambda.ts index a784c0d059..20e1c40135 100644 --- a/modules/runners/lambdas/runners/src/lambda.ts +++ b/modules/runners/lambdas/runners/src/lambda.ts @@ -1,31 +1,38 @@ import { scaleUp } from './scale-runners/scale-up'; import { scaleDown } from './scale-runners/scale-down'; import { SQSEvent, ScheduledEvent, Context, Callback } from 'aws-lambda'; -import { logger } from './scale-runners/logger'; +import { LogFields, logger } from './scale-runners/logger'; +import ScaleError from './scale-runners/ScaleError'; import 'source-map-support/register'; -export async function scaleUpHandler(event: SQSEvent, context: Context, callback: Callback): Promise { +export async function scaleUpHandler(event: SQSEvent, context: Context): Promise { logger.setSettings({ requestId: context.awsRequestId }); logger.debug(JSON.stringify(event)); - try { - for (const e of event.Records) { - await scaleUp(e.eventSource, JSON.parse(e.body)); - } + if (event.Records.length !== 1) { + logger.warn( + 'Event ignored, only one record at the time can be handled, ensure the lambda batch size is set to 1.', + LogFields.print(), + ); + return new Promise((resolve) => resolve()); + } - callback(null); + try { + await scaleUp(event.Records[0].eventSource, JSON.parse(event.Records[0].body)); } catch (e) { - logger.error(e); - callback('Failed handling SQS event'); + if (e instanceof ScaleError) { + throw e; + } else { + logger.warn(`Ignoring error: ${(e as Error).message}`, LogFields.print()); + } } } -export async function scaleDownHandler(event: ScheduledEvent, context: Context, callback: Callback): Promise { +export async function scaleDownHandler(event: ScheduledEvent, context: Context): Promise { logger.setSettings({ requestId: context.awsRequestId }); + try { await scaleDown(); - callback(null); } catch (e) { logger.error(e); - callback('Failed'); } } diff --git a/modules/runners/lambdas/runners/src/scale-runners/ScaleError.ts b/modules/runners/lambdas/runners/src/scale-runners/ScaleError.ts new file mode 100644 index 0000000000..d7e71f8c33 --- /dev/null +++ b/modules/runners/lambdas/runners/src/scale-runners/ScaleError.ts @@ -0,0 +1,9 @@ +class ScaleError extends Error { + constructor(public message: string) { + super(message); + this.name = 'ScaleError'; + this.stack = new Error().stack; + } +} + +export default ScaleError; diff --git a/modules/runners/lambdas/runners/src/scale-runners/runners.ts b/modules/runners/lambdas/runners/src/scale-runners/runners.ts index 4453d9cd0b..9670f4e025 100644 --- a/modules/runners/lambdas/runners/src/scale-runners/runners.ts +++ b/modules/runners/lambdas/runners/src/scale-runners/runners.ts @@ -90,15 +90,17 @@ export async function createRunner(runnerParameters: RunnerInputParameters, laun LogFields.print(), ); const ssm = new SSM(); - runInstancesResponse.Instances?.forEach(async (i: EC2.Instance) => { - await ssm - .putParameter({ - Name: runnerParameters.environment + '-' + (i.InstanceId as string), - Value: runnerParameters.runnerServiceConfig, - Type: 'SecureString', - }) - .promise(); - }); + if (runInstancesResponse.Instances) { + for (let i = 0; i < runInstancesResponse.Instances?.length; i++) { + await ssm + .putParameter({ + Name: runnerParameters.environment + '-' + (runInstancesResponse.Instances[i].InstanceId as string), + Value: runnerParameters.runnerServiceConfig, + Type: 'SecureString', + }) + .promise(); + } + } } function getInstanceParams( diff --git a/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts b/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts index 82191bc427..6bd0449f0b 100644 --- a/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts +++ b/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts @@ -4,6 +4,7 @@ import { listEC2Runners, createRunner, RunnerInputParameters } from './runners'; import * as ghAuth from './gh-auth'; import nock from 'nock'; import { Octokit } from '@octokit/rest'; +import ScaleError from './ScaleError'; const mockOctokit = { checks: { get: jest.fn() }, @@ -53,7 +54,7 @@ const cleanEnv = process.env; const EXPECTED_RUNNER_PARAMS: RunnerInputParameters = { environment: 'unit-test-environment', - runnerServiceConfig: `--url https://github.enterprise.something/${TEST_DATA.repositoryOwner} --token 1234abcd `, + runnerServiceConfig: `--url https://github.enterprise.something/${TEST_DATA.repositoryOwner} --token 1234abcd`, runnerType: 'Org', runnerOwner: TEST_DATA.repositoryOwner, }; @@ -226,7 +227,7 @@ describe('scaleUp with GHES', () => { process.env.RUNNER_GROUP_NAME = 'TEST_GROUP'; await scaleUpModule.scaleUp('aws:sqs', TEST_DATA); expectedRunnerParams.runnerServiceConfig = - expectedRunnerParams.runnerServiceConfig + `--labels label1,label2 --runnergroup TEST_GROUP`; + expectedRunnerParams.runnerServiceConfig + ` --labels label1,label2 --runnergroup TEST_GROUP`; expect(createRunner).toBeCalledWith(expectedRunnerParams, 'lt-1'); }); @@ -260,7 +261,7 @@ describe('scaleUp with GHES', () => { expectedRunnerParams.runnerServiceConfig = `--url ` + `https://github.enterprise.something/${TEST_DATA.repositoryOwner}/${TEST_DATA.repositoryName} ` + - `--token 1234abcd `; + `--token 1234abcd`; }); it('gets the current repo level runners', async () => { @@ -326,7 +327,7 @@ describe('scaleUp with GHES', () => { it('creates a runner with correct config and labels', async () => { process.env.RUNNER_EXTRA_LABELS = 'label1,label2'; await scaleUpModule.scaleUp('aws:sqs', TEST_DATA); - expectedRunnerParams.runnerServiceConfig = expectedRunnerParams.runnerServiceConfig + `--labels label1,label2`; + expectedRunnerParams.runnerServiceConfig = expectedRunnerParams.runnerServiceConfig + ` --labels label1,label2`; expect(createRunner).toBeCalledWith(expectedRunnerParams, 'lt-1'); }); @@ -334,7 +335,7 @@ describe('scaleUp with GHES', () => { process.env.RUNNER_EXTRA_LABELS = 'label1,label2'; process.env.RUNNER_GROUP_NAME = 'TEST_GROUP_IGNORED'; await scaleUpModule.scaleUp('aws:sqs', TEST_DATA); - expectedRunnerParams.runnerServiceConfig = expectedRunnerParams.runnerServiceConfig + `--labels label1,label2`; + expectedRunnerParams.runnerServiceConfig = expectedRunnerParams.runnerServiceConfig + ` --labels label1,label2`; expect(createRunner).toBeCalledWith(expectedRunnerParams, 'lt-1'); }); @@ -407,7 +408,7 @@ describe('scaleUp with public GH', () => { process.env.ENABLE_ORGANIZATION_RUNNERS = 'true'; expectedRunnerParams = { ...EXPECTED_RUNNER_PARAMS }; expectedRunnerParams.runnerServiceConfig = - `--url https://github.com/${TEST_DATA.repositoryOwner} ` + `--token 1234abcd `; + `--url https://github.com/${TEST_DATA.repositoryOwner} ` + `--token 1234abcd`; }); it('gets the current org level runners', async () => { @@ -449,7 +450,7 @@ describe('scaleUp with public GH', () => { process.env.RUNNER_GROUP_NAME = 'TEST_GROUP'; await scaleUpModule.scaleUp('aws:sqs', TEST_DATA); expectedRunnerParams.runnerServiceConfig = - expectedRunnerParams.runnerServiceConfig + `--labels label1,label2 --runnergroup TEST_GROUP`; + expectedRunnerParams.runnerServiceConfig + ` --labels label1,label2 --runnergroup TEST_GROUP`; expect(createRunner).toBeCalledWith(expectedRunnerParams, LAUNCH_TEMPLATE); }); @@ -470,7 +471,7 @@ describe('scaleUp with public GH', () => { expectedRunnerParams.runnerType = 'Repo'; expectedRunnerParams.runnerOwner = `${TEST_DATA.repositoryOwner}/${TEST_DATA.repositoryName}`; expectedRunnerParams.runnerServiceConfig = - `--url https://github.com/${TEST_DATA.repositoryOwner}/${TEST_DATA.repositoryName} ` + `--token 1234abcd `; + `--url https://github.com/${TEST_DATA.repositoryOwner}/${TEST_DATA.repositoryName} ` + `--token 1234abcd`; }); it('gets the current repo level runners', async () => { @@ -521,7 +522,7 @@ describe('scaleUp with public GH', () => { it('creates a runner with correct config and labels', async () => { process.env.RUNNER_EXTRA_LABELS = 'label1,label2'; await scaleUpModule.scaleUp('aws:sqs', TEST_DATA); - expectedRunnerParams.runnerServiceConfig = expectedRunnerParams.runnerServiceConfig + `--labels label1,label2`; + expectedRunnerParams.runnerServiceConfig = expectedRunnerParams.runnerServiceConfig + ` --labels label1,label2`; expect(createRunner).toBeCalledWith(expectedRunnerParams, LAUNCH_TEMPLATE); }); @@ -529,7 +530,7 @@ describe('scaleUp with public GH', () => { process.env.RUNNER_EXTRA_LABELS = 'label1,label2'; process.env.RUNNER_GROUP_NAME = 'TEST_GROUP_IGNORED'; await scaleUpModule.scaleUp('aws:sqs', TEST_DATA); - expectedRunnerParams.runnerServiceConfig = expectedRunnerParams.runnerServiceConfig + `--labels label1,label2`; + expectedRunnerParams.runnerServiceConfig = expectedRunnerParams.runnerServiceConfig + ` --labels label1,label2`; expect(createRunner).toBeCalledWith(expectedRunnerParams, LAUNCH_TEMPLATE); }); @@ -541,5 +542,28 @@ describe('scaleUp with public GH', () => { expect(createRunner).toHaveBeenNthCalledWith(1, expectedRunnerParams, 'lt-1'); expect(createRunner).toHaveBeenNthCalledWith(2, expectedRunnerParams, 'lt-2'); }); + + it('ephemeral runners only run with workflow_job event, others should fail.', async () => { + process.env.ENABLE_EPHEMERAL_RUNNERS = 'true'; + await expect( + scaleUpModule.scaleUp('aws:sqs', { + ...TEST_DATA, + eventType: 'check_run', + }), + ).rejects.toBeInstanceOf(Error); + }); + + it('creates a ephemeral runner.', async () => { + process.env.ENABLE_EPHEMERAL_RUNNERS = 'true'; + await scaleUpModule.scaleUp('aws:sqs', TEST_DATA); + expectedRunnerParams.runnerServiceConfig = expectedRunnerParams.runnerServiceConfig + ` --ephemeral`; + expect(createRunner).toBeCalledWith(expectedRunnerParams, LAUNCH_TEMPLATE); + }); + + it('Scaling error should cause reject so retry can be triggered.', async () => { + process.env.RUNNERS_MAXIMUM_COUNT = '1'; + process.env.ENABLE_EPHEMERAL_RUNNERS = 'true'; + await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).rejects.toBeInstanceOf(ScaleError); + }); }); }); diff --git a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts index 6e6c40f606..8422c09540 100644 --- a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts +++ b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts @@ -2,7 +2,8 @@ import { listEC2Runners, createRunner, RunnerInputParameters } from './runners'; import { createOctoClient, createGithubAppAuth, createGithubInstallationAuth } from './gh-auth'; import yn from 'yn'; import { Octokit } from '@octokit/rest'; -import { logger as rootLogger, LogFields } from './logger'; +import { LogFields, logger as rootLogger } from './logger'; +import ScaleError from './ScaleError'; const logger = rootLogger.getChildLogger({ name: 'scale-up' }); @@ -15,6 +16,11 @@ export interface ActionRequestMessage { } export async function scaleUp(eventSource: string, payload: ActionRequestMessage): Promise { + logger.info( + `Received ${payload.eventType} from ${payload.repositoryOwner}/${payload.repositoryName}`, + LogFields.print(), + ); + if (eventSource !== 'aws:sqs') throw Error('Cannot handle non-SQS events!'); const enableOrgLevel = yn(process.env.ENABLE_ORGANIZATION_RUNNERS, { default: true }); const maximumRunners = parseInt(process.env.RUNNERS_MAXIMUM_COUNT || '3'); @@ -22,7 +28,19 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage const runnerGroup = process.env.RUNNER_GROUP_NAME; const environment = process.env.ENVIRONMENT; const ghesBaseUrl = process.env.GHES_URL; + const ephemeralEnabled = yn(process.env.ENABLE_EPHEMERAL_RUNNERS, { default: false }); + if (ephemeralEnabled && payload.eventType !== 'workflow_job') { + logger.warn( + `${payload.eventType} event is not supported in combination with ephemeral runners.`, + LogFields.print(), + ); + throw Error( + `The event type ${payload.eventType} is not supported in combination with ephemeral runners.` + + `Please ensure you have enabled workflow_job events.`, + ); + } + const ephemeral = ephemeralEnabled && payload.eventType === 'workflow_job'; const runnerType = enableOrgLevel ? 'Org' : 'Repo'; const runnerOwner = enableOrgLevel ? payload.repositoryOwner : `${payload.repositoryOwner}/${payload.repositoryName}`; @@ -60,8 +78,7 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage const ghAuth = await createGithubInstallationAuth(installationId, ghesApiUrl); const githubInstallationClient = await createOctoClient(ghAuth.token, ghesApiUrl); - const isQueued = await getJobStatus(githubInstallationClient, payload); - if (isQueued) { + if (ephemeral || (await getJobStatus(githubInstallationClient, payload))) { const currentRunners = await listEC2Runners({ environment, runnerType, @@ -80,21 +97,25 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage }); const token = registrationToken.data.token; - const labelsArgument = runnerExtraLabels !== undefined ? `--labels ${runnerExtraLabels}` : ''; - const runnerGroupArgument = runnerGroup !== undefined ? ` --runnergroup ${runnerGroup}` : ''; + const labelsArgument = runnerExtraLabels !== undefined ? `--labels ${runnerExtraLabels} ` : ''; + const runnerGroupArgument = runnerGroup !== undefined ? `--runnergroup ${runnerGroup} ` : ''; const configBaseUrl = ghesBaseUrl ? ghesBaseUrl : 'https://github.com'; + const ephemeralArgument = ephemeral ? '--ephemeral ' : ''; + const runnerArgs = `--token ${token} ${labelsArgument}${ephemeralArgument}`; await createRunnerLoop({ environment, runnerServiceConfig: enableOrgLevel - ? `--url ${configBaseUrl}/${payload.repositoryOwner} --token ${token} ${labelsArgument}${runnerGroupArgument}` - : `--url ${configBaseUrl}/${payload.repositoryOwner}/${payload.repositoryName} ` + - `--token ${token} ${labelsArgument}`, + ? `--url ${configBaseUrl}/${payload.repositoryOwner} ${runnerArgs}${runnerGroupArgument}`.trim() + : `--url ${configBaseUrl}/${payload.repositoryOwner}/${payload.repositoryName} ${runnerArgs}`.trim(), runnerOwner, runnerType, }); } else { logger.info('No runner will be created, maximum number of runners reached.', LogFields.print()); + if (ephemeral) { + throw new ScaleError('No runners create: maximum of runners reached.'); + } } } } @@ -139,6 +160,6 @@ export async function createRunnerLoop(runnerParameters: RunnerInputParameters): } } if (launched == false) { - throw Error('All launch templates failed'); + throw new ScaleError('All launch templates failed'); } } diff --git a/modules/runners/policies-runner.tf b/modules/runners/policies-runner.tf index 2ac1b87454..dc90d47b0b 100644 --- a/modules/runners/policies-runner.tf +++ b/modules/runners/policies-runner.tf @@ -54,4 +54,11 @@ resource "aws_iam_role_policy_attachment" "managed_policies" { policy_arn = element(var.runner_iam_role_managed_policy_arns, count.index) } + +resource "aws_iam_role_policy" "ec2" { + name = "ec2" + role = aws_iam_role.runner.name + policy = templatefile("${path.module}/policies/instance-ec2.json", {}) +} + // see also logging.tf for logging and metrics policies diff --git a/modules/runners/policies/instance-ec2.json b/modules/runners/policies/instance-ec2.json new file mode 100644 index 0000000000..4a5bc578f5 --- /dev/null +++ b/modules/runners/policies/instance-ec2.json @@ -0,0 +1,15 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": "ec2:TerminateInstances", + "Resource": "*", + "Condition": { + "StringEquals": { + "aws:ARN": "$${ec2:SourceInstanceARN}" + } + } + } + ] +} diff --git a/modules/runners/runner-config.tf b/modules/runners/runner-config.tf index eb6370e58f..83ec7929cd 100644 --- a/modules/runners/runner-config.tf +++ b/modules/runners/runner-config.tf @@ -6,10 +6,9 @@ resource "aws_ssm_parameter" "runner_config_run_as" { } resource "aws_ssm_parameter" "runner_agent_mode" { - name = "/${var.environment}/runner/agent-mode" - type = "String" - # TODO: Update this to allow for ephemeral runners - value = "persistent" + name = "/${var.environment}/runner/agent-mode" + type = "String" + value = var.enable_ephemeral_runners ? "ephemeral" : "persistent" tags = local.tags } diff --git a/modules/runners/scale-up.tf b/modules/runners/scale-up.tf index 486ce3de9f..e217cf4b74 100644 --- a/modules/runners/scale-up.tf +++ b/modules/runners/scale-up.tf @@ -28,6 +28,7 @@ resource "aws_lambda_function" "scale_up" { RUNNER_GROUP_NAME = var.runner_group_name RUNNERS_MAXIMUM_COUNT = var.runners_maximum_count SUBNET_IDS = join(",", var.subnet_ids) + ENABLE_EPHEMERAL_RUNNERS = var.enable_ephemeral_runners } } @@ -49,6 +50,7 @@ resource "aws_cloudwatch_log_group" "scale_up" { resource "aws_lambda_event_source_mapping" "scale_up" { event_source_arn = var.sqs_build_queue.arn function_name = aws_lambda_function.scale_up.arn + batch_size = 1 } resource "aws_lambda_permission" "scale_runners_lambda" { diff --git a/modules/runners/templates/start-runner.sh b/modules/runners/templates/start-runner.sh index 6da70d1f9b..3cedc0862b 100644 --- a/modules/runners/templates/start-runner.sh +++ b/modules/runners/templates/start-runner.sh @@ -29,12 +29,11 @@ echo "Retrieved /$environment/runner/enable-cloudwatch parameter - ($enable_clou agent_mode=$(echo "$parameters" | jq --arg environment "$environment" -r '.[] | select(.Name == "/\($environment)/runner/agent-mode") | .Value') echo "Retrieved /$environment/runner/agent-mode parameter - ($agent_mode)" -if [[ -n "$enable_cloudwatch_agent" ]]; then - echo "Cloudwatch is enabled" +if [[ -n "$enable_cloudwatch_agent" ]]; then + echo "Cloudwatch is enabled" amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c "ssm:$environment-cloudwatch_agent_config_runner" fi - ## Configure the runner echo "Get GH Runner config from AWS SSM" @@ -66,18 +65,18 @@ sudo --preserve-env=RUNNER_ALLOW_RUNASROOT -u "$run_as" -- ./config.sh --unatten echo "Starting runner after $(awk '{print int($1/3600)":"int(($1%3600)/60)":"int($1%60)}' /proc/uptime)" echo "Starting the runner as user $run_as" -if [[ $agent_mode = "ephemeral" ]]; then +if [[ $agent_mode = "ephemeral" ]]; then echo "Starting the runner in ephemeral mode" sudo --preserve-env=RUNNER_ALLOW_RUNASROOT -u "$run_as" -- ./run.sh echo "Runner has finished" - + echo "Stopping cloudwatch service" - service awslogsd stop + systemctl stop amazon-cloudwatch-agent.service echo "Terminating instance" aws ec2 terminate-instances --instance-ids "$instance_id" --region "$region" -else +else echo "Installing the runner as a service" ./svc.sh install "$run_as" echo "Starting the runner in persistent mode" ./svc.sh start -fi \ No newline at end of file +fi diff --git a/modules/runners/variables.tf b/modules/runners/variables.tf index 80416b7894..c46e88340a 100644 --- a/modules/runners/variables.tf +++ b/modules/runners/variables.tf @@ -416,5 +416,10 @@ variable "metadata_options" { http_tokens = "optional" http_put_response_hop_limit = 1 } +} +variable "enable_ephemeral_runners" { + description = "Enable ephemeral runners, runners will only be used once." + type = bool + default = false } diff --git a/modules/webhook/README.md b/modules/webhook/README.md index ad9beca0b1..2430972b5e 100644 --- a/modules/webhook/README.md +++ b/modules/webhook/README.md @@ -38,66 +38,69 @@ yarn run dist | Name | Version | |------|---------| -| terraform | >= 0.14.1 | -| aws | >= 3.38 | +| [terraform](#requirement\_terraform) | >= 0.14.1 | +| [aws](#requirement\_aws) | >= 3.38 | ## Providers | Name | Version | |------|---------| -| aws | >= 3.38 | +| [aws](#provider\_aws) | >= 3.38 | ## Modules -No Modules. +No modules. ## Resources -| Name | -|------| -| [aws_apigatewayv2_api](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/apigatewayv2_api) | -| [aws_apigatewayv2_integration](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/apigatewayv2_integration) | -| [aws_apigatewayv2_route](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/apigatewayv2_route) | -| [aws_apigatewayv2_stage](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/apigatewayv2_stage) | -| [aws_cloudwatch_log_group](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | -| [aws_iam_policy_document](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | -| [aws_iam_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | -| [aws_iam_role_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | -| [aws_lambda_function](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | -| [aws_lambda_permission](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | +| Name | Type | +|------|------| +| [aws_apigatewayv2_api.webhook](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/apigatewayv2_api) | resource | +| [aws_apigatewayv2_integration.webhook](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/apigatewayv2_integration) | resource | +| [aws_apigatewayv2_route.webhook](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/apigatewayv2_route) | resource | +| [aws_apigatewayv2_stage.webhook](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/apigatewayv2_stage) | resource | +| [aws_cloudwatch_log_group.webhook](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | +| [aws_iam_role.webhook_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | +| [aws_iam_role_policy.webhook_logging](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | +| [aws_iam_role_policy.webhook_sqs](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | +| [aws_iam_role_policy.webhook_ssm](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | +| [aws_lambda_function.webhook](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource | +| [aws_lambda_permission.webhook](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | +| [aws_iam_policy_document.lambda_assume_role_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| aws\_region | AWS region. | `string` | n/a | yes | -| disable\_check\_wokflow\_job\_labels | Disable the the check of workflow labels. | `bool` | `false` | no | -| environment | A name that identifies the environment, used as prefix and for tagging. | `string` | n/a | yes | -| github\_app\_webhook\_secret\_arn | n/a | `string` | n/a | yes | -| kms\_key\_arn | Optional CMK Key ARN to be used for Parameter Store. | `string` | `null` | no | -| lambda\_s3\_bucket | S3 bucket from which to specify lambda functions. This is an alternative to providing local files directly. | `any` | `null` | no | -| lambda\_timeout | Time out of the lambda in seconds. | `number` | `10` | no | -| lambda\_zip | File location of the lambda zip file. | `string` | `null` | no | -| log\_level | Logging level for lambda logging. Valid values are 'silly', 'trace', 'debug', 'info', 'warn', 'error', 'fatal'. | `string` | `"info"` | no | -| log\_type | Logging format for lambda logging. Valid values are 'json', 'pretty', 'hidden'. | `string` | `"pretty"` | no | -| logging\_retention\_in\_days | Specifies the number of days you want to retain log events for the lambda log group. Possible values are: 0, 1, 3, 5, 7, 14, 30, 60, 90, 120, 150, 180, 365, 400, 545, 731, 1827, and 3653. | `number` | `7` | no | -| repository\_white\_list | List of repositories allowed to use the github app | `list(string)` | `[]` | no | -| role\_path | The path that will be added to the role; if not set, the environment name will be used. | `string` | `null` | no | -| role\_permissions\_boundary | Permissions boundary that will be added to the created role for the lambda. | `string` | `null` | no | -| runner\_extra\_labels | Extra labels for the runners (GitHub). Separate each label by a comma | `string` | `""` | no | -| sqs\_build\_queue | SQS queue to publish accepted build events. |
object({
id = string
arn = string
})
| n/a | yes | -| tags | Map of tags that will be added to created resources. By default resources will be tagged with name and environment. | `map(string)` | `{}` | no | -| webhook\_lambda\_s3\_key | S3 key for webhook lambda function. Required if using S3 bucket to specify lambdas. | `any` | `null` | no | -| webhook\_lambda\_s3\_object\_version | S3 object version for webhook lambda function. Useful if S3 versioning is enabled on source bucket. | `any` | `null` | no | +| [aws\_region](#input\_aws\_region) | AWS region. | `string` | n/a | yes | +| [disable\_check\_wokflow\_job\_labels](#input\_disable\_check\_wokflow\_job\_labels) | Disable the the check of workflow labels. | `bool` | `false` | no | +| [environment](#input\_environment) | A name that identifies the environment, used as prefix and for tagging. | `string` | n/a | yes | +| [github\_app\_webhook\_secret\_arn](#input\_github\_app\_webhook\_secret\_arn) | n/a | `string` | n/a | yes | +| [kms\_key\_arn](#input\_kms\_key\_arn) | Optional CMK Key ARN to be used for Parameter Store. | `string` | `null` | no | +| [lambda\_s3\_bucket](#input\_lambda\_s3\_bucket) | S3 bucket from which to specify lambda functions. This is an alternative to providing local files directly. | `any` | `null` | no | +| [lambda\_timeout](#input\_lambda\_timeout) | Time out of the lambda in seconds. | `number` | `10` | no | +| [lambda\_zip](#input\_lambda\_zip) | File location of the lambda zip file. | `string` | `null` | no | +| [log\_level](#input\_log\_level) | Logging level for lambda logging. Valid values are 'silly', 'trace', 'debug', 'info', 'warn', 'error', 'fatal'. | `string` | `"info"` | no | +| [log\_type](#input\_log\_type) | Logging format for lambda logging. Valid values are 'json', 'pretty', 'hidden'. | `string` | `"pretty"` | no | +| [logging\_retention\_in\_days](#input\_logging\_retention\_in\_days) | Specifies the number of days you want to retain log events for the lambda log group. Possible values are: 0, 1, 3, 5, 7, 14, 30, 60, 90, 120, 150, 180, 365, 400, 545, 731, 1827, and 3653. | `number` | `7` | no | +| [repository\_white\_list](#input\_repository\_white\_list) | List of repositories allowed to use the github app | `list(string)` | `[]` | no | +| [role\_path](#input\_role\_path) | The path that will be added to the role; if not set, the environment name will be used. | `string` | `null` | no | +| [role\_permissions\_boundary](#input\_role\_permissions\_boundary) | Permissions boundary that will be added to the created role for the lambda. | `string` | `null` | no | +| [runner\_extra\_labels](#input\_runner\_extra\_labels) | Extra labels for the runners (GitHub). Separate each label by a comma | `string` | `""` | no | +| [sqs\_build\_queue](#input\_sqs\_build\_queue) | SQS queue to publish accepted build events. |
object({
id = string
arn = string
})
| n/a | yes | +| [sqs\_build\_queue\_fifo](#input\_sqs\_build\_queue\_fifo) | Enable a FIFO queue to remain the order of events received by the webhook. Suggest to set to true for repo level runners. | `bool` | `false` | no | +| [tags](#input\_tags) | Map of tags that will be added to created resources. By default resources will be tagged with name and environment. | `map(string)` | `{}` | no | +| [webhook\_lambda\_s3\_key](#input\_webhook\_lambda\_s3\_key) | S3 key for webhook lambda function. Required if using S3 bucket to specify lambdas. | `any` | `null` | no | +| [webhook\_lambda\_s3\_object\_version](#input\_webhook\_lambda\_s3\_object\_version) | S3 object version for webhook lambda function. Useful if S3 versioning is enabled on source bucket. | `any` | `null` | no | ## Outputs | Name | Description | |------|-------------| -| endpoint\_relative\_path | n/a | -| gateway | n/a | -| lambda | n/a | -| role | n/a | +| [endpoint\_relative\_path](#output\_endpoint\_relative\_path) | n/a | +| [gateway](#output\_gateway) | n/a | +| [lambda](#output\_lambda) | n/a | +| [role](#output\_role) | n/a | ## Philips Forest diff --git a/modules/webhook/lambdas/webhook/jest.config.js b/modules/webhook/lambdas/webhook/jest.config.js index 4a5b465ecb..02a6524ce9 100644 --- a/modules/webhook/lambdas/webhook/jest.config.js +++ b/modules/webhook/lambdas/webhook/jest.config.js @@ -1,4 +1,14 @@ module.exports = { preset: 'ts-jest', testEnvironment: 'node', + collectCoverage: true, + collectCoverageFrom: ['src/**/*.{ts,js,jsx}', '!src/**/*local*.ts'], + coverageThreshold: { + global: { + branches: 85, + functions: 85, + lines: 85, + statements: 85 + } + } }; diff --git a/modules/webhook/lambdas/webhook/src/sqs/index.test.ts b/modules/webhook/lambdas/webhook/src/sqs/index.test.ts new file mode 100644 index 0000000000..de8570157f --- /dev/null +++ b/modules/webhook/lambdas/webhook/src/sqs/index.test.ts @@ -0,0 +1,65 @@ +import { SQS } from 'aws-sdk'; +import { sendActionRequest, ActionRequestMessage } from '.'; + +const mockSQS = { + sendMessage: jest.fn(() => { + { + return { promise: jest.fn() }; + } + }), +}; +jest.mock('aws-sdk', () => ({ + SQS: jest.fn().mockImplementation(() => mockSQS), +})); + +describe('Test sending message to SQS.', () => { + const message: ActionRequestMessage = { + eventType: 'type', + id: 0, + installationId: 0, + repositoryName: 'test', + repositoryOwner: 'owner', + }; + const sqsMessage: SQS.Types.SendMessageRequest = { + QueueUrl: 'https://sqs.eu-west-1.amazonaws.com/123456789/queued-builds', + MessageBody: JSON.stringify(message), + }; + + it('no fifo queue, based on defaults', async () => { + // Arrange + process.env.SQS_URL_WEBHOOK = sqsMessage.QueueUrl; + + // Act + const result = await sendActionRequest(message); + + // Assert + expect(mockSQS.sendMessage).toBeCalledWith(sqsMessage); + expect(result).resolves; + }); + + it('no fifo queue', async () => { + // Arrange + process.env.SQS_URL_WEBHOOK = sqsMessage.QueueUrl; + process.env.SQS_IS_FIFO = 'false'; + + // Act + const result = await sendActionRequest(message); + + // Assert + expect(mockSQS.sendMessage).toBeCalledWith(sqsMessage); + expect(result).resolves; + }); + + it('use a fifo queue', async () => { + // Arrange + process.env.SQS_URL_WEBHOOK = sqsMessage.QueueUrl; + process.env.SQS_IS_FIFO = 'true'; + + // Act + const result = await sendActionRequest(message); + + // Assert + expect(mockSQS.sendMessage).toBeCalledWith({ ...sqsMessage, MessageGroupId: String(message.id) }); + expect(result).resolves; + }); +}); diff --git a/modules/webhook/lambdas/webhook/src/sqs/index.ts b/modules/webhook/lambdas/webhook/src/sqs/index.ts index 1a6e75e808..63a2a240f2 100644 --- a/modules/webhook/lambdas/webhook/src/sqs/index.ts +++ b/modules/webhook/lambdas/webhook/src/sqs/index.ts @@ -1,10 +1,5 @@ -import AWS, { SQS } from 'aws-sdk'; - -AWS.config.update({ - region: process.env.AWS_REGION, -}); - -const sqs = new SQS(); +import { SQS } from 'aws-sdk'; +import { LogFields, logger as logger } from '../webhook/logger'; export interface ActionRequestMessage { id: number; @@ -15,11 +10,20 @@ export interface ActionRequestMessage { } export const sendActionRequest = async (message: ActionRequestMessage): Promise => { - await sqs - .sendMessage({ - QueueUrl: String(process.env.SQS_URL_WEBHOOK), - MessageBody: JSON.stringify(message), - MessageGroupId: String(message.id), - }) - .promise(); + const sqs = new SQS({ region: process.env.AWS_REGION }); + + const useFifoQueueEnv = process.env.SQS_IS_FIFO || 'false'; + const useFifoQueue = JSON.parse(useFifoQueueEnv) as boolean; + + const sqsMessage: SQS.Types.SendMessageRequest = { + QueueUrl: String(process.env.SQS_URL_WEBHOOK), + MessageBody: JSON.stringify(message), + }; + + logger.debug(`sending message to SQS: ${JSON.stringify(sqsMessage)}`, LogFields.print()); + if (useFifoQueue) { + sqsMessage.MessageGroupId = String(message.id); + } + + await sqs.sendMessage(sqsMessage).promise(); }; diff --git a/modules/webhook/lambdas/webhook/src/webhook/handler.test.ts b/modules/webhook/lambdas/webhook/src/webhook/handler.test.ts index 72e84219ec..76bbf737a3 100644 --- a/modules/webhook/lambdas/webhook/src/webhook/handler.test.ts +++ b/modules/webhook/lambdas/webhook/src/webhook/handler.test.ts @@ -98,6 +98,17 @@ describe('handler', () => { expect(sendActionRequest).not.toBeCalled(); }); + it('handles workflow_job events without installation id', async () => { + const event = JSON.stringify({ ...workflowjob_event, installation: null }); + process.env.REPOSITORY_WHITE_LIST = '["philips-labs/terraform-aws-github-runner"]'; + const resp = await handle( + { 'X-Hub-Signature': await webhooks.sign(event), 'X-GitHub-Event': 'workflow_job' }, + event, + ); + expect(resp.statusCode).toBe(201); + expect(sendActionRequest).toBeCalled(); + }); + it('handles workflow_job events from whitelisted repositories', async () => { const event = JSON.stringify(workflowjob_event); process.env.REPOSITORY_WHITE_LIST = '["philips-labs/terraform-aws-github-runner"]'; @@ -264,5 +275,15 @@ describe('handler', () => { expect(resp.statusCode).toBe(201); expect(sendActionRequest).toBeCalled(); }); + + it('handles check_run events with no installation id.', async () => { + const event = JSON.stringify({ ...checkrun_event, installation: { id: null } }); + const resp = await handle( + { 'X-Hub-Signature': await webhooks.sign(event), 'X-GitHub-Event': 'check_run' }, + event, + ); + expect(resp.statusCode).toBe(201); + expect(sendActionRequest).toBeCalled(); + }); }); }); diff --git a/modules/webhook/lambdas/webhook/src/webhook/handler.ts b/modules/webhook/lambdas/webhook/src/webhook/handler.ts index 11b9b1ec76..7e56480e2b 100644 --- a/modules/webhook/lambdas/webhook/src/webhook/handler.ts +++ b/modules/webhook/lambdas/webhook/src/webhook/handler.ts @@ -101,10 +101,7 @@ async function handleWorkflowJob(body: WorkflowJobEvent, githubEvent: string): P }; } - let installationId = body.installation?.id; - if (installationId == null) { - installationId = 0; - } + const installationId = getInstallationId(body); if (body.action === 'queued') { await sendActionRequest({ id: body.workflow_job.id, @@ -119,10 +116,7 @@ async function handleWorkflowJob(body: WorkflowJobEvent, githubEvent: string): P } async function handleCheckRun(body: CheckRunEvent, githubEvent: string): Promise { - let installationId = body.installation?.id; - if (installationId == null) { - installationId = 0; - } + const installationId = getInstallationId(body); if (body.action === 'created' && body.check_run.status === 'queued') { await sendActionRequest({ id: body.check_run.id, @@ -136,6 +130,14 @@ async function handleCheckRun(body: CheckRunEvent, githubEvent: string): Promise return { statusCode: 201 }; } +function getInstallationId(body: WorkflowJobEvent | CheckRunEvent) { + let installationId = body.installation?.id; + if (installationId == null) { + installationId = 0; + } + return installationId; +} + function isRepoNotAllowed(repo_full_name: string): boolean { const repositoryWhiteListEnv = process.env.REPOSITORY_WHITE_LIST || '[]'; const repositoryWhiteList = JSON.parse(repositoryWhiteListEnv) as Array; diff --git a/modules/webhook/variables.tf b/modules/webhook/variables.tf index 5a767fc1b8..1eb17c9d38 100644 --- a/modules/webhook/variables.tf +++ b/modules/webhook/variables.tf @@ -126,3 +126,9 @@ variable "disable_check_wokflow_job_labels" { type = bool default = false } + +variable "sqs_build_queue_fifo" { + description = "Enable a FIFO queue to remain the order of events received by the webhook. Suggest to set to true for repo level runners." + type = bool + default = false +} diff --git a/modules/webhook/webhook.tf b/modules/webhook/webhook.tf index 1115b985f1..8ea42b2d28 100644 --- a/modules/webhook/webhook.tf +++ b/modules/webhook/webhook.tf @@ -19,6 +19,7 @@ resource "aws_lambda_function" "webhook" { REPOSITORY_WHITE_LIST = jsonencode(var.repository_white_list) RUNNER_LABELS = jsonencode(split(",", var.runner_extra_labels)) SQS_URL_WEBHOOK = var.sqs_build_queue.id + SQS_IS_FIFO = var.sqs_build_queue_fifo } } diff --git a/outputs.tf b/outputs.tf index 1c2bef291a..8dcc418b77 100644 --- a/outputs.tf +++ b/outputs.tf @@ -32,3 +32,12 @@ output "webhook" { output "ssm_parameters" { value = module.ssm.parameters } + + +output "queues" { + description = "SQS queues." + value = { + build_queue_arn = aws_sqs_queue.queued_builds.arn + build_queue_dlq_arn = var.redrive_build_queue.enabled ? aws_sqs_queue.queued_builds_dlq[0].arn : null + } +} diff --git a/variables.tf b/variables.tf index 103b400937..2dbb2b80d7 100644 --- a/variables.tf +++ b/variables.tf @@ -96,7 +96,7 @@ variable "runners_lambda_zip" { variable "runners_scale_up_lambda_timeout" { description = "Time out for the scale up lambda in seconds." type = number - default = 180 + default = 30 } variable "runners_scale_down_lambda_timeout" { @@ -451,7 +451,12 @@ variable "runner_metadata_options" { http_tokens = "optional" http_put_response_hop_limit = 1 } +} +variable "enable_ephemeral_runners" { + description = "Enable ephemeral runners, runners will only be used once." + type = bool + default = false } variable "runner_os" { @@ -473,3 +478,25 @@ variable "lambda_principals" { })) default = [] } + +variable "fifo_build_queue" { + description = "Enable a FIFO queue to remain the order of events received by the webhook. Suggest to set to true for repo level runners." + type = bool + default = false +} + +variable "redrive_build_queue" { + description = "Set options to attach (optional) a dead letter queue to the build queue, the queue between the webhook and the scale up lambda. You have the following options. 1. Disable by setting, `enalbed' to false. 2. Enable by setting `enabled` to `true`, `maxReceiveCount` to a number of max retries." + type = object({ + enabled = bool + maxReceiveCount = number + }) + default = { + enabled = false + maxReceiveCount = null + } + validation { + condition = var.redrive_build_queue.enabled && var.redrive_build_queue.maxReceiveCount != null || !var.redrive_build_queue.enabled + error_message = "Ensure you have set the maxReceiveCount when enabled." + } +}