From 01b1dc1b93ff30f31be7ae46c1b0d33b5530241a Mon Sep 17 00:00:00 2001
From: Niek Palm <dev.npalm@gmail.com>
Date: Wed, 16 Mar 2022 09:28:40 +0100
Subject: [PATCH] feat: Add option for ephemeral to check builds status before
 scaling

---
 README.md                                      |  6 ++++--
 examples/ephemeral/main.tf                     |  3 +++
 main.tf                                        |  1 +
 modules/runners/README.md                      |  1 +
 .../runners/src/scale-runners/scale-up.test.ts | 18 ++++++++++++++++++
 .../runners/src/scale-runners/scale-up.ts      |  3 ++-
 modules/runners/main.tf                        |  2 ++
 modules/runners/scale-up.tf                    |  1 +
 modules/runners/variables.tf                   |  6 ++++++
 variables.tf                                   |  6 ++++++
 10 files changed, 44 insertions(+), 3 deletions(-)
diff --git a/README.md b/README.md
index 5f7e33beeb..28e7b54cf2 100644
--- a/README.md
+++ b/README.md
@@ -304,10 +304,11 @@ For time zones please check [TZ database name column](https://en.wikipedia.org/w
 Currently a beta feature! You can configure runners to be ephemeral, runners will be used only for one job. The feature should be used in conjunction with listening for the workflow job event. Please consider the following:
 
 - The scale down lambda is still active, and should only remove orphan instances. But there is no strict check in place. So ensure you configure the `minimum_running_time_in_minutes` to a value that is high enough to got your runner booted and connected to avoid it got terminated before executing a job.
-- The messages sent from the webhook lambda to scale-up lambda are by default delayed delayed by SQS, to give available runners to option to start the job before the decision is made to scale more runners. For ephemeral runners there is no need to wait. Set `delay_webhook_event` to `0`.
+- The messages sent from the webhook lambda to scale-up lambda are by default delayed delayed by SQS, to give available runners to option to start the job before the decision is made to scale more runners. For ephemeral runners there is no need to wait. Set `delay_webhook_event` to `0`. 
+- All events on the queue will lead to a new runner crated by the lambda. By setting `enable_job_queued_check` to `true` you can enforce only create a runner if the event has a correlated queued job. Setting this can avoid creating useless runners, for example whn jobs got cancelled before a runner is created. We suggest to use this in combination with a pool.
 - To ensure runners are created in the same order GitHub sends the events we use by default a FIFO queue, this is mainly relevant for repo level runners. For ephemeral runners you can set `fifo_build_queue` to `false`.
 - Error related to scaling should be retried via SQS. You can configure `job_queue_retention_in_seconds` `redrive_build_queue` to tune the behavior. We have no mechanism to avoid events will never processed, which means potential no runner could be created and the job in GitHub can time out in 6 hours. 
- 
+
 The example for [ephemeral runners](./examples/ephemeral) is based on the [default example](./examples/default). Have look on the diff to see the major configuration differences.
 
 ### Prebuilt Images
@@ -407,6 +408,7 @@ In case the setup does not work as intended follow the trace of events:
 | <a name="input_disable_runner_autoupdate"></a> [disable\_runner\_autoupdate](#input\_disable\_runner\_autoupdate) | Disable the auto update of the github runner agent. Be-aware there is a grace period of 30 days, see also the [GitHub article](https://github.blog/changelog/2022-02-01-github-actions-self-hosted-runners-can-now-disable-automatic-updates/) | `bool` | `false` | no |
 | <a name="input_enable_cloudwatch_agent"></a> [enable\_cloudwatch\_agent](#input\_enable\_cloudwatch\_agent) | Enabling the cloudwatch agent on the ec2 runner instances, the runner contains default config. Configuration can be overridden via `cloudwatch_config`. | `bool` | `true` | no |
 | <a name="input_enable_ephemeral_runners"></a> [enable\_ephemeral\_runners](#input\_enable\_ephemeral\_runners) | Enable ephemeral runners, runners will only be used once. | `bool` | `false` | no |
+| <a name="input_enable_job_queued_check"></a> [enable\_job\_queued\_check](#input\_enable\_job\_queued\_check) | Only scale if the job event received by the scale up lambda is is in the state queued. By default enabled for non ephemeral runners and disabled for ephemeral. Set this variable to overwrite the default behavior. | `bool` | `null` | no |
 | <a name="input_enable_managed_runner_security_group"></a> [enable\_managed\_runner\_security\_group](#input\_enable\_managed\_runner\_security\_group) | Enabling the default managed security group creation. Unmanaged security groups can be specified via `runner_additional_security_group_ids`. | `bool` | `true` | no |
 | <a name="input_enable_organization_runners"></a> [enable\_organization\_runners](#input\_enable\_organization\_runners) | Register runners to organization, instead of repo level | `bool` | `false` | no |
 | <a name="input_enable_ssm_on_runners"></a> [enable\_ssm\_on\_runners](#input\_enable\_ssm\_on\_runners) | Enable to allow access the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances. | `bool` | `false` | no |
diff --git a/examples/ephemeral/main.tf b/examples/ephemeral/main.tf
index 3bf14b00d6..9abaef9e8d 100644
--- a/examples/ephemeral/main.tf
+++ b/examples/ephemeral/main.tf
@@ -63,6 +63,9 @@ module "runners" {
   #   size                = 20
   #   schedule_expression = "cron(* * * * ? *)"
   # }]
+  #
+  #
+  enable_job_queued_check = true
 
   # configure your pre-built AMI
   # enabled_userdata = false
diff --git a/main.tf b/main.tf
index dbd32ac34c..6620da3fcf 100644
--- a/main.tf
+++ b/main.tf
@@ -148,6 +148,7 @@ module "runners" {
   github_app_parameters                = local.github_app_parameters
   enable_organization_runners          = var.enable_organization_runners
   enable_ephemeral_runners             = var.enable_ephemeral_runners
+  enable_job_queued_check              = var.enable_job_queued_check
   disable_runner_autoupdate            = var.disable_runner_autoupdate
   enable_managed_runner_security_group = var.enable_managed_runner_security_group
   scale_down_schedule_expression       = var.scale_down_schedule_expression
diff --git a/modules/runners/README.md b/modules/runners/README.md
index 208e675e4e..986722803c 100644
--- a/modules/runners/README.md
+++ b/modules/runners/README.md
@@ -124,6 +124,7 @@ yarn run dist
 | <a name="input_egress_rules"></a> [egress\_rules](#input\_egress\_rules) | List of egress rules for the GitHub runner instances. | <pre>list(object({<br>    cidr_blocks      = list(string)<br>    ipv6_cidr_blocks = list(string)<br>    prefix_list_ids  = list(string)<br>    from_port        = number<br>    protocol         = string<br>    security_groups  = list(string)<br>    self             = bool<br>    to_port          = number<br>    description      = string<br>  }))</pre> | <pre>[<br>  {<br>    "cidr_blocks": [<br>      "0.0.0.0/0"<br>    ],<br>    "description": null,<br>    "from_port": 0,<br>    "ipv6_cidr_blocks": [<br>      "::/0"<br>    ],<br>    "prefix_list_ids": null,<br>    "protocol": "-1",<br>    "security_groups": null,<br>    "self": null,<br>    "to_port": 0<br>  }<br>]</pre> | no |
 | <a name="input_enable_cloudwatch_agent"></a> [enable\_cloudwatch\_agent](#input\_enable\_cloudwatch\_agent) | Enabling the cloudwatch agent on the ec2 runner instances, the runner contains default config. Configuration can be overridden via `cloudwatch_config`. | `bool` | `true` | no |
 | <a name="input_enable_ephemeral_runners"></a> [enable\_ephemeral\_runners](#input\_enable\_ephemeral\_runners) | Enable ephemeral runners, runners will only be used once. | `bool` | `false` | no |
+| <a name="input_enable_job_queued_check"></a> [enable\_job\_queued\_check](#input\_enable\_job\_queued\_check) | Only scale if the job event received by the scale up lambda is is in the state queued. By default enabled for non ephemeral runners and disabled for ephemeral. Set this variable to overwrite the default behavior. | `bool` | `null` | no |
 | <a name="input_enable_managed_runner_security_group"></a> [enable\_managed\_runner\_security\_group](#input\_enable\_managed\_runner\_security\_group) | Enabling the default managed security group creation. Unmanaged security groups can be specified via `runner_additional_security_group_ids`. | `bool` | `true` | no |
 | <a name="input_enable_organization_runners"></a> [enable\_organization\_runners](#input\_enable\_organization\_runners) | n/a | `bool` | n/a | yes |
 | <a name="input_enable_ssm_on_runners"></a> [enable\_ssm\_on\_runners](#input\_enable\_ssm\_on\_runners) | Enable to allow access to the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances. | `bool` | n/a | yes |
diff --git a/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts b/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts
index 35f9ae1534..37728771b0 100644
--- a/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts
+++ b/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts
@@ -362,6 +362,12 @@ describe('scaleUp with public GH', () => {
     });
   });
 
+  it('not checking queued workflows', async () => {
+    process.env.ENABLE_JOB_QUEUED_CHECK = 'false';
+    await scaleUpModule.scaleUp('aws:sqs', TEST_DATA);
+    expect(mockOctokit.actions.getJobForWorkflowRun).not.toBeCalled();
+  });
+
   it('does not retrieve installation id if already set', async () => {
     const appSpy = jest.spyOn(ghAuth, 'createGithubAppAuth');
     const installationSpy = jest.spyOn(ghAuth, 'createGithubInstallationAuth');
@@ -535,6 +541,7 @@ describe('scaleUp with public GH', () => {
 
     it('ephemeral runners only run with workflow_job event, others should fail.', async () => {
       process.env.ENABLE_EPHEMERAL_RUNNERS = 'true';
+      process.env.ENABLE_JOB_QUEUED_CHECK = 'false';
       await expect(
         scaleUpModule.scaleUp('aws:sqs', {
           ...TEST_DATA,
@@ -545,7 +552,18 @@ describe('scaleUp with public GH', () => {
 
     it('creates a ephemeral runner.', async () => {
       process.env.ENABLE_EPHEMERAL_RUNNERS = 'true';
+      process.env.ENABLE_JOB_QUEUED_CHECK = 'false';
+      await scaleUpModule.scaleUp('aws:sqs', TEST_DATA);
+      expectedRunnerParams.runnerServiceConfig = [...expectedRunnerParams.runnerServiceConfig, `--ephemeral`];
+      expect(mockOctokit.actions.getJobForWorkflowRun).not.toBeCalled();
+      expect(createRunner).toBeCalledWith(expectedRunnerParams);
+    });
+
+    it('creates a ephemeral runner after checking job is queued.', async () => {
+      process.env.ENABLE_EPHEMERAL_RUNNERS = 'true';
+      process.env.ENABLE_JOB_QUEUED_CHECK = 'true';
       await scaleUpModule.scaleUp('aws:sqs', TEST_DATA);
+      expect(mockOctokit.actions.getJobForWorkflowRun).toBeCalled();
       expectedRunnerParams.runnerServiceConfig = [...expectedRunnerParams.runnerServiceConfig, `--ephemeral`];
       expect(createRunner).toBeCalledWith(expectedRunnerParams);
     });
diff --git a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts
index c154a6214f..3347d82076 100644
--- a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts
+++ b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts
@@ -158,6 +158,7 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage
   const launchTemplateName = process.env.LAUNCH_TEMPLATE_NAME;
   const instanceMaxSpotPrice = process.env.INSTANCE_MAX_SPOT_PRICE;
   const instanceAllocationStrategy = process.env.INSTANCE_ALLOCATION_STRATEGY || 'lowest-price'; // same as AWS default
+  const enableJobQueuedCheck = yn(process.env.ENABLE_JOB_QUEUED_CHECK, { default: true });
 
   if (ephemeralEnabled && payload.eventType !== 'workflow_job') {
     logger.warn(
@@ -190,7 +191,7 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage
   const ghAuth = await createGithubInstallationAuth(installationId, ghesApiUrl);
   const githubInstallationClient = await createOctoClient(ghAuth.token, ghesApiUrl);
 
-  if (ephemeral || (await isJobQueued(githubInstallationClient, payload))) {
+  if (!enableJobQueuedCheck || (await isJobQueued(githubInstallationClient, payload))) {
     const currentRunners = await listEC2Runners({
       environment,
       runnerType,
diff --git a/modules/runners/main.tf b/modules/runners/main.tf
index ff5c614f1a..eb9cb7067e 100644
--- a/modules/runners/main.tf
+++ b/modules/runners/main.tf
@@ -35,6 +35,8 @@ locals {
   }
 
   ami_filter = coalesce(var.ami_filter, local.default_ami[var.runner_os])
+
+  enable_job_queued_check = var.enable_job_queued_check == null ? !var.enable_ephemeral_runners : var.enable_job_queued_check
 }
 
 data "aws_ami" "runner" {
diff --git a/modules/runners/scale-up.tf b/modules/runners/scale-up.tf
index 0a4bde8f6b..00a1d7e122 100644
--- a/modules/runners/scale-up.tf
+++ b/modules/runners/scale-up.tf
@@ -17,6 +17,7 @@ resource "aws_lambda_function" "scale_up" {
     variables = {
       DISABLE_RUNNER_AUTOUPDATE            = var.disable_runner_autoupdate
       ENABLE_EPHEMERAL_RUNNERS             = var.enable_ephemeral_runners
+      ENABLE_JOB_QUEUED_CHECK              = local.enable_job_queued_check
       ENABLE_ORGANIZATION_RUNNERS          = var.enable_organization_runners
       ENVIRONMENT                          = var.environment
       GHES_URL                             = var.ghes_url
diff --git a/modules/runners/variables.tf b/modules/runners/variables.tf
index 8252f3c715..a5d3621f5e 100644
--- a/modules/runners/variables.tf
+++ b/modules/runners/variables.tf
@@ -481,6 +481,12 @@ variable "enable_ephemeral_runners" {
   default     = false
 }
 
+variable "enable_job_queued_check" {
+  description = "Only scale if the job event received by the scale up lambda is is in the state queued. By default enabled for non ephemeral runners and disabled for ephemeral. Set this variable to overwrite the default behavior."
+  type        = bool
+  default     = null
+}
+
 variable "pool_lambda_timeout" {
   description = "Time out for the pool lambda lambda in seconds."
   type        = number
diff --git a/variables.tf b/variables.tf
index b2f65d5a51..c02d6484e8 100644
--- a/variables.tf
+++ b/variables.tf
@@ -507,6 +507,12 @@ variable "enable_ephemeral_runners" {
   default     = false
 }
 
+variable "enable_job_queued_check" {
+  description = "Only scale if the job event received by the scale up lambda is is in the state queued. By default enabled for non ephemeral runners and disabled for ephemeral. Set this variable to overwrite the default behavior."
+  type        = bool
+  default     = null
+}
+
 variable "enable_managed_runner_security_group" {
   description = "Enabling the default managed security group creation. Unmanaged security groups can be specified via `runner_additional_security_group_ids`."
   type        = bool