diff --git a/.buildkite/docker-compose.yml b/.buildkite/docker-compose.yml index ac75c64aa..1b9acfbe6 100644 --- a/.buildkite/docker-compose.yml +++ b/.buildkite/docker-compose.yml @@ -9,3 +9,12 @@ services: volumes: - ..:/code:ro command: go test -v ./... + + ruby: + image: ruby:3.3 + working_dir: /work + environment: + - DRY_RUN + - AWS_REGION + volumes: + - ..:/work:ro diff --git a/.buildkite/pipeline.cleanamis.yaml b/.buildkite/pipeline.cleanamis.yaml new file mode 100644 index 000000000..a441ff0f5 --- /dev/null +++ b/.buildkite/pipeline.cleanamis.yaml @@ -0,0 +1,19 @@ +steps: + - name: ":broom: Delete AMIs ({{matrix}})" + command: .buildkite/steps/clean-old-amis + agents: + queue: "oss-deploy" + env: + DRY_RUN: true + AWS_REGION: "{{matrix}}" + matrix: + - "us-east-1" + - "us-west-2" + - "ap-southeast-2" + plugins: + - aws-assume-role-with-web-identity#v1.1.0: + role-arn: arn:aws:iam::172840064832:role/pipeline-buildkite-elastic-stack-for-aws-ami-cleaner + - docker-compose#v5.4.1: + run: ruby + config: .buildkite/docker-compose.yml + propagate-aws-auth-tokens: true diff --git a/.buildkite/steps/clean-old-amis b/.buildkite/steps/clean-old-amis new file mode 100755 index 000000000..57ab5ebcd --- /dev/null +++ b/.buildkite/steps/clean-old-amis @@ -0,0 +1,105 @@ +#!/usr/bin/env ruby + +require "bundler/inline" +require "date" + +gemfile do + source "https://rubygems.org" + + gem "oga" # an xml parser is required by aws-sdk + gem "aws-sdk-ec2" + gem "ostruct" + gem "logger" + gem "base64" +end + +def die(msg) + $stderr.puts msg + exit 1 +end + +def log(msg) + puts "#{DateTime.now.iso8601} #{msg}" +end + +region = ARGV[0] || ENV["AWS_REGION"] +dry_run = ENV["DRY_RUN"] + +die("region not found") if region.nil? || region == "" + + +client = Aws::EC2::Client.new(region: region) + +# Fetch all AMIs that we own in the current region +res = client.describe_images(owners: ["self"], include_deprecated: true) +all_images = [] +res.images.each do |image| + all_images << image +end + +# Filter the list of AMIs down to just those that were published by the elastic stack +# pipeline. There might be other AMIs in this account, and we don't wantto mess with them +all_images.select! { |image| + image.name.start_with?("buildkite-stack-") || # The name we used until mid 2019 + image.name.start_with?("buildkite-stack-linux-") || # The name we used for linux amd64/arm64 from mid 2019 + image.name.start_with?("buildkite-stack-windows-") # The name we used for windows amd64 from mid 2019 +} + +# We'd like to process the images oldest to newest +all_images.sort_by! { |image| image.creation_date } + +# Each AMI *can* be used in multiple elastic stack releases. It's rare, but it happens. This will extract one +# of the versions - if any - from the tags. Enough to confirm this image is one we published in Cloud Formation +# templates on githib.com and customers might be using it +def get_stack_version_from_tags(image) + image.tags.each do |tag| + if tag.key.start_with?("Version:") + return tag.key[/^Version:(.+)$/, 1] + end + end + nil +end + +# If we only deregister the AMI then we'll be left with orphaned snapshots and keep paying for storage. This +# extracts the snapshot IDs that the AMI is pointing at, so we can delete them as well +def get_snapshot_ids(image) + image.block_device_mappings.map { |blk| blk.ebs&.snapshot_id }.compact +end + +# Deregister an AMI, and delete any associated snapshots +def deregister_image(client, image, dry_run) + snapshot_ids = get_snapshot_ids(image) + tag = dry_run ? "[DRY RUN]" : "" + + puts "- #{tag} deregistering image #{image.image_id}" + client.deregister_image({image_id: image.image_id}) unless dry_run + + snapshot_ids.each do |snapshot_id| + puts "- #{tag} deleting snapshot #{snapshot_id}" + client.delete_snapshot({ snapshot_id: snapshot_id }) unless dry_run + end +end + +one_year_ago = Time.now - (60 * 60 * 24 * 365) + +# Time to get down to business. +# +# Loop over each elastic stack AMI, skip over any that we want to keep, and anything else we can deregister +# and save some money +all_images.each do |image| + puts "#{image.image_id}, #{image.name}, #{image.creation_date}, #{image.last_launched_time}, #{image.public}, #{get_stack_version_from_tags(image)}, #{get_snapshot_ids(image).join("|")}" + + if get_stack_version_from_tags(image) + puts "- keep (released version)" + elsif image.last_launched_time && DateTime.parse(image.last_launched_time).to_time >= one_year_ago + puts "- keep (launched recently)" + elsif DateTime.parse(image.creation_date).to_time >= one_year_ago + puts "- keep (created recently)" + else + puts "- delete (catch all)" + deregister_image(client, image, dry_run) + end + puts + puts + +end