From 52a77961ed466fb3c43b843ace94a010aa7ada1d Mon Sep 17 00:00:00 2001 From: tim Date: Fri, 22 Aug 2025 16:36:29 +0300 Subject: [PATCH 1/5] asg_management_fix_and_improve --- roles/asg_management/defaults/main.yml | 2 + roles/asg_management/tasks/main.yml | 2 +- roles/deploy_code/tasks/cleanup.yml | 213 ++++++++++++++++--------- 3 files changed, 137 insertions(+), 80 deletions(-) diff --git a/roles/asg_management/defaults/main.yml b/roles/asg_management/defaults/main.yml index 67371acd..9ddecf69 100644 --- a/roles/asg_management/defaults/main.yml +++ b/roles/asg_management/defaults/main.yml @@ -2,6 +2,8 @@ # AWS ASG variables to allow for the suspension of autoscaling during a code deployment. asg_management: name: "" # if the deploy is on an ASG put the name here + target_group_name: "{{ asg_management.name }}" # change if different from the ASG name (which may happen due to the 32-char limit) + refresh_asg_instances: true # runs only if squashFS image unmount failed and this set to true. #profile: "example" # optional, the boto profile name to use if not the system default region: "eu-west-1" suspend_processes: "Launch Terminate HealthCheck" # space separated string, see https://docs.aws.amazon.com/autoscaling/ec2/userguide/as-suspend-resume-processes.html diff --git a/roles/asg_management/tasks/main.yml b/roles/asg_management/tasks/main.yml index 03ee8b44..05f4648a 100644 --- a/roles/asg_management/tasks/main.yml +++ b/roles/asg_management/tasks/main.yml @@ -11,7 +11,7 @@ region: "{{ asg_management.region }}" profile: "{{ asg_management.profile | default(omit) }}" names: - - "{{ asg_management.name }}" + - "{{ asg_management.target_group_name }}" register: _target_group - name: Loop over target instances until they are all 'healthy'. diff --git a/roles/deploy_code/tasks/cleanup.yml b/roles/deploy_code/tasks/cleanup.yml index 796854e6..ce9f25a7 100644 --- a/roles/deploy_code/tasks/cleanup.yml +++ b/roles/deploy_code/tasks/cleanup.yml @@ -150,93 +150,148 @@ failed_when: false register: _deploy_code_mount_check - - name: Get the current pts session. - ansible.builtin.shell: - cmd: "tty | sed 's#/dev/##'" - register: deploy_pts - - - name: "Check for active sessions in {{ deploy_base_path }}." - ansible.builtin.shell: - cmd: "ps -eo pid,tty | awk '{print $1}' | xargs -n 1 pwdx 2>&1 | grep -v 'No such process' | grep {{ deploy_base_path }} | cut -d: -f1 | xargs -n 1 ps -o tty= -p | sort | uniq" - register: sessions_in_deploy_path - become: true - - - name: Display active sessions. - ansible.builtin.debug: - msg: > - Deploy session: {{ deploy_pts.stdout | default('Unknown') }}. - Active sessions in {{ deploy_base_path }}: {{ sessions_in_deploy_path.stdout_lines | default([]) | join(', ') | default('None') }}. - - - name: Kill sessions except the current one. + - name: Mount a SquashFS image if there is no mount. ansible.builtin.command: - cmd: "pkill -9 -t {{ item }}" - loop: "{{ sessions_in_deploy_path.stdout_lines }}" - when: - - "item != deploy_pts.stdout" - - "item is match('^pts/\\d+$')" - failed_when: false - register: kill_sessions_result - become: true - - - name: Display killed sessions. - ansible.builtin.debug: - msg: > - Sessions terminated: {{ kill_sessions_result.results | selectattr('rc', 'defined') | selectattr('rc', 'equalto', 0) | map(attribute='item') | list | join(', ') | default('None') }}. - - - name: Reload any services that might be keeping the loop device busy. - ansible.builtin.service: - name: "{{ www_service }}" - state: reloaded - with_items: "{{ deploy_code.services }}" - loop_control: - loop_var: www_service - become: true - when: - - _deploy_code_mount_check.rc == 0 - - deploy_code.service_action == "reload" - - deploy_code.services | length > 0 - - - name: Stop any services that might be keeping the loop device busy. - ansible.builtin.service: - name: "{{ www_service }}" - state: stopped - with_items: "{{ deploy_code.services }}" - loop_control: - loop_var: www_service + cmd: "mount {{ build_base_path }}/deploy.sqsh {{ deploy_base_path }} -t squashfs -o loop" become: true when: - - _deploy_code_mount_check.rc == 0 - - deploy_code.service_action == "stop" - - deploy_code.services | length > 0 + - _deploy_code_mount_check.rc != 0 - - name: Unmount existing SquashFS image. - ansible.builtin.command: - cmd: "umount {{ deploy_base_path }}" - become: true + - name: Mount a new SquashFS image instead of the current one. when: - _deploy_code_mount_check.rc == 0 - register: task_result - retries: "{{ deploy_code.unmount_retries }}" - delay: "{{ deploy_code.unmount_delay }}" - until: task_result.rc == 0 + block: + - name: Get the current pts session. + ansible.builtin.shell: + cmd: "tty | sed 's#/dev/##'" + register: deploy_pts - - name: Mount new SquashFS image. - ansible.builtin.command: - cmd: "mount {{ build_base_path }}/deploy.sqsh {{ deploy_base_path }} -t squashfs -o loop" - become: true + - name: "Check for active sessions in {{ deploy_base_path }}." + ansible.builtin.shell: + cmd: "ps -eo pid,tty | awk '{print $1}' | xargs -n 1 pwdx 2>&1 | grep -v 'No such process' | grep {{ deploy_base_path }} | cut -d: -f1 | xargs -n 1 ps -o tty= -p | sort | uniq" + register: sessions_in_deploy_path + become: true - - name: Start any services we stopped. - ansible.builtin.service: - name: "{{ www_service }}" - state: started - with_items: "{{ deploy_code.services }}" - loop_control: - loop_var: www_service - become: true - when: - - _deploy_code_mount_check.rc == 0 - - deploy_code.service_action == "stop" - - deploy_code.services | length > 0 + - name: Display active sessions. + ansible.builtin.debug: + msg: > + Deploy session: {{ deploy_pts.stdout | default('Unknown') }}. + Active sessions in {{ deploy_base_path }}: {{ sessions_in_deploy_path.stdout_lines | default([]) | join(', ') | default('None') }}. + + - name: Kill sessions except the current one. + ansible.builtin.command: + cmd: "pkill -9 -t {{ item }}" + loop: "{{ sessions_in_deploy_path.stdout_lines }}" + when: + - "item != deploy_pts.stdout" + - "item is match('^pts/\\d+$')" + failed_when: false + register: kill_sessions_result + become: true + + - name: Display killed sessions. + ansible.builtin.debug: + msg: > + Sessions terminated: {{ kill_sessions_result.results | selectattr('rc', 'defined') | selectattr('rc', 'equalto', 0) | map(attribute='item') | list | join(', ') | default('None') }}. + + - name: Reload any services that might be keeping the loop device busy. + ansible.builtin.service: + name: "{{ www_service }}" + state: reloaded + with_items: "{{ deploy_code.services }}" + loop_control: + loop_var: www_service + become: true + when: + - deploy_code.service_action == "reload" + - deploy_code.services | length > 0 + + - name: Stop any services that might be keeping the loop device busy. + ansible.builtin.service: + name: "{{ www_service }}" + state: stopped + with_items: "{{ deploy_code.services }}" + loop_control: + loop_var: www_service + become: true + when: + - deploy_code.service_action == "stop" + - deploy_code.services | length > 0 + + - name: Image unmount and autoscaling management handling. + block: + - name: Unmount existing SquashFS image. + ansible.builtin.command: + cmd: "umount {{ deploy_base_path }}" + become: true + register: task_result + retries: "{{ deploy_code.unmount_retries }}" + delay: "{{ deploy_code.unmount_delay }}" + until: task_result.rc == 0 + failed_when: false + + - name: If current image unmount succeeded. + when: task_result.rc == 0 + block: + - name: Mount new SquashFS image. + ansible.builtin.command: + cmd: "mount {{ build_base_path }}/deploy.sqsh {{ deploy_base_path }} -t squashfs -o loop" + become: true + + - name: Starts any services that we stopped if the image re-mounting was successful. + ansible.builtin.service: + name: "{{ www_service }}" + state: started + with_items: "{{ deploy_code.services }}" + loop_control: + loop_var: www_service + become: true + when: + - deploy_code.service_action == "stop" + - deploy_code.services | length > 0 + + - name: If current image unmount failed. + when: task_result.rc != 0 + block: + - name: Resume all autoscale processes on ASG. + amazon.aws.autoscaling_group: + name: "{{ asg_management.name }}" + region: "{{ asg_management.region }}" + profile: "{{ asg_management.profile | default(omit) }}" + suspended_processes: [] + delegate_to: localhost + run_once: true + when: + - asg_management.name | length > 0 + + - name: Run ASG instance refresh. + amazon.aws.autoscaling_instance_refresh: + name: "{{ asg_management.name }}" + region: "{{ asg_management.region }}" + profile: "{{ asg_management.profile | default(omit) }}" + strategy: Rolling + preferences: + min_healthy_percentage: 51 + instance_warmup: 100 + skip_matching: false + state: started + delegate_to: localhost + when: + - asg_management.name | length > 0 + - asg_management.refresh_asg_instances + + - name: Start any services we stopped if the image re-mounting failed and ASG management is disabled. + ansible.builtin.service: + name: "{{ www_service }}" + state: started + with_items: "{{ deploy_code.services }}" + loop_control: + loop_var: www_service + become: true + when: + - deploy_code.service_action == "stop" + - deploy_code.services | length > 0 + - not asg_management.refresh_asg_instances or (asg_management.name | length) == 0 # End of the squashFS block. - name: Trigger an infrastructure rebuild. From 02da35add16706da23dd820c629137ec9036f0ad Mon Sep 17 00:00:00 2001 From: tim Date: Fri, 22 Aug 2025 16:50:54 +0300 Subject: [PATCH 2/5] asg_management_fix_and_improve --- roles/deploy_code/tasks/cleanup.yml | 144 ++++++++++++++-------------- 1 file changed, 71 insertions(+), 73 deletions(-) diff --git a/roles/deploy_code/tasks/cleanup.yml b/roles/deploy_code/tasks/cleanup.yml index ce9f25a7..3f6097d6 100644 --- a/roles/deploy_code/tasks/cleanup.yml +++ b/roles/deploy_code/tasks/cleanup.yml @@ -150,14 +150,14 @@ failed_when: false register: _deploy_code_mount_check - - name: Mount a SquashFS image if there is no mount. + - name: Mount SquashFS image if there is no mounted one. ansible.builtin.command: cmd: "mount {{ build_base_path }}/deploy.sqsh {{ deploy_base_path }} -t squashfs -o loop" become: true when: - _deploy_code_mount_check.rc != 0 - - name: Mount a new SquashFS image instead of the current one. + - name: Mount new SquashFS image instead of the already mounted. when: - _deploy_code_mount_check.rc == 0 block: @@ -218,80 +218,78 @@ - deploy_code.service_action == "stop" - deploy_code.services | length > 0 - - name: Image unmount and autoscaling management handling. + - name: Unmount existing SquashFS image. + ansible.builtin.command: + cmd: "umount {{ deploy_base_path }}" + become: true + register: task_result + retries: "{{ deploy_code.unmount_retries }}" + delay: "{{ deploy_code.unmount_delay }}" + until: task_result.rc == 0 + failed_when: false + + - name: If current image unmount succeeded. + when: task_result.rc == 0 block: - - name: Unmount existing SquashFS image. + - name: Mount new SquashFS image. ansible.builtin.command: - cmd: "umount {{ deploy_base_path }}" + cmd: "mount {{ build_base_path }}/deploy.sqsh {{ deploy_base_path }} -t squashfs -o loop" + become: true + + - name: Start any services that we stopped if the image re-mounting was successful. + ansible.builtin.service: + name: "{{ www_service }}" + state: started + with_items: "{{ deploy_code.services }}" + loop_control: + loop_var: www_service + become: true + when: + - deploy_code.service_action == "stop" + - deploy_code.services | length > 0 + + - name: If current image unmount failed. + when: task_result.rc != 0 + block: + - name: Resume all autoscale processes on ASG. + amazon.aws.autoscaling_group: + name: "{{ asg_management.name }}" + region: "{{ asg_management.region }}" + profile: "{{ asg_management.profile | default(omit) }}" + suspended_processes: [] + delegate_to: localhost + run_once: true + when: + - asg_management.name | length > 0 + + - name: Run ASG instance refresh. + amazon.aws.autoscaling_instance_refresh: + name: "{{ asg_management.name }}" + region: "{{ asg_management.region }}" + profile: "{{ asg_management.profile | default(omit) }}" + strategy: Rolling + preferences: + min_healthy_percentage: 51 + instance_warmup: 100 + skip_matching: false + state: started + delegate_to: localhost + when: + - asg_management.name | length > 0 + - asg_management.refresh_asg_instances + + - name: Start any services we stopped if the image re-mounting failed and ASG management is disabled. + ansible.builtin.service: + name: "{{ www_service }}" + state: started + with_items: "{{ deploy_code.services }}" + loop_control: + loop_var: www_service become: true - register: task_result - retries: "{{ deploy_code.unmount_retries }}" - delay: "{{ deploy_code.unmount_delay }}" - until: task_result.rc == 0 - failed_when: false - - - name: If current image unmount succeeded. - when: task_result.rc == 0 - block: - - name: Mount new SquashFS image. - ansible.builtin.command: - cmd: "mount {{ build_base_path }}/deploy.sqsh {{ deploy_base_path }} -t squashfs -o loop" - become: true - - - name: Starts any services that we stopped if the image re-mounting was successful. - ansible.builtin.service: - name: "{{ www_service }}" - state: started - with_items: "{{ deploy_code.services }}" - loop_control: - loop_var: www_service - become: true - when: - - deploy_code.service_action == "stop" - - deploy_code.services | length > 0 - - - name: If current image unmount failed. - when: task_result.rc != 0 - block: - - name: Resume all autoscale processes on ASG. - amazon.aws.autoscaling_group: - name: "{{ asg_management.name }}" - region: "{{ asg_management.region }}" - profile: "{{ asg_management.profile | default(omit) }}" - suspended_processes: [] - delegate_to: localhost - run_once: true - when: - - asg_management.name | length > 0 - - - name: Run ASG instance refresh. - amazon.aws.autoscaling_instance_refresh: - name: "{{ asg_management.name }}" - region: "{{ asg_management.region }}" - profile: "{{ asg_management.profile | default(omit) }}" - strategy: Rolling - preferences: - min_healthy_percentage: 51 - instance_warmup: 100 - skip_matching: false - state: started - delegate_to: localhost - when: - - asg_management.name | length > 0 - - asg_management.refresh_asg_instances - - - name: Start any services we stopped if the image re-mounting failed and ASG management is disabled. - ansible.builtin.service: - name: "{{ www_service }}" - state: started - with_items: "{{ deploy_code.services }}" - loop_control: - loop_var: www_service - become: true - when: - - deploy_code.service_action == "stop" - - deploy_code.services | length > 0 - - not asg_management.refresh_asg_instances or (asg_management.name | length) == 0 + when: + - deploy_code.service_action == "stop" + - deploy_code.services | length > 0 + - not asg_management.refresh_asg_instances or (asg_management.name | length) == 0 # End of the squashFS block. - name: Trigger an infrastructure rebuild. From 1c4aa52b78aae73a8614494d4229e439be260808 Mon Sep 17 00:00:00 2001 From: tim Date: Fri, 22 Aug 2025 16:54:30 +0300 Subject: [PATCH 3/5] asg_management_fix_and_improve --- roles/deploy_code/tasks/cleanup.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/roles/deploy_code/tasks/cleanup.yml b/roles/deploy_code/tasks/cleanup.yml index 3f6097d6..d3b1b8db 100644 --- a/roles/deploy_code/tasks/cleanup.yml +++ b/roles/deploy_code/tasks/cleanup.yml @@ -244,7 +244,7 @@ loop_control: loop_var: www_service become: true - when: + when: - deploy_code.service_action == "stop" - deploy_code.services | length > 0 @@ -286,7 +286,7 @@ loop_control: loop_var: www_service become: true - when: + when: - deploy_code.service_action == "stop" - deploy_code.services | length > 0 - not asg_management.refresh_asg_instances or (asg_management.name | length) == 0 From 0d54400694b2f039ca96b557f2efa1f06881d558 Mon Sep 17 00:00:00 2001 From: tim Date: Fri, 22 Aug 2025 17:25:33 +0300 Subject: [PATCH 4/5] asg_management_fix_and_improve --- roles/asg_management/defaults/main.yml | 2 +- roles/asg_management/tasks/main.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/roles/asg_management/defaults/main.yml b/roles/asg_management/defaults/main.yml index 9ddecf69..ab54916e 100644 --- a/roles/asg_management/defaults/main.yml +++ b/roles/asg_management/defaults/main.yml @@ -2,7 +2,7 @@ # AWS ASG variables to allow for the suspension of autoscaling during a code deployment. asg_management: name: "" # if the deploy is on an ASG put the name here - target_group_name: "{{ asg_management.name }}" # change if different from the ASG name (which may happen due to the 32-char limit) + #target_group_name: "example" # matches the ASG name by default, specify if your TargetGroup name is different (for example due to the 32-char name length limit in AWS) refresh_asg_instances: true # runs only if squashFS image unmount failed and this set to true. #profile: "example" # optional, the boto profile name to use if not the system default region: "eu-west-1" diff --git a/roles/asg_management/tasks/main.yml b/roles/asg_management/tasks/main.yml index 05f4648a..885eb564 100644 --- a/roles/asg_management/tasks/main.yml +++ b/roles/asg_management/tasks/main.yml @@ -11,7 +11,7 @@ region: "{{ asg_management.region }}" profile: "{{ asg_management.profile | default(omit) }}" names: - - "{{ asg_management.target_group_name }}" + - "{{ asg_management.target_group_name | default(asg_management.name) }}" register: _target_group - name: Loop over target instances until they are all 'healthy'. From e198865218397a5c536f97c8b35a17475035ddf3 Mon Sep 17 00:00:00 2001 From: tim Date: Fri, 22 Aug 2025 18:21:02 +0300 Subject: [PATCH 5/5] asg_management_fix_and_improve --- roles/deploy_code/tasks/cleanup.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/roles/deploy_code/tasks/cleanup.yml b/roles/deploy_code/tasks/cleanup.yml index d3b1b8db..9db5403f 100644 --- a/roles/deploy_code/tasks/cleanup.yml +++ b/roles/deploy_code/tasks/cleanup.yml @@ -274,6 +274,7 @@ skip_matching: false state: started delegate_to: localhost + run_once: true when: - asg_management.name | length > 0 - asg_management.refresh_asg_instances