diff --git a/ansible/README.md b/ansible/README.md index d7e7d965..5b1257a7 100644 --- a/ansible/README.md +++ b/ansible/README.md @@ -182,7 +182,11 @@ repmgr standby switchover -f /etc/repmgr.conf --siblings-follow This is used in the case where the primary has gone down (e.g. unplanned downtime of a data room). Make sure you know which one you want to promote! -TODO: can (should?) this be automated? +```terminal +ansible-playbook -i inventory.yml -e primary=lard-a -e standby=lard-b rejoin.yml +``` + +This can also be done manually following the following steps: #### A. Promote standby node to primary @@ -194,7 +198,7 @@ TODO: can (should?) this be automated? postgres@lard-b:~$ repmgr -f /etc/repmgr.conf cluster show ``` - The primary should say it's **uncreachable**. + The primary should say it's **unreachable**. 1. Then promote the standby to primary: @@ -207,6 +211,12 @@ TODO: can (should?) this be automated? 1. Then move the IP in the OpenStack GUI (`Network → Floating IPs`, dissasociate it then associated it with the ipalias port on the other VM). +1. Restart LARD ingestion service in the new primary + + ```terminal + ubuntu@lard-b:~$ sudo systemctl start lard_ingestion.service + ``` + #### B. Rejoin old primary The cluster will be in a slightly confused state, because this VM still thinks @@ -234,12 +244,10 @@ be no data loss. 1. With a **playbook**: ```terminal - ansible-playbook -i inventory.yml -e rejoin=lard-a -e primary=lard-b rejoin.yml + ansible-playbook -i inventory.yml -e primary=lard-a -e standby=lard-b rejoin.yml --skip-tags promote ``` - where `rejoin` is the host name of the primary node that has been down and should now be a standby. - -If you want to do this **manually** you can follow the steps in the `rejoin` role tasks. + where `primary` is the host name of the primary node that has been down and should now be a standby. #### Testing diff --git a/ansible/rejoin.yml b/ansible/rejoin.yml index c98bb691..db3600e6 100644 --- a/ansible/rejoin.yml +++ b/ansible/rejoin.yml @@ -1,14 +1,46 @@ --- - name: Rejoin - hosts: "{{ rejoin }}" + hosts: localhost remote_user: ubuntu vars: # Old primary host that went down - rejoin: # provide via cmd - # New primary host after it was promoted primary: # provide via cmd + # Old standby that will be promoted to primary + standby: # provide via cmd - roles: - - role: rejoin + tasks: + - name: Promote standby + ansible.builtin.include_role: + name: rejoin + tasks_from: promote.yml + apply: + delegate_to: "{{ standby }}" + tags: "promote" + + - name: Perform IP switchover + ansible.builtin.include_role: + name: ostack + tasks_from: move_floating_ip.yml + vars: + ostack_primary: "{{ primary }}" + ostack_standby: "{{ standby }}" + tags: "promote" + + # TODO: should this happen before or after rejoining the old primary + - name: Restart LARD ingestion service + ansible.builtin.systemd_service: + name: lard_ingestion + state: restarted + become: true + delegate_to: "{{ standby }}" + tags: "promote" + + - name: Rejoin old primary + ansible.builtin.include_role: + name: rejoin + tasks_from: rejoin.yml + apply: + delegate_to: "{{ primary }}" vars: - rejoin_primary_ip: "{{ hostvars[primary].ansible_host }}" + # TODO: this should be done via DNS once we have those set up + rejoin_primary_ip: "{{ hostvars[standby].ansible_host }}" diff --git a/ansible/roles/rejoin/default/main.yml b/ansible/roles/rejoin/default/main.yml index 7c7866b6..9374b296 100644 --- a/ansible/roles/rejoin/default/main.yml +++ b/ansible/roles/rejoin/default/main.yml @@ -1,2 +1,2 @@ --- -rejoin_primary_ip: +rejoin_ip: diff --git a/ansible/roles/rejoin/tasks/promote.yml b/ansible/roles/rejoin/tasks/promote.yml new file mode 100644 index 00000000..0814d9db --- /dev/null +++ b/ansible/roles/rejoin/tasks/promote.yml @@ -0,0 +1,40 @@ +--- +- name: Check cluster + ansible.builtin.command: repmgr -f /etc/repmgr.conf cluster show + become: true + become_user: postgres + register: cluster_status + changed_when: false + + # TODO: check that primary says "unreachable"? +- name: Print cluster status + ansible.builtin.debug: + msg: "{{ cluster_status }}" + +- name: Dry run of standby promotion + ansible.builtin.command: repmgr -f /etc/repmgr.conf standby promote --dry-run + become: true + become_user: postgres + changed_when: false + register: dry_run_promote + +- name: Print result of dry-run + ansible.builtin.debug: + msg: "{{ dry_run_promote }}" + +# TODO: should postgres service be restarted? +# TODO: check that primary says "failed"? +- name: Promote standby + ansible.builtin.command: repmgr -f /etc/repmgr.conf standby promote + become: true + become_user: postgres + changed_when: true + # TODO: this will keep crashing until the the IP alias is moved to the standby + # So probably best to restart after the IP switch + # - name: Start LARD ingestion service + # ansible.builtin.systemd_service: + # daemon_reload: true + # name: lard_ingestion + # state: restarted + # enabled: true + # become: true diff --git a/ansible/roles/rejoin/tasks/main.yml b/ansible/roles/rejoin/tasks/rejoin.yml similarity index 87% rename from ansible/roles/rejoin/tasks/main.yml rename to ansible/roles/rejoin/tasks/rejoin.yml index 65ed311e..b33017f0 100644 --- a/ansible/roles/rejoin/tasks/main.yml +++ b/ansible/roles/rejoin/tasks/rejoin.yml @@ -8,7 +8,7 @@ - name: Dry run of rejoin ansible.builtin.command: > repmgr node rejoin - -f /etc/repmgr.conf -d 'host='{{ rejoin_primary_ip }}' user=repmgr dbname=repmgr connect_timeout=2' + -f /etc/repmgr.conf -d 'host='{{ rejoin_ip }}' user=repmgr dbname=repmgr connect_timeout=2' --force-rewind=/usr/lib/postgresql/16/bin/pg_rewind --verbose --dry-run become: true become_user: postgres @@ -24,7 +24,7 @@ - name: Rejoin old primary as standby ansible.builtin.command: > repmgr node rejoin - -f /etc/repmgr.conf -d 'host='{{ rejoin_primary_ip }}' user=repmgr dbname=repmgr connect_timeout=2' + -f /etc/repmgr.conf -d 'host='{{ rejoin_ip }}' user=repmgr dbname=repmgr connect_timeout=2' --force-rewind=/usr/lib/postgresql/16/bin/pg_rewind --verbose become: true become_user: postgres diff --git a/ansible/switchover.yml b/ansible/switchover.yml index fa68e546..bd032894 100644 --- a/ansible/switchover.yml +++ b/ansible/switchover.yml @@ -16,13 +16,13 @@ name: postgresql state: restarted become: true - delegate_to: primary + delegate_to: "{{ primary }}" - name: Perform Postgres switchover ansible.builtin.include_role: name: switchover apply: - delegate_to: standby + delegate_to: "{{ standby }}" - name: Perform IP switchover ansible.builtin.include_role: @@ -37,4 +37,4 @@ name: lard_ingestion state: restarted become: true - delegate_to: standby + delegate_to: "{{ standby }}"