From 91bd67beb1637d11677e8d835594eb26dbd07da3 Mon Sep 17 00:00:00 2001 From: Ian Boden <82514609+IanBoden@users.noreply.github.com> Date: Mon, 9 Dec 2024 10:32:54 +0000 Subject: [PATCH] [patch] reinstate zenmetastore and couchdb workarounds for cpd 4.8 (#1580) --- .../roles/cp4d/tasks/install-cp4d.yml | 9 ++ .../cp4d/tasks/wait/wait-zenmetastore-edb.yml | 112 ++++++++++++++++++ .../cp4d_service/tasks/wait/wait-ccs.yml | 11 +- .../cp4d_service/tasks/wait/wait-couchdb.yml | 108 +++++++++++++++++ 4 files changed, 239 insertions(+), 1 deletion(-) create mode 100644 ibm/mas_devops/roles/cp4d/tasks/wait/wait-zenmetastore-edb.yml create mode 100644 ibm/mas_devops/roles/cp4d_service/tasks/wait/wait-couchdb.yml diff --git a/ibm/mas_devops/roles/cp4d/tasks/install-cp4d.yml b/ibm/mas_devops/roles/cp4d/tasks/install-cp4d.yml index 44e4fa78e..144b262a6 100644 --- a/ibm/mas_devops/roles/cp4d/tasks/install-cp4d.yml +++ b/ibm/mas_devops/roles/cp4d/tasks/install-cp4d.yml @@ -160,6 +160,15 @@ retries: 20 # Approximately 20 minutes before we give up delay: 60 # 1 minute +# 3-pre. Wait for Zen Metastore Cluster to be ready +# ----------------------------------------------------------------------------- +# There have been issues with Zen Metastore Cluster not starting due to Persistent Storage, +# This task restarts any failing pods +- name: "install-cp4d : Wait for zen-metadata to be ready again (60s delay)" + include_tasks: "tasks/wait/wait-zenmetastore-edb.yml" + when: + - cpd_48_or_higher + # 3. Wait for zenStatus # ----------------------------------------------------------------------------- # oc get ZenService lite-cr -o jsonpath="{.status.zenStatus}{'\n'}" diff --git a/ibm/mas_devops/roles/cp4d/tasks/wait/wait-zenmetastore-edb.yml b/ibm/mas_devops/roles/cp4d/tasks/wait/wait-zenmetastore-edb.yml new file mode 100644 index 000000000..22432104f --- /dev/null +++ b/ibm/mas_devops/roles/cp4d/tasks/wait/wait-zenmetastore-edb.yml @@ -0,0 +1,112 @@ +--- +# 1. Wait for zen metastore cluster to start +# ----------------------------------------------------------------------------- +- name: "wait-zenmetastore-edb : Wait for Zen Metastore EDB Cluster to be created" + k8s_info: + kind: Cluster + namespace: "{{ cpd_instance_namespace }}" + name: "zen-metastore-edb" + register: zenmetastoreCluster + retries: 120 # Give 60 minutes for the zenService to start Zen Metastore Pods (Logs show this taking ~20 minutes in a good run) + delay: 30 + until: zenmetastoreCluster.resources[0].status is defined + +# 2. For V4.8, We need to patch the postgres licensing job acccording to https://www.ibm.com/support/pages/node/7158524 +- name: "wait-zenmetastore-edb : Check and display the create-postgres-license-config license expiry date" + when: cpd_48 + block: + - name: "wait-zenmetastore-edb : Fetch the license expiry date" + kubernetes.core.k8s_info: + kind: Cluster + namespace: "{{ cpd_instance_namespace }}" + name: "zen-metastore-edb" + register: zenmetastoreCluster + until: + zenmetastoreCluster.resources[0].spec.instances is defined + and zenmetastoreCluster.resources[0].status.licenseStatus.licenseStatus is defined + retries: 40 # Give 20 minutes + delay: 30 + - debug: + msg: + - "License Expiration .. {{ zenmetastoreCluster.resources[0].status.licenseStatus.licenseExpiration | default ('') }}" + - "License Status ...... {{ zenmetastoreCluster.resources[0].status.licenseStatus.licenseStatus}}" + +- name: "wait-zenmetastore-edb : Update create-postgres-license-config job" + when: + - cpd_48 + - zenmetastoreCluster.resources[0].status.licenseStatus.licenseExpiration is not defined + block: + - name: "wait-zenmetastore-edb : Wait for create-postgres-license-config Job to be Completed or Failed (10s delay)" + kubernetes.core.k8s_info: + api_version: batch/v1 + kind: Job + name: "create-postgres-license-config" + namespace: "{{ cpd_operators_namespace }}" + register: _job_info + until: + - _job_info.resources is defined + - _job_info.resources | length > 0 + - (_job_info.resources | json_query('[*].status.conditions[?type==`Complete`][].status') | select ('match','True') | list | length == 1) or + (_job_info.resources | json_query('[*].status.conditions[?type==`Failed`][].status') | select ('match','True') | list | length == 1) + retries: 30 + delay: 10 + + - name: "wait-zenmetastore-edb : Recreate the job with up to date license image" + shell: >- + oc get job create-postgres-license-config -n {{ cpd_operators_namespace }} -o yaml | \ + sed -e 's/operator.ibm.com\/opreq-control: "true"/operator.ibm.com\/opreq-control: "false"/' \ + -e 's|\(image: \).*|\1"cp.icr.io/cp/cpd/edb-postgres-license-provider@sha256:c1670e7dd93c1e65a6659ece644e44aa5c2150809ac1089e2fd6be37dceae4ce"|' \ + -e '/controller-uid:/d' | \ + oc replace --force -f - + register: _job_recreate_output + + - debug: + msg: "Recreate the job: {{ _job_recreate_output }}" + + - name: "wait-zenmetastore-edb : Wait for create-postgres-license-config Job to be Completed or Failed (10s delay)" + kubernetes.core.k8s_info: + api_version: batch/v1 + kind: Job + name: "create-postgres-license-config" + namespace: "{{ cpd_operators_namespace }}" + register: _job_info + until: + - _job_info.resources is defined + - _job_info.resources | length > 0 + - (_job_info.resources | json_query('[*].status.conditions[?type==`Complete`][].status') | select ('match','True') | list | length == 1) or + (_job_info.resources | json_query('[*].status.conditions[?type==`Failed`][].status') | select ('match','True') | list | length == 1) + retries: 30 + delay: 10 + + - name: "wait-zenmetastore-edb : Check and display the license expiry date" + kubernetes.core.k8s_info: + kind: Cluster + namespace: "{{ cpd_instance_namespace }}" + name: "zen-metastore-edb" + register: newzenmetastoreCluster + until: + newzenmetastoreCluster.resources[0].spec.instances is defined + and newzenmetastoreCluster.resources[0].status.licenseStatus.licenseStatus is defined + and "Valid license" in newzenmetastoreCluster.resources[0].status.licenseStatus.licenseStatus + retries: 40 # Give 20 minutes + delay: 30 + + - debug: + msg: + - "License Expiration .. {{ newzenmetastoreCluster.resources[0].status.licenseStatus.licenseExpiration | default ('') }}" + - "License Status ...... {{ newzenmetastoreCluster.resources[0].status.licenseStatus.licenseStatus | default ('')}}" + +# 3. Wait for zen metastore replica pods to become ready +# ----------------------------------------------------------------------------- +- name: "wait-zenmetastore-edb : Wait for ZenMetastore pods to be become ready" + k8s_info: + kind: Cluster + namespace: "{{ cpd_instance_namespace }}" + name: "zen-metastore-edb" + register: zenmetastoreCluster + retries: 40 # Give 20 minutes for the pods to become ready + delay: 30 + until: >- + zenmetastoreCluster.resources[0].spec.instances is defined + and zenmetastoreCluster.resources[0].status.readyInstances is defined + and zenmetastoreCluster.resources[0].spec.instances == zenmetastoreCluster.resources[0].status.readyInstances diff --git a/ibm/mas_devops/roles/cp4d_service/tasks/wait/wait-ccs.yml b/ibm/mas_devops/roles/cp4d_service/tasks/wait/wait-ccs.yml index f304fa5be..9645398c7 100644 --- a/ibm/mas_devops/roles/cp4d_service/tasks/wait/wait-ccs.yml +++ b/ibm/mas_devops/roles/cp4d_service/tasks/wait/wait-ccs.yml @@ -109,7 +109,16 @@ - cpd_48_or_higher # elastic search operator was just introduced with cpd 4.8 - not skip_ibm_entitlement_injection # eventually we hope to be able to skip patching the elastic search cr with image pull secret, but not for now -# 5. Wait for CCS CR to be ready +# 5. Wait for CouchDB Stateful Set to be ready +# ----------------------------------------------------------------------------- +# There have been issues with CouchDB not starting due to Persistent Storage, +# This task restarts any failing pods +- include_tasks: "tasks/wait/wait-couchdb.yml" + when: + - cpd_48 + + +# 6. Wait for CCS CR to be ready # ----------------------------------------------------------------------------- # Note: We can't fail early when we see Failed status, as the operator will # report failed multiple times during initial reconcile. diff --git a/ibm/mas_devops/roles/cp4d_service/tasks/wait/wait-couchdb.yml b/ibm/mas_devops/roles/cp4d_service/tasks/wait/wait-couchdb.yml new file mode 100644 index 000000000..8b7cf0939 --- /dev/null +++ b/ibm/mas_devops/roles/cp4d_service/tasks/wait/wait-couchdb.yml @@ -0,0 +1,108 @@ +--- +# 1. Wait for couch-db stateful set to start all the replica pods +# ----------------------------------------------------------------------------- +- name: "wait-couchdb: Wait for CouchDB pods to be created" + k8s_info: + kind: StatefulSet + namespace: "{{ cpd_instance_namespace }}" + name: "wdp-couchdb" + register: couchdbStatefulSet + retries: 40 # Give 20 minutes for the ccs Operator to start CouchDB Pods (Logs show this taking ~7 minutes in a good run) + delay: 30 + until: >- + (( couchdbStatefulSet.resources[0].status is defined + and couchdbStatefulSet.resources[0].status.replicas is defined + and couchdbStatefulSet.resources[0].status.replicas == 0 ) + or ( couchdbStatefulSet.resources[0].status is defined + and couchdbStatefulSet.resources[0].status.updatedReplicas is defined + and couchdbStatefulSet.resources[0].status.replicas == couchdbStatefulSet.resources[0].status.updatedReplicas )) + + +# 2. Wait for couchdb replica pods to become ready +# ----------------------------------------------------------------------------- +- name: "wait-couchdb: Wait for CouchDB pods to be become ready" + k8s_info: + kind: StatefulSet + namespace: "{{ cpd_instance_namespace }}" + name: "wdp-couchdb" + register: couchdbStatefulSet + retries: 10 # Give 5 minutes for the pods to become ready + delay: 30 + until: >- + couchdbStatefulSet.resources[0].status.readyReplicas is defined + and couchdbStatefulSet.resources[0].status.replicas == couchdbStatefulSet.resources[0].status.readyReplicas + #ignore-errors: true # If this fails then we restart pending pods below + failed_when: false + +# 2. Restart any couchDB pods that are still Pending +# ----------------------------------------------------------------------------- +- set_fact: + is_couchdb_ready: true + when: + couchdbStatefulSet.resources[0].status.readyReplicas is defined + and couchdbStatefulSet.resources[0].status.replicas == couchdbStatefulSet.resources[0].status.readyReplicas + +- name: "wait-couchdb: Detecting and restarting pending CouchDB Pods" + block: + - name: "install-cp4d : Get pending CouchDB Pods" + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + label_selectors: + - "app=couchdb" + field_selectors: + - "status.phase=Pending" + namespace: "{{ cpd_instance_namespace }}" + register: pending_pod_lookup + + - set_fact: + pending_pod_names: "{{ pending_pod_lookup.resources | map(attribute='metadata.name') }}" + + - debug: + msg: "Restarting pending CouchDB Pods: {{ pending_pod_names }}" + + - name: "wait-couchdb: Restarting pending CouchDB Pods" + kubernetes.core.k8s: + state: absent + api_version: v1 + kind: Pod + namespace: "{{ cpd_instance_namespace }}" + name: "{{ item }}" + loop: "{{ pending_pod_names }}" + + # 3. Wait again couchdb replica pods to become ready + # ----------------------------------------------------------------------------- + - name: "wait-couchdb: Wait for CouchDB pods to be become ready" + k8s_info: + kind: StatefulSet + namespace: "{{ cpd_instance_namespace }}" + name: "wdp-couchdb" + register: couchdbStatefulSet + retries: 10 # Give another 5 minutes for the pods to become ready + delay: 30 + until: >- + couchdbStatefulSet.resources[0].status.readyReplicas is defined + and couchdbStatefulSet.resources[0].status.replicas == couchdbStatefulSet.resources[0].status.readyReplicas + failed_when: false # We handle and log the failure below. + + - name: "wait-couchdb: Fail if CouchDB pods are not ready" + block: + - name: "install-cp4d : Get Pending CouchDB Pods" + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + label_selectors: + - "app=couchdb" + field_selectors: + - "status.phase=Pending" + namespace: "{{ cpd_instance_namespace }}" + register: pending_pod_lookup + + - fail: + msg: + - "CouchDB pods are not ready {{ couchdbStatefulSet.resources[0].status }}" + - "Pending CouchDB Pods: {{ pending_pod_lookup.resources | map(attribute='metadata.name') }}" + when: + couchdbStatefulSet.resources[0].status.replicas != couchdbStatefulSet.resources[0].status.readyReplicas + + when: is_couchdb_ready is not defined