From 69d73b7463a072ab3364d4c8be0bb8fe8a0003c6 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Fri, 14 Oct 2022 11:54:25 -0400 Subject: [PATCH 01/95] Fixes for 17.0 ir script (#380) --- tests/infrared/17.0/infrared-openstack.sh | 2 +- tests/infrared/17.0/stf-connectors.yaml.template | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/infrared/17.0/infrared-openstack.sh b/tests/infrared/17.0/infrared-openstack.sh index 87257d323..17ff67a35 100755 --- a/tests/infrared/17.0/infrared-openstack.sh +++ b/tests/infrared/17.0/infrared-openstack.sh @@ -95,7 +95,7 @@ ir_create_undercloud() { } stf_create_config() { - sed -r "s/<>/${AMQP_HOST}/;s/<>/${AMQP_PORT}/;s/<>/${CLOUD_NAME}/;s%<>%${CA_CERT_FILE_CONTENT//$'\n'/<@@@>}%;s/<@@@>/\n /g" ${ENVIRONMENT_TEMPLATE} > outputs/stf-connectors.yaml + sed -r "s/<>/${AMQP_HOST}/;s/<>/${AMQP_PORT}/;s/<>/${CLOUD_NAME}/;s%<>%${CA_CERT_FILE_CONTENT//$'\n'/<@@@>}%;s/<@@@>/\n /g" ${STF_ENVIRONMENT_TEMPLATE} > outputs/stf-connectors.yaml } gnocchi_create_config() { diff --git a/tests/infrared/17.0/stf-connectors.yaml.template b/tests/infrared/17.0/stf-connectors.yaml.template index b667fd835..c29b518e7 100644 --- a/tests/infrared/17.0/stf-connectors.yaml.template +++ b/tests/infrared/17.0/stf-connectors.yaml.template @@ -10,15 +10,15 @@ custom_templates: # set parameter defaults to match stable-1.3 documentation parameter_defaults: MetricsQdrConnectors: - - host: <> - port: <> - role: edge - verifyHostname: false - sslProfile: sslProfile + - host: <> + port: <> + role: edge + verifyHostname: false + sslProfile: sslProfile MetricsQdrSSLProfiles: - - name: sslProfile - caCertFileContent: | + - name: sslProfile + caCertFileContent: | <> CeilometerQdrEventsConfig: From 1dc0808ace835f2410c35b1b523398ed35f797f8 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Thu, 20 Oct 2022 15:18:45 -0400 Subject: [PATCH 02/95] Move the SNMP trap delivery checks (#381) * Move the SNMP trap delivery checks Move the SNMP trap delivery checks as where they are situated now seems to cause false positives. Moves the checks closer to the end of the smoketest run seems to result in a better change that the logs the check is looking for have been provided. * Use a loop to check for SNMP status with break and max time --- tests/smoketest/smoketest.sh | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tests/smoketest/smoketest.sh b/tests/smoketest/smoketest.sh index eb1a30cd5..89135ba32 100755 --- a/tests/smoketest/smoketest.sh +++ b/tests/smoketest/smoketest.sh @@ -75,8 +75,18 @@ done oc delete pod curl SNMP_WEBHOOK_POD=$(oc get pod -l "app=default-snmp-webhook" -ojsonpath='{.items[0].metadata.name}') -oc logs "$SNMP_WEBHOOK_POD" | grep 'Sending SNMP trap' -SNMP_WEBHOOK_STATUS=$? +SNMP_WEBHOOK_CHECK_MAX_TRIES=5 +SNMP_WEBHOOK_CHECK_TIMEOUT=30 +SNMP_WEBHOOK_CHECK_COUNT=0 +while [ $SNMP_WEBHOOK_CHECK_COUNT -lt $SNMP_WEBHOOK_CHECK_MAX_TRIES ]; do + oc logs "$SNMP_WEBHOOK_POD" | grep 'Sending SNMP trap' + SNMP_WEBHOOK_STATUS=$? + (( SNMP_WEBHOOK_CHECK_COUNT=SNMP_WEBHOOK_CHECK_COUNT+1 )) + if [ $SNMP_WEBHOOK_STATUS -eq 0 ]; then + break + fi + sleep $SNMP_WEBHOOK_CHECK_TIMEOUT +done echo "*** [INFO] Showing oc get all..." oc get all From 1560d3cad81cba367d31858d0c180ff761d937c8 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Fri, 28 Oct 2022 16:35:43 -0400 Subject: [PATCH 03/95] Lock the bundle to OCP v4.10 (#385) --- deploy/olm-catalog/service-telemetry-operator/Dockerfile.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in b/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in index c6488bfea..eaa25d334 100644 --- a/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in +++ b/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in @@ -13,7 +13,7 @@ LABEL operators.operatorframework.io.metrics.mediatype.v1=metrics+v1 LABEL operators.operatorframework.io.metrics.builder=operator-sdk-v0.19.4 LABEL operators.operatorframework.io.metrics.project_layout=ansible LABEL com.redhat.delivery.operator.bundle=true -LABEL com.redhat.openshift.versions="v4.10" +LABEL com.redhat.openshift.versions="=v4.10" LABEL com.redhat.delivery.backport=false LABEL com.redhat.component="service-telemetry-operator-bundle-container" \ From af35714b5d08720063b6dc0e6a3e46962ff1ec64 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Mon, 7 Nov 2022 15:16:00 -0500 Subject: [PATCH 04/95] Make all certs 8yr expiry --- roles/servicetelemetry/tasks/_local_signing_authority.yml | 4 ++++ roles/servicetelemetry/tasks/component_qdr.yml | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/roles/servicetelemetry/tasks/_local_signing_authority.yml b/roles/servicetelemetry/tasks/_local_signing_authority.yml index 346e2c770..6c689fb65 100644 --- a/roles/servicetelemetry/tasks/_local_signing_authority.yml +++ b/roles/servicetelemetry/tasks/_local_signing_authority.yml @@ -8,6 +8,7 @@ name: '{{ ansible_operator_meta.namespace }}-selfsigned' namespace: '{{ ansible_operator_meta.namespace }}' spec: + duration: 70080h selfSigned: {} - name: Create CA certificate @@ -19,6 +20,7 @@ name: '{{ ansible_operator_meta.namespace }}-ca' namespace: '{{ ansible_operator_meta.namespace }}' spec: + duration: 70080h secretName: '{{ ansible_operator_meta.namespace }}-ca' commonName: '{{ ansible_operator_meta.namespace }}-ca' isCA: true @@ -34,6 +36,7 @@ name: '{{ ansible_operator_meta.namespace }}-ca' namespace: '{{ ansible_operator_meta.namespace }}' spec: + duration: 70080h ca: secretName: '{{ ansible_operator_meta.namespace }}-ca' @@ -47,6 +50,7 @@ name: elasticsearch-es-http namespace: '{{ ansible_operator_meta.namespace }}' spec: + duration: 70080h commonName: elasticsearch-es-http secretName: 'elasticsearch-es-cert' dnsNames: diff --git a/roles/servicetelemetry/tasks/component_qdr.yml b/roles/servicetelemetry/tasks/component_qdr.yml index 2247db84a..85a49cc36 100644 --- a/roles/servicetelemetry/tasks/component_qdr.yml +++ b/roles/servicetelemetry/tasks/component_qdr.yml @@ -13,6 +13,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-selfsigned" namespace: "{{ ansible_operator_meta.namespace }}" spec: + duration: 70080h selfSigned: {} - name: Create self-signed interconnect certificate @@ -25,6 +26,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-selfsigned" namespace: "{{ ansible_operator_meta.namespace }}" spec: + duration: 70080h commonName: "{{ ansible_operator_meta.name }}-interconnect.{{ ansible_operator_meta.namespace }}.svc.cluster.local" isCA: true issuerRef: @@ -42,6 +44,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-ca" namespace: "{{ ansible_operator_meta.namespace }}" spec: + duration: 70080h ca: secretName: "{{ ansible_operator_meta.name }}-interconnect-selfsigned" @@ -55,6 +58,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-openstack-ca" namespace: "{{ ansible_operator_meta.namespace }}" spec: + duration: 70080h commonName: "{{ ansible_operator_meta.name }}-interconnect-openstack-ca" isCA: true issuerRef: @@ -71,6 +75,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-openstack-credentials" namespace: "{{ ansible_operator_meta.namespace }}" spec: + duration: 70080h commonName: "{{ ansible_operator_meta.name }}-interconnect" dnsNames: - "{{ ansible_operator_meta.name }}-interconnect.{{ ansible_operator_meta.namespace }}.svc.cluster.local" @@ -88,6 +93,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-inter-router-ca" namespace: "{{ ansible_operator_meta.namespace }}" spec: + duration: 70080h ca: secretName: "{{ ansible_operator_meta.name }}-interconnect-inter-router-ca" @@ -101,6 +107,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-inter-router-ca" namespace: "{{ ansible_operator_meta.namespace }}" spec: + duration: 70080h commonName: "{{ ansible_operator_meta.name }}-interconnect-inter-router-ca" isCA: true issuerRef: @@ -117,6 +124,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-inter-router-credentials" namespace: "{{ ansible_operator_meta.namespace }}" spec: + duration: 70080h commonName: "{{ ansible_operator_meta.name }}-interconnect" dnsNames: - "{{ ansible_operator_meta.name }}-interconnect.{{ ansible_operator_meta.namespace }}.svc.cluster.local" From bc7138483aabc03365649302b00068fb96d7fd66 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Mon, 7 Nov 2022 15:19:49 -0500 Subject: [PATCH 05/95] Revert "Make all certs 8yr expiry" This reverts commit af35714b5d08720063b6dc0e6a3e46962ff1ec64. --- roles/servicetelemetry/tasks/_local_signing_authority.yml | 4 ---- roles/servicetelemetry/tasks/component_qdr.yml | 8 -------- 2 files changed, 12 deletions(-) diff --git a/roles/servicetelemetry/tasks/_local_signing_authority.yml b/roles/servicetelemetry/tasks/_local_signing_authority.yml index 6c689fb65..346e2c770 100644 --- a/roles/servicetelemetry/tasks/_local_signing_authority.yml +++ b/roles/servicetelemetry/tasks/_local_signing_authority.yml @@ -8,7 +8,6 @@ name: '{{ ansible_operator_meta.namespace }}-selfsigned' namespace: '{{ ansible_operator_meta.namespace }}' spec: - duration: 70080h selfSigned: {} - name: Create CA certificate @@ -20,7 +19,6 @@ name: '{{ ansible_operator_meta.namespace }}-ca' namespace: '{{ ansible_operator_meta.namespace }}' spec: - duration: 70080h secretName: '{{ ansible_operator_meta.namespace }}-ca' commonName: '{{ ansible_operator_meta.namespace }}-ca' isCA: true @@ -36,7 +34,6 @@ name: '{{ ansible_operator_meta.namespace }}-ca' namespace: '{{ ansible_operator_meta.namespace }}' spec: - duration: 70080h ca: secretName: '{{ ansible_operator_meta.namespace }}-ca' @@ -50,7 +47,6 @@ name: elasticsearch-es-http namespace: '{{ ansible_operator_meta.namespace }}' spec: - duration: 70080h commonName: elasticsearch-es-http secretName: 'elasticsearch-es-cert' dnsNames: diff --git a/roles/servicetelemetry/tasks/component_qdr.yml b/roles/servicetelemetry/tasks/component_qdr.yml index 85a49cc36..2247db84a 100644 --- a/roles/servicetelemetry/tasks/component_qdr.yml +++ b/roles/servicetelemetry/tasks/component_qdr.yml @@ -13,7 +13,6 @@ name: "{{ ansible_operator_meta.name }}-interconnect-selfsigned" namespace: "{{ ansible_operator_meta.namespace }}" spec: - duration: 70080h selfSigned: {} - name: Create self-signed interconnect certificate @@ -26,7 +25,6 @@ name: "{{ ansible_operator_meta.name }}-interconnect-selfsigned" namespace: "{{ ansible_operator_meta.namespace }}" spec: - duration: 70080h commonName: "{{ ansible_operator_meta.name }}-interconnect.{{ ansible_operator_meta.namespace }}.svc.cluster.local" isCA: true issuerRef: @@ -44,7 +42,6 @@ name: "{{ ansible_operator_meta.name }}-interconnect-ca" namespace: "{{ ansible_operator_meta.namespace }}" spec: - duration: 70080h ca: secretName: "{{ ansible_operator_meta.name }}-interconnect-selfsigned" @@ -58,7 +55,6 @@ name: "{{ ansible_operator_meta.name }}-interconnect-openstack-ca" namespace: "{{ ansible_operator_meta.namespace }}" spec: - duration: 70080h commonName: "{{ ansible_operator_meta.name }}-interconnect-openstack-ca" isCA: true issuerRef: @@ -75,7 +71,6 @@ name: "{{ ansible_operator_meta.name }}-interconnect-openstack-credentials" namespace: "{{ ansible_operator_meta.namespace }}" spec: - duration: 70080h commonName: "{{ ansible_operator_meta.name }}-interconnect" dnsNames: - "{{ ansible_operator_meta.name }}-interconnect.{{ ansible_operator_meta.namespace }}.svc.cluster.local" @@ -93,7 +88,6 @@ name: "{{ ansible_operator_meta.name }}-interconnect-inter-router-ca" namespace: "{{ ansible_operator_meta.namespace }}" spec: - duration: 70080h ca: secretName: "{{ ansible_operator_meta.name }}-interconnect-inter-router-ca" @@ -107,7 +101,6 @@ name: "{{ ansible_operator_meta.name }}-interconnect-inter-router-ca" namespace: "{{ ansible_operator_meta.namespace }}" spec: - duration: 70080h commonName: "{{ ansible_operator_meta.name }}-interconnect-inter-router-ca" isCA: true issuerRef: @@ -124,7 +117,6 @@ name: "{{ ansible_operator_meta.name }}-interconnect-inter-router-credentials" namespace: "{{ ansible_operator_meta.namespace }}" spec: - duration: 70080h commonName: "{{ ansible_operator_meta.name }}-interconnect" dnsNames: - "{{ ansible_operator_meta.name }}-interconnect.{{ ansible_operator_meta.namespace }}.svc.cluster.local" From 4b5d7a156cf38cd862825f845d021630bc1b1c20 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Thu, 10 Nov 2022 12:08:41 -0500 Subject: [PATCH 06/95] Make all certs 8yr expiry (#387) * Make all certs 8yr expiry * Use certificate_duration and test against generated cert * Better messages during CI cloning --- build/stf-run-ci/tasks/clone_repos.yml | 10 +++++----- roles/servicetelemetry/defaults/main.yml | 2 ++ .../tasks/_local_signing_authority.yml | 4 ++++ roles/servicetelemetry/tasks/component_qdr.yml | 8 ++++++++ tests/smoketest/smoketest.sh | 9 +++++++++ 5 files changed, 28 insertions(+), 5 deletions(-) diff --git a/build/stf-run-ci/tasks/clone_repos.yml b/build/stf-run-ci/tasks/clone_repos.yml index 883211090..97e351bdc 100644 --- a/build/stf-run-ci/tasks/clone_repos.yml +++ b/build/stf-run-ci/tasks/clone_repos.yml @@ -5,7 +5,7 @@ # of these separately rather than using a loop. - name: Get Smart Gateway Operator block: - - name: Try cloning same-named branch or override branch from specified repository + - name: Try cloning same-named branch or override branch from SGO repository git: repo: "{{ sgo_repository }}" dest: working/smart-gateway-operator @@ -19,7 +19,7 @@ - name: Get sg-core block: - - name: Try cloning same-named branch or override branch from specified repository + - name: Try cloning same-named branch or override branch from sg-core repository git: repo: "{{ sg_core_repository }}" dest: working/sg-core @@ -33,7 +33,7 @@ - name: Get sg-bridge block: - - name: Try cloning same-named branch or override branch from specified repository + - name: Try cloning same-named branch or override branch from sg-bridge repository git: repo: "{{ sg_bridge_repository }}" dest: working/sg-bridge @@ -47,7 +47,7 @@ - name: Get prometheus-webhook-snmp block: - - name: Try cloning same-named branch or override branch from specified repository + - name: Try cloning same-named branch or override branch from prometheus-webhook-snmp repository git: repo: "{{ prometheus_webhook_snmp_repository }}" dest: working/prometheus-webhook-snmp @@ -64,7 +64,7 @@ # branches there. - name: Get Loki Operator block: - - name: Try cloning same-named branch or override branch from specified repository + - name: Try cloning same-named branch or override branch from loki repository git: repo: "{{ loki_operator_repository }}" dest: working/loki diff --git a/roles/servicetelemetry/defaults/main.yml b/roles/servicetelemetry/defaults/main.yml index 95346f137..2ba6d4b43 100644 --- a/roles/servicetelemetry/defaults/main.yml +++ b/roles/servicetelemetry/defaults/main.yml @@ -6,6 +6,8 @@ clouds_remove_on_missing: false # default observability strategy (compatible with STF 1.3) observability_strategy: use_community +certificate_duration: 70080h + servicetelemetry_defaults: high_availability: enabled: false diff --git a/roles/servicetelemetry/tasks/_local_signing_authority.yml b/roles/servicetelemetry/tasks/_local_signing_authority.yml index 346e2c770..f4ae3fd62 100644 --- a/roles/servicetelemetry/tasks/_local_signing_authority.yml +++ b/roles/servicetelemetry/tasks/_local_signing_authority.yml @@ -8,6 +8,7 @@ name: '{{ ansible_operator_meta.namespace }}-selfsigned' namespace: '{{ ansible_operator_meta.namespace }}' spec: + duration: '{{ certificate_duration }}' selfSigned: {} - name: Create CA certificate @@ -19,6 +20,7 @@ name: '{{ ansible_operator_meta.namespace }}-ca' namespace: '{{ ansible_operator_meta.namespace }}' spec: + duration: '{{ certificate_duration }}' secretName: '{{ ansible_operator_meta.namespace }}-ca' commonName: '{{ ansible_operator_meta.namespace }}-ca' isCA: true @@ -34,6 +36,7 @@ name: '{{ ansible_operator_meta.namespace }}-ca' namespace: '{{ ansible_operator_meta.namespace }}' spec: + duration: '{{ certificate_duration }}' ca: secretName: '{{ ansible_operator_meta.namespace }}-ca' @@ -47,6 +50,7 @@ name: elasticsearch-es-http namespace: '{{ ansible_operator_meta.namespace }}' spec: + duration: '{{ certificate_duration }}' commonName: elasticsearch-es-http secretName: 'elasticsearch-es-cert' dnsNames: diff --git a/roles/servicetelemetry/tasks/component_qdr.yml b/roles/servicetelemetry/tasks/component_qdr.yml index 2247db84a..cf7cc937b 100644 --- a/roles/servicetelemetry/tasks/component_qdr.yml +++ b/roles/servicetelemetry/tasks/component_qdr.yml @@ -13,6 +13,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-selfsigned" namespace: "{{ ansible_operator_meta.namespace }}" spec: + duration: '{{ certificate_duration }}' selfSigned: {} - name: Create self-signed interconnect certificate @@ -25,6 +26,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-selfsigned" namespace: "{{ ansible_operator_meta.namespace }}" spec: + duration: '{{ certificate_duration }}' commonName: "{{ ansible_operator_meta.name }}-interconnect.{{ ansible_operator_meta.namespace }}.svc.cluster.local" isCA: true issuerRef: @@ -42,6 +44,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-ca" namespace: "{{ ansible_operator_meta.namespace }}" spec: + duration: '{{ certificate_duration }}' ca: secretName: "{{ ansible_operator_meta.name }}-interconnect-selfsigned" @@ -55,6 +58,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-openstack-ca" namespace: "{{ ansible_operator_meta.namespace }}" spec: + duration: '{{ certificate_duration }}' commonName: "{{ ansible_operator_meta.name }}-interconnect-openstack-ca" isCA: true issuerRef: @@ -71,6 +75,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-openstack-credentials" namespace: "{{ ansible_operator_meta.namespace }}" spec: + duration: '{{ certificate_duration }}' commonName: "{{ ansible_operator_meta.name }}-interconnect" dnsNames: - "{{ ansible_operator_meta.name }}-interconnect.{{ ansible_operator_meta.namespace }}.svc.cluster.local" @@ -88,6 +93,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-inter-router-ca" namespace: "{{ ansible_operator_meta.namespace }}" spec: + duration: '{{ certificate_duration }}' ca: secretName: "{{ ansible_operator_meta.name }}-interconnect-inter-router-ca" @@ -101,6 +107,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-inter-router-ca" namespace: "{{ ansible_operator_meta.namespace }}" spec: + duration: '{{ certificate_duration }}' commonName: "{{ ansible_operator_meta.name }}-interconnect-inter-router-ca" isCA: true issuerRef: @@ -117,6 +124,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-inter-router-credentials" namespace: "{{ ansible_operator_meta.namespace }}" spec: + duration: '{{ certificate_duration }}' commonName: "{{ ansible_operator_meta.name }}-interconnect" dnsNames: - "{{ ansible_operator_meta.name }}-interconnect.{{ ansible_operator_meta.namespace }}.svc.cluster.local" diff --git a/tests/smoketest/smoketest.sh b/tests/smoketest/smoketest.sh index 89135ba32..aa82145cf 100755 --- a/tests/smoketest/smoketest.sh +++ b/tests/smoketest/smoketest.sh @@ -73,6 +73,15 @@ for NAME in "${CLOUDNAMES[@]}"; do RET=$((RET || $?)) # Accumulate exit codes done +echo "*** [INFO] Checking that the qdr certificate has a long expiry" +EXPIRETIME=$(oc get secret default-interconnect-openstack-ca -o json | grep \"tls.crt\"\: | awk -F '": "' '{print $2}' | rev | cut -c3- | rev | base64 -d | openssl x509 -in - -text | grep "Not After" | awk -F " : " '{print $2}') +EXPIRETIME_UNIX=$(date -d "${EXPIRETIME}" "+%s") +TARGET_UNIX=$(date -d "now + 7 years" "+%s") +if [ ${EXPIRETIME_UNIX} -lt ${TARGET_UNIX} ]; then + echo "[FAILURE] Certificate expire time (${EXPIRETIME}) less than 7 years from now" +fi + +echo "*** [INFO] Waiting to see SNMP trap message in webhook pod" oc delete pod curl SNMP_WEBHOOK_POD=$(oc get pod -l "app=default-snmp-webhook" -ojsonpath='{.items[0].metadata.name}') SNMP_WEBHOOK_CHECK_MAX_TRIES=5 From 729f8411663d840d21a697bb965dcaf9eddb7c18 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Fri, 18 Nov 2022 11:28:54 -0500 Subject: [PATCH 07/95] Expand support for OCP 4.11 (#391) * Expand support for OCP 4.11 Allow installation to be done on OCP 4.11 while updating the smoketest jobs to support later versions of the client. Also migrate to using community-operators CatalogSource instead of OperatorHub.io. Only enable community-operators when the use_community strategy is enabled. Update the token request syntax when requesting a service account token. Add checks to look for oc client version and fail if we're using a version that's too old. * Make passwords safer in smoketest job template Encapsulate the password values with double quotes to help make them safer for consumption in the template. I had an odd situation where the password contained a bunch of extended characters and caused the smoketest to report an error on the template having an issue with yaml to json. The password contained several characters such as . and : which confused the template. Wrapping the contents in the double quotes allowed the smoketest to apply the job.batch template and result in a working smoketest run. --- build/stf-run-ci/tasks/setup_base.yml | 20 +++---------------- .../service-telemetry-operator/Dockerfile.in | 2 +- ...emetry-operator.clusterserviceversion.yaml | 2 +- tests/smoketest/smoketest.sh | 13 ++++++++++-- tests/smoketest/smoketest_job.yaml.template | 8 ++++---- 5 files changed, 20 insertions(+), 25 deletions(-) diff --git a/build/stf-run-ci/tasks/setup_base.yml b/build/stf-run-ci/tasks/setup_base.yml index 13944668d..9b0c838f9 100644 --- a/build/stf-run-ci/tasks/setup_base.yml +++ b/build/stf-run-ci/tasks/setup_base.yml @@ -13,6 +13,8 @@ name: certified-operators - disabled: false name: redhat-operators + - disabled: "{{ false if __service_telemetry_observability_strategy == 'use_community' else true }}" + name: community-operators - name: Create OperatorGroup k8s: @@ -63,22 +65,6 @@ source: redhat-operators sourceNamespace: openshift-marketplace -- name: Enable OperatorHub.io for Elastic Cloud on Kubernetes - k8s: - definition: - apiVersion: operators.coreos.com/v1alpha1 - kind: CatalogSource - metadata: - name: operatorhubio-operators - namespace: openshift-marketplace - spec: - sourceType: grpc - image: quay.io/operatorhubio/catalog:latest - displayName: OperatorHub.io Operators - publisher: OperatorHub.io - when: - - __service_telemetry_observability_strategy == "use_community" - - name: Subscribe to Elastic Cloud on Kubernetes Operator k8s: definition: @@ -123,7 +109,7 @@ channel: beta installPlanApproval: Automatic name: prometheus - source: operatorhubio-operators + source: community-operators sourceNamespace: openshift-marketplace when: - __service_telemetry_observability_strategy == "use_community" diff --git a/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in b/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in index eaa25d334..9639f4e1b 100644 --- a/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in +++ b/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in @@ -13,7 +13,7 @@ LABEL operators.operatorframework.io.metrics.mediatype.v1=metrics+v1 LABEL operators.operatorframework.io.metrics.builder=operator-sdk-v0.19.4 LABEL operators.operatorframework.io.metrics.project_layout=ansible LABEL com.redhat.delivery.operator.bundle=true -LABEL com.redhat.openshift.versions="=v4.10" +LABEL com.redhat.openshift.versions="v4.10-v4.11" LABEL com.redhat.delivery.backport=false LABEL com.redhat.component="service-telemetry-operator-bundle-container" \ diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index c65b02bd9..84289e887 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -161,7 +161,7 @@ metadata: description: Service Telemetry Framework. Umbrella Operator for instantiating the required dependencies and configuration of various components to build a Service Telemetry platform for telco grade monitoring. - olm.properties: '[{"type": "olm.maxOpenShiftVersion", "value": "4.10"}]' + olm.properties: '[{"type": "olm.maxOpenShiftVersion", "value": "4.11"}]' olm.skipRange: '>=<> <<>' operatorframework.io/suggested-namespace: service-telemetry operators.openshift.io/valid-subscription: '["OpenStack Platform", "Cloud Infrastructure", diff --git a/tests/smoketest/smoketest.sh b/tests/smoketest/smoketest.sh index aa82145cf..fbd84c1af 100755 --- a/tests/smoketest/smoketest.sh +++ b/tests/smoketest/smoketest.sh @@ -17,6 +17,15 @@ NUMCLOUDS=${NUMCLOUDS:-1} CLOUDNAMES=() OCP_PROJECT=${OCP_PROJECT:-} +OC_CLIENT_VERSION_X=$(oc version --client | grep Client | cut -f2 -d: | tr -s -d "[:space:]" - | cut -d. -f1) +OC_CLIENT_VERSION_X_REQUIRED=4 +OC_CLIENT_VERSION_Y=$(oc version --client | grep Client | cut -f2 -d: | tr -s -d "[:space:]" - | cut -d. -f2) +OC_CLIENT_VERSION_Y_REQUIRED=10 + +if [ "${OC_CLIENT_VERSION_Y}" -lt "${OC_CLIENT_VERSION_Y_REQUIRED}" ] || [ "${OC_CLIENT_VERSION_X}" != "${OC_CLIENT_VERSION_X_REQUIRED}" ]; then + echo "*** Please install 'oc' client version ${OC_CLIENT_VERSION_X_REQUIRED}.${OC_CLIENT_VERSION_Y_REQUIRED} or later ***" + exit 1 +fi CLEANUP=${CLEANUP:-true} @@ -59,7 +68,7 @@ for NAME in "${CLOUDNAMES[@]}"; do done echo "*** [INFO] Triggering an alertmanager notification..." -PROMETHEUS_K8S_TOKEN=$(oc serviceaccounts get-token prometheus-k8s) +PROMETHEUS_K8S_TOKEN=$(oc create token prometheus-k8s) oc run curl --restart='Never' --image=quay.io/infrawatch/busyboxplus:curl -- sh -c "curl -k -H \"Content-Type: application/json\" -H \"Authorization: Bearer ${PROMETHEUS_K8S_TOKEN}\" -d '[{\"labels\":{\"alertname\":\"Testalert1\"}}]' https://default-alertmanager-proxy:9095/api/v1/alerts" # it takes some time to get the alert delivered, continuing with other tests @@ -146,7 +155,7 @@ oc logs "$(oc get pod -l app=default-snmp-webhook -o jsonpath='{.items[0].metada echo echo "*** [INFO] Logs from alertmanager..." -oc logs "$(oc get pod -l app=alertmanager -o jsonpath='{.items[0].metadata.name}')" -c alertmanager +oc logs "$(oc get pod -l app.kubernetes.io/name=alertmanager -o jsonpath='{.items[0].metadata.name}')" -c alertmanager echo echo "*** [INFO] Cleanup resources..." diff --git a/tests/smoketest/smoketest_job.yaml.template b/tests/smoketest/smoketest_job.yaml.template index dcf055faa..50735b6a5 100644 --- a/tests/smoketest/smoketest_job.yaml.template +++ b/tests/smoketest/smoketest_job.yaml.template @@ -21,9 +21,9 @@ spec: - name: CLOUDNAME value: <> - name: ELASTICSEARCH_AUTH_PASS - value: <> + value: "<>" - name: PROMETHEUS_AUTH_PASS - value: <> + value: "<>" volumeMounts: - name: collectd-config mountPath: /etc/minimal-collectd.conf.template @@ -48,9 +48,9 @@ spec: - name: CLOUDNAME value: <> - name: ELASTICSEARCH_AUTH_PASS - value: <> + value: "<>" - name: PROMETHEUS_AUTH_PASS - value: <> + value: "<>" volumeMounts: - name: ceilometer-publisher mountPath: /ceilometer_publish.py From f09797762f6db26e0a6c681767fc7c468c37c300 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Tue, 22 Nov 2022 14:45:37 -0500 Subject: [PATCH 08/95] Force SGO checkout during build (#388) * Replacing the placeholder namespace during the build results in a "there are local changes" error on next build * This forces the checkout to discard that (and other!?) local changes * Quicker dev/test loop --- build/stf-run-ci/tasks/clone_repos.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/build/stf-run-ci/tasks/clone_repos.yml b/build/stf-run-ci/tasks/clone_repos.yml index 97e351bdc..73a00c8c9 100644 --- a/build/stf-run-ci/tasks/clone_repos.yml +++ b/build/stf-run-ci/tasks/clone_repos.yml @@ -10,6 +10,7 @@ repo: "{{ sgo_repository }}" dest: working/smart-gateway-operator version: "{{ sgo_branch | default(branch, true) }}" + force: yes rescue: - name: "Get {{ version_branches.sgo }} upstream branch because specified branch or repository doesn't exist" git: From 6811894b04b53266b836d6dfc384858c98f848b6 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Tue, 22 Nov 2022 15:14:15 -0500 Subject: [PATCH 09/95] Update oc to 4.11 in jenkins agent (#393) * Update oc to 4.11 in jenkins agent Need 4.11 for new token handling changes --- .jenkins/agent/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.jenkins/agent/Dockerfile b/.jenkins/agent/Dockerfile index 5e4b026d8..fe1a7fa70 100644 --- a/.jenkins/agent/Dockerfile +++ b/.jenkins/agent/Dockerfile @@ -6,3 +6,5 @@ RUN dnf install -y ansible golang python38 && \ alternatives --set python /usr/bin/python3.8 && \ python -m pip install openshift kubernetes "ansible-core~=2.12" && \ ansible-galaxy collection install -f 'kubernetes.core:>=2.2.0' community.general +RUN curl -LO "https://mirror.openshift.com/pub/openshift-v4/clients/ocp/latest-4.11/openshift-client-linux.tar.gz" && \ + tar -xv -C /usr/local/bin -f openshift-client-linux.tar.gz From e50f0573295847d238df547d07dab8a6e460892d Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Thu, 24 Nov 2022 16:21:59 -0500 Subject: [PATCH 10/95] Remove OperatorHub.io as a CatalogSource (#394) Remove the OperatorHub.io CatalogSource and instead use the community-operators CatalogSource which is available with an OCP installation. Ideally this will avoid some of the conflicts we've been seeing in our CI environment. This is a short term fix as future development will likely make use of Observability Operator to provide the metrics data store and alert delivery mechanism. --- build/stf-run-ci/tasks/pre-clean.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/build/stf-run-ci/tasks/pre-clean.yml b/build/stf-run-ci/tasks/pre-clean.yml index f443bfc2d..c370f5bf3 100644 --- a/build/stf-run-ci/tasks/pre-clean.yml +++ b/build/stf-run-ci/tasks/pre-clean.yml @@ -29,3 +29,22 @@ kind: clusterroles label_selectors: - "olm.owner.namespace = {{ namespace }}" + +# Clean the environment if it has OperatorHub.io CatalogSource still enabled as +# environment is using community-operators CatalogSource when use_community has +# been enabled. This avoids installing an additional CatalogSource which is no +# longer required. +- name: Remove OperatorHub.io CatalogSource if it installed + k8s: + state: absent + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: CatalogSource + metadata: + name: operatorhubio-operators + namespace: openshift-marketplace + spec: + sourceType: grpc + image: quay.io/operatorhubio/catalog:latest + displayName: OperatorHub.io Operators + publisher: OperatorHub.io From 37dceed7e55856820c10fb812da0ed9cd6551a3b Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Tue, 24 Jan 2023 16:54:43 -0500 Subject: [PATCH 11/95] Changes for 4.12 (#401) * Catalog changes * CI change to pre-clean cert-manager-operator * not 100% sure this is 4.12 related, but it's new and first seen during testing 4.12 --- build/stf-run-ci/tasks/pre-clean.yml | 11 +++++++++++ .../service-telemetry-operator/Dockerfile.in | 2 +- ...vice-telemetry-operator.clusterserviceversion.yaml | 2 +- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/build/stf-run-ci/tasks/pre-clean.yml b/build/stf-run-ci/tasks/pre-clean.yml index c370f5bf3..e5d7b2b16 100644 --- a/build/stf-run-ci/tasks/pre-clean.yml +++ b/build/stf-run-ci/tasks/pre-clean.yml @@ -48,3 +48,14 @@ image: quay.io/operatorhubio/catalog:latest displayName: OperatorHub.io Operators publisher: OperatorHub.io + +# Remove the cert manager since we install it as part of the CI/documented pre-install process +- name: Remove openshift-cert-manager-operator namespace + k8s: + state: absent + wait: yes + definition: + apiVersion: project.openshift.io/v1 + kind: Project + metadata: + name: openshift-cert-manager-operator \ No newline at end of file diff --git a/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in b/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in index 9639f4e1b..182dbf160 100644 --- a/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in +++ b/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in @@ -13,7 +13,7 @@ LABEL operators.operatorframework.io.metrics.mediatype.v1=metrics+v1 LABEL operators.operatorframework.io.metrics.builder=operator-sdk-v0.19.4 LABEL operators.operatorframework.io.metrics.project_layout=ansible LABEL com.redhat.delivery.operator.bundle=true -LABEL com.redhat.openshift.versions="v4.10-v4.11" +LABEL com.redhat.openshift.versions="v4.10-v4.12" LABEL com.redhat.delivery.backport=false LABEL com.redhat.component="service-telemetry-operator-bundle-container" \ diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index 84289e887..eaff970ae 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -161,7 +161,7 @@ metadata: description: Service Telemetry Framework. Umbrella Operator for instantiating the required dependencies and configuration of various components to build a Service Telemetry platform for telco grade monitoring. - olm.properties: '[{"type": "olm.maxOpenShiftVersion", "value": "4.11"}]' + olm.properties: '[{"type": "olm.maxOpenShiftVersion", "value": "4.12"}]' olm.skipRange: '>=<> <<>' operatorframework.io/suggested-namespace: service-telemetry operators.openshift.io/valid-subscription: '["OpenStack Platform", "Cloud Infrastructure", From 73fbb736111d649b543d5b948ef773c7bc97ad87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jarom=C3=ADr=20Wysoglad?= Date: Thu, 23 Feb 2023 17:04:51 +0100 Subject: [PATCH 12/95] Remove Loki from stf-run-ci (#405) * Remove Loki from stf-run-ci * Return "Get new operator sdk" to stf-run-ci --- build/stf-run-ci/README.md | 3 - build/stf-run-ci/defaults/main.yml | 18 --- build/stf-run-ci/tasks/clone_repos.yml | 21 --- build/stf-run-ci/tasks/deploy_stf.yml | 2 +- build/stf-run-ci/tasks/main.yml | 73 +--------- build/stf-run-ci/tasks/pre-clean.yml | 3 +- .../tasks/setup_stf_local_build.yml | 129 ------------------ 7 files changed, 3 insertions(+), 246 deletions(-) diff --git a/build/stf-run-ci/README.md b/build/stf-run-ci/README.md index 4e654bb44..08a6d8086 100644 --- a/build/stf-run-ci/README.md +++ b/build/stf-run-ci/README.md @@ -41,12 +41,9 @@ choose to override: | `__service_telemetry_storage_ephemeral_enabled` | {true,false} | false | Whether to enable ephemeral storage support in ServiceTelemetry | | `__service_telemetry_storage_persistent_storage_class` | | | Set a custom storageClass to override the default provided by OpenShift platform | | `__service_telemetry_snmptraps_enabled` | {true,false} | true | Whether to enable snmptraps delivery via Alertmanager receiver (prometheus-webhook-snmp) | -| `__service_telemetry_logs_enabled` | {true,false} | false | Whether to enable logs support in ServiceTelemetry | | `__service_telemetry_observability_strategy` | | use_community | Which observability strategy to use for deployment. Default deployment is 'use_community'. Also supported is 'none' | | `__internal_registry_path` | | image-registry.openshift-image-registry.svc:5000 | Path to internal registry for image path | -| `__deploy_minio_enabled` | {true,false} | false | Whether to deploy minio while deploying loki-operator for logging development purposes | | `__deploy_loki_enabled` | {true,false} | false | Whether to deploy loki-operator and other systems for logging development purposes | -| `__loki_skip_tls_verify` | {true,false} | false | Whether to skip TLS verify for Loki S3 connection | | `__golang_image_path` | | quay.io/infrawatch/golang:1.16 | Golang image path for building the loki-operator image | | `__loki_image_path` | | quay.io/infrawatch/loki:2.2.1 | Loki image path for Loki microservices | diff --git a/build/stf-run-ci/defaults/main.yml b/build/stf-run-ci/defaults/main.yml index 880e0cc55..f41c89349 100644 --- a/build/stf-run-ci/defaults/main.yml +++ b/build/stf-run-ci/defaults/main.yml @@ -14,50 +14,32 @@ __service_telemetry_high_availability_enabled: false __service_telemetry_metrics_enabled: true __service_telemetry_storage_ephemeral_enabled: false __service_telemetry_snmptraps_enabled: true -__service_telemetry_logs_enabled: false __service_telemetry_observability_strategy: use_community __internal_registry_path: image-registry.openshift-image-registry.svc:5000 __service_telemetry_bundle_image_path: __smart_gateway_bundle_image_path: -__deploy_minio_enabled: false -__deploy_loki_enabled: false -__loki_skip_tls_verify: false -__golang_image_path: quay.io/infrawatch/golang:1.16 -__loki_image_path: quay.io/infrawatch/loki:2.4.2 - sgo_image_tag: latest sto_image_tag: latest sg_core_image_tag: latest sg_bridge_image_tag: latest prometheus_webhook_snmp_image_tag: latest -loki_operator_image_tag: latest new_operator_sdk_version: v1.11.0 -new_go_version: 1.16.3 namespace: service-telemetry pull_secret_registry: pull_secret_user: pull_secret_pass: -# Set a default commit hash to clone for loki-operator to freeze -# the operator developement. -loki_operator_branch: b8e9973 - - # used when building images to default to correct version branch for STF subcomponents per STF version version_branches: sgo: master sg_core: master sg_bridge: master prometheus_webhook_snmp: master - loki_operator: master sgo_repository: https://github.com/infrawatch/smart-gateway-operator sg_core_repository: https://github.com/infrawatch/sg-core sg_bridge_repository: https://github.com/infrawatch/sg-bridge prometheus_webhook_snmp_repository: https://github.com/infrawatch/prometheus-webhook-snmp -loki_operator_repository: https://github.com/grafana/loki - -loki_operator_folder: operator base_dir: '' diff --git a/build/stf-run-ci/tasks/clone_repos.yml b/build/stf-run-ci/tasks/clone_repos.yml index 73a00c8c9..11e5d84d6 100644 --- a/build/stf-run-ci/tasks/clone_repos.yml +++ b/build/stf-run-ci/tasks/clone_repos.yml @@ -60,24 +60,3 @@ dest: working/prometheus-webhook-snmp version: "{{ version_branches.prometheus_webhook_snmp }}" -# Branches for Loki Operator don't work the same as with other repositories. -# We don't have write access to the upstream repository to create our own -# branches there. -- name: Get Loki Operator - block: - - name: Try cloning same-named branch or override branch from loki repository - git: - repo: "{{ loki_operator_repository }}" - dest: working/loki - version: "{{ loki_operator_branch | default(branch, true) }}" - force: yes - rescue: - - name: "Get {{ version_branches.loki_operator }} upstream branch because specified branch or repository doesn't exist" - git: - repo: https://github.com/grafana/loki - dest: working/loki - version: "{{ version_branches.loki_operator }}" - force: yes - when: - - __deploy_loki_enabled | bool - - __service_telemetry_observability_strategy == "use_community" diff --git a/build/stf-run-ci/tasks/deploy_stf.yml b/build/stf-run-ci/tasks/deploy_stf.yml index 596509cf5..d165940e5 100644 --- a/build/stf-run-ci/tasks/deploy_stf.yml +++ b/build/stf-run-ci/tasks/deploy_stf.yml @@ -42,7 +42,7 @@ {% endif %} logs: loki: - enabled: {{ __service_telemetry_logs_enabled }} + enabled: false replicationFactor: 1 flavor: 1x.extra-small storage: diff --git a/build/stf-run-ci/tasks/main.yml b/build/stf-run-ci/tasks/main.yml index 5216d4acf..3041d22ea 100644 --- a/build/stf-run-ci/tasks/main.yml +++ b/build/stf-run-ci/tasks/main.yml @@ -12,8 +12,6 @@ sg_core_image_path: "{{ __internal_registry_path }}/{{ namespace }}/sg-core:{{ sg_core_image_tag }}" sg_bridge_image_path: "{{ __internal_registry_path }}/{{ namespace }}/sg-bridge:{{ sg_bridge_image_tag }}" prometheus_webhook_snmp_image_path: "{{ __internal_registry_path }}/{{ namespace }}/prometheus-webhook-snmp:{{ prometheus_webhook_snmp_image_tag }}" - loki_operator_image_path: "{{ __internal_registry_path }}/{{ namespace }}/loki-operator:{{ loki_operator_image_tag }}" - loki_operator_bundle_image_path: "{{ __internal_registry_path }}/{{ namespace }}/loki-operator-bundle:{{ loki_operator_image_tag }}" - name: Fail on mutually exclusive flags fail: @@ -43,7 +41,7 @@ when: base_dir | length == 0 - name: Get new operator sdk - when: __local_build_enabled | bool or __deploy_from_bundles_enabled | bool or __deploy_loki_enabled | bool + when: __local_build_enabled | bool or __deploy_from_bundles_enabled | bool command: "{{ base_dir }}/get_new_operator_sdk.sh {{ new_operator_sdk_version }}" - when: __local_build_enabled | bool @@ -53,61 +51,6 @@ tags: - clone - - block: - - name: Move loki-operator to loki-operator folder - command: rm -rf "{{ base_dir }}/working/loki-operator" - command: mv "{{ base_dir }}/working/loki/{{ loki_operator_folder }}" "{{ base_dir }}/working/loki-operator" - - - name: Get new go - command: "{{ base_dir }}/get_go.sh {{ new_go_version }}" - when: __deploy_loki_enabled | bool - - # TLS verification support doesn't seem to be implemented in the operator yet - - block: - - name: Prepare for skip Loki TLS patch - replace: - path: "{{ base_dir }}/working/loki-operator/internal/manifests/internal/config/loki-config.yaml" - regexp: "\ \ \ \ insecure: false\n -\ \ \ http_config:\n -\ \ \ \ \ insecure_skip_verify: true" - replace: "" - - - name: Skip Loki TLS verification - replace: - path: "{{ base_dir }}/working/loki-operator/internal/manifests/internal/config/loki-config.yaml" - regexp: "\ \ \ \ s3forcepathstyle: true" - replace: "\ \ \ \ s3forcepathstyle: true\n -\ \ \ insecure: false\n -\ \ \ http_config:\n -\ \ \ \ \ insecure_skip_verify: true" - when: - - __deploy_loki_enabled | bool - - __loki_skip_tls_verify | bool - - __service_telemetry_observability_strategy == "use_community" - - - name: Remove forced multi-tenancy from loki-operator config - replace: - path: "{{ base_dir }}/working/loki-operator/internal/manifests/internal/config/loki-config.yaml" - regexp: "auth_enabled: true" - replace: "auth_enabled: false" - when: __deploy_loki_enabled | bool - - - block: - - name: Replace loki-operator golang base image - replace: - path: "{{ base_dir }}/working/loki-operator/Dockerfile" - regexp: "FROM golang:1.16 as builder" - replace: "FROM {{ __golang_image_path }} as builder" - - - name: Replace Loki image - replace: - path: "{{ base_dir }}/working/loki-operator/internal/manifests/var.go" - regexp: "docker.io/grafana/loki:\\d\\.\\d\\.\\d" - replace: "{{ __loki_image_path }}" - when: - - __deploy_loki_enabled | bool - - __service_telemetry_observability_strategy == "use_community" - - name: Create base build list set_fact: build_list: @@ -117,20 +60,6 @@ - { name: sg-bridge, dockerfile_path: build/Dockerfile, image_reference_name: sg_bridge_image_path, working_build_dir: ./working/sg-bridge } - { name: prometheus-webhook-snmp, dockerfile_path: Dockerfile, image_reference_name: prometheus_webhook_snmp_image_path, working_build_dir: ./working/prometheus-webhook-snmp } - - block: - - name: Create Loki build list - set_fact: - loki_build_list: - - { name: loki-operator-bundle, dockerfile_path: bundle.Dockerfile, image_reference_name: loki_operator_bundle_image_path, working_build_dir: ./working/loki-operator } - - { name: loki-operator, dockerfile_path: Dockerfile, image_reference_name: loki_operator_image_path, working_build_dir: ./working/loki-operator } - - - name: Combine lists when community operators are enabled - set_fact: - build_list: "{{ build_list + loki_build_list }}" - when: - - __deploy_loki_enabled | bool - - __service_telemetry_observability_strategy == "use_community" - - debug: var: build_list diff --git a/build/stf-run-ci/tasks/pre-clean.yml b/build/stf-run-ci/tasks/pre-clean.yml index e5d7b2b16..d86093cce 100644 --- a/build/stf-run-ci/tasks/pre-clean.yml +++ b/build/stf-run-ci/tasks/pre-clean.yml @@ -8,7 +8,6 @@ loop: - smartgateways.smartgateway.infra.watch - servicetelemetrys.infra.watch - - lokistacks.loki.openshift.io tags: - clean-crds @@ -58,4 +57,4 @@ apiVersion: project.openshift.io/v1 kind: Project metadata: - name: openshift-cert-manager-operator \ No newline at end of file + name: openshift-cert-manager-operator diff --git a/build/stf-run-ci/tasks/setup_stf_local_build.yml b/build/stf-run-ci/tasks/setup_stf_local_build.yml index baf17138e..a7c3c2578 100644 --- a/build/stf-run-ci/tasks/setup_stf_local_build.yml +++ b/build/stf-run-ci/tasks/setup_stf_local_build.yml @@ -81,132 +81,3 @@ - name: Load Service Telemetry Operator CSV shell: oc apply -f working/service-telemetry-operator-bundle/manifests/service-telemetry-operator.clusterserviceversion.yaml -n "{{ namespace }}" -# --- Loki Operator --- -- block: - - name: Prevent Loki Operator from building operator-sdk - replace: - path: "{{ base_dir }}/working/loki-operator/.bingo/Variables.mk" - regexp: '^.*modfile=operator-sdk.mod.*$' - replace: '' - - - name: Prevent Loki Operator from replacing GOBIN - replace: - path: "{{ base_dir }}/working/loki-operator/.bingo/Variables.mk" - regexp: '^GOBIN.*$' - replace: 'GOBIN ?= $(shell go env GOBIN)' - - - name: Prevent Loki Operator from using system golang - replace: - path: "{{ base_dir }}/working/loki-operator/.bingo/Variables.mk" - regexp: '^GO .*$' - replace: 'GO ?= $(GOBIN)"/go"' - - - name: Prevent Loki Operator from putting authentication on /metrics - replace: - path: "{{ base_dir }}/working/loki-operator/config/overlays/openshift/kustomization.yaml" - regexp: '{{ item }}' - replace: '' - loop: - - "patchesStrategicMerge:" - - "- manager_auth_proxy_patch.yaml" - - "- manager_related_image_patch.yaml" - - "- manager_run_flags_patch.yaml" - - "- prometheus_service_monitor_patch.yaml" - - - name: Generate Loki Operator CSV - make: - chdir: "{{ base_dir }}/working/loki-operator" - target: bundle - params: - REGISTRY_ORG: infrawatch - OPERATOR_SDK: "{{ base_dir }}/working/operator-sdk" - GOROOT: "{{ base_dir }}/working/go" - GOTOOLDIR: "{{ base_dir }}/working/go/pkg/tool/linux_amd64" - GOBIN: "{{ base_dir }}/working/go/bin" - - - name: Replace namespace in loki-operator CSV - replace: - path: "{{ base_dir }}/working/loki-operator/bundle/manifests/loki-operator.clusterserviceversion.yaml" - regexp: 'placeholder' - replace: '{{ namespace }}' - - - name: Replace image path in loki-operator CSV - replace: - path: "{{ base_dir }}/working/loki-operator/bundle/manifests/loki-operator.clusterserviceversion.yaml" - regexp: '{{ item }}' - replace: '{{ loki_operator_image_path }}' - loop: - - quay.io/infrawatch/loki-operator:v0.0.1 - - quay.io/openshift-logging/loki-operator:v0.0.1 - - - name: Replace namespace in loki-operator - replace: - path: "{{ base_dir }}/working/loki-operator/config/overlays/development/kustomization.yaml" - regexp: 'default' - replace: '{{ namespace }}' - - - name: Remove additional manager deployment - replace: - path: "{{ base_dir }}/working/loki-operator/config/overlays/development/kustomization.yaml" - regexp: '^.*manager' - replace: '' - - - name: Remove unnecessary patches - replace: - path: "{{ base_dir }}/working/loki-operator/config/overlays/development/kustomization.yaml" - regexp: '.*patch.*' - replace: '' - - - name: Setup PVs for Loki in crc - shell: - cmd: ./create_standard_pvs.sh 4 - when: - - is_crc | bool - - __service_telemetry_logs_enabled | bool - - - name: Replace namespace in S3 secret - replace: - path: "{{ base_dir }}/working/loki-operator/config/overlays/development/minio/secret.yaml" - regexp: 'default' - replace: '{{ namespace }}' - when: - - __deploy_loki_enabled | bool - - __service_telemetry_observability_strategy == "use_community" - -- block: - - name: Remove minio deployment - replace: - path: "{{ base_dir }}/working/loki-operator/config/overlays/development/kustomization.yaml" - regexp: '^.*minio' - replace: '' - when: - - not __deploy_minio_enabled | bool or - not __service_telemetry_observability_strategy == "use_community" or - not __service_telemetry_logs_enabled | bool - - __deploy_loki_enabled | bool - -- block: - - name: Deploy Loki Operator - make: - chdir: "{{ base_dir }}/working/loki-operator" - target: deploy - params: - REGISTRY_ORG: infrawatch - OPERATOR_SDK: "{{ base_dir }}/working/operator-sdk" - GOROOT: "{{ base_dir }}/working/go" - GOTOOLDIR: "{{ base_dir }}/working/go/pkg/tool/linux_amd64" - GOBIN: "{{ base_dir }}/working/go/bin" - - - name: Load Loki Operator bundle manifests - command: oc apply -f working/loki-operator/bundle/manifests/{{ item }} -n "{{ namespace }}" - loop: - - loki.grafana.com_lokistacks.yaml - - loki-operator-controller-manager-metrics-service_v1_service.yaml - - loki-operator-manager-config_v1_configmap.yaml - - loki-operator-metrics-reader_rbac.authorization.k8s.io_v1_clusterrole.yaml - - loki-operator-prometheus_rbac.authorization.k8s.io_v1_rolebinding.yaml - - loki-operator-prometheus_rbac.authorization.k8s.io_v1_role.yaml - - loki-operator.clusterserviceversion.yaml - when: - - __deploy_loki_enabled | bool - - __service_telemetry_observability_strategy == "use_community" From 7687cd7744cf118a0abbbb550e1bbd148efeeffb Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Tue, 28 Feb 2023 09:09:20 -0500 Subject: [PATCH 13/95] GHA checkout action v2 is deprecated (#407) The GitHub Actions checkout action v2 is deprecated and needs to move to version 3. --- .github/workflows/main.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d5e04ce09..d774c0e24 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-20.04 steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Install Ansible run: python -m pip install 'ansible <= 2.9' @@ -33,7 +33,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Verify image builds run: docker build --tag infrawatch/service-telemetry-operator:latest --file build/Dockerfile . @@ -44,7 +44,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Get operator-sdk image run: curl --output operator-sdk -JL https://github.com/operator-framework/operator-sdk/releases/download/$RELEASE_VERSION/operator-sdk-$RELEASE_VERSION-x86_64-linux-gnu From 16324bc2567f32e711f029c4a986997b792ac23c Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Tue, 28 Feb 2023 15:27:52 -0500 Subject: [PATCH 14/95] Implement SNMPtrap delivery controls (#404) * Implement SNMPtrap delivery controls Implement ability to override the default values for the SNMPtrap alertmanager receiver via prometheus-webhook-snmp component. Closes: STF-559 * Run operator-sdk generate bundle Run the following command to update the bundle artifacts: operator-sdk-0.19.4 generate bundle --metadata --manifests --channels unstable --default-channel unstable * Build out the remaining SNMP options Build out the remaining options for prometheus-webhook-snmp to allow for finer grained controls and delivery of SNMP traps via alertmanager alerts. * Generate bundle contents with operator-sdk --- build/stf-run-ci/README.md | 20 ++++++++--- build/stf-run-ci/defaults/main.yml | 10 ++++++ build/stf-run-ci/tasks/deploy_stf.yml | 9 +++++ .../infra.watch_servicetelemetrys_crd.yaml | 26 +++++++++++++- ...fra.watch_v1beta1_servicetelemetry_cr.yaml | 8 +++++ .../infra.watch_servicetelemetrys_crd.yaml | 34 ++++++++++++++++++- ...emetry-operator.clusterserviceversion.yaml | 10 +++++- roles/servicetelemetry/defaults/main.yml | 8 +++++ .../templates/manifest_snmp_traps.j2 | 16 +++++++-- 9 files changed, 130 insertions(+), 11 deletions(-) diff --git a/build/stf-run-ci/README.md b/build/stf-run-ci/README.md index 08a6d8086..add2f33dc 100644 --- a/build/stf-run-ci/README.md +++ b/build/stf-run-ci/README.md @@ -21,10 +21,10 @@ choose to override: | Parameter name | Values | Default | Description | | ------------------------------ | ------------ | --------- | ------------------------------------ | | `__deploy_stf` | {true,false} | true | Whether to deploy an instance of STF | -| `__local_build_enabled` | {true,false} | true | Whether to deploy STF from local built artifacts. Also see `working_branch`, `sg_branch`, `sgo_branch` | -| `__deploy_from_bundles_enabled` | {true,false} | false | Whether to deploy STF from OLM bundles (TODO: compat with __local_build_enabled) | -| `__service_telemetry_bundle_image_path` | | | Image path to Service Telemetry Operator bundle | -| `__smart_gateway_bundle_image_path` | | | Image path to Smart Gateway Operator bundle | +| `__local_build_enabled` | {true,false} | true | Whether to deploy STF from local built artifacts. Also see `working_branch`, `sg_branch`, `sgo_branch` | +| `__deploy_from_bundles_enabled` | {true,false} | false | Whether to deploy STF from OLM bundles (TODO: compat with `__local_build_enabled`) | +| `__service_telemetry_bundle_image_path` | | | Image path to Service Telemetry Operator bundle | +| `__smart_gateway_bundle_image_path` | | | Image path to Smart Gateway Operator bundle | | `prometheus_webhook_snmp_branch` | | master | Which Prometheus Webhook SNMP git branch to checkout | | `sgo_branch` | | master | Which Smart Gateway Operator git branch to checkout | | `sg_core_branch` | | master | Which Smart Gateway Core git branch to checkout | @@ -41,7 +41,17 @@ choose to override: | `__service_telemetry_storage_ephemeral_enabled` | {true,false} | false | Whether to enable ephemeral storage support in ServiceTelemetry | | `__service_telemetry_storage_persistent_storage_class` | | | Set a custom storageClass to override the default provided by OpenShift platform | | `__service_telemetry_snmptraps_enabled` | {true,false} | true | Whether to enable snmptraps delivery via Alertmanager receiver (prometheus-webhook-snmp) | -| `__service_telemetry_observability_strategy` | | use_community | Which observability strategy to use for deployment. Default deployment is 'use_community'. Also supported is 'none' | +| `__service_telemetry_snmptraps_community` | | `public` | Set the SNMP community to send traps to. Defaults to public | +| `__service_telemetry_snmptraps_target` | | `192.168.24.254` | Set the SNMP target to send traps to. Defaults to 192.168.24.254 | +| `__service_telemetry_snmptraps_retries` | | 5 | Set the SNMP retry count for traps. Defaults to 5 | +| `__service_telemetry_snmptraps_port` | | 162 | Set the SNMP target port for traps. Defaults to 162 | +| `__service_telemetry_snmptraps_timeout` | | 1 | Set the SNMP retry timeout (in seconds). Defaults to 1 | +| `__service_telemetry_alert_oid_label` | | oid | The alert label name to look for oid value. Default to oid. | +| `__service_telemetry_trap_oid_prefix` | | 1.3.6.1.4.1.50495.15 | The OID prefix for trap variable bindings. | +| `__service_telemetry_trap_default_oid` | | 1.3.6.1.4.1.50495.15.1.2.1 | The trap OID if none is found in the Prometheus alert labels. | +| `__service_telemetry_trap_default_severity` | | | The trap severity if none is found in the Prometheus alert labels. | +| `__service_telemetry_logs_enabled` | {true,false} | false | Whether to enable logs support in ServiceTelemetry | +| `__service_telemetry_observability_strategy` | | `use_community` | Which observability strategy to use for deployment. Default deployment is 'use_community'. Also supported is 'none' | | `__internal_registry_path` | | image-registry.openshift-image-registry.svc:5000 | Path to internal registry for image path | | `__deploy_loki_enabled` | {true,false} | false | Whether to deploy loki-operator and other systems for logging development purposes | | `__golang_image_path` | | quay.io/infrawatch/golang:1.16 | Golang image path for building the loki-operator image | diff --git a/build/stf-run-ci/defaults/main.yml b/build/stf-run-ci/defaults/main.yml index f41c89349..31089ba66 100644 --- a/build/stf-run-ci/defaults/main.yml +++ b/build/stf-run-ci/defaults/main.yml @@ -14,6 +14,16 @@ __service_telemetry_high_availability_enabled: false __service_telemetry_metrics_enabled: true __service_telemetry_storage_ephemeral_enabled: false __service_telemetry_snmptraps_enabled: true +__service_telemetry_snmptraps_target: "192.168.24.254" +__service_telemetry_snmptraps_community: "public" +__service_telemetry_snmptraps_retries: 5 +__service_telemetry_snmptraps_timeout: 1 +__service_telemetry_snmptraps_port: 162 +__service_telemetry_snmptraps_alert_oid_label: "oid" +__service_telemetry_snmptraps_trap_oid_prefix: "1.3.6.1.4.1.50495.15" +__service_telemetry_snmptraps_trap_default_oid: "1.3.6.1.4.1.50495.15.1.2.1" +__service_telemetry_snmptraps_trap_default_severity: "" +__service_telemetry_logs_enabled: false __service_telemetry_observability_strategy: use_community __internal_registry_path: image-registry.openshift-image-registry.svc:5000 __service_telemetry_bundle_image_path: diff --git a/build/stf-run-ci/tasks/deploy_stf.yml b/build/stf-run-ci/tasks/deploy_stf.yml index d165940e5..ba8746f1b 100644 --- a/build/stf-run-ci/tasks/deploy_stf.yml +++ b/build/stf-run-ci/tasks/deploy_stf.yml @@ -21,6 +21,15 @@ receivers: snmpTraps: enabled: {{ __service_telemetry_snmptraps_enabled }} + target: "{{ __service_telemetry_snmptraps_target }}" + community: "{{ __service_telemetry_snmptraps_community }}" + retries: {{ __service_telemetry_snmptraps_retries }} + port: {{ __service_telemetry_snmptraps_port }} + timeout: {{ __service_telemetry_snmptraps_timeout }} + alertOidLabel: "{{ __service_telemetry_snmptraps_alert_oid_label }}" + trapOidPrefix: "{{ __service_telemetry_snmptraps_trap_oid_prefix }}" + trapDefaultOid: "{{ __service_telemetry_snmptraps_trap_default_oid }}" + trapDefaultSeverity: "{{ __service_telemetry_snmptraps_trap_default_severity }}" backends: events: elasticsearch: diff --git a/deploy/crds/infra.watch_servicetelemetrys_crd.yaml b/deploy/crds/infra.watch_servicetelemetrys_crd.yaml index 2427b4eb6..496da5012 100644 --- a/deploy/crds/infra.watch_servicetelemetrys_crd.yaml +++ b/deploy/crds/infra.watch_servicetelemetrys_crd.yaml @@ -56,8 +56,32 @@ spec: enabled: description: Deploy container to send snmp traps type: boolean + community: + description: 'Target community for SNMP traps. Default is "public"' + type: string target: - description: Target address for SNMP traps to send to + description: 'Target address for SNMP traps to send to.' + type: string + retries: + description: 'SNMP trap delivery retry limit. Default is 5' + type: integer + timeout: + description: 'Response timeout, in seconds. Default is 1' + type: integer + port: + description: 'SNMP track delivery port. Default is 162' + type: integer + alertOidLabel: + description: 'Label for finding the OID. Default is "oid"' + type: string + trapOidPrefix: + description: 'OID prefix for the trap variable bindings. Default is "1.3.6.1.4.1.50495.15"' + type: string + trapDefaultOid: + description: 'The trap OID if none is found in the Prometheus alert labels. Default is "1.3.6.1.4.1.50495.15.1.2.1"' + type: string + trapDefaultSeverity: + description: 'The trap severity if none is found in the Prometheus alert labels. Default is empty.' type: string type: object type: object diff --git a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml index 7b5cfb63e..5c202f5bb 100644 --- a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml +++ b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml @@ -10,7 +10,15 @@ spec: receivers: snmpTraps: enabled: false + community: public target: 192.168.24.254 + retries: 5 + port: 162 + timeout: 1 + alertOidLabel: oid + trapOidPrefix: "1.3.6.1.4.1.50495.15" + trapDefaultOid: "1.3.6.1.4.1.50495.15.1.2.1" + trapDefaultSeverity: "" storage: strategy: persistent persistent: diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml index 8a196cf74..486793cc9 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml @@ -46,12 +46,44 @@ spec: properties: snmpTraps: properties: + alertOidLabel: + description: Label for finding the OID. Default is + "oid" + type: string + community: + description: Target community for SNMP traps. Default + is "public" + type: string enabled: description: Deploy container to send snmp traps type: boolean + port: + description: SNMP track delivery port. Default is + 162 + type: integer + retries: + description: SNMP trap delivery retry limit. Default + is 5 + type: integer target: description: Target address for SNMP traps to send - to + to. + type: string + timeout: + description: Response timeout, in seconds. Default + is 1 + type: integer + trapDefaultOid: + description: The trap OID if none is found in the + Prometheus alert labels. Default is "1.3.6.1.4.1.50495.15.1.2.1" + type: string + trapDefaultSeverity: + description: The trap severity if none is found in + the Prometheus alert labels. Default is empty. + type: string + trapOidPrefix: + description: OID prefix for the trap variable bindings. + Default is "1.3.6.1.4.1.50495.15" type: string type: object type: object diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index eaff970ae..c21b0909f 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -15,8 +15,16 @@ metadata: "alertmanager": { "receivers": { "snmpTraps": { + "alertOidLabel": "oid", + "community": "public", "enabled": false, - "target": "192.168.24.254" + "port": 162, + "retries": 5, + "target": "192.168.24.254", + "timeout": 1, + "trapDefaultOid": "1.3.6.1.4.1.50495.15.1.2.1", + "trapDefaultSeverity": "", + "trapOidPrefix": "1.3.6.1.4.1.50495.15" } }, "storage": { diff --git a/roles/servicetelemetry/defaults/main.yml b/roles/servicetelemetry/defaults/main.yml index 2ba6d4b43..70539d791 100644 --- a/roles/servicetelemetry/defaults/main.yml +++ b/roles/servicetelemetry/defaults/main.yml @@ -24,7 +24,15 @@ servicetelemetry_defaults: receivers: snmp_traps: enabled: false + community: public target: 192.168.24.254 + retries: 5 + timeout: 1 + port: 162 + alert_oid_label: "oid" + trap_oid_prefix: "1.3.6.1.4.1.50495.15" + trap_default_oid: "1.3.6.1.4.1.50495.15.1.2.1" + trap_default_severity: "" backends: metrics: diff --git a/roles/servicetelemetry/templates/manifest_snmp_traps.j2 b/roles/servicetelemetry/templates/manifest_snmp_traps.j2 index 64292a4d2..b4a48445b 100644 --- a/roles/servicetelemetry/templates/manifest_snmp_traps.j2 +++ b/roles/servicetelemetry/templates/manifest_snmp_traps.j2 @@ -20,10 +20,20 @@ spec: - containerPort: 9099 env: - name: SNMP_COMMUNITY - value: public + value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.community }}" - name: SNMP_RETRIES - value: "1" + value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.retries }}" - name: SNMP_HOST value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.target }}" - name: SNMP_PORT - value: "162" + value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.port }}" + - name: SNMP_TIMEOUT + value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.port }}" + - name: ALERT_OID_LABEL + value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.alert_oid_label }}" + - name: TRAP_OID_PREFIX + value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.trap_oid_prefix }}" + - name: TRAP_DEFAULT_OID + value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.trap_default_oid }}" + - name: TRAP_DEFAULT_SEVERITY + value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.trap_default_severity }}" From aa12146341daae475255142e1349d9ae1b623871 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Fri, 3 Mar 2023 09:55:46 -0500 Subject: [PATCH 15/95] Implement changes for operator-sdk-1.26.0 testing (#411) * Implement changes for operator-sdk-1.26.0 testing Implement changes that allow testing validation via operator-sdk-1.26.0 without bumping the entire bundle generation process from operator-sdk-0.19.4 to post-operator-sdk-1.x. These are the same tests run for validation during product pipeline verification. * Adds test to verify building of the bundle image works. * Adds KinD deployment to allow executing scorecard checks. Related: STF-1252 * Fix properties.yaml * Simplify use of RELEASE_VERSION variable (#412) * Add note about why we're copying files in --- .github/workflows/main.yml | 70 +++++++++++++++---- build/generate_bundle.sh | 13 ++++ ...emetry-operator.clusterserviceversion.yaml | 1 - .../metadata/properties.yaml | 3 + .../tests/scorecard/config.yaml | 21 ++++++ 5 files changed, 94 insertions(+), 14 deletions(-) create mode 100644 deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml create mode 100644 deploy/olm-catalog/service-telemetry-operator/tests/scorecard/config.yaml diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d774c0e24..5cbce3e9e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -20,15 +20,9 @@ jobs: - name: Lint Ansible roles/servicetelemetry/ directory run: ${HOME}/.local/bin/ansible-lint roles/servicetelemetry -# TODO: requires a bunch of work on our bash scripts, or finesse -# - name: Run Super-Linter -# uses: github/super-linter@v3 -# env: -# DEFAULT_BRANCH: master -# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - build-check: - name: Build check + + build-operator-check: + name: Build Operator check runs-on: ubuntu-20.04 steps: @@ -38,15 +32,46 @@ jobs: - name: Verify image builds run: docker build --tag infrawatch/service-telemetry-operator:latest --file build/Dockerfile . - bundle-check: - name: Bundle check + build-bundle-check: + name: Build bundle check runs-on: ubuntu-20.04 + env: + RELEASE_VERSION: v0.19.4 steps: - name: Checkout code uses: actions/checkout@v3 - - name: Get operator-sdk image + - name: Get operator-sdk image 0.19.4 + run: curl --output operator-sdk -JL https://github.com/operator-framework/operator-sdk/releases/download/$RELEASE_VERSION/operator-sdk-$RELEASE_VERSION-x86_64-linux-gnu + + - name: Make operator-sdk executable + run: chmod +x operator-sdk + + - name: Move operator-sdk binary + run: sudo mv operator-sdk /usr/local/bin + + - name: Create working directory + run: mkdir /tmp/bundle + + - name: Generate bundle + run: WORKING_DIR=/tmp/bundle ./build/generate_bundle.sh + + - name: Verify image builds + run: docker build --tag infrawatch/service-telemetry-operator:latest --file build/Dockerfile . + + check-bundle-validation-scorecard: + name: Validate the generated bundle and perform scorecard checks + runs-on: ubuntu-20.04 + env: + RELEASE_VERSION: v1.26.0 + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + # prepare environment to buld the bundle + - name: Get operator-sdk image 0.19.4 run: curl --output operator-sdk -JL https://github.com/operator-framework/operator-sdk/releases/download/$RELEASE_VERSION/operator-sdk-$RELEASE_VERSION-x86_64-linux-gnu env: RELEASE_VERSION: v0.19.4 @@ -60,8 +85,27 @@ jobs: - name: Create working directory run: mkdir /tmp/bundle + # generate the bundle using operator-sdk-0.19.4 - name: Generate bundle run: WORKING_DIR=/tmp/bundle ./build/generate_bundle.sh + # prepare the environment to run bundle validation and bundle scorecard checks + - name: Get operator-sdk image 1.26.0 + run: curl --output operator-sdk-$RELEASE_VERSION -JL https://github.com/operator-framework/operator-sdk/releases/download/$RELEASE_VERSION/operator-sdk_linux_amd64 + + - name: Make operator-sdk executable + run: chmod +x operator-sdk-$RELEASE_VERSION + + - name: Move operator-sdk binary + run: sudo mv operator-sdk-$RELEASE_VERSION /usr/local/bin + + # perform bundle validation - name: Check bundle validation - run: operator-sdk bundle validate --verbose /tmp/bundle + run: operator-sdk-$RELEASE_VERSION bundle validate --verbose /tmp/bundle + + - name: Create KinD cluster to execute scorecard tests + uses: helm/kind-action@v1.4.0 + + # perform scorecard checks against a KinD cluster + - name: Check scorecord validation + run: operator-sdk-$RELEASE_VERSION scorecard --verbose /tmp/bundle diff --git a/build/generate_bundle.sh b/build/generate_bundle.sh index bfccecc4c..8c5b13934 100755 --- a/build/generate_bundle.sh +++ b/build/generate_bundle.sh @@ -38,6 +38,18 @@ generate_bundle() { echo "---- Generated bundle complete at ${WORKING_DIR}/manifests/${OPERATOR_NAME}.clusterserviceversion.yaml" } +copy_extra_metadata() { + # We add this because our version of operator-sdk for building doesn't + # understand these files, but newer versions of operator-sdk (for testing + # purposes) does, and newer versions of opm (as used in both downstream and + # upstream index image builds) also understands these files. Just copy them + # into the bundle directory during building. + echo "-- Copy extra metadata in" + pushd "${REL}/../" + cp -r ./deploy/olm-catalog/service-telemetry-operator/tests/ "${WORKING_DIR}" + cp ./deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml "${WORKING_DIR}/metadata/" +} + build_bundle_instructions() { echo "-- Commands to create a bundle build" echo docker build -t "${OPERATOR_BUNDLE_IMAGE}:${OPERATOR_BUNDLE_VERSION}" -f "${WORKING_DIR}/Dockerfile" "${WORKING_DIR}" @@ -51,5 +63,6 @@ generate_version create_working_dir generate_dockerfile generate_bundle +copy_extra_metadata build_bundle_instructions echo "## End Bundle creation" diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index c21b0909f..175b752dc 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -169,7 +169,6 @@ metadata: description: Service Telemetry Framework. Umbrella Operator for instantiating the required dependencies and configuration of various components to build a Service Telemetry platform for telco grade monitoring. - olm.properties: '[{"type": "olm.maxOpenShiftVersion", "value": "4.12"}]' olm.skipRange: '>=<> <<>' operatorframework.io/suggested-namespace: service-telemetry operators.openshift.io/valid-subscription: '["OpenStack Platform", "Cloud Infrastructure", diff --git a/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml b/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml new file mode 100644 index 000000000..8edfa0da9 --- /dev/null +++ b/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml @@ -0,0 +1,3 @@ +properties: + - type: olm.maxOpenShiftVersion + value: "4.12" diff --git a/deploy/olm-catalog/service-telemetry-operator/tests/scorecard/config.yaml b/deploy/olm-catalog/service-telemetry-operator/tests/scorecard/config.yaml new file mode 100644 index 000000000..dc0bae379 --- /dev/null +++ b/deploy/olm-catalog/service-telemetry-operator/tests/scorecard/config.yaml @@ -0,0 +1,21 @@ +kind: Configuration +apiversion: scorecard.operatorframework.io/v1alpha3 +metadata: + name: config +stages: +- parallel: true + tests: + - image: quay.io/operator-framework/scorecard-test:latest + entrypoint: + - scorecard-test + - basic-check-spec + labels: + suite: basic + test: basic-check-spec-test + - image: quay.io/operator-framework/scorecard-test:latest + entrypoint: + - scorecard-test + - olm-bundle-validation + labels: + suite: olm + test: olm-bundle-validation-test From 71ac4ddeb6ca2c004eb1285cb56e0ba8c4ab7ef0 Mon Sep 17 00:00:00 2001 From: Victoria Martinez de la Cruz Date: Mon, 6 Mar 2023 21:58:21 +0100 Subject: [PATCH 16/95] Expose ability to set certificate renewal target times (#406) * Adds duration param for CA and endpoint certs Replaces certificate_duration for ca_certificate_duration and endpoint_certificate_duration. Set default value for those to 70080h (previous value) Removes the certificate_duration param from the Issuer resource since it's not actually needed (see [0]) [0] https://cert-manager.io/docs/reference/api-docs/#cert-manager.io/v1.IssuerConfig * Exposes CA and endpoint certificate duration config Exposes certificate duration config for both ElasticSearch and QDR Keeps the default value in use for now. Better default values should be discussed to be included in a follow up change. * Fix identation for certs duration param in servicetelemetry crd * Adds cert duration to the OLM catalog Includes cert duration params in the OLM catalog for both ElasticSearch and QDR * Changes snake_case to camelCase to yaml case Fix to match style convention * Adds pattern expresion for certs duration * Add certificates param to events and transport * Exposes duration parameter in the CI script Adds the duration parameter for both ElasticSearch and QDR in the CI script Also updates the OLM Catalog with the latest changes (certificates object) * Corrects naming to certificates params in CI script * Fix snake cae in the CI script params for cert duration * Fix identation for transports in the deploy_stf CI script --------- Co-authored-by: Chris Sibbitt --- build/stf-run-ci/README.md | 4 +++ build/stf-run-ci/defaults/main.yml | 4 +++ build/stf-run-ci/tasks/deploy_stf.yml | 9 +++++ .../infra.watch_servicetelemetrys_crd.yaml | 26 ++++++++++++++ ...fra.watch_v1beta1_servicetelemetry_cr.yaml | 6 ++++ .../infra.watch_servicetelemetrys_crd.yaml | 34 +++++++++++++++++++ ...emetry-operator.clusterserviceversion.yaml | 8 +++++ roles/servicetelemetry/defaults/main.yml | 8 +++-- .../tasks/_local_signing_authority.yml | 6 ++-- .../servicetelemetry/tasks/component_qdr.yml | 13 +++---- 10 files changed, 104 insertions(+), 14 deletions(-) diff --git a/build/stf-run-ci/README.md b/build/stf-run-ci/README.md index add2f33dc..87ca2f0aa 100644 --- a/build/stf-run-ci/README.md +++ b/build/stf-run-ci/README.md @@ -35,6 +35,8 @@ choose to override: | `sg_bridge_repository` | | https://github.com/infrawatch/sg-bridge | Which Smart Gateway Bridge git repository to clone | | `prometheus_webhook_snmp_repository` | | https://github.com/infrawatch/prometheus-webhook-snmp | Which Prometheus webhook snmp git repository to clone | | `loki_operator_repository` | | https://github.com/viaq/loki-operator | Which Loki-operator git repository to clone | +| `__service_telemetry_events_certificates_endpoint_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 2160h | Lifetime of the ElasticSearch endpoint certificate (minimum duration is 1h) | +| `__service_telemetry_events_certificates_ca_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the ElasticSearch CA certificate (minimum duration is 1h) | | `__service_telemetry_events_enabled` | {true,false} | true | Whether to enable events support in ServiceTelemetry | | `__service_telemetry_high_availability_enabled` | {true,false} | false | Whether to enable high availability support in ServiceTelemetry | | `__service_telemetry_metrics_enabled` | {true,false} | true | Whether to enable metrics support in ServiceTelemetry | @@ -52,6 +54,8 @@ choose to override: | `__service_telemetry_trap_default_severity` | | | The trap severity if none is found in the Prometheus alert labels. | | `__service_telemetry_logs_enabled` | {true,false} | false | Whether to enable logs support in ServiceTelemetry | | `__service_telemetry_observability_strategy` | | `use_community` | Which observability strategy to use for deployment. Default deployment is 'use_community'. Also supported is 'none' | +| `__service_telemetry_transports_certificates_endpoint_cert_duration`| [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 2160h | Lifetime of the QDR endpoint certificate (minimum duration is 1h) | +| `__service_telemetry_transports_certificates_ca_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the QDR CA certificate (minimum duration is 1h) | | `__internal_registry_path` | | image-registry.openshift-image-registry.svc:5000 | Path to internal registry for image path | | `__deploy_loki_enabled` | {true,false} | false | Whether to deploy loki-operator and other systems for logging development purposes | | `__golang_image_path` | | quay.io/infrawatch/golang:1.16 | Golang image path for building the loki-operator image | diff --git a/build/stf-run-ci/defaults/main.yml b/build/stf-run-ci/defaults/main.yml index 31089ba66..9b54f0ce3 100644 --- a/build/stf-run-ci/defaults/main.yml +++ b/build/stf-run-ci/defaults/main.yml @@ -9,6 +9,8 @@ __local_build_enabled: true __deploy_from_bundles_enabled: false __deploy_stf: true +__service_telemetry_events_certificates_endpoint_cert_duration: 70080h +__service_telemetry_events_certificates_ca_cert_duration: 2160h __service_telemetry_events_enabled: true __service_telemetry_high_availability_enabled: false __service_telemetry_metrics_enabled: true @@ -25,6 +27,8 @@ __service_telemetry_snmptraps_trap_default_oid: "1.3.6.1.4.1.50495.15.1.2.1" __service_telemetry_snmptraps_trap_default_severity: "" __service_telemetry_logs_enabled: false __service_telemetry_observability_strategy: use_community +__service_telemetry_transports_certificates_endpoint_cert_duration: 70080h +__service_telemetry_transports_certificates_ca_cert_duration: 2160h __internal_registry_path: image-registry.openshift-image-registry.svc:5000 __service_telemetry_bundle_image_path: __smart_gateway_bundle_image_path: diff --git a/build/stf-run-ci/tasks/deploy_stf.yml b/build/stf-run-ci/tasks/deploy_stf.yml index ba8746f1b..bc49897c0 100644 --- a/build/stf-run-ci/tasks/deploy_stf.yml +++ b/build/stf-run-ci/tasks/deploy_stf.yml @@ -40,6 +40,9 @@ persistent: storageClass: {{ __service_telemetry_storage_persistent_storage_class }} {% endif %} + certificates: + endpointCertDuration: {{ __service_telemetry_events_certificates_endpoint_cert_duration }} + caCertDuration: {{ __service_telemetry_events_certificates_ca_cert_duration }} metrics: prometheus: enabled: {{ __service_telemetry_metrics_enabled }} @@ -59,6 +62,12 @@ {% if __service_telemetry_storage_persistent_storage_class is defined %} storageClass: {{ __service_telemetry_storage_persistent_storage_class }} {% endif %} + transports: + qdr: + enabled: true + certificates: + endpointCertDuration: {{ __service_telemetry_transports_certificates_endpoint_cert_duration }} + caCertDuration: {{ __service_telemetry_transports_certificates_ca_cert_duration }} highAvailability: enabled: {{ __service_telemetry_high_availability_enabled }} when: diff --git a/deploy/crds/infra.watch_servicetelemetrys_crd.yaml b/deploy/crds/infra.watch_servicetelemetrys_crd.yaml index 496da5012..21e9f8652 100644 --- a/deploy/crds/infra.watch_servicetelemetrys_crd.yaml +++ b/deploy/crds/infra.watch_servicetelemetrys_crd.yaml @@ -199,6 +199,19 @@ spec: type: string type: object type: object + certificates: + properties: + endpointCertDuration: + description: The requested 'duration' (i.e. lifetime) of the ElasticSearch endpoint Certificate. + Minimum accepted duration is 1 hour. Value must be in units accepted by Go time.ParseDuration https://golang.org/pkg/time/#ParseDuration + pattern: ^((([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + caCertDuration: + description: The requested 'duration' (i.e. lifetime) of the ElasticSearch CA Certificate. + Minimum accepted duration is 1 hour. Value must be in units accepted by Go time.ParseDuration https://golang.org/pkg/time/#ParseDuration + pattern: ^((([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + type: object type: object type: object logs: @@ -286,6 +299,19 @@ spec: description: Enable web interface for QDR type: boolean type: object + certificates: + properties: + endpointCertDuration: + description: The requested 'duration' (i.e. lifetime) of the QDR endpoint Certificate. + Minimum accepted duration is 1 hour. Value must be in units accepted by Go time.ParseDuration https://golang.org/pkg/time/#ParseDuration + pattern: ^((([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + caCertDuration: + description: The requested 'duration' (i.e. lifetime) of the QDR CA Certificate. + Minimum accepted duration is 1 hour. Value must be in units accepted by Go time.ParseDuration https://golang.org/pkg/time/#ParseDuration + pattern: ^((([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + type: object type: object type: object graphing: diff --git a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml index 5c202f5bb..9d324839c 100644 --- a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml +++ b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml @@ -41,6 +41,9 @@ spec: strategy: persistent persistent: pvcStorageRequest: 20Gi + certificates: + endpointCertDuration: 70080h + caCertDuration: 70080h logs: loki: enabled: false @@ -102,6 +105,9 @@ spec: enabled: true web: enabled: false + certificates: + endpointCertDuration: 70080h + caCertDuration: 70080h highAvailability: enabled: false # vim: set ft=yaml shiftwidth=2 tabstop=2 expandtab: diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml index 486793cc9..c275c943b 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml @@ -124,6 +124,23 @@ spec: elasticsearch: description: Events storage backend ElasticSearch properties: + certificates: + properties: + caCertDuration: + description: The requested 'duration' (i.e. lifetime) + of the ElasticSearch CA Certificate. Minimum accepted + duration is 1 hour. Value must be in units accepted + by Go time.ParseDuration https://golang.org/pkg/time/#ParseDuration + pattern: ^((([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + endpointCertDuration: + description: The requested 'duration' (i.e. lifetime) + of the ElasticSearch endpoint Certificate. Minimum + accepted duration is 1 hour. Value must be in units + accepted by Go time.ParseDuration https://golang.org/pkg/time/#ParseDuration + pattern: ^((([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + type: object enabled: description: Enable ElasticSearch as a storage backend for events @@ -486,6 +503,23 @@ spec: qdr: description: QDR configuration for data transport properties: + certificates: + properties: + caCertDuration: + description: The requested 'duration' (i.e. lifetime) + of the QDR CA Certificate. Minimum accepted duration + is 1 hour. Value must be in units accepted by Go time.ParseDuration + https://golang.org/pkg/time/#ParseDuration + pattern: ^((([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + endpointCertDuration: + description: The requested 'duration' (i.e. lifetime) + of the QDR endpoint Certificate. Minimum accepted duration + is 1 hour. Value must be in units accepted by Go time.ParseDuration + https://golang.org/pkg/time/#ParseDuration + pattern: ^((([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + type: object enabled: description: Enable QDR data transort type: boolean diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index 175b752dc..a04701500 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -39,6 +39,10 @@ metadata: "backends": { "events": { "elasticsearch": { + "certificates": { + "caCertDuration": "70080h", + "endpointCertDuration": "70080h" + }, "enabled": false, "storage": { "persistent": { @@ -152,6 +156,10 @@ metadata: "observabilityStrategy": "use_community", "transports": { "qdr": { + "certificates": { + "caCertDuration": "70080h", + "endpointCertDuration": "70080h" + }, "enabled": true, "web": { "enabled": false diff --git a/roles/servicetelemetry/defaults/main.yml b/roles/servicetelemetry/defaults/main.yml index 70539d791..a6de8d092 100644 --- a/roles/servicetelemetry/defaults/main.yml +++ b/roles/servicetelemetry/defaults/main.yml @@ -6,8 +6,6 @@ clouds_remove_on_missing: false # default observability strategy (compatible with STF 1.3) observability_strategy: use_community -certificate_duration: 70080h - servicetelemetry_defaults: high_availability: enabled: false @@ -56,6 +54,9 @@ servicetelemetry_defaults: persistent: storage_class: "" pvc_storage_request: 20Gi + certificates: + endpoint_cert_duration: 70080h + ca_cert_duration: 70080h logs: loki: enabled: false @@ -81,6 +82,9 @@ servicetelemetry_defaults: deployment_size: 1 web: enabled: false + certificates: + endpoint_cert_duration: 70080h + ca_cert_duration: 70080h graphing: enabled: false diff --git a/roles/servicetelemetry/tasks/_local_signing_authority.yml b/roles/servicetelemetry/tasks/_local_signing_authority.yml index f4ae3fd62..c517a4041 100644 --- a/roles/servicetelemetry/tasks/_local_signing_authority.yml +++ b/roles/servicetelemetry/tasks/_local_signing_authority.yml @@ -8,7 +8,6 @@ name: '{{ ansible_operator_meta.namespace }}-selfsigned' namespace: '{{ ansible_operator_meta.namespace }}' spec: - duration: '{{ certificate_duration }}' selfSigned: {} - name: Create CA certificate @@ -20,7 +19,7 @@ name: '{{ ansible_operator_meta.namespace }}-ca' namespace: '{{ ansible_operator_meta.namespace }}' spec: - duration: '{{ certificate_duration }}' + duration: '{{ servicetelemetry_vars.backends.events.elasticsearch.certificates.ca_cert_duration }}' secretName: '{{ ansible_operator_meta.namespace }}-ca' commonName: '{{ ansible_operator_meta.namespace }}-ca' isCA: true @@ -36,7 +35,6 @@ name: '{{ ansible_operator_meta.namespace }}-ca' namespace: '{{ ansible_operator_meta.namespace }}' spec: - duration: '{{ certificate_duration }}' ca: secretName: '{{ ansible_operator_meta.namespace }}-ca' @@ -50,7 +48,7 @@ name: elasticsearch-es-http namespace: '{{ ansible_operator_meta.namespace }}' spec: - duration: '{{ certificate_duration }}' + duration: '{{ servicetelemetry_vars.backends.events.elasticsearch.certificates.endpoint_cert_duration }}' commonName: elasticsearch-es-http secretName: 'elasticsearch-es-cert' dnsNames: diff --git a/roles/servicetelemetry/tasks/component_qdr.yml b/roles/servicetelemetry/tasks/component_qdr.yml index cf7cc937b..84fcd1beb 100644 --- a/roles/servicetelemetry/tasks/component_qdr.yml +++ b/roles/servicetelemetry/tasks/component_qdr.yml @@ -13,7 +13,6 @@ name: "{{ ansible_operator_meta.name }}-interconnect-selfsigned" namespace: "{{ ansible_operator_meta.namespace }}" spec: - duration: '{{ certificate_duration }}' selfSigned: {} - name: Create self-signed interconnect certificate @@ -26,7 +25,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-selfsigned" namespace: "{{ ansible_operator_meta.namespace }}" spec: - duration: '{{ certificate_duration }}' + duration: '{{ servicetelemetry_vars.transports.qdr.certificates.ca_cert_duration }}' commonName: "{{ ansible_operator_meta.name }}-interconnect.{{ ansible_operator_meta.namespace }}.svc.cluster.local" isCA: true issuerRef: @@ -44,7 +43,6 @@ name: "{{ ansible_operator_meta.name }}-interconnect-ca" namespace: "{{ ansible_operator_meta.namespace }}" spec: - duration: '{{ certificate_duration }}' ca: secretName: "{{ ansible_operator_meta.name }}-interconnect-selfsigned" @@ -58,7 +56,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-openstack-ca" namespace: "{{ ansible_operator_meta.namespace }}" spec: - duration: '{{ certificate_duration }}' + duration: '{{ servicetelemetry_vars.transports.qdr.certificates.ca_cert_duration }}' commonName: "{{ ansible_operator_meta.name }}-interconnect-openstack-ca" isCA: true issuerRef: @@ -75,7 +73,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-openstack-credentials" namespace: "{{ ansible_operator_meta.namespace }}" spec: - duration: '{{ certificate_duration }}' + duration: '{{ servicetelemetry_vars.transports.qdr.certificates.endpoint_cert_duration }}' commonName: "{{ ansible_operator_meta.name }}-interconnect" dnsNames: - "{{ ansible_operator_meta.name }}-interconnect.{{ ansible_operator_meta.namespace }}.svc.cluster.local" @@ -93,7 +91,6 @@ name: "{{ ansible_operator_meta.name }}-interconnect-inter-router-ca" namespace: "{{ ansible_operator_meta.namespace }}" spec: - duration: '{{ certificate_duration }}' ca: secretName: "{{ ansible_operator_meta.name }}-interconnect-inter-router-ca" @@ -107,7 +104,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-inter-router-ca" namespace: "{{ ansible_operator_meta.namespace }}" spec: - duration: '{{ certificate_duration }}' + duration: '{{ servicetelemetry_vars.transports.qdr.certificates.ca_cert_duration }}' commonName: "{{ ansible_operator_meta.name }}-interconnect-inter-router-ca" isCA: true issuerRef: @@ -124,7 +121,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-inter-router-credentials" namespace: "{{ ansible_operator_meta.namespace }}" spec: - duration: '{{ certificate_duration }}' + duration: '{{ servicetelemetry_vars.transports.qdr.certificates.endpoint_cert_duration }}' commonName: "{{ ansible_operator_meta.name }}-interconnect" dnsNames: - "{{ ansible_operator_meta.name }}-interconnect.{{ ansible_operator_meta.namespace }}.svc.cluster.local" From e3a1125f8af59c4028e63f2080fea04eca9f5590 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Wed, 8 Mar 2023 12:51:40 -0500 Subject: [PATCH 17/95] Fix default CA cert lifetime values in stf-run-ci (#414) --- build/stf-run-ci/defaults/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build/stf-run-ci/defaults/main.yml b/build/stf-run-ci/defaults/main.yml index 9b54f0ce3..ab9dab7a6 100644 --- a/build/stf-run-ci/defaults/main.yml +++ b/build/stf-run-ci/defaults/main.yml @@ -10,7 +10,7 @@ __deploy_from_bundles_enabled: false __deploy_stf: true __service_telemetry_events_certificates_endpoint_cert_duration: 70080h -__service_telemetry_events_certificates_ca_cert_duration: 2160h +__service_telemetry_events_certificates_ca_cert_duration: 70080h __service_telemetry_events_enabled: true __service_telemetry_high_availability_enabled: false __service_telemetry_metrics_enabled: true @@ -28,7 +28,7 @@ __service_telemetry_snmptraps_trap_default_severity: "" __service_telemetry_logs_enabled: false __service_telemetry_observability_strategy: use_community __service_telemetry_transports_certificates_endpoint_cert_duration: 70080h -__service_telemetry_transports_certificates_ca_cert_duration: 2160h +__service_telemetry_transports_certificates_ca_cert_duration: 70080h __internal_registry_path: image-registry.openshift-image-registry.svc:5000 __service_telemetry_bundle_image_path: __smart_gateway_bundle_image_path: From 7c66ed859958abd77513f54f9bc47b2fa9d03271 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Wed, 8 Mar 2023 16:51:44 -0500 Subject: [PATCH 18/95] fix/client version mismatch (#415) * Fix PROMETHEUS_K8S_TOKEN to account for oc version mismatch * Fix to use correct variable name in test --- tests/smoketest/smoketest.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/smoketest/smoketest.sh b/tests/smoketest/smoketest.sh index fbd84c1af..8a801c004 100755 --- a/tests/smoketest/smoketest.sh +++ b/tests/smoketest/smoketest.sh @@ -68,12 +68,18 @@ for NAME in "${CLOUDNAMES[@]}"; do done echo "*** [INFO] Triggering an alertmanager notification..." -PROMETHEUS_K8S_TOKEN=$(oc create token prometheus-k8s) + +# check if the oc client version is less than 4.11 and adjust the token command to match available commands +if [ 0${OC_CLIENT_VERSION_Y} -lt 011 ]; then + PROMETHEUS_K8S_TOKEN=$(oc serviceaccounts get-token prometheus-k8s) +else + PROMETHEUS_K8S_TOKEN=$(oc create token prometheus-k8s) +fi + oc run curl --restart='Never' --image=quay.io/infrawatch/busyboxplus:curl -- sh -c "curl -k -H \"Content-Type: application/json\" -H \"Authorization: Bearer ${PROMETHEUS_K8S_TOKEN}\" -d '[{\"labels\":{\"alertname\":\"Testalert1\"}}]' https://default-alertmanager-proxy:9095/api/v1/alerts" # it takes some time to get the alert delivered, continuing with other tests - # Trying to find a less brittle test than a timeout JOB_TIMEOUT=300s for NAME in "${CLOUDNAMES[@]}"; do From 4d2f34842eac2fbad71c2fb0d4f8fb2818b12d68 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Thu, 9 Mar 2023 09:26:26 -0500 Subject: [PATCH 19/95] Allow oc client version override for Jenkins agent (#416) --- .jenkins/agent/Dockerfile | 6 +++++- .jenkins/agent/README.md | 16 ++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/.jenkins/agent/Dockerfile b/.jenkins/agent/Dockerfile index fe1a7fa70..c41fb9c69 100644 --- a/.jenkins/agent/Dockerfile +++ b/.jenkins/agent/Dockerfile @@ -1,4 +1,8 @@ FROM quay.io/openshift/origin-jenkins-agent-base:latest + +# pass --build-arg OC_CLIENT_VERSION= to build stage to change client version +ARG OC_CLIENT_VERSION="4.12" + RUN curl -LO "https://github.com/operator-framework/operator-sdk/releases/download/v0.19.4/operator-sdk-v0.19.4-x86_64-linux-gnu" && \ chmod +x operator-sdk-v0.19.4-x86_64-linux-gnu && mv operator-sdk-v0.19.4-x86_64-linux-gnu /usr/local/bin/operator-sdk RUN dnf install -y ansible golang python38 && \ @@ -6,5 +10,5 @@ RUN dnf install -y ansible golang python38 && \ alternatives --set python /usr/bin/python3.8 && \ python -m pip install openshift kubernetes "ansible-core~=2.12" && \ ansible-galaxy collection install -f 'kubernetes.core:>=2.2.0' community.general -RUN curl -LO "https://mirror.openshift.com/pub/openshift-v4/clients/ocp/latest-4.11/openshift-client-linux.tar.gz" && \ +RUN curl -LO "https://mirror.openshift.com/pub/openshift-v4/clients/ocp/latest-$OC_CLIENT_VERSION/openshift-client-linux.tar.gz" && \ tar -xv -C /usr/local/bin -f openshift-client-linux.tar.gz diff --git a/.jenkins/agent/README.md b/.jenkins/agent/README.md index 5eaf16527..dbef269fc 100644 --- a/.jenkins/agent/README.md +++ b/.jenkins/agent/README.md @@ -1,13 +1,29 @@ The Jenkins agent pod is used to run all Jenkins pipelines for the Service Telemetry Framework. # Build in OpenShift + ```bash oc new-build --binary=true --name=jenkins-agent oc start-build jenkins-agent --from-dir . ``` + +You can override the default `oc` client version being installed by overriding the default argument `OC_CLIENT_VERSION` from the `Dockerfile`. + +```bash +oc new-build --build-arg OC_CLIENT_VERSION=4.10 --binary=true --name=jenkins-agent +oc start-build jenkins-agent --from-dir . +``` + Builds will be available in-cluster at the address: `image-registry.openshift-image-registry.svc:5000//jenkins-agent:latest` # Build with Podman/Docker + ```bash podman build -t jenkins-agent:latest . ``` + +You can override the default `oc` client version being installed by overriding the default argument `OC_CLIENT_VERSION` from the `Dockerfile`. + +```bash +podman build --build-arg OC_CLIENT_VERSION=4.10 -t jenkins-agent:latest . +``` From 007f0057df10445a04daba843be6c4570a92828a Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Wed, 5 Apr 2023 15:44:12 -0400 Subject: [PATCH 20/95] Update CatalogSource for Prometheus Operator deployment (#419) --- build/stf-run-ci/tasks/setup_base.yml | 6 +++--- tests/infrared/README.md | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/build/stf-run-ci/tasks/setup_base.yml b/build/stf-run-ci/tasks/setup_base.yml index 9b0c838f9..fb06457dd 100644 --- a/build/stf-run-ci/tasks/setup_base.yml +++ b/build/stf-run-ci/tasks/setup_base.yml @@ -103,13 +103,13 @@ apiVersion: operators.coreos.com/v1alpha1 kind: Subscription metadata: - name: prometheus + name: rhods-prometheus-operator namespace: "{{ namespace }}" spec: channel: beta installPlanApproval: Automatic - name: prometheus - source: community-operators + name: rhods-prometheus-operator + source: redhat-operators sourceNamespace: openshift-marketplace when: - __service_telemetry_observability_strategy == "use_community" diff --git a/tests/infrared/README.md b/tests/infrared/README.md index 9b15471dc..ce497e1a7 100644 --- a/tests/infrared/README.md +++ b/tests/infrared/README.md @@ -17,7 +17,7 @@ to an STF instance all on one (large) baremetal machine. Once the deployment is complete, you can check prometheus for data, like so: ```shells -$ PROM_HOST=$(oc get route prometheus -o jsonpath='{.spec.host}') +$ PROM_HOST=$(oc get route default-prometheus-proxy -o jsonpath='{.spec.host}') $ curl "http://${PROM_HOST}/api/v1/query?query=collectd_uptime\[10s\]" {"status":"success","data":{"resultType":"matrix","result":[{"metric":{"__name__":"collectd_uptime","endpoint":"prom-http","host":"compute-0.localdomain","service":"white-smartgateway","type":"base","uptime":"base"},"values":[[1566500715.207,"88719"],[1566500716.214,"88720"],[1566500717.207,"88721"],[1566500718.207,"88722"],[1566500720.207,"88724"],[1566500721.207,"88725"],[1566500722.207,"88726"],[1566500723.207,"88727"]]},{"metric":{"__name__":"collectd_uptime","endpoint":"prom-http","host":"controller-0.localdomain","service":"white-smartgateway","type":"base","uptime":"base"},"values":[[1566500715.207,"88700"],[1566500717.207,"88701"],[1566500718.207,"88702"],[1566500719.209,"88703"],[1566500721.207,"88704"],[1566500723.207,"88705"]]}]}} ``` From 45baeb49f70d2f914e39aa2e98c9c0667c73ab4a Mon Sep 17 00:00:00 2001 From: Joaquin Veira Date: Mon, 17 Apr 2023 21:26:53 +0200 Subject: [PATCH 21/95] Fix path to right collectd service template file for OSP 13 (#418) --- tests/infrared/13/stf-connectors.yaml.template | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/infrared/13/stf-connectors.yaml.template b/tests/infrared/13/stf-connectors.yaml.template index 98fca0847..c6cf36182 100644 --- a/tests/infrared/13/stf-connectors.yaml.template +++ b/tests/infrared/13/stf-connectors.yaml.template @@ -5,7 +5,7 @@ tripleo_heat_templates: custom_templates: # don't load collectd-write-qdr.yaml when using multi-cloud and instead load collectd service directly resource_registry: - OS::TripleO::Services::Collectd: /usr/share/openstack-tripleo-heat-templates/deployment/metrics/collectd-container-puppet.yaml + OS::TripleO::Services::Collectd: /usr/share/openstack-tripleo-heat-templates/docker/services/metrics/collectd.yaml # set parameter defaults to match stable-1.3 documentation parameter_defaults: From c1716a9dee1250f3528c4f68556857455fd21932 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Tue, 25 Apr 2023 13:37:48 -0400 Subject: [PATCH 22/95] Latest oauth-proxy with bcrypt (#420) * Move to newest oauth-proxy container * Move to bcrypt for htpasswd * Up/Downstream image handling for new oauth-proxy container * Skip broken ansible lint for htpasswd * Hack to add EPEL in upstream builds --- build/Dockerfile | 8 ++++++++ build/generate_bundle.sh | 2 +- build/metadata.sh | 2 ++ ...ce-telemetry-operator.clusterserviceversion.yaml | 2 ++ deploy/operator.yaml | 2 ++ roles/servicetelemetry/defaults/main.yml | 13 ------------- roles/servicetelemetry/tasks/component_grafana.yml | 2 +- .../servicetelemetry/tasks/component_prometheus.yml | 4 +++- roles/servicetelemetry/tasks/pre.yml | 1 + 9 files changed, 20 insertions(+), 16 deletions(-) diff --git a/build/Dockerfile b/build/Dockerfile index af0e4c83d..c96f01ccf 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -1,5 +1,13 @@ FROM quay.io/openshift/origin-ansible-operator:4.10 +USER 0 +# Upstream CI builds need the additional EPEL sources for python3-passlib and python3-bcrypt but have no working repos to install epel-release +# NO_PROXY is undefined in upsream CI builds, but defined (usually blank) during openshift builds (a possibly brittle hack) +RUN bash -c -- 'if [ "${NO_PROXY-__ZZZZZ}" == "__ZZZZZ" ]; then echo "Applying upstream EPEL hacks" && echo -e "-----BEGIN PGP PUBLIC KEY BLOCK-----\n\nmQINBFz3zvsBEADJOIIWllGudxnpvJnkxQz2CtoWI7godVnoclrdl83kVjqSQp+2\ndgxuG5mUiADUfYHaRQzxKw8efuQnwxzU9kZ70ngCxtmbQWGmUmfSThiapOz00018\n+eo5MFabd2vdiGo1y+51m2sRDpN8qdCaqXko65cyMuLXrojJHIuvRA/x7iqOrRfy\na8x3OxC4PEgl5pgDnP8pVK0lLYncDEQCN76D9ubhZQWhISF/zJI+e806V71hzfyL\n/Mt3mQm/li+lRKU25Usk9dWaf4NH/wZHMIPAkVJ4uD4H/uS49wqWnyiTYGT7hUbi\necF7crhLCmlRzvJR8mkRP6/4T/F3tNDPWZeDNEDVFUkTFHNU6/h2+O398MNY/fOh\nyKaNK3nnE0g6QJ1dOH31lXHARlpFOtWt3VmZU0JnWLeYdvap4Eff9qTWZJhI7Cq0\nWm8DgLUpXgNlkmquvE7P2W5EAr2E5AqKQoDbfw/GiWdRvHWKeNGMRLnGI3QuoX3U\npAlXD7v13VdZxNydvpeypbf/AfRyrHRKhkUj3cU1pYkM3DNZE77C5JUe6/0nxbt4\nETUZBTgLgYJGP8c7PbkVnO6I/KgL1jw+7MW6Az8Ox+RXZLyGMVmbW/TMc8haJfKL\nMoUo3TVk8nPiUhoOC0/kI7j9ilFrBxBU5dUtF4ITAWc8xnG6jJs/IsvRpQARAQAB\ntChGZWRvcmEgRVBFTCAoOCkgPGVwZWxAZmVkb3JhcHJvamVjdC5vcmc+iQI4BBMB\nAgAiBQJc9877AhsPBgsJCAcDAgYVCAIJCgsEFgIDAQIeAQIXgAAKCRAh6kWrL4bW\noWagD/4xnLWws34GByVDQkjprk0fX7Iyhpm/U7BsIHKspHLL+Y46vAAGY/9vMvdE\n0fcr9Ek2Zp7zE1RWmSCzzzUgTG6BFoTG1H4Fho/7Z8BXK/jybowXSZfqXnTOfhSF\nalwDdwlSJvfYNV9MbyvbxN8qZRU1z7PEWZrIzFDDToFRk0R71zHpnPTNIJ5/YXTw\nNqU9OxII8hMQj4ufF11040AJQZ7br3rzerlyBOB+Jd1zSPVrAPpeMyJppWFHSDAI\nWK6x+am13VIInXtqB/Cz4GBHLFK5d2/IYspVw47Solj8jiFEtnAq6+1Aq5WH3iB4\nbE2e6z00DSF93frwOyWN7WmPIoc2QsNRJhgfJC+isGQAwwq8xAbHEBeuyMG8GZjz\nxohg0H4bOSEujVLTjH1xbAG4DnhWO/1VXLX+LXELycO8ZQTcjj/4AQKuo4wvMPrv\n9A169oETG+VwQlNd74VBPGCvhnzwGXNbTK/KH1+WRH0YSb+41flB3NKhMSU6dGI0\nSGtIxDSHhVVNmx2/6XiT9U/znrZsG5Kw8nIbbFz+9MGUUWgJMsd1Zl9R8gz7V9fp\nn7L7y5LhJ8HOCMsY/Z7/7HUs+t/A1MI4g7Q5g5UuSZdgi0zxukiWuCkLeAiAP4y7\nzKK4OjJ644NDcWCHa36znwVmkz3ixL8Q0auR15Oqq2BjR/fyog==\n=84m8\n-----END PGP PUBLIC KEY BLOCK-----" > /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-8 && echo -e "[epel]\nname=Extra Packages for Enterprise Linux 8 - \$basearch\nmetalink=https://mirrors.fedoraproject.org/metalink?repo=epel-8&arch=\$basearch&infra=\$infra&content=\$contentdir\nenabled=1\ngpgcheck=1\ngpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-8" > /etc/yum.repos.d/epel.repo; fi' +# Required for oauth-proxy +RUN dnf install -y python3-passlib python3-bcrypt +USER 1001 + COPY watches.yaml ${HOME}/watches.yaml COPY roles/ ${HOME}/roles/ COPY collections/ ${HOME}/.ansible/collections/ diff --git a/build/generate_bundle.sh b/build/generate_bundle.sh index 8c5b13934..f699f3559 100755 --- a/build/generate_bundle.sh +++ b/build/generate_bundle.sh @@ -27,7 +27,7 @@ generate_dockerfile() { generate_bundle() { echo "-- Generate bundle" - REPLACE_REGEX="s#<>#${CREATED_DATE}#g;s#<>#${OPERATOR_IMAGE}#g;s#<>#${OPERATOR_TAG}#g;s#<>#${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP}#g;s#<>#${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG}#g;s#<>#${OPERATOR_BUNDLE_VERSION}#g;s#1.99.0#${OPERATOR_BUNDLE_VERSION}#g;s#<>#${OPERATOR_DOCUMENTATION_URL}#g;s#<>#${BUNDLE_OLM_SKIP_RANGE_LOWER_BOUND}#g" + REPLACE_REGEX="s#<>#${CREATED_DATE}#g;s#<>#${OPERATOR_IMAGE}#g;s#<>#${OPERATOR_TAG}#g;s#<>#${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP}#g;s#<>#${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG}#g;s#<>#${RELATED_IMAGE_OAUTH_PROXY}#g;s#<>#${RELATED_IMAGE_OAUTH_PROXY_TAG}#g;s#<>#${OPERATOR_BUNDLE_VERSION}#g;s#1.99.0#${OPERATOR_BUNDLE_VERSION}#g;s#<>#${OPERATOR_DOCUMENTATION_URL}#g;s#<>#${BUNDLE_OLM_SKIP_RANGE_LOWER_BOUND}#g" pushd "${REL}/../" ${OPERATOR_SDK} generate bundle --channels ${BUNDLE_CHANNELS} --default-channel ${BUNDLE_DEFAULT_CHANNEL} --manifests --metadata --version "${OPERATOR_BUNDLE_VERSION}" --output-dir "${WORKING_DIR}" diff --git a/build/metadata.sh b/build/metadata.sh index 38c314c4d..759892400 100644 --- a/build/metadata.sh +++ b/build/metadata.sh @@ -19,6 +19,8 @@ BUNDLE_OLM_SKIP_RANGE_LOWER_BOUND=${BUNDLE_OLM_SKIP_RANGE_LOWER_BOUND:-1.3.0} CREATED_DATE=${CREATED_DATE:-$(date +'%Y-%m-%dT%H:%M:%SZ')} RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP=${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP:-quay.io/infrawatch/prometheus-webhook-snmp} RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG=${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG:-latest} +RELATED_IMAGE_OAUTH_PROXY=${RELATED_IMAGE_OAUTH_PROXY:-quay.io/openshift/origin-oauth-proxy} +RELATED_IMAGE_OAUTH_PROXY_TAG=${RELATED_IMAGE_OAUTH_PROXY_TAG:-latest} BUNDLE_PATH=${BUNDLE_PATH:-deploy/olm-catalog/service-telemetry-operator} BUNDLE_CHANNELS=${BUNDLE_CHANNELS:-unstable} BUNDLE_DEFAULT_CHANNEL=${BUNDLE_DEFAULT_CHANNEL:-unstable} diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index a04701500..a33a7bc92 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -311,6 +311,8 @@ spec: value: explicit - name: RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_IMAGE value: <>:<> + - name: RELATED_IMAGE_OAUTH_PROXY_IMAGE + value: <>:<> image: <>:<> imagePullPolicy: Always name: operator diff --git a/deploy/operator.yaml b/deploy/operator.yaml index 7b6879d4a..c56c11daa 100644 --- a/deploy/operator.yaml +++ b/deploy/operator.yaml @@ -35,6 +35,8 @@ spec: value: explicit - name: RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_IMAGE value: <>:<> + - name: RELATED_IMAGE_OAUTH_PROXY_IMAGE + value: <>:<> volumes: - name: runner emptyDir: {} diff --git a/roles/servicetelemetry/defaults/main.yml b/roles/servicetelemetry/defaults/main.yml index a6de8d092..c1f112eed 100644 --- a/roles/servicetelemetry/defaults/main.yml +++ b/roles/servicetelemetry/defaults/main.yml @@ -160,19 +160,6 @@ servicetelemetry_defaults: is_k8s: false is_openshift: false -# - This image works on OCP 4.6, 4.7, and 4.8 (v4.4 tag instead of 'latest' as I'd prefer above) -oauth_proxy_image: image-registry.openshift-image-registry.svc:5000/openshift/oauth-proxy:v4.4 - -# - Downstream, this one works, but there is a big red "Red Hat strongly recommends updating to the newest image version..." (4.9) which doesn't work -# - See https://github.com/openshift/oauth-proxy/issues/229 for why 4.9 isn't working for us -# oauth_proxy_image: registry.redhat.io/openshift4/ose-oauth-proxy:v4.8 - -# - This image works (probably upstream and down!), but is pinned back to the version we're strongly recommended not to use: -# oauth_proxy_image: quay.io/openshift/origin-oauth-proxy:4.8 - -# - For reference this is the image line I see in openshift-monitoring for the equivalent image: -# oauth_proxy_image: quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:ca37f5fc57643a353dcdcd6ad3a7bc8bb40fd4276a555fe5de2b8c7167d64020 - _ephemeral_storage_enabled: false # set default smartgateway deployment size. You should not modify this. diff --git a/roles/servicetelemetry/tasks/component_grafana.yml b/roles/servicetelemetry/tasks/component_grafana.yml index 2b0f6eeea..25e1e981f 100644 --- a/roles/servicetelemetry/tasks/component_grafana.yml +++ b/roles/servicetelemetry/tasks/component_grafana.yml @@ -18,7 +18,7 @@ namespace: '{{ ansible_operator_meta.namespace }}' type: Opaque stringData: - auth: '{{ servicetelemetry_vars.graphing.grafana.admin_user }}:{{ servicetelemetry_vars.graphing.grafana.admin_password | htpasswd_sha1 }}' + auth: '{{ servicetelemetry_vars.graphing.grafana.admin_user }}:{{ servicetelemetry_vars.graphing.grafana.admin_password | password_hash("bcrypt") | replace("$2b$","$2y$", 1) }}' - name: Lookup template debug: diff --git a/roles/servicetelemetry/tasks/component_prometheus.yml b/roles/servicetelemetry/tasks/component_prometheus.yml index 2cb7573ff..9decbf765 100644 --- a/roles/servicetelemetry/tasks/component_prometheus.yml +++ b/roles/servicetelemetry/tasks/component_prometheus.yml @@ -62,8 +62,10 @@ namespace: '{{ ansible_operator_meta.namespace }}' type: Opaque stringData: - auth: 'internal:{{ prom_basicauth_passwd | htpasswd_sha1 }}' # SHA1 is deprecated, bcrypt is available in OCP 4.8+ only https://bugzilla.redhat.com/show_bug.cgi?id=1874322 + auth: 'internal:{{ prom_basicauth_passwd | password_hash("bcrypt") | replace("$2b$","$2y$", 1)}}' password: '{{ prom_basicauth_passwd }}' + tags: + - skip_ansible_lint - name: Re-register new object for use in the annotation k8s_info: diff --git a/roles/servicetelemetry/tasks/pre.yml b/roles/servicetelemetry/tasks/pre.yml index 6a6c5b494..2e780affd 100644 --- a/roles/servicetelemetry/tasks/pre.yml +++ b/roles/servicetelemetry/tasks/pre.yml @@ -34,6 +34,7 @@ - name: "Set supporting container image paths" set_fact: prometheus_webhook_snmp_container_image_path: "{{ lookup('env', 'RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_IMAGE') | default('quay.io/infrawatch/prometheus-webhook-snmp:latest', true) }}" # noqa 204 + oauth_proxy_image: "{{ lookup('env', 'RELATED_IMAGE_OAUTH_PROXY_IMAGE') | default('quay.io/openshift/origin-oauth-proxy:latest', true) }}" # noqa 204 - name: Adjust defaults when highAvailability.enabled is true block: From ea5616c27f5be0bb394820520c82243c92a2d283 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Mon, 8 May 2023 20:03:48 +0100 Subject: [PATCH 23/95] [jenkins] Add custom context labels for github build status notification (#402) * [jenkins] Add custom context labels for github build status notification Multiple jenkins deployment can now be run and report their build status separately instead of both reporting to the same ``continuous-integration/jenkins/pr-merge`` and overriding each other. There is now ``continuous-integration/jenkins/ocp-/pr-merge`` NOTE: the OCP_VERSION is hardcoded at the moment https://github.com/jenkinsci/github-scm-trait-notification-context-plugin * [jenkins] Make the build status label configurable Added the OCP_VERSION var to the casc-configmap.yaml, so that the correct label can be set for the jobs --------- Co-authored-by: Chris Sibbitt --- .jenkins/Dockerfile | 1 + .jenkins/README.md | 2 ++ .jenkins/deploy/casc-configmap.yaml | 5 +++++ 3 files changed, 8 insertions(+) diff --git a/.jenkins/Dockerfile b/.jenkins/Dockerfile index 7d946961d..9930d6c57 100644 --- a/.jenkins/Dockerfile +++ b/.jenkins/Dockerfile @@ -10,6 +10,7 @@ RUN jenkins-plugin-cli -p ant:latest \ email-ext:latest \ git:latest \ github-branch-source:latest \ + github-scm-trait-notification-context:latest \ gradle:latest \ ldap:latest \ mailer:latest \ diff --git a/.jenkins/README.md b/.jenkins/README.md index 64d768251..2085f21d3 100644 --- a/.jenkins/README.md +++ b/.jenkins/README.md @@ -39,6 +39,8 @@ oc apply -f deploy/service-route.yaml export SMEE_CHANNEL= #(just the slug, not the whole URL) export GH_ORG= export JENKINS_URL=$(oc get route jenkins -ojsonpath='{.spec.host}') +# This is for labelling the status that is returned to github +export OCP_VERSION= # e.g. 4.12 for f in deploy/*; do envsubst < "${f}" | oc apply -f - diff --git a/.jenkins/deploy/casc-configmap.yaml b/.jenkins/deploy/casc-configmap.yaml index cb34dddb9..ab372d230 100644 --- a/.jenkins/deploy/casc-configmap.yaml +++ b/.jenkins/deploy/casc-configmap.yaml @@ -94,6 +94,11 @@ data: // 1 : Forks in the same account // 2 : Nobody } + // Custom Github Notification Context; https://github.com/jenkinsci/github-scm-trait-notification-context-plugin + traits << 'org.jenkinsci.plugins.githubScmTraitNotificationContext.NotificationContextTrait' { + contextLabel("continuous-integration/jenkins/ocp-${OCP_VERSION}") + typeSuffix(true) + } } // "Project Recognizers" From da4d091830fc4b4d8df030d83cbe5b90a9aaecc8 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Mon, 8 May 2023 17:21:53 -0400 Subject: [PATCH 24/95] Observability Operator for Prometheus & Alertmanager (#421) Default to Observability Operator for Prometheus & Alertmanager - Adds two options to observability_strategy: * use_redhat (OBO Only) * use_hybrid (OBO + friends) - Default to redhat supported components in deployment - Use upstream source for ObO in CI - Added sensubility to deployment validation - Added required RBAC for OBO usage -When no explicit observability_strategy is set: * Existing STF objects get an explicit "use_community" added * New STF objects get the default ("use_redhat") explicitly added - Narrow the scope of the smoke test (#422) Co-authored-by: Leif Madsen --- Jenkinsfile | 4 +- build/stf-run-ci/README.md | 2 +- build/stf-run-ci/defaults/main.yml | 2 +- build/stf-run-ci/tasks/deploy_stf.yml | 6 +- build/stf-run-ci/tasks/setup_base.yml | 52 ++++++++++-- build/validate_deployment.sh | 15 +++- deploy/alerts/alerts.yaml | 2 +- .../infra.watch_servicetelemetrys_crd.yaml | 4 +- ...fra.watch_v1beta1_servicetelemetry_cr.yaml | 2 +- .../infra.watch_servicetelemetrys_crd.yaml | 8 +- ...emetry-operator.clusterserviceversion.yaml | 32 +++++++- deploy/remove_stf.sh | 13 +++ deploy/role.yaml | 30 +++++++ roles/servicetelemetry/defaults/main.yml | 3 +- .../tasks/component_alertmanager.yml | 20 +++++ .../tasks/component_certificates.yml | 3 - .../tasks/component_clouds.yml | 4 +- ...rity.yml => component_es_certificates.yml} | 0 .../tasks/component_prometheus.yml | 80 ++++++++++++++++++- .../tasks/component_servicemonitor.yml | 2 +- roles/servicetelemetry/tasks/main.yml | 31 +++++-- roles/servicetelemetry/tasks/pre.yml | 56 +++++++++++-- .../templates/manifest_alertmanager.j2 | 3 +- .../templates/manifest_prometheus.j2 | 3 +- tests/smoketest/smoketest.sh | 12 ++- .../smoketest_ceilometer_entrypoint.sh | 56 +++++++------ .../smoketest_collectd_entrypoint.sh | 36 +++++---- tests/smoketest/smoketest_job.yaml.template | 4 + 28 files changed, 394 insertions(+), 91 deletions(-) rename roles/servicetelemetry/tasks/{_local_signing_authority.yml => component_es_certificates.yml} (100%) diff --git a/Jenkinsfile b/Jenkinsfile index 726a4bab4..f94b64b1e 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -15,7 +15,7 @@ metadata: name: default namespace: ${namespace} spec: - observabilityStrategy: use_community + observabilityStrategy: use_redhat alerting: alertmanager: storage: @@ -177,7 +177,7 @@ pipeline { openshift.withProject(namespace) { timeout(time: 800, unit: 'SECONDS') { openshift.create(stf_resource) - sh "OCP_PROJECT=${namespace} ./build/validate_deployment.sh" + sh "OCP_PROJECT=${namespace} VALIDATION_SCOPE=use_redhat ./build/validate_deployment.sh" } } } diff --git a/build/stf-run-ci/README.md b/build/stf-run-ci/README.md index 87ca2f0aa..7a45897fc 100644 --- a/build/stf-run-ci/README.md +++ b/build/stf-run-ci/README.md @@ -53,7 +53,7 @@ choose to override: | `__service_telemetry_trap_default_oid` | | 1.3.6.1.4.1.50495.15.1.2.1 | The trap OID if none is found in the Prometheus alert labels. | | `__service_telemetry_trap_default_severity` | | | The trap severity if none is found in the Prometheus alert labels. | | `__service_telemetry_logs_enabled` | {true,false} | false | Whether to enable logs support in ServiceTelemetry | -| `__service_telemetry_observability_strategy` | | `use_community` | Which observability strategy to use for deployment. Default deployment is 'use_community'. Also supported is 'none' | +| `__service_telemetry_observability_strategy` | | `use_hybrid` | Which observability strategy to use for deployment. Default is 'use_hybrid'. Also supported are 'use_redhat', 'use_community', and 'none' | | `__service_telemetry_transports_certificates_endpoint_cert_duration`| [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 2160h | Lifetime of the QDR endpoint certificate (minimum duration is 1h) | | `__service_telemetry_transports_certificates_ca_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the QDR CA certificate (minimum duration is 1h) | | `__internal_registry_path` | | image-registry.openshift-image-registry.svc:5000 | Path to internal registry for image path | diff --git a/build/stf-run-ci/defaults/main.yml b/build/stf-run-ci/defaults/main.yml index ab9dab7a6..e38c63caf 100644 --- a/build/stf-run-ci/defaults/main.yml +++ b/build/stf-run-ci/defaults/main.yml @@ -26,7 +26,7 @@ __service_telemetry_snmptraps_trap_oid_prefix: "1.3.6.1.4.1.50495.15" __service_telemetry_snmptraps_trap_default_oid: "1.3.6.1.4.1.50495.15.1.2.1" __service_telemetry_snmptraps_trap_default_severity: "" __service_telemetry_logs_enabled: false -__service_telemetry_observability_strategy: use_community +__service_telemetry_observability_strategy: use_hybrid __service_telemetry_transports_certificates_endpoint_cert_duration: 70080h __service_telemetry_transports_certificates_ca_cert_duration: 70080h __internal_registry_path: image-registry.openshift-image-registry.svc:5000 diff --git a/build/stf-run-ci/tasks/deploy_stf.yml b/build/stf-run-ci/tasks/deploy_stf.yml index bc49897c0..c7a1219d0 100644 --- a/build/stf-run-ci/tasks/deploy_stf.yml +++ b/build/stf-run-ci/tasks/deploy_stf.yml @@ -1,6 +1,6 @@ # NOTE: be aware that if the API version changes for the ServiceTelemetry # object that it'll need to be updated here -- name: Create default ServiceTelemetry manifest with observabilityStrategy use_community +- name: Create default ServiceTelemetry manifest with a observabilityStrategy other than none set_fact: service_telemetry_manifest: | apiVersion: infra.watch/v1beta1 @@ -9,7 +9,7 @@ name: default namespace: "{{ namespace }}" spec: - observabilityStrategy: "use_community" + observabilityStrategy: "{{ __service_telemetry_observability_strategy }}" alerting: alertmanager: storage: @@ -72,7 +72,7 @@ enabled: {{ __service_telemetry_high_availability_enabled }} when: - service_telemetry_manifest is not defined - - __service_telemetry_observability_strategy == "use_community" + - __service_telemetry_observability_strategy != "none" - name: Create default ServiceTelemetry manifest with observabilityStrategy none set_fact: diff --git a/build/stf-run-ci/tasks/setup_base.yml b/build/stf-run-ci/tasks/setup_base.yml index fb06457dd..779ee0533 100644 --- a/build/stf-run-ci/tasks/setup_base.yml +++ b/build/stf-run-ci/tasks/setup_base.yml @@ -13,7 +13,7 @@ name: certified-operators - disabled: false name: redhat-operators - - disabled: "{{ false if __service_telemetry_observability_strategy == 'use_community' else true }}" + - disabled: "{{ false if __service_telemetry_observability_strategy in ['use_community', 'use_hybrid'] else true }}" name: community-operators - name: Create OperatorGroup @@ -80,7 +80,7 @@ source: certified-operators sourceNamespace: openshift-marketplace when: - - __service_telemetry_observability_strategy == "use_community" + - __service_telemetry_observability_strategy in ['use_community', 'use_hybrid'] - name: Subscribe to AMQ Interconnect Operator k8s: @@ -103,13 +103,55 @@ apiVersion: operators.coreos.com/v1alpha1 kind: Subscription metadata: - name: rhods-prometheus-operator + name: prometheus namespace: "{{ namespace }}" spec: channel: beta installPlanApproval: Automatic - name: rhods-prometheus-operator - source: redhat-operators + name: prometheus + source: community-operators sourceNamespace: openshift-marketplace when: - __service_telemetry_observability_strategy == "use_community" + +- block: + # Upstream Source + Sub from https://github.com/rhobs/observability-operator/tree/main/hack/olm + - name: Create CatalogSource for Red Hat Observability Operator + k8s: + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: CatalogSource + metadata: + annotations: + name: observability-operator + namespace: openshift-marketplace + spec: + displayName: Observability Operator - Test + icon: + base64data: "" + mediatype: "" + image: quay.io/rhobs/observability-operator-catalog:latest + publisher: Sunil Thaha + sourceType: grpc + updateStrategy: + registryPoll: + interval: 10m0s + + - name: Subscribe to Red Hat Obervability Operator + k8s: + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + labels: + operators.coreos.com/observability-operator.openshift-operators: "" + name: observability-operator + namespace: openshift-operators + spec: + channel: development + installPlanApproval: Automatic + name: observability-operator + source: observability-operator + sourceNamespace: openshift-marketplace + when: + - __service_telemetry_observability_strategy in ['use_redhat', 'use_hybrid'] diff --git a/build/validate_deployment.sh b/build/validate_deployment.sh index 345ee3d7f..14ea741c7 100755 --- a/build/validate_deployment.sh +++ b/build/validate_deployment.sh @@ -12,7 +12,7 @@ echo -e "\n* [info] Waiting for QDR deployment to complete\n" until timeout 300 oc rollout status deployment.apps/default-interconnect; do sleep 3; done case "${VALIDATION_SCOPE}" in - "use_community") + "use_community" | "use_hybrid") echo -e "\n* [info] Waiting for prometheus deployment to complete\n" until timeout 300 oc rollout status statefulset.apps/prometheus-default; do sleep 3; done echo -e "\n* [info] Waiting for elasticsearch deployment to complete \n" @@ -30,12 +30,25 @@ case "${VALIDATION_SCOPE}" in until timeout 300 oc rollout status deployment.apps/default-cloud1-coll-event-smartgateway; do sleep 3; done until timeout 300 oc rollout status deployment.apps/default-cloud1-ceil-event-smartgateway; do sleep 3; done until timeout 300 oc rollout status deployment.apps/default-cloud1-ceil-meter-smartgateway; do sleep 3; done + until timeout 300 oc rollout status deployment.apps/default-cloud1-sens-meter-smartgateway; do sleep 3; done + ;; + + "use_redhat") + echo -e "\n* [info] Waiting for prometheus deployment to complete\n" + until timeout 300 oc rollout status statefulset.apps/prometheus-default; do sleep 3; done + echo -e "\n* [info] Waiting for alertmanager deployment to complete\n" + until timeout 300 oc rollout status statefulset.apps/alertmanager-default; do sleep 3; done + echo -e "\n* [info] Waiting for smart-gateway deployment to complete\n" + until timeout 300 oc rollout status deployment.apps/default-cloud1-coll-meter-smartgateway; do sleep 3; done + until timeout 300 oc rollout status deployment.apps/default-cloud1-ceil-meter-smartgateway; do sleep 3; done + until timeout 300 oc rollout status deployment.apps/default-cloud1-sens-meter-smartgateway; do sleep 3; done ;; "none") echo -e "\n* [info] Waiting for smart-gateway deployment to complete\n" until timeout 300 oc rollout status deployment.apps/default-cloud1-coll-meter-smartgateway; do sleep 3; done until timeout 300 oc rollout status deployment.apps/default-cloud1-ceil-meter-smartgateway; do sleep 3; done + until timeout 300 oc rollout status deployment.apps/default-cloud1-sens-meter-smartgateway; do sleep 3; done ;; esac diff --git a/deploy/alerts/alerts.yaml b/deploy/alerts/alerts.yaml index ea96d0be1..f7bed35ac 100644 --- a/deploy/alerts/alerts.yaml +++ b/deploy/alerts/alerts.yaml @@ -1,4 +1,4 @@ -apiVersion: monitoring.coreos.com/v1 +apiVersion: monitoring.rhobs/v1 kind: PrometheusRule metadata: creationTimestamp: null diff --git a/deploy/crds/infra.watch_servicetelemetrys_crd.yaml b/deploy/crds/infra.watch_servicetelemetrys_crd.yaml index 21e9f8652..5b4a2f9a0 100644 --- a/deploy/crds/infra.watch_servicetelemetrys_crd.yaml +++ b/deploy/crds/infra.watch_servicetelemetrys_crd.yaml @@ -36,10 +36,12 @@ spec: description: ServiceTelemetrySpec holds the specification of an STF instance. properties: observabilityStrategy: - description: 'The strategy to use for observability systems. Options are "none" (do not deploy any observability components), and "use_community" (community operators with administrator managed subscriptions).' + description: 'The strategy to use for observability systems. Options are "none" (do not deploy any observability components), "use_community" (community supported operators), "use_redhat" (Red Hat Observability Operator with no unsupported components), "use_hybrid" (Red Hat Observability Operator + community supported operators).' type: string enum: - use_community + - use_redhat + - use_hybrid - none alerting: properties: diff --git a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml index 9d324839c..848e78190 100644 --- a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml +++ b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml @@ -3,7 +3,7 @@ kind: ServiceTelemetry metadata: name: default spec: - observabilityStrategy: use_community + observabilityStrategy: use_redhat alerting: enabled: true alertmanager: diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml index c275c943b..982b1710c 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml @@ -491,10 +491,14 @@ spec: type: object observabilityStrategy: description: The strategy to use for observability systems. Options - are "none" (do not deploy any observability components), and "use_community" - (community operators with administrator managed subscriptions). + are "none" (do not deploy any observability components), "use_community" + (community supported operators), "use_redhat" (Red Hat Observability + Operator with no unsupported components), "use_hybrid" (Red Hat + Observability Operator + community supported operators). enum: - use_community + - use_redhat + - use_hybrid - none type: string transports: diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index a33a7bc92..be777dfec 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -153,7 +153,7 @@ metadata: "highAvailability": { "enabled": false }, - "observabilityStrategy": "use_community", + "observabilityStrategy": "use_redhat", "transports": { "qdr": { "certificates": { @@ -261,6 +261,7 @@ spec: - security.openshift.io resourceNames: - nonroot + - nonroot-v2 resources: - securitycontextconstraints verbs: @@ -381,6 +382,7 @@ spec: - interconnectedcloud.github.io - smartgateway.infra.watch - monitoring.coreos.com + - monitoring.rhobs - elasticsearch.k8s.elastic.co - integreatly.org - loki.grafana.com @@ -395,6 +397,13 @@ spec: verbs: - get - create + - apiGroups: + - monitoring.rhobs + resources: + - servicemonitors + verbs: + - get + - create - apiGroups: - apps resourceNames: @@ -421,6 +430,27 @@ spec: - '*' verbs: - '*' + - apiGroups: + - rbac.authorization.k8s.io + resources: + - roles + - rolebindings + verbs: + - create + - get + - list + - watch + - update + - patch + - apiGroups: + - extensions + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch serviceAccountName: service-telemetry-operator strategy: deployment installModes: diff --git a/deploy/remove_stf.sh b/deploy/remove_stf.sh index 6ebb540e7..5286693e0 100755 --- a/deploy/remove_stf.sh +++ b/deploy/remove_stf.sh @@ -4,6 +4,7 @@ # REL=$(dirname "$0"); . "${REL}/../build/metadata.sh" REMOVE_CERTMANAGER=${REMOVE_CERTMANAGER:-true} +REMOVE_OBO=${REMOVE_OBO:-true} # The whole STF project (start this first since it's slow) oc delete project "${OCP_PROJECT}" @@ -45,6 +46,18 @@ if [ "${REMOVE_CERTMANAGER}" = "true" ]; then oc get crd | grep cert-manager.io | cut -d ' ' -f 1 | xargs oc delete crd fi +if [ "${REMOVE_OBO}" = "true" ]; then + oc delete subscription observability-operator -n openshift-operators + oc delete catalogsource observability-operator -n openshift-marketplace + + # CSV for OBO + OBO_CSV=$(oc get csv | grep observability-operator | cut -d ' ' -f 1) + oc delete csv "${OBO_CSV}" + + # OBO CRDs + oc get crd | grep monitoring.rhobs | cut -d ' ' -f 1 | xargs oc delete crd +fi + # Wait for namespace to actually disappear (this can take awhile) while oc get ns "${OCP_PROJECT}" > /dev/null; do echo "Waiting for ${OCP_PROJECT} to disappear"; sleep 5; done diff --git a/deploy/role.yaml b/deploy/role.yaml index c1c465969..1d0a841fa 100644 --- a/deploy/role.yaml +++ b/deploy/role.yaml @@ -32,6 +32,7 @@ rules: - security.openshift.io resourceNames: - nonroot + - nonroot-v2 resources: - securitycontextconstraints verbs: @@ -116,6 +117,7 @@ rules: - interconnectedcloud.github.io - smartgateway.infra.watch - monitoring.coreos.com + - monitoring.rhobs - elasticsearch.k8s.elastic.co - integreatly.org - loki.grafana.com @@ -130,6 +132,13 @@ rules: verbs: - get - create +- apiGroups: + - monitoring.rhobs + resources: + - servicemonitors + verbs: + - get + - create - apiGroups: - apps resourceNames: @@ -156,3 +165,24 @@ rules: - '*' verbs: - '*' +- apiGroups: + - rbac.authorization.k8s.io + resources: + - roles + - rolebindings + verbs: + - create + - get + - list + - watch + - update + - patch +- apiGroups: + - extensions + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch \ No newline at end of file diff --git a/roles/servicetelemetry/defaults/main.yml b/roles/servicetelemetry/defaults/main.yml index c1f112eed..981273821 100644 --- a/roles/servicetelemetry/defaults/main.yml +++ b/roles/servicetelemetry/defaults/main.yml @@ -3,8 +3,7 @@ # # remove SmartGateway object when cloud no longer in current `clouds` object list. clouds_remove_on_missing: false -# default observability strategy (compatible with STF 1.3) -observability_strategy: use_community +observability_strategy: use_redhat servicetelemetry_defaults: high_availability: diff --git a/roles/servicetelemetry/tasks/component_alertmanager.yml b/roles/servicetelemetry/tasks/component_alertmanager.yml index edace5b1c..bcb63e44f 100644 --- a/roles/servicetelemetry/tasks/component_alertmanager.yml +++ b/roles/servicetelemetry/tasks/component_alertmanager.yml @@ -28,6 +28,26 @@ definition: '{{ alertmanager_manifest }}' +- name: Ensure no community Alertmanager is installed if not using community operator + k8s: + state: absent + api_version: monitoring.coreos.com/v1 + kind: alertmanager + name: '{{ ansible_operator_meta.name }}' + namespace: '{{ ansible_operator_meta.namespace }}' + when: + - observability_strategy != "use_community" + +- name: Ensure no rhobs Alertmanager is installed if not using it + k8s: + state: absent + api_version: monitoring.rhobs/v1 + kind: alertmanager + name: '{{ ansible_operator_meta.name }}' + namespace: '{{ ansible_operator_meta.namespace }}' + when: + - observability_strategy not in ['use_redhat', 'use_hybrid'] + # TODO: expand the management of alertmanager receivers and move this functionality to a common location # --> SNMP traps - name: Create SNMP traps instance diff --git a/roles/servicetelemetry/tasks/component_certificates.yml b/roles/servicetelemetry/tasks/component_certificates.yml index e8f034338..bc0178409 100644 --- a/roles/servicetelemetry/tasks/component_certificates.yml +++ b/roles/servicetelemetry/tasks/component_certificates.yml @@ -1,6 +1,3 @@ -- name: Create local signing authority - include_tasks: _local_signing_authority.yml - - name: Create configmap for OAUTH CA certs k8s: definition: diff --git a/roles/servicetelemetry/tasks/component_clouds.yml b/roles/servicetelemetry/tasks/component_clouds.yml index 99de80198..f1e9a4e5a 100644 --- a/roles/servicetelemetry/tasks/component_clouds.yml +++ b/roles/servicetelemetry/tasks/component_clouds.yml @@ -54,7 +54,7 @@ when: - has_elasticsearch_api | bool - servicetelemetry_vars.backends.events.elasticsearch.enabled - - observability_strategy == 'use_community' + - observability_strategy in ['use_community', 'use_hybrid'] - name: Deploy Logs Smart Gateway instance vars: @@ -68,7 +68,7 @@ label: "{{ this_collector.collector_type }}" when: - has_loki_api | bool - - observability_strategy == 'use_community' + - observability_strategy in ['use_community', 'use_hybrid'] - servicetelemetry_vars.backends.logs.loki.enabled - this_cloud.logs is defined - this_cloud.logs.collectors is defined diff --git a/roles/servicetelemetry/tasks/_local_signing_authority.yml b/roles/servicetelemetry/tasks/component_es_certificates.yml similarity index 100% rename from roles/servicetelemetry/tasks/_local_signing_authority.yml rename to roles/servicetelemetry/tasks/component_es_certificates.yml diff --git a/roles/servicetelemetry/tasks/component_prometheus.yml b/roles/servicetelemetry/tasks/component_prometheus.yml index 9decbf765..eb890c1be 100644 --- a/roles/servicetelemetry/tasks/component_prometheus.yml +++ b/roles/servicetelemetry/tasks/component_prometheus.yml @@ -18,7 +18,65 @@ annotations: serviceaccounts.openshift.io/oauth-redirectreference.prometheus: '{{ prom_oauth_redir_ref | to_json }}' -- name: Bind the local prometheus SA to prometheus cluster role +- block: + - name: Install RBAC Role for prometheus operations + k8s: + definition: + apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + name: prometheus-stf + namespace: '{{ ansible_operator_meta.namespace }}' + rules: + - apiGroups: + - "" + resources: + - services + - endpoints + - pods + verbs: + - get + - list + - watch + - apiGroups: + - extensions + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch + - apiGroups: + - security.openshift.io + resourceNames: + - nonroot + - nonroot-v2 + resources: + - securitycontextconstraints + verbs: + - use + + - name: Bind the local prometheus SA to our new role + k8s: + definition: + apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: prometheus-k8s-stf + namespace: '{{ ansible_operator_meta.namespace }}' + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-stf + subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: '{{ ansible_operator_meta.namespace }}' + when: + - observability_strategy in ['use_redhat', 'use_hybrid'] + +- name: Bind the local prometheus SA to prometheus cluster role (for oauth perms) k8s: definition: apiVersion: rbac.authorization.k8s.io/v1 @@ -90,6 +148,26 @@ definition: '{{ prometheus_manifest }}' +- name: Ensure no community Prometheus is installed if not using community operator + k8s: + state: absent + api_version: monitoring.coreos.com/v1 + kind: prometheus + name: '{{ ansible_operator_meta.name }}' + namespace: '{{ ansible_operator_meta.namespace }}' + when: + - observability_strategy != "use_community" + +- name: Ensure no rhobs Prometheus is installed if not using it + k8s: + state: absent + api_version: monitoring.rhobs/v1 + kind: prometheus + name: '{{ ansible_operator_meta.name }}' + namespace: '{{ ansible_operator_meta.namespace }}' + when: + - observability_strategy not in ['use_redhat', 'use_hybrid'] + - name: Set default prometheus service template set_fact: prometheus_service_manifest: "{{ lookup('template', './manifest_prometheus_service.j2') | from_yaml }}" diff --git a/roles/servicetelemetry/tasks/component_servicemonitor.yml b/roles/servicetelemetry/tasks/component_servicemonitor.yml index 886d67df7..753116c46 100644 --- a/roles/servicetelemetry/tasks/component_servicemonitor.yml +++ b/roles/servicetelemetry/tasks/component_servicemonitor.yml @@ -1,7 +1,7 @@ - name: Create SG-specific Service Monitor manifest set_fact: sg_specific_servicemonitor_manifest: | - apiVersion: monitoring.coreos.com/v1 + apiVersion: {{ prometheus_operator_api_string }} kind: ServiceMonitor metadata: labels: diff --git a/roles/servicetelemetry/tasks/main.yml b/roles/servicetelemetry/tasks/main.yml index 96f5f2b9e..937fe7828 100644 --- a/roles/servicetelemetry/tasks/main.yml +++ b/roles/servicetelemetry/tasks/main.yml @@ -23,9 +23,26 @@ - has_certmanager_api | bool # --> backends.metrics +- name: Setup Certificates for metrics components + include_tasks: component_certificates.yml + when: + - has_certmanager_api | bool + +- name: Set community monitoring API string and labels + set_fact: + prometheus_operator_api_string: monitoring.coreos.com/v1 + prometheus_operator_label: + when: observability_strategy == 'use_community' + +- name: Set Red Hat monitoring API string + set_fact: + prometheus_operator_api_string: monitoring.rhobs/v1 + prometheus_operator_label: 'app.kubernetes.io/managed-by: observability-operator' + when: observability_strategy in ['use_redhat', 'use_hybrid'] + - name: Check if we have monitoring.coreos.com API set_fact: - has_monitoring_coreos_api: "{{ True if 'monitoring.coreos.com' in api_groups else False }}" + has_monitoring_api: "{{ True if (prometheus_operator_api_string | dirname) in api_groups else False }}" - block: - name: Create Prometheus instance @@ -35,8 +52,8 @@ - name: Create Alertmanager instance include_tasks: component_alertmanager.yml when: - - has_monitoring_coreos_api | bool - - observability_strategy == 'use_community' + - has_monitoring_api | bool + - observability_strategy != 'none' # --> backends.events - name: Check if we have elasticsearch API @@ -46,14 +63,14 @@ - name: Deploy ElasticSearch events backend block: - name: Setup Certificates for ElasticSearch - include_tasks: component_certificates.yml + include_tasks: component_es_certificates.yml - name: Setup ElasticSearch include_tasks: component_elasticsearch.yml when: - has_elasticsearch_api | bool - has_certmanager_api | bool - - observability_strategy == 'use_community' + - observability_strategy in ['use_community', 'use_hybrid'] # --> backends.logs - name: Check if we have loki API @@ -64,7 +81,7 @@ include_tasks: component_loki.yml when: - has_loki_api | bool - - observability_strategy == 'use_community' + - observability_strategy in ['use_community', 'use_hybrid'] # --> graphing - name: Check if we have integreatly.org API @@ -81,7 +98,7 @@ # include_tasks: component_dashboards.yml when: - has_integreatly_api | bool - - observability_strategy == 'use_community' + - observability_strategy in ['use_community', 'use_hybrid'] # --> clouds - name: Get data about clouds diff --git a/roles/servicetelemetry/tasks/pre.yml b/roles/servicetelemetry/tasks/pre.yml index 2e780affd..3e43729dc 100644 --- a/roles/servicetelemetry/tasks/pre.yml +++ b/roles/servicetelemetry/tasks/pre.yml @@ -79,6 +79,54 @@ namespace: "{{ ansible_operator_meta.namespace }}" register: smartgateways_loaded +- name: Get current STF object + k8s_info: + api_version: infra.watch/v1beta1 + kind: ServiceTelemetry + name: "{{ ansible_operator_meta.name }}" + namespace: "{{ ansible_operator_meta.namespace }}" + register: _stf_object + +- name: Get community Prometheus objects + k8s_info: + api_version: monitoring.coreos.com/v1 + kind: Prometheus + name: "{{ ansible_operator_meta.name }}" + namespace: "{{ ansible_operator_meta.namespace }}" + register: _community_prom_object + +- block: + - name: Apply community observabilityStrategy if missing on an STF object with an existing community prometheus + k8s: + definition: + apiVersion: infra.watch/v1beta1 + kind: ServiceTelemetry + metadata: + name: "{{ ansible_operator_meta.name }}" + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + observabilityStrategy: use_community + - name: Set non-default community strategy for remainder of this run + set_fact: + observability_strategy: use_community + when: + - _community_prom_object.resources[0] is defined + - _stf_object.resources[0].spec.observability_strategy is not defined + +- name: Apply default observability_strategy if missing on a new STF object with no associated community prometheus + k8s: + definition: + apiVersion: infra.watch/v1beta1 + kind: ServiceTelemetry + metadata: + name: "{{ ansible_operator_meta.name }}" + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + observabilityStrategy: "{{ observability_strategy }}" + when: + - _community_prom_object.resources[0] is not defined + - _stf_object.resources[0].spec.observability_strategy is not defined + - name: Set ephemeral_storage_enabled to true when storage strategy is ephemeral set_fact: _ephemeral_storage_enabled: true @@ -87,14 +135,6 @@ servicetelemetry_vars.backends.events.elasticsearch.storage.strategy == "ephemeral" or servicetelemetry_vars.alerting.alertmanager.storage.strategy == "ephemeral" -- name: Get current ephemeralStorageEnabled status - k8s_info: - api_version: infra.watch/v1beta1 - kind: ServiceTelemetry - name: "{{ ansible_operator_meta.name }}" - namespace: "{{ ansible_operator_meta.namespace }}" - register: _stf_object - - name: Set ServiceTelemetry object status to have ephemeralStorageEnabled status operator_sdk.util.k8s_status: api_version: infra.watch/v1beta1 diff --git a/roles/servicetelemetry/templates/manifest_alertmanager.j2 b/roles/servicetelemetry/templates/manifest_alertmanager.j2 index 0500772c9..2465ee43f 100644 --- a/roles/servicetelemetry/templates/manifest_alertmanager.j2 +++ b/roles/servicetelemetry/templates/manifest_alertmanager.j2 @@ -1,8 +1,9 @@ -apiVersion: monitoring.coreos.com/v1 +apiVersion: {{ prometheus_operator_api_string }} kind: Alertmanager metadata: labels: alertmanager: '{{ ansible_operator_meta.name }}' + {{ prometheus_operator_label }} name: '{{ ansible_operator_meta.name }}' namespace: '{{ ansible_operator_meta.namespace }}' spec: diff --git a/roles/servicetelemetry/templates/manifest_prometheus.j2 b/roles/servicetelemetry/templates/manifest_prometheus.j2 index 8b594cefa..8e6fa5fc1 100644 --- a/roles/servicetelemetry/templates/manifest_prometheus.j2 +++ b/roles/servicetelemetry/templates/manifest_prometheus.j2 @@ -1,8 +1,9 @@ -apiVersion: monitoring.coreos.com/v1 +apiVersion: {{ prometheus_operator_api_string }} kind: Prometheus metadata: labels: prometheus: '{{ ansible_operator_meta.name }}' + {{ prometheus_operator_label }} name: '{{ ansible_operator_meta.name }}' namespace: '{{ ansible_operator_meta.namespace }}' spec: diff --git a/tests/smoketest/smoketest.sh b/tests/smoketest/smoketest.sh index 8a801c004..2ec35d46f 100755 --- a/tests/smoketest/smoketest.sh +++ b/tests/smoketest/smoketest.sh @@ -29,6 +29,8 @@ fi CLEANUP=${CLEANUP:-true} +OBSERVABILITY_STRATEGY="${OBSERVABILITY_STRATEGY:-use_redhat}" + for ((i=1; i<=NUMCLOUDS; i++)); do NAME="smoke${i}" CLOUDNAMES+=("${NAME}") @@ -64,7 +66,7 @@ oc create configmap stf-smoketest-ceilometer-entrypoint-script --from-file "${RE echo "*** [INFO] Creating smoketest jobs..." oc delete job -l app=stf-smoketest for NAME in "${CLOUDNAMES[@]}"; do - oc create -f <(sed -e "s/<>/${NAME}/;s/<>/${ELASTICSEARCH_AUTH_PASS}/;s/<>/${PROMETHEUS_AUTH_PASS}/" ${REL}/smoketest_job.yaml.template) + oc create -f <(sed -e "s/<>/${NAME}/;s/<>/${ELASTICSEARCH_AUTH_PASS}/;s/<>/${PROMETHEUS_AUTH_PASS}/;s/<>/${OBSERVABILITY_STRATEGY}/" ${REL}/smoketest_job.yaml.template) done echo "*** [INFO] Triggering an alertmanager notification..." @@ -152,9 +154,11 @@ echo "*** [INFO] Logs from prometheus..." oc logs "$(oc get pod -l prometheus=default -o jsonpath='{.items[0].metadata.name}')" -c prometheus echo -echo "*** [INFO] Logs from elasticsearch..." -oc logs "$(oc get pod -l common.k8s.elastic.co/type=elasticsearch -o jsonpath='{.items[0].metadata.name}')" -echo +if [ "$OBSERVABILITY_STRATEGY" != "use_redhat" ]; then + echo "*** [INFO] Logs from elasticsearch..." + oc logs "$(oc get pod -l common.k8s.elastic.co/type=elasticsearch -o jsonpath='{.items[0].metadata.name}')" + echo +fi echo "*** [INFO] Logs from snmp webhook..." oc logs "$(oc get pod -l app=default-snmp-webhook -o jsonpath='{.items[0].metadata.name}')" diff --git a/tests/smoketest/smoketest_ceilometer_entrypoint.sh b/tests/smoketest/smoketest_ceilometer_entrypoint.sh index 8101b00a5..674a6e203 100644 --- a/tests/smoketest/smoketest_ceilometer_entrypoint.sh +++ b/tests/smoketest/smoketest_ceilometer_entrypoint.sh @@ -32,38 +32,42 @@ echo "[DEBUG] Query returned" metrics_result=$? echo "[DEBUG] Set metrics_result to $metrics_result" -echo "*** [INFO] Get documents for this test from ElasticSearch..." -DOCUMENT_HITS=$(curl -sk -u "elastic:${ELASTICSEARCH_AUTH_PASS}" -X GET "${ELASTICSEARCH}/_search" -H 'Content-Type: application/json' -d'{ - "query": { - "bool": { - "filter": [ - { "term" : { "labels.instance" : { "value" : "'${CLOUDNAME}'", "boost" : 1.0 } } }, - { "range" : { "startsAt" : { "gte" : "now-1m", "lt" : "now" } } } - ] +if [ "$OBSERVABILITY_STRATEGY" != "use_redhat" ]; then + echo "*** [INFO] Get documents for this test from ElasticSearch..." + DOCUMENT_HITS=$(curl -sk -u "elastic:${ELASTICSEARCH_AUTH_PASS}" -X GET "${ELASTICSEARCH}/_search" -H 'Content-Type: application/json' -d'{ + "query": { + "bool": { + "filter": [ + { "term" : { "labels.instance" : { "value" : "'${CLOUDNAME}'", "boost" : 1.0 } } }, + { "range" : { "startsAt" : { "gte" : "now-1m", "lt" : "now" } } } + ] + } } - } -}' | python3 -c "import sys, json; parsed = json.load(sys.stdin); print(parsed['hits']['total']['value'])") + }' | python3 -c "import sys, json; parsed = json.load(sys.stdin); print(parsed['hits']['total']['value'])") -echo "*** [INFO] List of indices for debugging..." -curl -sk -u "elastic:${ELASTICSEARCH_AUTH_PASS}" -X GET "${ELASTICSEARCH}/_cat/indices/ceilometer_*?s=index" -echo + echo "*** [INFO] List of indices for debugging..." + curl -sk -u "elastic:${ELASTICSEARCH_AUTH_PASS}" -X GET "${ELASTICSEARCH}/_cat/indices/ceilometer_*?s=index" + echo -echo "*** [INFO] Get documents for this test from ElasticSearch..." -ES_INDEX=ceilometer_image -DOCUMENT_HITS=$(curl -sk -u "elastic:${ELASTICSEARCH_AUTH_PASS}" -X GET "${ELASTICSEARCH}/${ES_INDEX}/_search" -H 'Content-Type: application/json' -d'{ - "query": { - "match_all": {} - } -}'| python3 -c "import sys, json; parsed = json.load(sys.stdin); print(parsed['hits']['total']['value'])") + echo "*** [INFO] Get documents for this test from ElasticSearch..." + ES_INDEX=ceilometer_image + DOCUMENT_HITS=$(curl -sk -u "elastic:${ELASTICSEARCH_AUTH_PASS}" -X GET "${ELASTICSEARCH}/${ES_INDEX}/_search" -H 'Content-Type: application/json' -d'{ + "query": { + "match_all": {} + } + }'| python3 -c "import sys, json; parsed = json.load(sys.stdin); print(parsed['hits']['total']['value'])") -echo "*** [INFO] Found ${DOCUMENT_HITS} documents" -echo; echo + echo "*** [INFO] Found ${DOCUMENT_HITS} documents" + echo; echo -# check if we got documents back for this test -events_result=1 -if [ "$DOCUMENT_HITS" -gt "0" ]; then - events_result=0 + # check if we got documents back for this test + events_result=1 + if [ "$DOCUMENT_HITS" -gt "0" ]; then + events_result=0 + fi +else + events_result=0 fi echo "[INFO] Verification exit codes (0 is passing, non-zero is a failure): events=${events_result} metrics=${metrics_result}" diff --git a/tests/smoketest/smoketest_collectd_entrypoint.sh b/tests/smoketest/smoketest_collectd_entrypoint.sh index 81c12d9fe..00fe92145 100755 --- a/tests/smoketest/smoketest_collectd_entrypoint.sh +++ b/tests/smoketest/smoketest_collectd_entrypoint.sh @@ -62,25 +62,29 @@ grep -E '"result":\[{"metric":{"__name__":"sensubility_container_health_status", metrics_result=$((metrics_result || $?)) echo; echo -echo "*** [INFO] Get documents for this test from ElasticSearch..." -DOCUMENT_HITS=$(curl -sk -u "elastic:${ELASTICSEARCH_AUTH_PASS}" -X GET "${ELASTICSEARCH}/_search" -H 'Content-Type: application/json' -d'{ - "query": { - "bool": { - "filter": [ - { "term" : { "labels.instance" : { "value" : "'${CLOUDNAME}'", "boost" : 1.0 } } }, - { "range" : { "generated" : { "gte" : "now-1m", "lt" : "now" } } } - ] +if [ "$OBSERVABILITY_STRATEGY" != "use_redhat" ]; then + echo "*** [INFO] Get documents for this test from ElasticSearch..." + DOCUMENT_HITS=$(curl -sk -u "elastic:${ELASTICSEARCH_AUTH_PASS}" -X GET "${ELASTICSEARCH}/_search" -H 'Content-Type: application/json' -d'{ + "query": { + "bool": { + "filter": [ + { "term" : { "labels.instance" : { "value" : "'${CLOUDNAME}'", "boost" : 1.0 } } }, + { "range" : { "generated" : { "gte" : "now-1m", "lt" : "now" } } } + ] + } } - } -}' | python3 -c "import sys, json; parsed = json.load(sys.stdin); print(parsed['hits']['total']['value'])") + }' | python3 -c "import sys, json; parsed = json.load(sys.stdin); print(parsed['hits']['total']['value'])") -echo "*** [INFO] Found ${DOCUMENT_HITS} documents" -echo; echo + echo "*** [INFO] Found ${DOCUMENT_HITS} documents" + echo; echo -# check if we got documents back for this test -events_result=1 -if [ "$DOCUMENT_HITS" -gt "0" ]; then - events_result=0 + # check if we got documents back for this test + events_result=1 + if [ "$DOCUMENT_HITS" -gt "0" ]; then + events_result=0 + fi +else + events_result=0 fi echo "[INFO] Verification exit codes (0 is passing, non-zero is a failure): events=${events_result} metrics=${metrics_result}" diff --git a/tests/smoketest/smoketest_job.yaml.template b/tests/smoketest/smoketest_job.yaml.template index 50735b6a5..4a9c20cc9 100644 --- a/tests/smoketest/smoketest_job.yaml.template +++ b/tests/smoketest/smoketest_job.yaml.template @@ -24,6 +24,8 @@ spec: value: "<>" - name: PROMETHEUS_AUTH_PASS value: "<>" + - name: OBSERVABILITY_STRATEGY + value: "<>" volumeMounts: - name: collectd-config mountPath: /etc/minimal-collectd.conf.template @@ -51,6 +53,8 @@ spec: value: "<>" - name: PROMETHEUS_AUTH_PASS value: "<>" + - name: OBSERVABILITY_STRATEGY + value: "<>" volumeMounts: - name: ceilometer-publisher mountPath: /ceilometer_publish.py From 9785fa792023ac8b36c8bdb317d7c801b825edd2 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Tue, 9 May 2023 12:56:47 -0400 Subject: [PATCH 25/95] Use stable-v1 cert-manager in CI for OCP >= 4.12 (#424) * Use stable-v1 cert-manager in CI for OCP >= 4.12 --- build/stf-run-ci/defaults/main.yml | 2 ++ build/stf-run-ci/tasks/setup_base.yml | 11 ++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/build/stf-run-ci/defaults/main.yml b/build/stf-run-ci/defaults/main.yml index e38c63caf..390f9cd0b 100644 --- a/build/stf-run-ci/defaults/main.yml +++ b/build/stf-run-ci/defaults/main.yml @@ -57,3 +57,5 @@ sg_bridge_repository: https://github.com/infrawatch/sg-bridge prometheus_webhook_snmp_repository: https://github.com/infrawatch/prometheus-webhook-snmp base_dir: '' + +cert_manager_channel: stable-v1 \ No newline at end of file diff --git a/build/stf-run-ci/tasks/setup_base.yml b/build/stf-run-ci/tasks/setup_base.yml index 779ee0533..3f5c790ee 100644 --- a/build/stf-run-ci/tasks/setup_base.yml +++ b/build/stf-run-ci/tasks/setup_base.yml @@ -50,6 +50,15 @@ namespace: openshift-cert-manager-operator spec: {} + - name: Get OCP version + shell: oc version -o yaml | grep openshiftVersion | awk '{print $2}' + register: ocp_ver + + - name: Use tech-preview channel for cert_manager in older OCP versions + set_fact: + cert_manager_channel: tech-preview + when: ocp_ver.stdout is version('4.12', '<') + - name: Subscribe to Cert Manager for OpenShift Operator k8s: definition: @@ -59,7 +68,7 @@ name: openshift-cert-manager-operator namespace: openshift-cert-manager-operator spec: - channel: tech-preview + channel: "{{ cert_manager_channel }}" installPlanApproval: Automatic name: openshift-cert-manager-operator source: redhat-operators From ee571fea2ae644463aea4c3ff45a12718d2bed5b Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Tue, 9 May 2023 15:35:12 -0400 Subject: [PATCH 26/95] Pinning prometheus version to ease migration to OBO (#425) * Currently we are installing :latest * We and OBO currently install v2.43.0 * This will mitigate any potential for a prometheus roll-back * We will remove the pin in STF 1.5.5 after migration to OBO is complete --- roles/servicetelemetry/templates/manifest_prometheus.j2 | 1 + 1 file changed, 1 insertion(+) diff --git a/roles/servicetelemetry/templates/manifest_prometheus.j2 b/roles/servicetelemetry/templates/manifest_prometheus.j2 index 8e6fa5fc1..4e8651def 100644 --- a/roles/servicetelemetry/templates/manifest_prometheus.j2 +++ b/roles/servicetelemetry/templates/manifest_prometheus.j2 @@ -7,6 +7,7 @@ metadata: name: '{{ ansible_operator_meta.name }}' namespace: '{{ ansible_operator_meta.namespace }}' spec: + version: v2.43.0 replicas: {{ servicetelemetry_vars.backends.metrics.prometheus.deployment_size }} ruleSelector: {} securityContext: {} From ffd3dbfcb7b82bcfe0d2da66b589c0f75e96e6f9 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Mon, 15 May 2023 10:05:16 -0400 Subject: [PATCH 27/95] Fix auto-reversion to use_community (#427) * The check for a missing observabilityStrategy was faulty * STF object would be updated to `observabilityStrategy: use_community` any time there was a community Prometheus deployed --- roles/servicetelemetry/tasks/pre.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/roles/servicetelemetry/tasks/pre.yml b/roles/servicetelemetry/tasks/pre.yml index 3e43729dc..0fd1bb59b 100644 --- a/roles/servicetelemetry/tasks/pre.yml +++ b/roles/servicetelemetry/tasks/pre.yml @@ -111,9 +111,9 @@ observability_strategy: use_community when: - _community_prom_object.resources[0] is defined - - _stf_object.resources[0].spec.observability_strategy is not defined + - _stf_object.resources[0].spec.observabilityStrategy is not defined -- name: Apply default observability_strategy if missing on a new STF object with no associated community prometheus +- name: Apply default observabilityStrategy if missing on a new STF object with no associated community prometheus k8s: definition: apiVersion: infra.watch/v1beta1 @@ -125,7 +125,7 @@ observabilityStrategy: "{{ observability_strategy }}" when: - _community_prom_object.resources[0] is not defined - - _stf_object.resources[0].spec.observability_strategy is not defined + - _stf_object.resources[0].spec.observabilityStrategy is not defined - name: Set ephemeral_storage_enabled to true when storage strategy is ephemeral set_fact: From c5f7c7b4455d0c827516c77806e122840764c558 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Mon, 15 May 2023 12:47:36 -0400 Subject: [PATCH 28/95] Fix duplicate alarm expressions (#426) --- deploy/alerts/alerts.yaml | 43 +++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/deploy/alerts/alerts.yaml b/deploy/alerts/alerts.yaml index f7bed35ac..229c5c385 100644 --- a/deploy/alerts/alerts.yaml +++ b/deploy/alerts/alerts.yaml @@ -55,7 +55,6 @@ spec: severity: warn annotations: summary: IO read (warning) - - alert: disk:time:read critical expr: >- (abs(job:disk:time:read:rate_5m - job:disk:time:read:rate_5m:avg_over_time_1h) / job:disk:time:read:rate_5m:stddev_over_time_1h) >6 @@ -64,6 +63,7 @@ spec: severity: critical annotations: summary: IO read (critical) + - expr: 'rate(collectd_disk_disk_time_write_total[5m])' record: 'job:disk:time:write:rate_5m' - expr: 'stddev_over_time(job:disk:time:write:rate_5m[1h])' @@ -78,7 +78,6 @@ spec: severity: warn annotations: summary: IO write (warning) - - alert: disk:time:write critical expr: >- (abs(job:disk:time:write:rate_5m - job:disk:time:write:rate_5m:avg_over_time_1h) / job:disk:time:write:rate_5m:stddev_over_time_1h) >6 @@ -87,47 +86,47 @@ spec: severity: critical annotations: summary: IO write (critical) + - expr: 'rate(collectd_disk_disk_ops_read_total[5m])' - record: 'job:disk:time:read:rate_5m' - - expr: 'stddev_over_time(job:disk:time:read:rate_5m[1h])' - record: 'job:disk:time:read:rate_5m:stddev_over_time_1h' - - expr: 'avg_over_time(job:disk:time:read:rate_5m[1h])' - record: 'job:disk:time:read:rate_5m:avg_over_time_1h' - - alert: disk:time:read warn + record: 'job:disk:ops:read:rate_5m' + - expr: 'stddev_over_time(job:disk:ops:read:rate_5m[1h])' + record: 'job:disk:ops:read:rate_5m:stddev_over_time_1h' + - expr: 'avg_over_time(job:disk:ops:read:rate_5m[1h])' + record: 'job:disk:ops:read:rate_5m:avg_over_time_1h' + - alert: disk:ops:read warn expr: >- - (abs(job:disk:time:read:rate_5m - job:disk:time:read:rate_5m:avg_over_time_1h) / job:disk:time:read:rate_5m:stddev_over_time_1h) >3 + (abs(job:disk:ops:read:rate_5m - job:disk:ops:read:rate_5m:avg_over_time_1h) / job:disk:ops:read:rate_5m:stddev_over_time_1h) >3 for: 10m labels: severity: warn annotations: summary: disk ops read (warning) - - - alert: disk:time:read critical + - alert: disk:ops:read critical expr: >- - (abs(job:disk:time:read:rate_5m - job:disk:time:read:rate_5m:avg_over_time_1h) / job:disk:time:read:rate_5m:stddev_over_time_1h) >6 + (abs(job:disk:ops:read:rate_5m - job:disk:ops:read:rate_5m:avg_over_time_1h) / job:disk:ops:read:rate_5m:stddev_over_time_1h) >6 for: 10m labels: severity: critical annotations: summary: disk ops read (critical) + - expr: 'rate(collectd_disk_disk_ops_write_total[5m])' - record: 'job:disk:time:write:rate_5m' - - expr: 'stddev_over_time(job:disk:time:write:rate_5m[1h])' - record: 'job:disk:time:write:rate_5m:stddev_over_time_1h' - - expr: 'avg_over_time(job:disk:time:write:rate_5m[1h])' - record: 'job:disk:time:write:rate_5m:avg_over_time_1h' - - alert: disk:time:write warn + record: 'job:disk:ops:write:rate_5m' + - expr: 'stddev_over_time(job:disk:ops:write:rate_5m[1h])' + record: 'job:disk:ops:write:rate_5m:stddev_over_time_1h' + - expr: 'avg_over_time(job:disk:ops:write:rate_5m[1h])' + record: 'job:disk:ops:write:rate_5m:avg_over_time_1h' + - alert: disk:ops:write warn expr: >- - (abs(job:disk:time:write:rate_5m - job:disk:time:write:rate_5m:avg_over_time_1h) / job:disk:time:write:rate_5m:stddev_over_time_1h) >3 + (abs(job:disk:ops:write:rate_5m - job:disk:ops:write:rate_5m:avg_over_time_1h) / job:disk:ops:write:rate_5m:stddev_over_time_1h) >3 for: 10m labels: severity: warn annotations: summary: disk ops write (warning) - - - alert: disk:time:write critical + - alert: disk:ops:write critical expr: >- - (abs(job:disk:time:write:rate_5m - job:disk:time:write:rate_5m:avg_over_time_1h) / job:disk:time:write:rate_5m:stddev_over_time_1h) >6 + (abs(job:disk:ops:write:rate_5m - job:disk:ops:write:rate_5m:avg_over_time_1h) / job:disk:ops:write:rate_5m:stddev_over_time_1h) >6 for: 10m labels: severity: critical From 13370ad8496955ed74c1cfbf24f58734edc33df2 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Fri, 2 Jun 2023 13:56:58 +0100 Subject: [PATCH 29/95] [zuul] Create zuul no-op job to test integration (#431) --- .zuul.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .zuul.yaml diff --git a/.zuul.yaml b/.zuul.yaml new file mode 100644 index 000000000..28a9499a6 --- /dev/null +++ b/.zuul.yaml @@ -0,0 +1,6 @@ +--- +- project: + name: infrawatch/service-telemetry-operator + github-check: + jobs: + - noop From 153bdb17d6d918ae7b0c3a1a8975ac83c284d3bc Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Wed, 7 Jun 2023 15:00:44 -0400 Subject: [PATCH 30/95] Allow bundle deploy from unauth'd repo (#437) Allow bundle deployments to happen from unauthorized container repositories. --- build/stf-run-ci/tasks/pre-clean.yml | 20 +++++++++++++++++++ .../tasks/setup_stf_from_bundles.yml | 4 ++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/build/stf-run-ci/tasks/pre-clean.yml b/build/stf-run-ci/tasks/pre-clean.yml index d86093cce..af096f98b 100644 --- a/build/stf-run-ci/tasks/pre-clean.yml +++ b/build/stf-run-ci/tasks/pre-clean.yml @@ -48,6 +48,26 @@ displayName: OperatorHub.io Operators publisher: OperatorHub.io +- name: Remove service-telemetry-operator-bundle CatalogSource (bundle deploy) + k8s: + state: absent + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: CatalogSource + metadata: + name: service-telemetry-operator-catalog + namespace: "{{ namespace }}" + +- name: Remove smart-gateway-operator-bundle CatalogSource (bundle deploy) + k8s: + state: absent + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: CatalogSource + metadata: + name: smart-gateway-operator-catalog + namespace: "{{ namespace }}" + # Remove the cert manager since we install it as part of the CI/documented pre-install process - name: Remove openshift-cert-manager-operator namespace k8s: diff --git a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml index a3305440a..fd5423dd3 100644 --- a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml +++ b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml @@ -82,8 +82,8 @@ - name: Deploy SGO via OLM bundle shell: - cmd: "{{ base_dir }}/working/operator-sdk run bundle {{__smart_gateway_bundle_image_path}} --pull-secret-name=pull-secret --ca-secret-name=registry-tls-ca --namespace={{ namespace }} --timeout 600s" + cmd: "{{ base_dir }}/working/operator-sdk run bundle {{__smart_gateway_bundle_image_path}} {% if pull_secret is defined %}--pull-secret-name=pull-secret --ca-secret-name=registry-tls-ca{% endif %} --namespace={{ namespace }} --timeout 600s" - name: Deploy STO via OLM bundle shell: - cmd: "{{ base_dir }}/working/operator-sdk run bundle {{ __service_telemetry_bundle_image_path}} --pull-secret-name=pull-secret --ca-secret-name=registry-tls-ca --namespace={{ namespace }} --timeout 600s" + cmd: "{{ base_dir }}/working/operator-sdk run bundle {{ __service_telemetry_bundle_image_path}} {% if pull_secret is defined %}--pull-secret-name=pull-secret --ca-secret-name=registry-tls-ca{% endif %} --namespace={{ namespace }} --timeout 600s" From e30efa047d02c9eb21755097d229925cce314aa9 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Mon, 12 Jun 2023 16:12:52 -0400 Subject: [PATCH 31/95] Manage Operator dependencies with properties.yaml (#423) * Manage Operator dependencies with properties.yaml Use the properties.yaml to manage the packages we require when deploying Service Telemetry Operator. Allows us to reference the Operator name (which you can find via 'oc get packagemanifests' and reviewing the packageName value of the packagemanifest (which should just match the name listed in the oc get packagemanifests output). Constraints allow the use of versions as well, setting a target of >= the current version(ish). Per https://olm.operatorframework.io/docs/concepts/olm-architecture/dependency-resolution/#nested-compound-constraints: > A nested compound constraint, one that contains at least one child compound constraint along with zero or more simple constraints, is evaluated from the bottom up following the procedures described for each above. For Prometheus, we set our ordered list from bottom to top, with a preference for Observability Operator, followed by RHODS Prometheus Operator, and finally Prometheus Operator from the Community Catalog. Closes: STF-1356 * Move smart-gateway-operator dependency to properties.yaml * Update deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml Co-authored-by: Chris Sibbitt * Add index-based deployment method to stf-run-ci (#428) * Create bundle and index builds for OLM deployment Create bundle images using BuildConfigs through the internal registry after generating contents using generate_bundles. Create a file-based index image from the bundles created and available from the internal registry. Deploy STF using a local CatalogSource that references the internal index image which allows OLM to stand up dependencies using properties.yaml within the Service Teleletry Operator metadata. Skips over pre-deployment artifacts and uses only data available via CatalogSource for dependency validation. * Add bundle builds to stf-run-ci for index-based deployments Initial working code to allow bundle builds to be created in support of index-based (CatalogSource) deployments of STF using local builds. Updates the create_builds.yml logic so that it allows deployments to proceed when builds have already been created. If local builds are enabled then if BuildConfigs have already been created, then the role will lookup the latest Build object and set the internal image path so that the deployment can continue. Primary function is for iterative development. Stubbed out functionality to start creating the index image is available, but still needs to be developed fully. * Update generate_bundle.sh to return a JSON map Update generate_bundle.sh to return a JSON map so that it can be consumed by Ansible in stf-run-ci. * Working development for index-based deployment Completed the initial implementation of a local bundle build and index-image created from bundle images. Generated via opm and loaded in a CatalogSource that allows stf-run-ci to Subscribe to the service-telemetry-operator package. Created to allow testing of the properties.yaml now included in Service Telemetry Operator to allow dependencies to be resolved without pre-Subscribing to the Operators. Closes: STF-1362 * Test and Tune - clean up some ordering of object creation - update index name in a couple of spots (service-telemetry-framework vs service-telemetry-operator) - more checks to allow better idempotency when running stf-run-ci multiple times - create CatalogSource - syntax error on a couple of plays - add some more clean up to make re-running deployments without full artifact builds possible - always need OperatorGroup from CLI... had a check because testing was done from UI incorrectly * Fix syntax error for pre-Subscription * Remove unused template file * Move OCP version lookup to top of plays (#433) * Make debug output of generate_bundle consistent * Clean up block usage for BuildConfig creation * Set operator base image and tag as parameters * Fix typo in annotation * Add check if index and bundle deploys both enabled (#438) Add a fail check to make sure we're not deploying from both index images and bundle images at the same time. * Drop rhods-prometheus-operator from satisfying STO Don't allow rhods-prometheus-operator package to satisfy for installation of Service Telemetry Operator as it is expected to go away. --------- Co-authored-by: Chris Sibbitt --- build/generate_bundle.sh | 27 ++-- build/stf-run-ci/README.md | 136 ++++++++-------- build/stf-run-ci/defaults/main.yml | 9 +- build/stf-run-ci/tasks/create_builds.yml | 62 ++++---- build/stf-run-ci/tasks/create_catalog.yml | 145 ++++++++++++++++++ build/stf-run-ci/tasks/main.yml | 85 +++++++++- build/stf-run-ci/tasks/pre-clean.yml | 41 +++++ build/stf-run-ci/tasks/setup_base.yml | 143 +++++++++-------- .../tasks/setup_stf_local_build.yml | 60 +++++--- build/stf-run-ci/templates/config-json.j2 | 1 + build/stf-run-ci/templates/index-yaml.j2 | 20 +++ ...emetry-operator.clusterserviceversion.yaml | 6 - .../metadata/properties.yaml | 38 +++++ 13 files changed, 552 insertions(+), 221 deletions(-) create mode 100644 build/stf-run-ci/tasks/create_catalog.yml create mode 100644 build/stf-run-ci/templates/config-json.j2 create mode 100644 build/stf-run-ci/templates/index-yaml.j2 diff --git a/build/generate_bundle.sh b/build/generate_bundle.sh index f699f3559..3145eba13 100755 --- a/build/generate_bundle.sh +++ b/build/generate_bundle.sh @@ -6,36 +6,27 @@ REL=$(dirname "$0") . "${REL}/metadata.sh" generate_version() { - echo "-- Generating operator version" UNIXDATE=$(date '+%s') OPERATOR_BUNDLE_VERSION=${OPERATOR_CSV_MAJOR_VERSION}.${UNIXDATE} - echo "---- Operator Version: ${OPERATOR_BUNDLE_VERSION}" } create_working_dir() { - echo "-- Create working directory" WORKING_DIR=${WORKING_DIR:-"/tmp/${OPERATOR_NAME}-bundle-${OPERATOR_BUNDLE_VERSION}"} mkdir -p "${WORKING_DIR}" - echo "---- Created working directory: ${WORKING_DIR}" } generate_dockerfile() { - echo "-- Generate Dockerfile for bundle" sed -E "s#<>#${OPERATOR_BUNDLE_VERSION}#g;s#<>#${BUNDLE_CHANNELS}#g;s#<>#${BUNDLE_DEFAULT_CHANNEL}#g" "${REL}/../${BUNDLE_PATH}/Dockerfile.in" > "${WORKING_DIR}/Dockerfile" - echo "---- Generated Dockerfile complete" } generate_bundle() { - echo "-- Generate bundle" REPLACE_REGEX="s#<>#${CREATED_DATE}#g;s#<>#${OPERATOR_IMAGE}#g;s#<>#${OPERATOR_TAG}#g;s#<>#${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP}#g;s#<>#${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG}#g;s#<>#${RELATED_IMAGE_OAUTH_PROXY}#g;s#<>#${RELATED_IMAGE_OAUTH_PROXY_TAG}#g;s#<>#${OPERATOR_BUNDLE_VERSION}#g;s#1.99.0#${OPERATOR_BUNDLE_VERSION}#g;s#<>#${OPERATOR_DOCUMENTATION_URL}#g;s#<>#${BUNDLE_OLM_SKIP_RANGE_LOWER_BOUND}#g" - pushd "${REL}/../" - ${OPERATOR_SDK} generate bundle --channels ${BUNDLE_CHANNELS} --default-channel ${BUNDLE_DEFAULT_CHANNEL} --manifests --metadata --version "${OPERATOR_BUNDLE_VERSION}" --output-dir "${WORKING_DIR}" - popd + pushd "${REL}/../" > /dev/null 2>&1 + ${OPERATOR_SDK} generate bundle --channels ${BUNDLE_CHANNELS} --default-channel ${BUNDLE_DEFAULT_CHANNEL} --manifests --metadata --version "${OPERATOR_BUNDLE_VERSION}" --output-dir "${WORKING_DIR}" > /dev/null 2>&1 + popd > /dev/null 2>&1 - echo "---- Replacing variables in generated manifest" sed -i -E "${REPLACE_REGEX}" "${WORKING_DIR}/manifests/${OPERATOR_NAME}.clusterserviceversion.yaml" - echo "---- Generated bundle complete at ${WORKING_DIR}/manifests/${OPERATOR_NAME}.clusterserviceversion.yaml" } copy_extra_metadata() { @@ -44,8 +35,7 @@ copy_extra_metadata() { # purposes) does, and newer versions of opm (as used in both downstream and # upstream index image builds) also understands these files. Just copy them # into the bundle directory during building. - echo "-- Copy extra metadata in" - pushd "${REL}/../" + pushd "${REL}/../" > /dev/null 2>&1 cp -r ./deploy/olm-catalog/service-telemetry-operator/tests/ "${WORKING_DIR}" cp ./deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml "${WORKING_DIR}/metadata/" } @@ -58,11 +48,14 @@ build_bundle_instructions() { # generate templates -echo "## Begin bundle creation" +#echo "## Begin bundle creation" generate_version create_working_dir generate_dockerfile generate_bundle copy_extra_metadata -build_bundle_instructions -echo "## End Bundle creation" +#build_bundle_instructions +#echo "## End Bundle creation" + +JSON_OUTPUT='{"operator_bundle_image":"%s","operator_bundle_version":"%s","operator_image":"%s","bundle_channels":"%s","bundle_default_channel":"%s","operator_tag":"%s","working_dir":"%s"}' +printf "$JSON_OUTPUT" "$OPERATOR_BUNDLE_IMAGE" "$OPERATOR_BUNDLE_VERSION" "$OPERATOR_IMAGE" "$BUNDLE_CHANNELS" "$BUNDLE_DEFAULT_CHANNEL" "$OPERATOR_TAG" "$WORKING_DIR" diff --git a/build/stf-run-ci/README.md b/build/stf-run-ci/README.md index 7a45897fc..a389d2c03 100644 --- a/build/stf-run-ci/README.md +++ b/build/stf-run-ci/README.md @@ -1,70 +1,67 @@ -stf-run-ci -========== +# stf-run-ci Run the Service Telemetry Framework CI system. This role is intended to be called from a playbook running locally on a preconfigured test system. Primarily this means a running CodeReady Container system has been provided. -Requirements ------------- +## Requirements - CodeReady Containers - Ansible 2.9 (tested) - `oc` command line tool -Variables ---------- +## Variables Not all variables are listed here, but these are the most common ones you might choose to override: -| Parameter name | Values | Default | Description | -| ------------------------------ | ------------ | --------- | ------------------------------------ | -| `__deploy_stf` | {true,false} | true | Whether to deploy an instance of STF | -| `__local_build_enabled` | {true,false} | true | Whether to deploy STF from local built artifacts. Also see `working_branch`, `sg_branch`, `sgo_branch` | -| `__deploy_from_bundles_enabled` | {true,false} | false | Whether to deploy STF from OLM bundles (TODO: compat with `__local_build_enabled`) | -| `__service_telemetry_bundle_image_path` | | | Image path to Service Telemetry Operator bundle | -| `__smart_gateway_bundle_image_path` | | | Image path to Smart Gateway Operator bundle | -| `prometheus_webhook_snmp_branch` | | master | Which Prometheus Webhook SNMP git branch to checkout | -| `sgo_branch` | | master | Which Smart Gateway Operator git branch to checkout | -| `sg_core_branch` | | master | Which Smart Gateway Core git branch to checkout | -| `sg_bridge_branch` | | master | Which Smart Gateway Bridge git branch to checkout | -| `prometheus_webhook_snmp_branch` | | master | Which Prometheus webhook snmp branch to checkout | -| `sgo_repository` | | https://github.com/infrawatch/smart-gateway-operator | Which Smart Gateway Operator git repository to clone | -| `sg_core_repository` | | https://github.com/infrawatch/sg-core | Which Smart Gateway Core git repository to clone | -| `sg_bridge_repository` | | https://github.com/infrawatch/sg-bridge | Which Smart Gateway Bridge git repository to clone | -| `prometheus_webhook_snmp_repository` | | https://github.com/infrawatch/prometheus-webhook-snmp | Which Prometheus webhook snmp git repository to clone | -| `loki_operator_repository` | | https://github.com/viaq/loki-operator | Which Loki-operator git repository to clone | -| `__service_telemetry_events_certificates_endpoint_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 2160h | Lifetime of the ElasticSearch endpoint certificate (minimum duration is 1h) | -| `__service_telemetry_events_certificates_ca_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the ElasticSearch CA certificate (minimum duration is 1h) | -| `__service_telemetry_events_enabled` | {true,false} | true | Whether to enable events support in ServiceTelemetry | -| `__service_telemetry_high_availability_enabled` | {true,false} | false | Whether to enable high availability support in ServiceTelemetry | -| `__service_telemetry_metrics_enabled` | {true,false} | true | Whether to enable metrics support in ServiceTelemetry | -| `__service_telemetry_storage_ephemeral_enabled` | {true,false} | false | Whether to enable ephemeral storage support in ServiceTelemetry | -| `__service_telemetry_storage_persistent_storage_class` | | | Set a custom storageClass to override the default provided by OpenShift platform | -| `__service_telemetry_snmptraps_enabled` | {true,false} | true | Whether to enable snmptraps delivery via Alertmanager receiver (prometheus-webhook-snmp) | -| `__service_telemetry_snmptraps_community` | | `public` | Set the SNMP community to send traps to. Defaults to public | -| `__service_telemetry_snmptraps_target` | | `192.168.24.254` | Set the SNMP target to send traps to. Defaults to 192.168.24.254 | -| `__service_telemetry_snmptraps_retries` | | 5 | Set the SNMP retry count for traps. Defaults to 5 | -| `__service_telemetry_snmptraps_port` | | 162 | Set the SNMP target port for traps. Defaults to 162 | -| `__service_telemetry_snmptraps_timeout` | | 1 | Set the SNMP retry timeout (in seconds). Defaults to 1 | -| `__service_telemetry_alert_oid_label` | | oid | The alert label name to look for oid value. Default to oid. | -| `__service_telemetry_trap_oid_prefix` | | 1.3.6.1.4.1.50495.15 | The OID prefix for trap variable bindings. | -| `__service_telemetry_trap_default_oid` | | 1.3.6.1.4.1.50495.15.1.2.1 | The trap OID if none is found in the Prometheus alert labels. | -| `__service_telemetry_trap_default_severity` | | | The trap severity if none is found in the Prometheus alert labels. | -| `__service_telemetry_logs_enabled` | {true,false} | false | Whether to enable logs support in ServiceTelemetry | -| `__service_telemetry_observability_strategy` | | `use_hybrid` | Which observability strategy to use for deployment. Default is 'use_hybrid'. Also supported are 'use_redhat', 'use_community', and 'none' | -| `__service_telemetry_transports_certificates_endpoint_cert_duration`| [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 2160h | Lifetime of the QDR endpoint certificate (minimum duration is 1h) | -| `__service_telemetry_transports_certificates_ca_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the QDR CA certificate (minimum duration is 1h) | -| `__internal_registry_path` | | image-registry.openshift-image-registry.svc:5000 | Path to internal registry for image path | -| `__deploy_loki_enabled` | {true,false} | false | Whether to deploy loki-operator and other systems for logging development purposes | -| `__golang_image_path` | | quay.io/infrawatch/golang:1.16 | Golang image path for building the loki-operator image | -| `__loki_image_path` | | quay.io/infrawatch/loki:2.2.1 | Loki image path for Loki microservices | - - - -Example Playbook ----------------- +| Parameter name | Values | Default | Description | +| ------------------------------ | ------------ | --------- | ------------------------------------ | +| `__deploy_stf` | {true,false} | true | Whether to deploy an instance of STF | +| `__local_build_enabled` | {true,false} | true | Whether to deploy STF from local built artifacts. Also see `working_branch`, `sg_branch`, `sgo_branch` | +| `__deploy_from_bundles_enabled` | {true,false} | false | Whether to deploy STF from OLM bundles (TODO: compat with `__local_build_enabled`) | +| `__deploy_from_index_enabled` | {true,false} | false | Whether to deploy STF from locally built bundles and index image. | +| `__service_telemetry_bundle_image_path` | | | Image path to Service Telemetry Operator bundle | +| `__smart_gateway_bundle_image_path` | | | Image path to Smart Gateway Operator bundle | +| `prometheus_webhook_snmp_branch` | | master | Which Prometheus Webhook SNMP git branch to checkout | +| `sgo_branch` | | master | Which Smart Gateway Operator git branch to checkout | +| `sg_core_branch` | | master | Which Smart Gateway Core git branch to checkout | +| `sg_bridge_branch` | | master | Which Smart Gateway Bridge git branch to checkout | +| `prometheus_webhook_snmp_branch` | | master | Which Prometheus webhook snmp branch to checkout | +| `sgo_repository` | | https://github.com/infrawatch/smart-gateway-operator | Which Smart Gateway Operator git repository to clone | +| `sg_core_repository` | | https://github.com/infrawatch/sg-core | Which Smart Gateway Core git repository to clone | +| `sg_bridge_repository` | | https://github.com/infrawatch/sg-bridge | Which Smart Gateway Bridge git repository to clone | +| `prometheus_webhook_snmp_repository` | | https://github.com/infrawatch/prometheus-webhook-snmp | Which Prometheus webhook snmp git repository to clone | +| `loki_operator_repository` | | https://github.com/viaq/loki-operator | Which Loki-operator git repository to clone | +| `__service_telemetry_events_certificates_endpoint_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 2160h | Lifetime of the ElasticSearch endpoint certificate (minimum duration is 1h) | +| `__service_telemetry_events_certificates_ca_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the ElasticSearch CA certificate (minimum duration is 1h) | +| `__service_telemetry_events_enabled` | {true,false} | true | Whether to enable events support in ServiceTelemetry | +| `__service_telemetry_high_availability_enabled` | {true,false} | false | Whether to enable high availability support in ServiceTelemetry | +| `__service_telemetry_metrics_enabled` | {true,false} | true | Whether to enable metrics support in ServiceTelemetry | +| `__service_telemetry_storage_ephemeral_enabled` | {true,false} | false | Whether to enable ephemeral storage support in ServiceTelemetry | +| `__service_telemetry_storage_persistent_storage_class` | | | Set a custom storageClass to override the default provided by OpenShift platform | +| `__service_telemetry_snmptraps_enabled` | {true,false} | true | Whether to enable snmptraps delivery via Alertmanager receiver (prometheus-webhook-snmp) | +| `__service_telemetry_snmptraps_community` | | `public` | Set the SNMP community to send traps to. Defaults to public | +| `__service_telemetry_snmptraps_target` | | `192.168.24.254` | Set the SNMP target to send traps to. Defaults to 192.168.24.254 | +| `__service_telemetry_snmptraps_retries` | | 5 | Set the SNMP retry count for traps. Defaults to 5 | +| `__service_telemetry_snmptraps_port` | | 162 | Set the SNMP target port for traps. Defaults to 162 | +| `__service_telemetry_snmptraps_timeout` | | 1 | Set the SNMP retry timeout (in seconds). Defaults to 1 | +| `__service_telemetry_alert_oid_label` | | oid | The alert label name to look for oid value. Default to oid. | +| `__service_telemetry_trap_oid_prefix` | | 1.3.6.1.4.1.50495.15 | The OID prefix for trap variable bindings. | +| `__service_telemetry_trap_default_oid` | | 1.3.6.1.4.1.50495.15.1.2.1 | The trap OID if none is found in the Prometheus alert labels. | +| `__service_telemetry_trap_default_severity` | | | The trap severity if none is found in the Prometheus alert labels. | +| `__service_telemetry_logs_enabled` | {true,false} | false | Whether to enable logs support in ServiceTelemetry | +| `__service_telemetry_observability_strategy` | | `use_hybrid` | Which observability strategy to use for deployment. Default is 'use_hybrid'. Also supported are 'use_redhat', 'use_community', and 'none' | +| `__service_telemetry_transports_certificates_endpoint_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 2160h | Lifetime of the QDR endpoint certificate (minimum duration is 1h) | +| `__service_telemetry_transports_certificates_ca_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the QDR CA certificate (minimum duration is 1h) | +| `__internal_registry_path` | | image-registry.openshift-image-registry.svc:5000 | Path to internal registry for image path | +| `__deploy_loki_enabled` | {true,false} | false | Whether to deploy loki-operator and other systems for logging development purposes | +| `__golang_image_path` | | quay.io/infrawatch/golang:1.16 | Golang image path for building the loki-operator image | +| `__loki_image_path` | | quay.io/infrawatch/loki:2.2.1 | Loki image path for Loki microservices | + + + +# Example Playbook ```yaml --- @@ -77,32 +74,39 @@ Example Playbook name: stf-run-ci ``` -Usage ------ +# Usage You can deploy Service Telemetry Framework using this role in a few configuration methods: * local build artifacts from Git repository cloned locally +* local build artifacts, local bundle artifacts, and Subscription via OLM using locally built index image * standard deployment using Subscription and OLM * supporting components but no instance of Service Telemetry Operator +## Basic deployment + You can deploy using the sample `run-ci.yaml` from the _Example Playbook_ section: -``` +```sh ansible-playbook run-ci.yaml ``` +## Standard deloyment with existing artifacts + If you want to do a standard deployment (existing remote artifacts) you can use the following command: -``` +```sh ansible-playbook --extra-vars __local_build_enabled=false run-ci.yaml ``` +## Deployment with pre-build bundles + You can deploy directly from pre-built bundles like this: -``` + +```sh ansible-playbook -e __local_build_enabled=false -e __deploy_from_bundles_enabled=true \ -e __service_telemetry_bundle_image_path=//stf-service-telemetry-operator-bundle: \ -e __smart_gateway_bundle_image_path=//stf-smart-gateway-operator-bundle: \ @@ -117,12 +121,18 @@ the registry already in place in the build directory, if required. If this is not required, add `--skip-tags bundle_registry_tls_ca`. If no login is required to your bundle image registry, add `--skip-tags bundle_registry_auth` -License -------- +## Deployment from local artifacts, bundles, and index + +You can perform a deployment using OLM and a Subscription from locally built artifacts, bundles, and index image like this: + +```sh +ansible-playbook -e __local_build_enabled=true -e __deploy_from_index_enabled=true run-ci.yaml +``` + +# License Apache v2.0 -Author Information ------------------- +# Author Information -Leif Madsen +Red Hat (CloudOps DFG) diff --git a/build/stf-run-ci/defaults/main.yml b/build/stf-run-ci/defaults/main.yml index 390f9cd0b..341dc07c4 100644 --- a/build/stf-run-ci/defaults/main.yml +++ b/build/stf-run-ci/defaults/main.yml @@ -7,6 +7,7 @@ list_of_stf_objects: __local_build_enabled: true __deploy_from_bundles_enabled: false +__deploy_from_index_enabled: false __deploy_stf: true __service_telemetry_events_certificates_endpoint_cert_duration: 70080h @@ -33,11 +34,17 @@ __internal_registry_path: image-registry.openshift-image-registry.svc:5000 __service_telemetry_bundle_image_path: __smart_gateway_bundle_image_path: +default_operator_registry_image_base: registry.redhat.io/openshift4/ose-operator-registry +default_operator_registry_image_tag: v4.12 + sgo_image_tag: latest sto_image_tag: latest sg_core_image_tag: latest sg_bridge_image_tag: latest prometheus_webhook_snmp_image_tag: latest +sgo_bundle_image_tag: latest +sto_bundle_image_tag: latest +stf_index_image_tag: latest new_operator_sdk_version: v1.11.0 namespace: service-telemetry pull_secret_registry: @@ -58,4 +65,4 @@ prometheus_webhook_snmp_repository: https://github.com/infrawatch/prometheus-web base_dir: '' -cert_manager_channel: stable-v1 \ No newline at end of file +cert_manager_channel: stable-v1 diff --git a/build/stf-run-ci/tasks/create_builds.yml b/build/stf-run-ci/tasks/create_builds.yml index 3dadb98b8..0282f3d56 100644 --- a/build/stf-run-ci/tasks/create_builds.yml +++ b/build/stf-run-ci/tasks/create_builds.yml @@ -1,47 +1,51 @@ --- -- name: Create BuildConfig and ImageStream - shell: oc new-build -n "{{ namespace }}" --name {{ artifact.name }} --dockerfile - < {{ artifact.working_build_dir }}/{{ artifact.dockerfile_path }} +- name: Get current BuildConfig for artifact to check if it exists + k8s_info: + api_version: build.openshift.io/v1 + kind: BuildConfig + namespace: "{{ namespace }}" + name: "{{ artifact.name }}" + register: build_config_lookup -- name: Kill first build since it will always fail (triggered on BuildConfig creation) - shell: sleep 10 ; oc delete build {{ artifact.name }}-1 -n "{{ namespace }}" +- name: Get current Builds for artifact to check if it exists + k8s_info: + api_version: build.openshift.io/v1 + kind: Build + namespace: "{{ namespace }}" + label_selectors: + - "build={{ artifact.name }}" + register: build_lookup -- name: Kick off build +- when: build_config_lookup.resources | length == 0 block: - - name: Start local image build - command: oc start-build {{ artifact.name }} -n "{{ namespace }}" --wait --from-dir "{{ artifact.working_build_dir }}" - register: build_name - always: - - name: Describe local image build (results) - command: oc describe build {{ artifact.name }} -n "{{ namespace }}" - register: build_describe + - name: Create BuildConfig and ImageStream + shell: oc new-build -n "{{ namespace }}" --name {{ artifact.name }} --dockerfile - < {{ artifact.working_build_dir }}/{{ artifact.dockerfile_path }} - - debug: - var: build_describe.stdout_lines + - name: Kill first build since it will always fail (triggered on BuildConfig creation) + shell: sleep 10 ; oc delete build {{ artifact.name }}-1 -n "{{ namespace }}" -- debug: - var: build_name +- name: Start local image build + command: oc start-build {{ artifact.name }} -n "{{ namespace }}" --wait --from-dir "{{ artifact.working_build_dir }}" + register: build_results + when: build_lookup.resources | length == 0 + +- name: Get latest build information for artifact + command: oc get build --selector build={{ artifact.name }} -n "{{ namespace }}" -ojsonpath='{.items[-1:]}' + register: build_describe_results -- name: Set current build name +- name: Set build_describe from json results set_fact: - this_build_name: "{{ build_name['stdout'].split(' ')[0].split('/')[1] }}" + build_describe: "{{ build_describe_results.stdout | from_json }}" - debug: - var: this_build_name - -- name: Get artifact path - k8s_info: - api_version: build.openshift.io/v1 - kind: Build - name: "{{ this_build_name }}" - namespace: "{{ namespace }}" - register: image_reference + var: build_describe - debug: - var: image_reference.resources[0].status.outputDockerImageReference + var: build_describe.status.outputDockerImageReference - name: Set unique image reference for this artifact set_fact: - "{{ artifact.image_reference_name }}": "{{ image_reference.resources[0].status.outputDockerImageReference }}" + "{{ artifact.image_reference_name }}": "{{ build_describe.status.outputDockerImageReference }}" - debug: var: "{{ artifact.image_reference_name }}" diff --git a/build/stf-run-ci/tasks/create_catalog.yml b/build/stf-run-ci/tasks/create_catalog.yml new file mode 100644 index 000000000..2134be017 --- /dev/null +++ b/build/stf-run-ci/tasks/create_catalog.yml @@ -0,0 +1,145 @@ +--- +- name: Create service-telemetry-framework-index working directory + file: + path: working/service-telemetry-framework-index + state: directory + mode: '0755' + +- name: Create info variables from bundle generation output + set_fact: + sto_bundle_info: "{{ generate_bundle_sto.stdout }}" + sgo_bundle_info: "{{ generate_bundle_sgo.stdout }}" + +- name: Get the builder-dockercfg Secret name + command: oc get secret -n {{ namespace }} --field-selector='type==kubernetes.io/dockercfg' -ojsonpath='{.items[?(@.metadata.annotations.kubernetes\.io/service-account\.name=="builder")].metadata.name}' + register: secret_builder_dockercfg_name + +- name: Get contents of builder Secret + k8s_info: + api_version: v1 + kind: Secret + name: "{{ secret_builder_dockercfg_name.stdout }}" + namespace: "{{ namespace }}" + register: secret_builder_dockercfg_results + +- name: Get builder-dockercfg authentication contents + set_fact: + builder_dockercfg_auth_results: "{{ secret_builder_dockercfg_results.resources[0].data['.dockercfg'] | b64decode }}" + +- name: Set internal registry authentication + set_fact: + internal_registry: "{{ builder_dockercfg_auth_results['image-registry.openshift-image-registry.svc:5000'] | to_json }}" + +- when: query('kubernetes.core.k8s', api_version='v1', kind='Secret', resource_name='service-telemetry-framework-index-dockercfg', namespace=namespace) | length == 0 + block: + - name: Create config.json to import as Secret + template: + variable_start_string: "<<" + variable_end_string: ">>" + src: config-json.j2 + dest: working/service-telemetry-framework-index/config.json + + - name: Create a Secret for the dockercfg + command: oc create secret generic -n {{ namespace }} service-telemetry-framework-index-dockercfg --from-file=.dockerconfigjson=working/service-telemetry-framework-index/config.json --type=kubernetes.io/dockerconfigjson + +- name: Create ImageStream for ose-operator-registry + command: oc import-image -n {{ namespace }} ose-operator-registry:{{ default_operator_registry_image_tag }} --from={{ default_operator_registry_image_base }}:{{ default_operator_registry_image_tag }} --confirm + when: query('kubernetes.core.k8s', api_version='v1', kind='ImageStream', resource_name='ose-operator-registry', namespace=namespace) | length == 0 + +- name: Create ImageStream for service-telemetry-framework-index + command: oc create imagestream -n {{ namespace }} service-telemetry-framework-index + when: query('kubernetes.core.k8s', api_version='v1', kind='ImageStream', resource_name='service-telemetry-framework-index', namespace=namespace) | length == 0 + +- name: Create BuildConfig for service-telemetry-framework-index + k8s: + definition: + apiVersion: build.openshift.io/v1 + kind: BuildConfig + metadata: + annotations: + openshift.io/generated-by: stf-run-ci + labels: + build: service-telemetry-framework-index + name: service-telemetry-framework-index + namespace: "{{ namespace }}" + spec: + failedBuildsHistoryLimit: 5 + nodeSelector: null + output: + to: + kind: ImageStreamTag + name: service-telemetry-framework-index:latest + postCommit: {} + resources: {} + runPolicy: Serial + source: + dockerfile: | + # The base image is expected to contain + # /bin/opm (with a serve subcommand) and /bin/grpc_health_probe + FROM registry.redhat.io/openshift4/ose-operator-registry:v4.12 + + COPY --chmod=666 index.yaml /configs/ + + RUN mkdir /tmp/auth/ + # we need the contents of the mounted build volume from secret placed into config.json + RUN cp /opt/app-root/auth/.dockerconfigjson /tmp/auth/config.json + RUN DOCKER_CONFIG=/tmp/auth /bin/opm --skip-tls-verify render {{ sto_bundle_image_path }} {{ sgo_bundle_image_path }} --output=yaml >> /configs/index.yaml + + ENTRYPOINT ["/bin/opm"] + CMD ["serve", "/configs"] + # Set DC-specific label for the location of the DC root directory + # in the image + LABEL operators.operatorframework.io.index.configs.v1=/configs + type: Dockerfile + strategy: + dockerStrategy: + from: + kind: ImageStreamTag + name: ose-operator-registry:v4.12 + volumes: + - mounts: + - destinationPath: /opt/app-root/auth + name: pull-secret + source: + secret: + defaultMode: 420 + secretName: service-telemetry-framework-index-dockercfg + type: Secret + type: Docker + successfulBuildsHistoryLimit: 5 + +- name: Get builds of service-telemetry-framework-index + k8s_info: + api_version: build.openshift.io/v1 + kind: Build + namespace: "{{ namespace }}" + label_selectors: + - "build=service-telemetry-framework-index" + register: index_builds + +- when: index_builds.resources | length == 0 + block: + - name: Create index.yaml base for index image + template: + src: index-yaml.j2 + dest: working/service-telemetry-framework-index/index.yaml + + - name: Build service-telemetry-framework-index + command: oc start-build -n "{{ namespace }}" service-telemetry-framework-index --wait --from-dir working/service-telemetry-framework-index + +- name: Create CloudOps CatalogSource + k8s: + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: CatalogSource + metadata: + name: service-telemetry-framework-operators + namespace: "{{ namespace }}" + spec: + displayName: CloudOps Operators + image: "{{ stf_index_image_path }}" + publisher: CloudOps + sourceType: grpc + updateStrategy: + registryPoll: + interval: 1m diff --git a/build/stf-run-ci/tasks/main.yml b/build/stf-run-ci/tasks/main.yml index 3041d22ea..0f948c0d5 100644 --- a/build/stf-run-ci/tasks/main.yml +++ b/build/stf-run-ci/tasks/main.yml @@ -1,5 +1,7 @@ --- # tasks file for stf-run-ci + +# -- initial setup - name: Setup default values set_fact: branch: "{{ working_branch | default('master') }}" @@ -13,22 +15,45 @@ sg_bridge_image_path: "{{ __internal_registry_path }}/{{ namespace }}/sg-bridge:{{ sg_bridge_image_tag }}" prometheus_webhook_snmp_image_path: "{{ __internal_registry_path }}/{{ namespace }}/prometheus-webhook-snmp:{{ prometheus_webhook_snmp_image_tag }}" +- name: Set default image paths for bundle and index builds + set_fact: + sgo_bundle_image_path: "{{ __internal_registry_path }}/{{ namespace }}/smart-gateway-operator-bundle:{{ sgo_bundle_image_tag }}" + sto_bundle_image_path: "{{ __internal_registry_path }}/{{ namespace }}/service-telemetry-operator-bundle:{{ sto_bundle_image_tag }}" + stf_index_image_path: "{{ __internal_registry_path }}/{{ namespace }}/service-telemetry-framework-index:{{ stf_index_image_tag }}" + - name: Fail on mutually exclusive flags fail: msg: __deploy_from_bundles_enabled not currently supported with __local_build_enabled (but should be) when: __local_build_enabled | bool and __deploy_from_bundles_enabled | bool +- name: Fail when deploying from index image and local build disabled + fail: + msg: __deploy_from_index_enabled must also have __local_build_enabled + when: __deploy_from_index_enabled | bool and not __local_build_enabled | bool + +- name: Fail when deploying from index images and deployment from bundles also requested (mutually exclusive methods) + fail: + msg: __deploy_from_index_enabled can not be used with __deploy_from_bundles_enabled + when: __deploy_from_index_enabled | bool and __deploy_from_bundles_enabled | bool + - name: Get the list of nodes k8s_info: kind: Node register: node_info +- name: Get OCP version + shell: oc version -o yaml | grep openshiftVersion | awk '{print $2}' + register: ocp_ver + - name: Find out if we are using crc by looking at the node hostnames set_fact: is_crc: "{{ True if 'crc' in node_info.resources[0].metadata.labels[\"kubernetes.io/hostname\"] else False }}" +# -- prepare environment and cleanup - name: Clean up any existing global artifacts include_tasks: pre-clean.yml + tags: + - pre-clean - name: Setup supporting Operator subscriptions include_tasks: setup_base.yml @@ -41,10 +66,13 @@ when: base_dir | length == 0 - name: Get new operator sdk - when: __local_build_enabled | bool or __deploy_from_bundles_enabled | bool + when: __local_build_enabled | bool or __deploy_from_bundles_enabled | bool or __deploy_from_index_enabled | bool command: "{{ base_dir }}/get_new_operator_sdk.sh {{ new_operator_sdk_version }}" +# -- create artifacts - when: __local_build_enabled | bool + tags: + - create_builds block: - name: Setup supporting repositories include_tasks: clone_repos.yml @@ -76,7 +104,33 @@ tags: - deploy -- block: +- when: __deploy_from_index_enabled | bool + tags: + - create_bundles + block: + - name: Create base build list + set_fact: + bundle_build_list: + - { name: service-telemetry-operator-bundle, dockerfile_path: Dockerfile, image_reference_name: sto_bundle_image_path, working_build_dir: ./working/service-telemetry-operator-bundle } + - { name: smart-gateway-operator-bundle, dockerfile_path: Dockerfile, image_reference_name: sgo_bundle_image_path, working_build_dir: ./working/smart-gateway-operator-bundle } + + - debug: + var: bundle_build_list + + - name: Create bundle builds and artifacts + include_tasks: create_builds.yml + loop: "{{ bundle_build_list }}" + loop_control: + loop_var: artifact + tags: + - build + + - name: Create file-based catalog + include_tasks: create_catalog.yml + +# -- deploy +- when: not __local_build_enabled | bool + block: - name: Setup Service Telemetry Framework from supplied bundle URLs include_tasks: setup_stf_from_bundles.yml when: __deploy_from_bundles_enabled | bool @@ -85,12 +139,31 @@ include_tasks: setup_stf.yml when: not __deploy_from_bundles_enabled | bool - when: not __local_build_enabled | bool - +- when: __deploy_from_index_enabled | bool + name: Subscribe to locally built Service Telemetry Operator + k8s: + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + labels: + operators.coreos.com/service-telemetry-operator.service-telemetry: "" + name: service-telemetry-operator + namespace: "{{ namespace }}" + spec: + channel: unstable + installPlanApproval: Automatic + name: service-telemetry-operator + source: service-telemetry-framework-operators + sourceNamespace: "{{ namespace }}" + +# -- check if we're ready to instantiate - name: Pre-flight checks include_tasks: preflight_checks.yml -- block: +# -- create a ServiceTelemetry object to stand up the STF instance +- when: __deploy_stf | bool + block: - name: Deploy an instance of STF include_tasks: deploy_stf.yml @@ -103,5 +176,3 @@ - debug: var: validate_deployment.stdout_lines - - when: __deploy_stf | bool diff --git a/build/stf-run-ci/tasks/pre-clean.yml b/build/stf-run-ci/tasks/pre-clean.yml index af096f98b..5ed639a97 100644 --- a/build/stf-run-ci/tasks/pre-clean.yml +++ b/build/stf-run-ci/tasks/pre-clean.yml @@ -48,6 +48,47 @@ displayName: OperatorHub.io Operators publisher: OperatorHub.io +- name: Remove CloudOps CatalogSource if it is installed + k8s: + state: absent + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: CatalogSource + metadata: + name: service-telemetry-framework-operators + namespace: "{{ namespace }}" + spec: + displayName: CloudOps Operators + publisher: CloudOps + sourceType: grpc + +- name: Remove Service Telemetry Operator bundle build + k8s: + state: absent + api_version: build.openshift.io/v1 + kind: Build + namespace: "{{ namespace }}" + label_selectors: + - "build=service-telemetry-operator-bundle" + +- name: Remove Smart Gateway Operator bundle build + k8s: + state: absent + api_version: build.openshift.io/v1 + kind: Build + namespace: "{{ namespace }}" + label_selectors: + - "build=smart-gateway-operator-bundle" + +- name: Remove Service Telemetry Framework index build + k8s: + state: absent + api_version: build.openshift.io/v1 + kind: Build + namespace: "{{ namespace }}" + label_selectors: + - "build=service-telemetry-framework-index" + - name: Remove service-telemetry-operator-bundle CatalogSource (bundle deploy) k8s: state: absent diff --git a/build/stf-run-ci/tasks/setup_base.yml b/build/stf-run-ci/tasks/setup_base.yml index 3f5c790ee..4b8f41160 100644 --- a/build/stf-run-ci/tasks/setup_base.yml +++ b/build/stf-run-ci/tasks/setup_base.yml @@ -28,51 +28,80 @@ targetNamespaces: - "{{ namespace }}" -- block: - - name: Create openshift-cert-manager-operator namespace - k8s: - definition: - apiVersion: project.openshift.io/v1 - kind: Project - metadata: - name: openshift-cert-manager-operator - spec: - finalizers: - - kubernetes +- when: not __deploy_from_index_enabled | bool + block: + - name: Create openshift-cert-manager-operator namespace + k8s: + definition: + apiVersion: project.openshift.io/v1 + kind: Project + metadata: + name: openshift-cert-manager-operator + spec: + finalizers: + - kubernetes - - name: Create openshift-cert-manager-operator OperatorGroup - k8s: - definition: - apiVersion: operators.coreos.com/v1 - kind: OperatorGroup - metadata: - name: openshift-cert-manager-operator - namespace: openshift-cert-manager-operator - spec: {} + - name: Create openshift-cert-manager-operator OperatorGroup + k8s: + definition: + apiVersion: operators.coreos.com/v1 + kind: OperatorGroup + metadata: + name: openshift-cert-manager-operator + namespace: openshift-cert-manager-operator + spec: {} - - name: Get OCP version - shell: oc version -o yaml | grep openshiftVersion | awk '{print $2}' - register: ocp_ver + - name: Use tech-preview channel for cert_manager in older OCP versions + set_fact: + cert_manager_channel: tech-preview + when: ocp_ver.stdout is version('4.12', '<') - - name: Use tech-preview channel for cert_manager in older OCP versions - set_fact: - cert_manager_channel: tech-preview - when: ocp_ver.stdout is version('4.12', '<') + - name: Subscribe to Cert Manager for OpenShift Operator + k8s: + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + name: openshift-cert-manager-operator + namespace: openshift-cert-manager-operator + spec: + channel: "{{ cert_manager_channel }}" + installPlanApproval: Automatic + name: openshift-cert-manager-operator + source: redhat-operators + sourceNamespace: openshift-marketplace - - name: Subscribe to Cert Manager for OpenShift Operator - k8s: - definition: - apiVersion: operators.coreos.com/v1alpha1 - kind: Subscription - metadata: - name: openshift-cert-manager-operator - namespace: openshift-cert-manager-operator - spec: - channel: "{{ cert_manager_channel }}" - installPlanApproval: Automatic - name: openshift-cert-manager-operator - source: redhat-operators - sourceNamespace: openshift-marketplace + - name: Subscribe to AMQ Interconnect Operator + k8s: + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + name: amq7-interconnect-operator + namespace: "{{ namespace }}" + spec: + channel: 1.10.x + installPlanApproval: Automatic + name: amq7-interconnect-operator + source: redhat-operators + sourceNamespace: openshift-marketplace + + - name: Subscribe to Prometheus Operator + k8s: + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + name: prometheus + namespace: "{{ namespace }}" + spec: + channel: beta + installPlanApproval: Automatic + name: prometheus + source: community-operators + sourceNamespace: openshift-marketplace + when: + - __service_telemetry_observability_strategy == "use_community" - name: Subscribe to Elastic Cloud on Kubernetes Operator k8s: @@ -91,38 +120,6 @@ when: - __service_telemetry_observability_strategy in ['use_community', 'use_hybrid'] -- name: Subscribe to AMQ Interconnect Operator - k8s: - definition: - apiVersion: operators.coreos.com/v1alpha1 - kind: Subscription - metadata: - name: amq7-interconnect-operator - namespace: "{{ namespace }}" - spec: - channel: 1.10.x - installPlanApproval: Automatic - name: amq7-interconnect-operator - source: redhat-operators - sourceNamespace: openshift-marketplace - -- name: Subscribe to Prometheus Operator - k8s: - definition: - apiVersion: operators.coreos.com/v1alpha1 - kind: Subscription - metadata: - name: prometheus - namespace: "{{ namespace }}" - spec: - channel: beta - installPlanApproval: Automatic - name: prometheus - source: community-operators - sourceNamespace: openshift-marketplace - when: - - __service_telemetry_observability_strategy == "use_community" - - block: # Upstream Source + Sub from https://github.com/rhobs/observability-operator/tree/main/hack/olm - name: Create CatalogSource for Red Hat Observability Operator diff --git a/build/stf-run-ci/tasks/setup_stf_local_build.yml b/build/stf-run-ci/tasks/setup_stf_local_build.yml index a7c3c2578..32fe68c9d 100644 --- a/build/stf-run-ci/tasks/setup_stf_local_build.yml +++ b/build/stf-run-ci/tasks/setup_stf_local_build.yml @@ -1,5 +1,7 @@ --- -# NOTE: the split filter cuts the image path (quay.io:443/infrawatch/container_image:tag_name) on the colon. Field :-1 (everything but the final field) is the image path, field -1 (final field) is the image tag +# WARNING: generation of bundles is not idempotent from the point of being able +# to use the generate_bundle_ content for use in other places + # --- Smart Gateway Operator --- - name: Generate Smart Gateway Operator CSV shell: @@ -15,9 +17,9 @@ ./generate_bundle.sh register: generate_bundle_sgo -- name: Results of bundle generation +- name: Results of SGO bundle generation debug: - var: generate_bundle_sgo.stdout_lines + var: generate_bundle_sgo.stdout - name: Replace namespace in SGO role binding replace: @@ -25,22 +27,24 @@ regexp: 'placeholder' replace: '{{ namespace }}' -- name: Load Smart Gateway Operator RBAC - command: oc apply -f working/smart-gateway-operator/deploy/{{ item }} -n "{{ namespace }}" - loop: - - service_account.yaml - - role.yaml - - role_binding.yaml - - olm-catalog/smart-gateway-operator/manifests/smartgateway.infra.watch_smartgateways_crd.yaml - - name: Replace namespace in SGO CSV replace: path: "{{ base_dir }}/working/smart-gateway-operator-bundle/manifests/smart-gateway-operator.clusterserviceversion.yaml" regexp: 'placeholder' replace: '{{ namespace }}' -- name: Load Smart Gateway Operator CSV - shell: oc apply -f working/smart-gateway-operator-bundle/manifests/smart-gateway-operator.clusterserviceversion.yaml -n "{{ namespace }}" +- when: not __deploy_from_index_enabled | bool + block: + - name: Load Smart Gateway Operator RBAC + command: oc apply -f working/smart-gateway-operator/deploy/{{ item }} -n "{{ namespace }}" + loop: + - service_account.yaml + - role.yaml + - role_binding.yaml + - olm-catalog/smart-gateway-operator/manifests/smartgateway.infra.watch_smartgateways_crd.yaml + + - name: Load Smart Gateway Operator CSV + shell: oc apply -f working/smart-gateway-operator-bundle/manifests/smart-gateway-operator.clusterserviceversion.yaml -n "{{ namespace }}" # --- Service Telemetry Operator --- - name: Generate Service Telemetry Operator CSV @@ -53,6 +57,11 @@ OPERATOR_IMAGE={{ sto_image_path | parse_image | quote }} \ OPERATOR_TAG={{ sto_image_path | parse_tag | quote }} \ ./generate_bundle.sh + register: generate_bundle_sto + +- name: Results of STO bundle generation + debug: + var: generate_bundle_sto.stdout - name: Replace namespace in STO role binding replace: @@ -60,7 +69,14 @@ regexp: 'placeholder' replace: '{{ namespace }}' -- block: +- name: Replace namespace in STO CSV + replace: + path: "{{ base_dir }}/working/service-telemetry-operator-bundle/manifests/service-telemetry-operator.clusterserviceversion.yaml" + regexp: 'placeholder' + replace: '{{ namespace }}' + +- when: not __deploy_from_index_enabled | bool + block: - name: Load Service Telemetry Operator RBAC command: oc apply -f ../deploy/{{ item }} -n "{{ namespace }}" loop: @@ -69,15 +85,9 @@ - role_binding.yaml - olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml - - name: Revert local change to role_binding.yaml - shell: git checkout -- "{{ base_dir }}/../deploy/role_binding.yaml" - -- name: Replace namespace in STO CSV - replace: - path: "{{ base_dir }}/working/service-telemetry-operator-bundle/manifests/service-telemetry-operator.clusterserviceversion.yaml" - regexp: 'placeholder' - replace: '{{ namespace }}' - -- name: Load Service Telemetry Operator CSV - shell: oc apply -f working/service-telemetry-operator-bundle/manifests/service-telemetry-operator.clusterserviceversion.yaml -n "{{ namespace }}" + - name: Load Service Telemetry Operator CSV + shell: oc apply -f working/service-telemetry-operator-bundle/manifests/service-telemetry-operator.clusterserviceversion.yaml -n "{{ namespace }}" +# cleanup +- name: Revert local change to role_binding.yaml + shell: git checkout -- "{{ base_dir }}/../deploy/role_binding.yaml" diff --git a/build/stf-run-ci/templates/config-json.j2 b/build/stf-run-ci/templates/config-json.j2 new file mode 100644 index 000000000..136015e9b --- /dev/null +++ b/build/stf-run-ci/templates/config-json.j2 @@ -0,0 +1 @@ +{"auths":{"image-registry.openshift-image-registry.svc:5000":<< internal_registry >>}} diff --git a/build/stf-run-ci/templates/index-yaml.j2 b/build/stf-run-ci/templates/index-yaml.j2 new file mode 100644 index 000000000..54731b8f9 --- /dev/null +++ b/build/stf-run-ci/templates/index-yaml.j2 @@ -0,0 +1,20 @@ +--- +defaultChannel: {{ sto_bundle_info.bundle_default_channel }} +name: service-telemetry-operator +schema: olm.package +--- +schema: olm.channel +package: service-telemetry-operator +name: {{ sto_bundle_info.bundle_channels }} +entries: + - name: service-telemetry-operator.v{{ sto_bundle_info.operator_bundle_version }} +--- +defaultChannel: {{ sgo_bundle_info.bundle_default_channel }} +name: smart-gateway-operator +schema: olm.package +--- +schema: olm.channel +package: smart-gateway-operator +name: {{ sgo_bundle_info.bundle_channels }} +entries: + - name: smart-gateway-operator.v{{ sgo_bundle_info.operator_bundle_version }} diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index be777dfec..33f281230 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -218,12 +218,6 @@ spec: name: servicemonitors.monitoring.coreos.com version: v1 version: v1beta1 - required: - - description: Creation of Smart Gateways - displayName: Smart Gateway - kind: SmartGateway - name: smartgateways.smartgateway.infra.watch - version: v2 description: Service Telemetry Operator for monitoring clouds displayName: Service Telemetry Operator icon: diff --git a/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml b/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml index 8edfa0da9..2905d1189 100644 --- a/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml @@ -1,3 +1,41 @@ properties: - type: olm.maxOpenShiftVersion value: "4.12" + - type: olm.constraint + value: + failureMessage: Require Smart Gateway for Service Telemetry Framework + all: + constraints: + - failureMessage: Package smart-gateway-operator is needed for Service Telemetry Framework + package: + packageName: smart-gateway-operator + versionRange: '>=5.0.0' + - type: olm.constraint + value: + failureMessage: Require data transport for Service Telemetry Framework + all: + constraints: + - failureMessage: Package amq7-interconnect-operator is needed for data transport with STF + package: + packageName: amq7-interconnect-operator + versionRange: '>=1.10.0' + - type: olm.constraint + value: + failureMessage: Require certificate management for Service Telemetry Framework + all: + constraints: + - failureMessage: Package openshift-cert-manager-operator is needed for AMQ Interconnect setup + package: + packageName: openshift-cert-manager-operator + versionRange: '>=1.10.0' + - type: olm.constraint + value: + failureMessage: Require Prometheus backend for data storage of metrics for Service Telemetry Framework + any: + constraints: + - package: + packageName: prometheus + versionRange: '>=0.56.0' + - package: + packageName: observability-operator + versionRange: '>=0.0.1' From 9eacaf4000b9bf20d21f87ebe48e3ba7613eb2ea Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Tue, 13 Jun 2023 12:07:35 -0400 Subject: [PATCH 32/95] Use the proper parameter for SNMP timeout (#435) The SNMP_TIMEOUT environment variable in the manifest_snmp_traps.j2 template for Service Telemetry Operator uses the wrong parameter, passing port instead of timeout value to the template. This change updates the template to use the appropriate parameter as input. Closes: rhbz#2213016 --- roles/servicetelemetry/templates/manifest_snmp_traps.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roles/servicetelemetry/templates/manifest_snmp_traps.j2 b/roles/servicetelemetry/templates/manifest_snmp_traps.j2 index b4a48445b..cee12c01b 100644 --- a/roles/servicetelemetry/templates/manifest_snmp_traps.j2 +++ b/roles/servicetelemetry/templates/manifest_snmp_traps.j2 @@ -28,7 +28,7 @@ spec: - name: SNMP_PORT value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.port }}" - name: SNMP_TIMEOUT - value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.port }}" + value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.timeout }}" - name: ALERT_OID_LABEL value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.alert_oid_label }}" - name: TRAP_OID_PREFIX From 5678f649469a5ba79b9faa829f9a03c47573fcbc Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Fri, 7 Jul 2023 10:41:32 -0400 Subject: [PATCH 33/95] Support External Elasticsearch (#439) * Support External Elasticsearch * Deprecate existing STF-deployed ES * New CRD properties for external ES * Adjust SG and Grafana to use new vars * Fix servicetelemetry_vars in grafana ds * Fix grafana and event SG conditions * Fixed ansible snake_case vs camelCase and s/ElasticS/Elastics/ * Correct apiVersion * Swap [] for () in CRD * regenerated olm-catalog content * Set default TLS server name from URL * Fix observability_strategy: none * More fixes to observability_strategy: none * Move new vars into a "forwarding" section and remove the ext_ * Updates to CRD/CR for variable refactor * Move tls_server_name logic into a task for clarity * Fixing lint * Update deprecation comments * Update deprecation comments in the OLM manifests * Make hostUrl defaults match legacy for easy migration --- .../infra.watch_servicetelemetrys_crd.yaml | 46 +++++++++---- ...fra.watch_v1beta1_servicetelemetry_cr.yaml | 7 ++ .../infra.watch_servicetelemetrys_crd.yaml | 68 ++++++++++++++----- ...emetry-operator.clusterserviceversion.yaml | 8 +++ roles/servicetelemetry/defaults/main.yml | 11 +++ .../tasks/base_smartgateway.yml | 5 +- .../tasks/component_clouds.yml | 59 ++++++++++++---- .../tasks/component_grafana.yml | 14 ---- roles/servicetelemetry/tasks/main.yml | 24 +++---- .../templates/manifest_grafana_ds.j2 | 16 ++--- .../templates/manifest_smartgateway_events.j2 | 25 +++---- .../vars/dummy_user_certs.yml | 56 +++++++++++++++ 12 files changed, 252 insertions(+), 87 deletions(-) create mode 100644 roles/servicetelemetry/vars/dummy_user_certs.yml diff --git a/deploy/crds/infra.watch_servicetelemetrys_crd.yaml b/deploy/crds/infra.watch_servicetelemetrys_crd.yaml index 5b4a2f9a0..5821808aa 100644 --- a/deploy/crds/infra.watch_servicetelemetrys_crd.yaml +++ b/deploy/crds/infra.watch_servicetelemetrys_crd.yaml @@ -167,49 +167,71 @@ spec: description: Events related backend configuration. properties: elasticsearch: - description: Events storage backend ElasticSearch + description: Events storage backend Elasticsearch properties: enabled: - description: Enable ElasticSearch as a storage backend for events + description: Enable Elasticsearch as a storage backend for events type: boolean + forwarding: + description: Configuration for where to forward events + type: object + properties: + hostUrl: + description: URL of Elasticsearch HTTP(S) endpoint + type: string + tlsServerName: + description: (if required) Server Name expected to match the certificate presented by the endpoint + type: string + tlsSecretName: + description: (if required) Name of the secret that stores the CA cert and client cert/key + type: string + userSecretName: + description: (if required) Name of the secret that stores the Basic Auth credentials + type: string + useBasicAuth: + description: Whether to provide HTTP Basic Auth headers + type: boolean + useTls: + description: Whether to enable TLS + type: boolean version: - description: Version of ElasticSearch to deploy. Elasticsearch licensing has changed as of version 7.11. See https://www.elastic.co/pricing/faq/licensing for details. + description: (DEPRECATED - Use forwarding params after STF 1.5.3) Version of Elasticsearch to deploy. Elasticsearch licensing has changed as of version 7.11. See https://www.elastic.co/pricing/faq/licensing for details. type: string nodeCount: - description: Elasticsearch node count + description: (DEPRECATED - Use forwarding params after STF 1.5.3) Elasticsearch node count type: string storage: - description: Events storage configuration for ElasticSearch + description: (DEPRECATED - Use forwarding params after STF 1.5.3) Events storage configuration for Elasticsearch properties: strategy: - description: Storage strategy. One of 'ephemeral' or 'persistent'. Persistent storage must be made available by the platform. + description: (DEPRECATED - Use forwarding params after STF 1.5.3) Storage strategy. One of 'ephemeral' or 'persistent'. Persistent storage must be made available by the platform. type: string enum: - ephemeral - persistent persistent: - description: Persistent storage configuration for ElasticSearch + description: (DEPRECATED - Use forwarding params after STF 1.5.3) Persistent storage configuration for Elasticsearch properties: storageClass: - description: Storage class name used for ElasticSearch PVC + description: (DEPRECATED - Use forwarding params after STF 1.5.3) Storage class name used for Elasticsearch PVC type: string storageSelector: - description: Storage selector definition for ElasticSearch + description: (DEPRECATED - Use forwarding params after STF 1.5.3) Storage selector definition for Elasticsearch type: string pvcStorageRequest: - description: How much storage space to request in the PVC + description: (DEPRECATED - Use forwarding params after STF 1.5.3) How much storage space to request in the PVC type: string type: object type: object certificates: properties: endpointCertDuration: - description: The requested 'duration' (i.e. lifetime) of the ElasticSearch endpoint Certificate. + description: (DEPRECATED - Use forwarding params after STF 1.5.3) The requested 'duration' (i.e. lifetime) of the Elasticsearch endpoint Certificate. Minimum accepted duration is 1 hour. Value must be in units accepted by Go time.ParseDuration https://golang.org/pkg/time/#ParseDuration pattern: ^((([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ type: string caCertDuration: - description: The requested 'duration' (i.e. lifetime) of the ElasticSearch CA Certificate. + description: (DEPRECATED - Use forwarding params after STF 1.5.3) The requested 'duration' (i.e. lifetime) of the Elasticsearch CA Certificate. Minimum accepted duration is 1 hour. Value must be in units accepted by Go time.ParseDuration https://golang.org/pkg/time/#ParseDuration pattern: ^((([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ type: string diff --git a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml index 848e78190..683019f2d 100644 --- a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml +++ b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml @@ -36,6 +36,13 @@ spec: events: elasticsearch: enabled: false + forwarding: + hostUrl: https://elasticsearch-es-http:9200 + tlsServerName: "" + tlsSecretName: elasticsearch-es-cert + userSecretName: elasticsearch-es-elastic-user + useBasicAuth: true + useTls: true version: 7.16.1 storage: strategy: persistent diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml index 982b1710c..1841298e1 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml @@ -122,53 +122,88 @@ spec: description: Events related backend configuration. properties: elasticsearch: - description: Events storage backend ElasticSearch + description: Events storage backend Elasticsearch properties: certificates: properties: caCertDuration: - description: The requested 'duration' (i.e. lifetime) - of the ElasticSearch CA Certificate. Minimum accepted + description: (DEPRECATED - Use forwarding params after + STF 1.5.3) The requested 'duration' (i.e. lifetime) + of the Elasticsearch CA Certificate. Minimum accepted duration is 1 hour. Value must be in units accepted by Go time.ParseDuration https://golang.org/pkg/time/#ParseDuration pattern: ^((([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ type: string endpointCertDuration: - description: The requested 'duration' (i.e. lifetime) - of the ElasticSearch endpoint Certificate. Minimum + description: (DEPRECATED - Use forwarding params after + STF 1.5.3) The requested 'duration' (i.e. lifetime) + of the Elasticsearch endpoint Certificate. Minimum accepted duration is 1 hour. Value must be in units accepted by Go time.ParseDuration https://golang.org/pkg/time/#ParseDuration pattern: ^((([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ type: string type: object enabled: - description: Enable ElasticSearch as a storage backend + description: Enable Elasticsearch as a storage backend for events type: boolean + forwarding: + description: Configuration for where to forward events + properties: + hostUrl: + description: URL of Elasticsearch HTTP(S) endpoint + type: string + tlsSecretName: + description: (if required) Name of the secret that + stores the CA cert and client cert/key + type: string + tlsServerName: + description: (if required) Server Name expected to + match the certificate presented by the endpoint + type: string + useBasicAuth: + description: Whether to provide HTTP Basic Auth headers + type: boolean + useTls: + description: Whether to enable TLS + type: boolean + userSecretName: + description: (if required) Name of the secret that + stores the Basic Auth credentials + type: string + type: object nodeCount: - description: Elasticsearch node count + description: (DEPRECATED - Use forwarding params after + STF 1.5.3) Elasticsearch node count type: string storage: - description: Events storage configuration for ElasticSearch + description: (DEPRECATED - Use forwarding params after + STF 1.5.3) Events storage configuration for Elasticsearch properties: persistent: - description: Persistent storage configuration for - ElasticSearch + description: (DEPRECATED - Use forwarding params after + STF 1.5.3) Persistent storage configuration for + Elasticsearch properties: pvcStorageRequest: - description: How much storage space to request + description: (DEPRECATED - Use forwarding params + after STF 1.5.3) How much storage space to request in the PVC type: string storageClass: - description: Storage class name used for ElasticSearch - PVC + description: (DEPRECATED - Use forwarding params + after STF 1.5.3) Storage class name used for + Elasticsearch PVC type: string storageSelector: - description: Storage selector definition for ElasticSearch + description: (DEPRECATED - Use forwarding params + after STF 1.5.3) Storage selector definition + for Elasticsearch type: string type: object strategy: - description: Storage strategy. One of 'ephemeral' + description: (DEPRECATED - Use forwarding params after + STF 1.5.3) Storage strategy. One of 'ephemeral' or 'persistent'. Persistent storage must be made available by the platform. enum: @@ -177,7 +212,8 @@ spec: type: string type: object version: - description: Version of ElasticSearch to deploy. Elasticsearch + description: (DEPRECATED - Use forwarding params after + STF 1.5.3) Version of Elasticsearch to deploy. Elasticsearch licensing has changed as of version 7.11. See https://www.elastic.co/pricing/faq/licensing for details. type: string diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index 33f281230..b06ea07a2 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -44,6 +44,14 @@ metadata: "endpointCertDuration": "70080h" }, "enabled": false, + "forwarding": { + "hostUrl": "https://elasticsearch-es-http:9200", + "tlsSecretName": "elasticsearch-es-cert", + "tlsServerName": "", + "useBasicAuth": true, + "useTls": true, + "userSecretName": "elasticsearch-es-elastic-user" + }, "storage": { "persistent": { "pvcStorageRequest": "20Gi" diff --git a/roles/servicetelemetry/defaults/main.yml b/roles/servicetelemetry/defaults/main.yml index 981273821..4f4e1ac54 100644 --- a/roles/servicetelemetry/defaults/main.yml +++ b/roles/servicetelemetry/defaults/main.yml @@ -5,6 +5,10 @@ clouds_remove_on_missing: false observability_strategy: use_redhat +# These get auto-discovered, but the code is simpler if they are never undefined +prometheus_operator_api_string: "" +prometheus_operator_label: "" + servicetelemetry_defaults: high_availability: enabled: false @@ -46,6 +50,13 @@ servicetelemetry_defaults: events: elasticsearch: enabled: false + forwarding: + host_url: https://elasticsearch-es-http:9200 + tls_server_name: "" + tls_secret_name: elasticsearch-es-cert + user_secret_name: elasticsearch-es-elastic-user + use_basic_auth: true + use_tls: true version: 7.16.1 node_count: 1 storage: diff --git a/roles/servicetelemetry/tasks/base_smartgateway.yml b/roles/servicetelemetry/tasks/base_smartgateway.yml index 95c46c6d8..4d0cfdafd 100644 --- a/roles/servicetelemetry/tasks/base_smartgateway.yml +++ b/roles/servicetelemetry/tasks/base_smartgateway.yml @@ -8,4 +8,7 @@ - name: Deploy SG-specific ServiceMonitor for metrics SGs include_tasks: component_servicemonitor.yml - when: data_type == 'metrics' + when: + - data_type == 'metrics' + - has_monitoring_api | bool + - observability_strategy != 'none' diff --git a/roles/servicetelemetry/tasks/component_clouds.yml b/roles/servicetelemetry/tasks/component_clouds.yml index f1e9a4e5a..52c00f659 100644 --- a/roles/servicetelemetry/tasks/component_clouds.yml +++ b/roles/servicetelemetry/tasks/component_clouds.yml @@ -19,25 +19,68 @@ - this_cloud.metrics.collectors is iterable - name: Events Smart Gateway deployment + when: servicetelemetry_vars.backends.events.elasticsearch.enabled block: - - name: Lookup ElasticSearch BasicAuth + - name: Lookup Elasticsearch BasicAuth k8s_info: api_version: v1 kind: Secret namespace: '{{ ansible_operator_meta.namespace }}' - name: 'elasticsearch-es-elastic-user' + name: '{{ servicetelemetry_vars.backends.events.elasticsearch.forwarding.user_secret_name }}' register: elasticsearch_es_elastic_user - - name: Filter out ElasticSearch password for BasicAuth + - name: Set Elasticsearch user for BasicAuth set_fact: - elastic_pass: "{{ elasticsearch_es_elastic_user | json_query('resources[0].data.elastic') | b64decode }}" + elastic_user: "{{ elasticsearch_es_elastic_user.resources[0].data | dict2items | map(attribute='key') | list | first }}" + + - name: Set Elasticsearch password for BasicAuth + set_fact: + elastic_pass: "{{ elasticsearch_es_elastic_user.resources[0].data[elastic_user] | b64decode }}" no_log: true + - name: Set elastic_tls_server_name from forwarding config if set + set_fact: + elastic_tls_server_name: "{{ servicetelemetry_vars.backends.events.elasticsearch.forwarding.tls_server_name }}" + when: servicetelemetry_vars.backends.events.elasticsearch.forwarding.tls_server_name | length > 0 + + # This sets the server name based on the host part of the URL between // and : (https://elasticsearch-host:9200) + - name: Set elastic_tls_server_name by parsing the host_url if it's not set in the config + set_fact: + elastic_tls_server_name: "{{ servicetelemetry_vars.backends.events.elasticsearch.forwarding.host_url.rpartition('//')[-1].partition(':')[0] }}" + when: servicetelemetry_vars.backends.events.elasticsearch.forwarding.tls_server_name | length == 0 + + - name: Get the Elasticsearch TLS materials secret + k8s_info: + api_version: v1 + kind: Secret + namespace: '{{ ansible_operator_meta.namespace }}' + name: '{{ servicetelemetry_vars.backends.events.elasticsearch.forwarding.tls_secret_name }}' + register: es_certs + + - when: es_certs[0].data[user.key] is not defined or es_certs[0].data[user.crt] is not defined + block: + - name: Load dummy certs + include_vars: + file: dummy_user_certs.yml + + - name: Augment the secret with dummy TLS cert/key if no TLS user auth material provided + k8s: + definition: + apiVersion: v1 + kind: Secret + metadata: + name: '{{ servicetelemetry_vars.backends.events.elasticsearch.forwarding.tls_secret_name }}' + namespace: '{{ ansible_operator_meta.namespace }}' + stringData: + user.crt: '{{ elastic_user_cert_dummy }}' + user.key: '{{ elastic_user_key_dummy }}' + - name: Deploy Events Smart Gateway instance for each collector vars: data_type: 'events' manifest: './manifest_smartgateway_events.j2' this_smartgateway: "{{ ansible_operator_meta.name }}-{{ this_cloud.name }}-{{ this_collector.collector_type[:4] }}-event" + elasticsearch: "{{ servicetelemetry_vars.backends.events.elasticsearch | from_yaml }}" include_tasks: base_smartgateway.yml loop: "{{ this_cloud.events.collectors }}" loop_control: @@ -47,14 +90,6 @@ - this_cloud.events is defined - this_cloud.events.collectors is defined - this_cloud.events is iterable - # TODO: it should be possible to deploy the eventing SGs when ElasticSearch - # is not available, but currently the template for smartgateway_events - # expects to have information about a local ES instance on cluster. - # https://github.com/infrawatch/service-telemetry-operator/issues/274 - when: - - has_elasticsearch_api | bool - - servicetelemetry_vars.backends.events.elasticsearch.enabled - - observability_strategy in ['use_community', 'use_hybrid'] - name: Deploy Logs Smart Gateway instance vars: diff --git a/roles/servicetelemetry/tasks/component_grafana.yml b/roles/servicetelemetry/tasks/component_grafana.yml index 25e1e981f..be90e6517 100644 --- a/roles/servicetelemetry/tasks/component_grafana.yml +++ b/roles/servicetelemetry/tasks/component_grafana.yml @@ -61,20 +61,6 @@ set_fact: prom_basicauth_passwd: '{{ prometheus_secret.resources[0].data.password | b64decode }}' - - when: servicetelemetry_vars.backends.events.elasticsearch.enabled - block: - - name: Retrieve elastic search secret - k8s_info: - api_version: v1 - kind: Secret - name: elasticsearch-es-elastic-user - namespace: '{{ ansible_operator_meta.namespace }}' - register: es_secret - - - name: Decode elasticsearch password - set_fact: - elasticsearch_pass: '{{ es_secret.resources[0].data.elastic | b64decode }}' - # Lookup existing datasources - name: Remove legacy datasources k8s: diff --git a/roles/servicetelemetry/tasks/main.yml b/roles/servicetelemetry/tasks/main.yml index 937fe7828..11dfc1629 100644 --- a/roles/servicetelemetry/tasks/main.yml +++ b/roles/servicetelemetry/tasks/main.yml @@ -40,7 +40,7 @@ prometheus_operator_label: 'app.kubernetes.io/managed-by: observability-operator' when: observability_strategy in ['use_redhat', 'use_hybrid'] -- name: Check if we have monitoring.coreos.com API +- name: Check if we have the requested monitoring API set_fact: has_monitoring_api: "{{ True if (prometheus_operator_api_string | dirname) in api_groups else False }}" @@ -83,6 +83,17 @@ - has_loki_api | bool - observability_strategy in ['use_community', 'use_hybrid'] +# --> clouds +- name: Get data about clouds + debug: + var: servicetelemetry_vars.clouds + +- name: Loop through cloud instances to setup transport receivers + include_tasks: component_clouds.yml + loop: "{{ servicetelemetry_vars.clouds }}" + loop_control: + loop_var: this_cloud + # --> graphing - name: Check if we have integreatly.org API set_fact: @@ -100,17 +111,6 @@ - has_integreatly_api | bool - observability_strategy in ['use_community', 'use_hybrid'] -# --> clouds -- name: Get data about clouds - debug: - var: servicetelemetry_vars.clouds - -- name: Loop through cloud instances to setup transport receivers - include_tasks: component_clouds.yml - loop: "{{ servicetelemetry_vars.clouds }}" - loop_control: - loop_var: this_cloud - # Post deployment tasks - name: Post-setup include_tasks: post.yml diff --git a/roles/servicetelemetry/templates/manifest_grafana_ds.j2 b/roles/servicetelemetry/templates/manifest_grafana_ds.j2 index 69910918e..7ae2b392a 100644 --- a/roles/servicetelemetry/templates/manifest_grafana_ds.j2 +++ b/roles/servicetelemetry/templates/manifest_grafana_ds.j2 @@ -29,11 +29,11 @@ spec: access: proxy editable: true isDefault: false - url: 'https://elasticsearch-es-http:9200' + url: {{ servicetelemetry_vars.backends.events.elasticsearch.forwarding.host_url }} type: elasticsearch - basicAuth: true - basicAuthUser: elastic - basicAuthPassword: {{ elasticsearch_pass }} + basicAuth: {{ servicetelemetry_vars.backends.events.elasticsearch.forwarding.use_basic_auth }} + basicAuthUser: {{ elastic_user }} + basicAuthPassword: {{ elastic_pass }} database: collectd_* jsonData: tlsSkipVerify: true @@ -44,11 +44,11 @@ spec: access: proxy editable: true isDefault: false - url: 'https://elasticsearch-es-http:9200' + url: {{ servicetelemetry_vars.backends.events.elasticsearch.forwarding.host_url }} type: elasticsearch - basicAuth: true - basicAuthUser: elastic - basicAuthPassword: {{ elasticsearch_pass }} + basicAuth: {{ servicetelemetry_vars.backends.events.elasticsearch.forwarding.use_basic_auth }} + basicAuthUser: {{ elastic_user }} + basicAuthPassword: {{ elastic_pass }} database: ceilometer_* jsonData: tlsSkipVerify: true diff --git a/roles/servicetelemetry/templates/manifest_smartgateway_events.j2 b/roles/servicetelemetry/templates/manifest_smartgateway_events.j2 index 884b0ae0f..3a80ad92d 100644 --- a/roles/servicetelemetry/templates/manifest_smartgateway_events.j2 +++ b/roles/servicetelemetry/templates/manifest_smartgateway_events.j2 @@ -1,27 +1,28 @@ apiVersion: smartgateway.infra.watch/v2 kind: SmartGateway metadata: - name: '{{ this_smartgateway }}' - namespace: '{{ ansible_operator_meta.namespace }}' + name: {{ this_smartgateway }} + namespace: {{ ansible_operator_meta.namespace }} spec: {% if this_collector.debug_enabled is defined and this_collector.debug_enabled %} - logLevel: "debug" + logLevel: debug {% else %} - logLevel: "info" + logLevel: info {% endif %} handleErrors: true size: {{ smartgateway_deployment_size }} applications: - config: | - hostURL: https://elasticsearch-es-http.{{ ansible_operator_meta.namespace }}.svc.cluster.local:9200 - useTLS: true - tlsClientCert: /config/certs/tls.crt - tlsClientKey: /config/certs/tls.key + hostURL: {{ elasticsearch.forwarding.host_url }} + useTLS: {{ elasticsearch.forwarding.use_tls }} + tlsClientCert: /config/certs/user.crt + tlsClientKey: /config/certs/user.key tlsCaCert: /config/certs/ca.crt - tlsServerName: 'elasticsearch-es-http.{{ ansible_operator_meta.namespace }}.svc.cluster.local' - user: '{{ elastic_user | default('elastic') }}' - password: '{{ elastic_pass | default('') }}' - useBasicAuth: true + tlsServerName: {{ elastic_tls_server_name }} + tlsSecretName: {{ elasticsearch.forwarding.tls_secret_name }} + user: {{ elastic_user | default('elastic') }} + password: {{ elastic_pass | default('') }} + useBasicAuth: {{ elasticsearch.forwarding.use_basic_auth }} name: elasticsearch bridge: amqpUrl: amqp://{{ ansible_operator_meta.name }}-interconnect.{{ ansible_operator_meta.namespace }}.svc.cluster.local:5673/{{ this_collector.subscription_address }} diff --git a/roles/servicetelemetry/vars/dummy_user_certs.yml b/roles/servicetelemetry/vars/dummy_user_certs.yml new file mode 100644 index 000000000..1ac2c152f --- /dev/null +++ b/roles/servicetelemetry/vars/dummy_user_certs.yml @@ -0,0 +1,56 @@ +# These are required because sg-core always expects there to be a user cert + key, whether we need it for auth or not +# CN = STF DUMMY USER CERT - DO NOT USE +elastic_user_cert_dummy: | + -----BEGIN CERTIFICATE----- + MIIEAzCCAuugAwIBAgIUVwi6wEIGgmyQfZ8s1+oqaf+yTpcwDQYJKoZIhvcNAQEL + BQAwgZAxCzAJBgNVBAYTAlVTMRcwFQYDVQQIDA5Ob3J0aCBDYXJvbGluYTEQMA4G + A1UEBwwHUmFsZWlnaDEUMBIGA1UECgwLUmVkIEhhdCBJbmMxFTATBgNVBAsMDE9T + UCBDbG91ZG9wczEpMCcGA1UEAwwgU1RGIERVTU1ZIFVTRVIgQ0VSVCAtIERPIE5P + VCBVU0UwHhcNMjMwNjEyMTgxODQ1WhcNMjMwNjEzMTgxODQ1WjCBkDELMAkGA1UE + BhMCVVMxFzAVBgNVBAgMDk5vcnRoIENhcm9saW5hMRAwDgYDVQQHDAdSYWxlaWdo + MRQwEgYDVQQKDAtSZWQgSGF0IEluYzEVMBMGA1UECwwMT1NQIENsb3Vkb3BzMSkw + JwYDVQQDDCBTVEYgRFVNTVkgVVNFUiBDRVJUIC0gRE8gTk9UIFVTRTCCASIwDQYJ + KoZIhvcNAQEBBQADggEPADCCAQoCggEBANQU/9/BEJbuX2xJUozSbUvG7qlk6yEi + KcFjkUwnXT+131ho+UWUn29yuqXI60E+8trWsL3uFlMbGh9t2VRfbfNNZiqon197 + CfzqS596AP8HtTZZx0Qy4sZrPRs8ffR/3wMjp8kMj+2jPpMq0zngJ1efHK7Z6GSR + IveXbCCfPQU4tvT3aQ5JQkIWvIo7kuS/u9K6LvOspYP04YNLUZdMCJDNE8hSpEkv + KfG7ZL2cfWF1nsX5+qyU5aIrUS7RYd/HGMKvpA0/Lvzl5FBMZ0BCF00LmY1tjUzK + DhHR62g/IkRaq8rrjdE+H2isVgSAIPAvnC039ePE4OOsoqO+aYYWqEsCAwEAAaNT + MFEwHQYDVR0OBBYEFMKfKoCQcbkb9BBDxXAQjYLSUWtoMB8GA1UdIwQYMBaAFMKf + KoCQcbkb9BBDxXAQjYLSUWtoMA8GA1UdEwEB/wQFMAMBAf8wDQYJKoZIhvcNAQEL + BQADggEBAI9q3AhqodLtsbET7yoZ2vAj8fzJyu5FXyyEf9wVgsyoJ56H77s2yp+b + iixc/MI9jsLX9wXa8monxdbHu01rlB8c9OwbcAyAhlnBWYcPqVFz4bjqNw5sH2Jg + vqIaGncn62IZv3mvN1VpyKZp2MbZGJdbgU8X3h1C6DCLf4fToFlqiiZ/XVvbk824 + j/OZ9l6Eu8VVekIQXVp2m9PPndOuEsBIMBkiB/xf32RTiOYWSG5mp70fxD7n2of/ + yb7hY+fL/wlucqS4ryT+2307ouEcTmpDSjHwKZRUYUDBZ4TmxCx5LlkuTO9MRnRy + 9hCGFF1rI+K33F952hxjkNaSSZvt3lQ= + -----END CERTIFICATE----- + +elastic_user_key_dummy: | + -----BEGIN RSA PRIVATE KEY----- + MIIEpAIBAAKCAQEA1BT/38EQlu5fbElSjNJtS8buqWTrISIpwWORTCddP7XfWGj5 + RZSfb3K6pcjrQT7y2tawve4WUxsaH23ZVF9t801mKqifX3sJ/OpLn3oA/we1NlnH + RDLixms9Gzx99H/fAyOnyQyP7aM+kyrTOeAnV58crtnoZJEi95dsIJ89BTi29Pdp + DklCQha8ijuS5L+70rou86ylg/Thg0tRl0wIkM0TyFKkSS8p8btkvZx9YXWexfn6 + rJTloitRLtFh38cYwq+kDT8u/OXkUExnQEIXTQuZjW2NTMoOEdHraD8iRFqryuuN + 0T4faKxWBIAg8C+cLTf148Tg46yio75phhaoSwIDAQABAoIBABXMUsBcx6e7uHMY + 1jNDLZisSbt/c+tj54bJBRYetabmup4LrBNKw1hhIm4HyKZcIfn8Nw5OelzwXC7+ + y2ewp0xqmCWqTzcxHkWwjzVFBPUxhZ6ge6q20Dg0rYMvJIMM4Y8hCw3PDLwQG05l + CHDaaTDIWdpe61Pq1v07wxFXTJ5MlgjoIfDN3xCFhHOEpbNCl6yVie4irjmxItS9 + Xp1/tdqtq8xSAAo9wWGb9SjsOn/C/AMtxerdHFjv8QErrA/ta/5qXa3KdEnElHqc + 2HkGt5w5FcRXCwrUW1MwnBzwbK5kEZth3D/i41y/F4vjwYwPfHRh3AeOpDpul0XW + qH+8qQECgYEA9iiUvbepX4mnj7CIQGKlDRCvvhdCUBnIAgA9/L8WWZIDvAEl1Rka + avAIvLMCTzoAO+TframNef0dNJWOAo/WQ/ViiLaqg7gbGE2DPjLitk0XaeKl+XAv + ip0K1Qouzxv2FFJR4h9iDCWjRIeClKIhE1sEMyJk45qyR4bMx0jQZJMCgYEA3I+l + wOO0kLD2lk/t9JBiBSLUrr6/mkPkCT7wn9U7owwHuoPDJHYX8+7y2u8vow6fgQyD + Jvud8wQOV4owBOBNafBT8a3Vp3W1lLTm1r0jJ7qbVNuMAnXcj1S0Q3VNX/jvO6wn + q6Ddxqh9p9+tYSNzwnD5XqxLeZiHXWCE2fB1+GkCgYEAillwj9iD52BUvtu3GIjY + vykbvTkRWjfDQ+yi6kTz6M+6LZZvjv+W63eRUY1CxQiSTRdr6A0dqOxr17wenq38 + /SETikcwOuvkvpoCI5kx9sgJWse6BSHadouhJO+eM2VBv1YtE2wUDUOyKbgH2kXt + VRWYnKy+C3ZMsQrAWVlBVuUCgYBI6LNCMANgUR8yUPm3/oJocDseCLANrqOS6ttf + +nzcSP3FCglX5DHG0RY2iRqWLB9N6XTxTfvIeW7EQUneUsdEXc1h9rTJxn9fyO0F + zz/vwh/WzTxbE9r1BmsQYZZSQ1fRwfbbJTIqmUfwVmBZ2/5IKFBGm23XpDQbCezg + njxhAQKBgQD0lOpKtL8qz9gmqtkhDRe+EPHSX8rfirqqRrPUiwK7kAJeW2vtU8aa + hFT7lEDjb7ERyZfybIkTVVBipKx2yse9nE+1dPGIgZop3E1guDuF9aOAzIUd/+/s + CI7s/lIBZsPD3PyxXXRtsvN7iUv5tLvNFhfomB7miTYHE+MC5QHJVQ== + -----END RSA PRIVATE KEY----- \ No newline at end of file From 10bf140d4dfc58315019899ecb2f65e22dab6089 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Wed, 12 Jul 2023 15:49:55 -0400 Subject: [PATCH 34/95] Fix bug with supplied user.crt/key not persisting (#442) * Fix bug with supplied user.crt/key not persisting * Update component_clouds.yml --- roles/servicetelemetry/tasks/component_clouds.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roles/servicetelemetry/tasks/component_clouds.yml b/roles/servicetelemetry/tasks/component_clouds.yml index 52c00f659..2dd352c38 100644 --- a/roles/servicetelemetry/tasks/component_clouds.yml +++ b/roles/servicetelemetry/tasks/component_clouds.yml @@ -57,7 +57,7 @@ name: '{{ servicetelemetry_vars.backends.events.elasticsearch.forwarding.tls_secret_name }}' register: es_certs - - when: es_certs[0].data[user.key] is not defined or es_certs[0].data[user.crt] is not defined + - when: es_certs.resources[0].data["user.key"] is not defined or es_certs.resources[0].data["user.crt"] is not defined block: - name: Load dummy certs include_vars: From 483f898e9bed12f7d169834f4d0b01d934fe7ff7 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Thu, 13 Jul 2023 15:00:55 -0400 Subject: [PATCH 35/95] Test with "external" elasticsearch (#441) * Test with "external" elasticsearch * Strip all ES config except "enable" (for forwarding) * Create ECK subscription no matter the observability_strategy * Deploy ES from CI for events testing (Code copied/trimmed from STO) * Default to use_redhat for CI * Fixes from testing * ephemeral volume * wait for CRD to establish * Adjust smoketest to always test events * Test events to external ES in any observability_strategy mode * We no longer need the smoketest to know the observability_strategy at all * Update build/stf-run-ci/tasks/setup_elasticsearch.yml Co-authored-by: Leif Madsen * Apply spelling/caps suggestions from code review --------- Co-authored-by: Leif Madsen --- build/stf-run-ci/defaults/main.yml | 4 +- build/stf-run-ci/tasks/deploy_stf.yml | 9 ---- build/stf-run-ci/tasks/main.yml | 5 +- build/stf-run-ci/tasks/pre-clean.yml | 12 +++++ build/stf-run-ci/tasks/setup_base.yml | 12 ++++- .../stf-run-ci/tasks/setup_elasticsearch.yml | 32 ++++++++++++ .../templates/manifest_elasticsearch.j2 | 52 +++++++++++++++++++ .../tasks/component_elasticsearch.yml | 7 +++ tests/smoketest/smoketest.sh | 12 ++--- .../smoketest_collectd_entrypoint.sh | 37 ++++++------- 10 files changed, 141 insertions(+), 41 deletions(-) create mode 100644 build/stf-run-ci/tasks/setup_elasticsearch.yml create mode 100644 build/stf-run-ci/templates/manifest_elasticsearch.j2 diff --git a/build/stf-run-ci/defaults/main.yml b/build/stf-run-ci/defaults/main.yml index 341dc07c4..af7fcd499 100644 --- a/build/stf-run-ci/defaults/main.yml +++ b/build/stf-run-ci/defaults/main.yml @@ -27,7 +27,7 @@ __service_telemetry_snmptraps_trap_oid_prefix: "1.3.6.1.4.1.50495.15" __service_telemetry_snmptraps_trap_default_oid: "1.3.6.1.4.1.50495.15.1.2.1" __service_telemetry_snmptraps_trap_default_severity: "" __service_telemetry_logs_enabled: false -__service_telemetry_observability_strategy: use_hybrid +__service_telemetry_observability_strategy: use_redhat __service_telemetry_transports_certificates_endpoint_cert_duration: 70080h __service_telemetry_transports_certificates_ca_cert_duration: 70080h __internal_registry_path: image-registry.openshift-image-registry.svc:5000 @@ -37,6 +37,8 @@ __smart_gateway_bundle_image_path: default_operator_registry_image_base: registry.redhat.io/openshift4/ose-operator-registry default_operator_registry_image_tag: v4.12 +elasticsearch_version: 7.16.1 + sgo_image_tag: latest sto_image_tag: latest sg_core_image_tag: latest diff --git a/build/stf-run-ci/tasks/deploy_stf.yml b/build/stf-run-ci/tasks/deploy_stf.yml index c7a1219d0..2fddbcbc4 100644 --- a/build/stf-run-ci/tasks/deploy_stf.yml +++ b/build/stf-run-ci/tasks/deploy_stf.yml @@ -34,15 +34,6 @@ events: elasticsearch: enabled: {{ __service_telemetry_events_enabled }} - storage: - strategy: {{ "ephemeral" if __service_telemetry_storage_ephemeral_enabled else "persistent" }} - {% if __service_telemetry_storage_persistent_storage_class is defined %} - persistent: - storageClass: {{ __service_telemetry_storage_persistent_storage_class }} - {% endif %} - certificates: - endpointCertDuration: {{ __service_telemetry_events_certificates_endpoint_cert_duration }} - caCertDuration: {{ __service_telemetry_events_certificates_ca_cert_duration }} metrics: prometheus: enabled: {{ __service_telemetry_metrics_enabled }} diff --git a/build/stf-run-ci/tasks/main.yml b/build/stf-run-ci/tasks/main.yml index 0f948c0d5..f62739541 100644 --- a/build/stf-run-ci/tasks/main.yml +++ b/build/stf-run-ci/tasks/main.yml @@ -60,6 +60,9 @@ tags: - deploy +- name: Deploy ES for events testing + include_tasks: setup_elasticsearch.yml + - name: Set default base dir if not provided set_fact: base_dir: "{{ playbook_dir }}" @@ -89,7 +92,7 @@ - { name: prometheus-webhook-snmp, dockerfile_path: Dockerfile, image_reference_name: prometheus_webhook_snmp_image_path, working_build_dir: ./working/prometheus-webhook-snmp } - debug: - var: build_list + var: build_list - name: Create builds and artifacts include_tasks: create_builds.yml diff --git a/build/stf-run-ci/tasks/pre-clean.yml b/build/stf-run-ci/tasks/pre-clean.yml index 5ed639a97..24e35d0f6 100644 --- a/build/stf-run-ci/tasks/pre-clean.yml +++ b/build/stf-run-ci/tasks/pre-clean.yml @@ -119,3 +119,15 @@ kind: Project metadata: name: openshift-cert-manager-operator + +- name: Remove Elasticsearch + ignore_errors: True + k8s: + state: absent + wait: yes + definition: + apiVersion: elasticsearch.k8s.elastic.co/v1 + kind: Elasticsearch + metadata: + name: elasticsearch + namespace: "{{ namespace }}" \ No newline at end of file diff --git a/build/stf-run-ci/tasks/setup_base.yml b/build/stf-run-ci/tasks/setup_base.yml index 4b8f41160..01a8312a2 100644 --- a/build/stf-run-ci/tasks/setup_base.yml +++ b/build/stf-run-ci/tasks/setup_base.yml @@ -117,8 +117,16 @@ name: elasticsearch-eck-operator-certified source: certified-operators sourceNamespace: openshift-marketplace - when: - - __service_telemetry_observability_strategy in ['use_community', 'use_hybrid'] + +- name: Wait for Elasticsearch CRD to appear + k8s_info: + api_version: apiextensions.k8s.io/v1 + kind: CustomResourceDefinition + name: elasticsearches.elasticsearch.k8s.elastic.co + register: eckCRD + until: eckCRD.resources[0] is defined + retries: 5 + delay: 30 - block: # Upstream Source + Sub from https://github.com/rhobs/observability-operator/tree/main/hack/olm diff --git a/build/stf-run-ci/tasks/setup_elasticsearch.yml b/build/stf-run-ci/tasks/setup_elasticsearch.yml new file mode 100644 index 000000000..20638fd51 --- /dev/null +++ b/build/stf-run-ci/tasks/setup_elasticsearch.yml @@ -0,0 +1,32 @@ +- name: Set default ElasticSearch manifest + set_fact: + elasticsearch_manifest: "{{ lookup('template', './manifest_elasticsearch.j2') | from_yaml }}" + when: elasticsearch_manifest is not defined + +- name: Create an instance of Elasticsearch + k8s: + state: present + definition: + '{{ elasticsearch_manifest }}' + +- name: Look up the newly generated ES Certs + k8s_info: + api_version: v1 + kind: Secret + name: elasticsearch-es-http-certs-public + namespace: '{{ namespace }}' + register: elasticsearch_certs + until: elasticsearch_certs.resources[0].data["ca.crt"] is defined + retries: 5 + delay: 30 + +- name: Copy the ES CA cert to our TLS secret + k8s: + definition: + apiVersion: v1 + kind: Secret + metadata: + name: elasticsearch-es-cert + namespace: '{{ namespace }}' + data: + ca.crt: '{{ elasticsearch_certs.resources[0].data["ca.crt"] }}' \ No newline at end of file diff --git a/build/stf-run-ci/templates/manifest_elasticsearch.j2 b/build/stf-run-ci/templates/manifest_elasticsearch.j2 new file mode 100644 index 000000000..e2e50c6f4 --- /dev/null +++ b/build/stf-run-ci/templates/manifest_elasticsearch.j2 @@ -0,0 +1,52 @@ +apiVersion: elasticsearch.k8s.elastic.co/v1 +kind: Elasticsearch +metadata: + name: elasticsearch + namespace: {{ namespace }} +spec: + auth: {} + http: + service: + metadata: {} + spec: {} + tls: + certificate: {} + monitoring: + logs: {} + metrics: {} + nodeSets: + - count: 1 + name: default + config: + node.roles: + - master + - data + - ingest + node.store.allow_mmap: true + podTemplate: + metadata: + labels: + tuned.openshift.io/elasticsearch: elasticsearch + spec: + containers: + - name: elasticsearch + resources: + limits: + cpu: "2" + memory: 4Gi + requests: + cpu: "1" + memory: 4Gi + volumes: + - emptyDir: {} + name: elasticsearch-data + transport: + service: + metadata: {} + spec: {} + tls: + certificate: {} + certificateAuthorities: {} + updateStrategy: + changeBudget: {} + version: {{ elasticsearch_version }} \ No newline at end of file diff --git a/roles/servicetelemetry/tasks/component_elasticsearch.yml b/roles/servicetelemetry/tasks/component_elasticsearch.yml index 87b58be02..0a34b64c0 100644 --- a/roles/servicetelemetry/tasks/component_elasticsearch.yml +++ b/roles/servicetelemetry/tasks/component_elasticsearch.yml @@ -1,3 +1,10 @@ +# DEPRECATED +# +# This code in the servicetelemetry role is deprecated as of STF 1.5.3, after +# which only forwarding to an external elasticsearch is supported. +# +# The code lives on in the stf-run-ci role for CI testing of the forwarding +# feature. - name: Lookup template debug: msg: "{{ lookup('template', './manifest_elasticsearch.j2') | from_yaml }}" diff --git a/tests/smoketest/smoketest.sh b/tests/smoketest/smoketest.sh index 2ec35d46f..8a801c004 100755 --- a/tests/smoketest/smoketest.sh +++ b/tests/smoketest/smoketest.sh @@ -29,8 +29,6 @@ fi CLEANUP=${CLEANUP:-true} -OBSERVABILITY_STRATEGY="${OBSERVABILITY_STRATEGY:-use_redhat}" - for ((i=1; i<=NUMCLOUDS; i++)); do NAME="smoke${i}" CLOUDNAMES+=("${NAME}") @@ -66,7 +64,7 @@ oc create configmap stf-smoketest-ceilometer-entrypoint-script --from-file "${RE echo "*** [INFO] Creating smoketest jobs..." oc delete job -l app=stf-smoketest for NAME in "${CLOUDNAMES[@]}"; do - oc create -f <(sed -e "s/<>/${NAME}/;s/<>/${ELASTICSEARCH_AUTH_PASS}/;s/<>/${PROMETHEUS_AUTH_PASS}/;s/<>/${OBSERVABILITY_STRATEGY}/" ${REL}/smoketest_job.yaml.template) + oc create -f <(sed -e "s/<>/${NAME}/;s/<>/${ELASTICSEARCH_AUTH_PASS}/;s/<>/${PROMETHEUS_AUTH_PASS}/" ${REL}/smoketest_job.yaml.template) done echo "*** [INFO] Triggering an alertmanager notification..." @@ -154,11 +152,9 @@ echo "*** [INFO] Logs from prometheus..." oc logs "$(oc get pod -l prometheus=default -o jsonpath='{.items[0].metadata.name}')" -c prometheus echo -if [ "$OBSERVABILITY_STRATEGY" != "use_redhat" ]; then - echo "*** [INFO] Logs from elasticsearch..." - oc logs "$(oc get pod -l common.k8s.elastic.co/type=elasticsearch -o jsonpath='{.items[0].metadata.name}')" - echo -fi +echo "*** [INFO] Logs from elasticsearch..." +oc logs "$(oc get pod -l common.k8s.elastic.co/type=elasticsearch -o jsonpath='{.items[0].metadata.name}')" +echo echo "*** [INFO] Logs from snmp webhook..." oc logs "$(oc get pod -l app=default-snmp-webhook -o jsonpath='{.items[0].metadata.name}')" diff --git a/tests/smoketest/smoketest_collectd_entrypoint.sh b/tests/smoketest/smoketest_collectd_entrypoint.sh index 00fe92145..a8ce1103f 100755 --- a/tests/smoketest/smoketest_collectd_entrypoint.sh +++ b/tests/smoketest/smoketest_collectd_entrypoint.sh @@ -62,31 +62,28 @@ grep -E '"result":\[{"metric":{"__name__":"sensubility_container_health_status", metrics_result=$((metrics_result || $?)) echo; echo -if [ "$OBSERVABILITY_STRATEGY" != "use_redhat" ]; then - echo "*** [INFO] Get documents for this test from ElasticSearch..." - DOCUMENT_HITS=$(curl -sk -u "elastic:${ELASTICSEARCH_AUTH_PASS}" -X GET "${ELASTICSEARCH}/_search" -H 'Content-Type: application/json' -d'{ - "query": { - "bool": { - "filter": [ - { "term" : { "labels.instance" : { "value" : "'${CLOUDNAME}'", "boost" : 1.0 } } }, - { "range" : { "generated" : { "gte" : "now-1m", "lt" : "now" } } } - ] - } +echo "*** [INFO] Get documents for this test from Elasticsearch..." +DOCUMENT_HITS=$(curl -sk -u "elastic:${ELASTICSEARCH_AUTH_PASS}" -X GET "${ELASTICSEARCH}/_search" -H 'Content-Type: application/json' -d'{ + "query": { + "bool": { + "filter": [ + { "term" : { "labels.instance" : { "value" : "'${CLOUDNAME}'", "boost" : 1.0 } } }, + { "range" : { "generated" : { "gte" : "now-1m", "lt" : "now" } } } + ] } - }' | python3 -c "import sys, json; parsed = json.load(sys.stdin); print(parsed['hits']['total']['value'])") + } +}' | python3 -c "import sys, json; parsed = json.load(sys.stdin); print(parsed['hits']['total']['value'])") - echo "*** [INFO] Found ${DOCUMENT_HITS} documents" - echo; echo +echo "*** [INFO] Found ${DOCUMENT_HITS} documents" +echo; echo - # check if we got documents back for this test - events_result=1 - if [ "$DOCUMENT_HITS" -gt "0" ]; then - events_result=0 - fi -else - events_result=0 +# check if we got documents back for this test +events_result=1 +if [ "$DOCUMENT_HITS" -gt "0" ]; then + events_result=0 fi + echo "[INFO] Verification exit codes (0 is passing, non-zero is a failure): events=${events_result} metrics=${metrics_result}" echo; echo From f88fa1904a65269c43d90efc9b691ce68dc2d481 Mon Sep 17 00:00:00 2001 From: Victoria Martinez de la Cruz Date: Tue, 25 Jul 2023 17:56:57 +0200 Subject: [PATCH 36/95] Revert cert-manager channel to "tech-preview" (#444) We recently changed the channel being used to retrieve cert-manager from "tech-preview" to "stable-v1" Since we only support tech-preview for up to STF 1.5.2, we should stick to test using that version --- build/stf-run-ci/defaults/main.yml | 2 -- build/stf-run-ci/tasks/setup_base.yml | 7 +------ 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/build/stf-run-ci/defaults/main.yml b/build/stf-run-ci/defaults/main.yml index af7fcd499..af263f7b8 100644 --- a/build/stf-run-ci/defaults/main.yml +++ b/build/stf-run-ci/defaults/main.yml @@ -66,5 +66,3 @@ sg_bridge_repository: https://github.com/infrawatch/sg-bridge prometheus_webhook_snmp_repository: https://github.com/infrawatch/prometheus-webhook-snmp base_dir: '' - -cert_manager_channel: stable-v1 diff --git a/build/stf-run-ci/tasks/setup_base.yml b/build/stf-run-ci/tasks/setup_base.yml index 01a8312a2..a90071b5b 100644 --- a/build/stf-run-ci/tasks/setup_base.yml +++ b/build/stf-run-ci/tasks/setup_base.yml @@ -51,11 +51,6 @@ namespace: openshift-cert-manager-operator spec: {} - - name: Use tech-preview channel for cert_manager in older OCP versions - set_fact: - cert_manager_channel: tech-preview - when: ocp_ver.stdout is version('4.12', '<') - - name: Subscribe to Cert Manager for OpenShift Operator k8s: definition: @@ -65,7 +60,7 @@ name: openshift-cert-manager-operator namespace: openshift-cert-manager-operator spec: - channel: "{{ cert_manager_channel }}" + channel: "tech-preview" installPlanApproval: Automatic name: openshift-cert-manager-operator source: redhat-operators From 0dfe245f6c3b60033dbba70c4f24c4b44a5578f5 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Fri, 25 Aug 2023 14:37:09 -0400 Subject: [PATCH 37/95] Fix infinite reconcile loop (#445) * Fix infinite reconcile loop Fix a potential infinite reconcile loop when deploying Grafana due to updated hashing algorithm which changes the hash contents each time the contents is hashed. Adds a fix which results in skipping the update of the default-grafana-htpasswd Secret object to avoid the situation. Also makes a small add to the when clause list for deploying Elasticsearch so that only when it is enabled are the tasks included. Closes: STF-1499 Signed-off-by: Leif Madsen * Logic change so Secret updates on parameter change Update the htpasswd secret when the ServiceTelemetry CRD is updated for a basicAuth password change. Will work with existing deployments, adding the annotation to an existing secret to get everything aligned. Signed-off-by: Leif Madsen * Syntax and lint fix * Remove TODO for creating dashboards Remove TODO for creating dashboards as management of dashboards is outside the scope of STF. * Update grafana htpasswd management logic Update the logic for grafana htpasswd management to no longer use sha256 hashes and instead use bcrypt. Update the logic so that we create the initial contents of the secret before writing it to the first secret so that we can avoid a second Secret k8s_info call. Create the secret if it isn't already created. If it is created, then read the existing salt and generate a hash with it. Update the secret and the annotation for grafana deployment. If no changes are found, then the secret and the annotation will not be updated, resulting in no grafana restart. NOTE: this commit has debugging logic in it to show how it works. Following commit removes it and won't be available in future squashed history. See original pull-request. * Remove debug logic * Adjust Ansible play names --------- Signed-off-by: Leif Madsen --- .../tasks/component_grafana.yml | 115 ++++++++++++------ roles/servicetelemetry/tasks/main.yml | 13 +- .../templates/manifest_grafana.j2 | 2 +- .../vars/dummy_user_certs.yml | 2 +- 4 files changed, 81 insertions(+), 51 deletions(-) diff --git a/roles/servicetelemetry/tasks/component_grafana.yml b/roles/servicetelemetry/tasks/component_grafana.yml index be90e6517..068507610 100644 --- a/roles/servicetelemetry/tasks/component_grafana.yml +++ b/roles/servicetelemetry/tasks/component_grafana.yml @@ -7,8 +7,43 @@ kind: Route name: 'grafana-route' -- name: Create htpasswd secret for grafana admin +- name: Check for existing grafana htpasswd secret no_log: true + k8s_info: + api_version: v1 + kind: Secret + namespace: '{{ ansible_operator_meta.namespace }}' + name: '{{ ansible_operator_meta.name }}-grafana-htpasswd' + register: grafana_htpasswd_secret + +- block: + - name: Parse current Grafana htpasswd salt from secret + no_log: true + set_fact: + grafana_htpasswd_salt: "{{ ((grafana_htpasswd_secret.resources[0].data.auth | b64decode).split('$')[-1])[0:22] }}" + rescue: + - name: Generate initial Grafana htpasswd bcrypt string from grafana.admin_password + no_log: true + set_fact: + init_grafana_htpasswd_bcrypt_string: "{{ (servicetelemetry_vars.graphing.grafana.admin_password | password_hash('bcrypt') | replace('$2b$','$2y$', 1)) }}" + + - name: Read newly generated Grafana htpasswd salt + no_log: true + set_fact: + grafana_htpasswd_salt: "{{ (init_grafana_htpasswd_bcrypt_string.split('$')[-1])[0:22] }}" + always: + - name: Generate Grafana htpasswd bcrypt string from grafana.adminPassword using salt + no_log: true + set_fact: + grafana_htpasswd_bcrypt_string: "{{ (servicetelemetry_vars.graphing.grafana.admin_password | password_hash('bcrypt', grafana_htpasswd_salt) | replace('$2b$','$2y$', 1)) }}" + + - name: Generate Grafana auth string from grafana.adminUser and grafana_htpasswd_bcrypt_string + no_log: true + set_fact: + grafana_htpasswd_auth_string: "{{ servicetelemetry_vars.graphing.grafana.admin_user }}:{{ grafana_htpasswd_bcrypt_string }}" + +- name: Create or patch htpasswd secret for grafana admin + no_log: false k8s: definition: api_version: v1 @@ -18,7 +53,7 @@ namespace: '{{ ansible_operator_meta.namespace }}' type: Opaque stringData: - auth: '{{ servicetelemetry_vars.graphing.grafana.admin_user }}:{{ servicetelemetry_vars.graphing.grafana.admin_password | password_hash("bcrypt") | replace("$2b$","$2y$", 1) }}' + auth: '{{ grafana_htpasswd_auth_string }}' - name: Lookup template debug: @@ -34,49 +69,49 @@ state: '{{ "present" if servicetelemetry_vars.graphing.enabled else "absent" }}' definition: '{{ grafana_manifest }}' - when: servicetelemetry_vars.graphing.enabled - when: servicetelemetry_vars.graphing.enabled block: - - when: servicetelemetry_vars.backends.metrics.prometheus.enabled - block: - - name: Retrieve configmap for OAUTH CA certs - k8s_info: - api_version: v1 - kind: ConfigMap - name: serving-certs-ca-bundle - namespace: '{{ ansible_operator_meta.namespace }}' - register: serving_certs_ca + - when: servicetelemetry_vars.backends.metrics.prometheus.enabled + block: + - name: Retrieve configmap for OAUTH CA certs + k8s_info: + api_version: v1 + kind: ConfigMap + name: serving-certs-ca-bundle + namespace: '{{ ansible_operator_meta.namespace }}' + register: serving_certs_ca - - name: Retrieve prometheus secret - k8s_info: - api_version: v1 - kind: Secret - namespace: '{{ ansible_operator_meta.namespace }}' - name: '{{ ansible_operator_meta.name }}-prometheus-htpasswd' - register: prometheus_secret + - name: Retrieve prometheus secret + k8s_info: + api_version: v1 + kind: Secret + namespace: '{{ ansible_operator_meta.namespace }}' + name: '{{ ansible_operator_meta.name }}-prometheus-htpasswd' + register: prometheus_secret - - name: Decode prometheus password - no_log: true - set_fact: - prom_basicauth_passwd: '{{ prometheus_secret.resources[0].data.password | b64decode }}' + - name: Decode prometheus password + no_log: true + set_fact: + prom_basicauth_passwd: '{{ prometheus_secret.resources[0].data.password | b64decode }}' - # Lookup existing datasources - - name: Remove legacy datasources - k8s: - api_version: integreatly.org/v1alpha1 - name: '{{ ansible_operator_meta.name }}-ds-prometheus' - kind: GrafanaDataSource - namespace: '{{ ansible_operator_meta.namespace }}' - state: absent + # Lookup existing datasources + - name: Remove legacy datasources + k8s: + api_version: integreatly.org/v1alpha1 + name: '{{ ansible_operator_meta.name }}-ds-prometheus' + kind: GrafanaDataSource + namespace: '{{ ansible_operator_meta.namespace }}' + state: absent - - name: Set datasources - set_fact: - ds_manifest: "{{ lookup('template', './manifest_grafana_ds.j2') | from_yaml }}" - when: ds_manifest is not defined + # NOTE: this can fail if you enable grafana without prometheus due to missing resources referenced in the template + - name: Set datasources + set_fact: + ds_manifest: "{{ lookup('template', './manifest_grafana_ds.j2') | from_yaml }}" + when: ds_manifest is not defined - - name: Create the datasources - k8s: - state: '{{ "present" if servicetelemetry_vars.graphing.enabled else "absent" }}' - definition: - '{{ ds_manifest }}' + - name: Create the datasources + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.enabled else "absent" }}' + definition: + '{{ ds_manifest }}' diff --git a/roles/servicetelemetry/tasks/main.yml b/roles/servicetelemetry/tasks/main.yml index 11dfc1629..ce615f25b 100644 --- a/roles/servicetelemetry/tasks/main.yml +++ b/roles/servicetelemetry/tasks/main.yml @@ -71,6 +71,7 @@ - has_elasticsearch_api | bool - has_certmanager_api | bool - observability_strategy in ['use_community', 'use_hybrid'] + - servicetelemetry_vars.backends.events.elasticsearch.enabled | bool # --> backends.logs - name: Check if we have loki API @@ -99,17 +100,11 @@ set_fact: has_integreatly_api: "{{ True if 'integreatly.org' in api_groups else False }}" -- name: Deploy graphing - block: - - name: Create Grafana instance - include_tasks: component_grafana.yml - -# TODO -# - name: Create dashboards -# include_tasks: component_dashboards.yml - when: +- when: - has_integreatly_api | bool - observability_strategy in ['use_community', 'use_hybrid'] + name: Start graphing component plays + include_tasks: component_grafana.yml # Post deployment tasks - name: Post-setup diff --git a/roles/servicetelemetry/templates/manifest_grafana.j2 b/roles/servicetelemetry/templates/manifest_grafana.j2 index d2b26eb34..792f7065c 100644 --- a/roles/servicetelemetry/templates/manifest_grafana.j2 +++ b/roles/servicetelemetry/templates/manifest_grafana.j2 @@ -9,7 +9,7 @@ spec: serviceaccounts.openshift.io/oauth-redirectreference.primary: '{{ grafana_oauth_redir_ref | to_json }}' deployment: annotations: - hash-of-creds-to-force-restart-if-changed: {{ (servicetelemetry_vars.graphing.grafana.admin_user + servicetelemetry_vars.graphing.grafana.admin_password) | password_hash('sha256', (session_secret | b64encode)[:16] ) }} + hash-of-creds-to-force-restart-if-changed: {{ grafana_htpasswd_auth_string | b64encode }} baseImage: {{ servicetelemetry_vars.graphing.grafana.base_image }} ingress: enabled: {{ servicetelemetry_vars.graphing.grafana.ingress_enabled }} diff --git a/roles/servicetelemetry/vars/dummy_user_certs.yml b/roles/servicetelemetry/vars/dummy_user_certs.yml index 1ac2c152f..e352309d1 100644 --- a/roles/servicetelemetry/vars/dummy_user_certs.yml +++ b/roles/servicetelemetry/vars/dummy_user_certs.yml @@ -53,4 +53,4 @@ elastic_user_key_dummy: | njxhAQKBgQD0lOpKtL8qz9gmqtkhDRe+EPHSX8rfirqqRrPUiwK7kAJeW2vtU8aa hFT7lEDjb7ERyZfybIkTVVBipKx2yse9nE+1dPGIgZop3E1guDuF9aOAzIUd/+/s CI7s/lIBZsPD3PyxXXRtsvN7iUv5tLvNFhfomB7miTYHE+MC5QHJVQ== - -----END RSA PRIVATE KEY----- \ No newline at end of file + -----END RSA PRIVATE KEY----- From 6bf4784f42e60e654087eb164549ba9bc0de36b0 Mon Sep 17 00:00:00 2001 From: Victoria Martinez de la Cruz Date: Mon, 4 Sep 2023 12:58:48 +0200 Subject: [PATCH 38/95] Add checks for not set sto and sgo bundle paths (#447) * Add checks for not set sto and sgo bundle paths This change adds a check for whether sto and sgo bundle paths are defined and properly inform the user if they are undefined. Also sets the nightly builds available in Quay as defaults. --- build/stf-run-ci/README.md | 4 ++-- build/stf-run-ci/defaults/main.yml | 4 ++-- build/stf-run-ci/tasks/setup_stf_from_bundles.yml | 8 ++++++++ 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/build/stf-run-ci/README.md b/build/stf-run-ci/README.md index a389d2c03..6b908d4f2 100644 --- a/build/stf-run-ci/README.md +++ b/build/stf-run-ci/README.md @@ -21,8 +21,8 @@ choose to override: | `__local_build_enabled` | {true,false} | true | Whether to deploy STF from local built artifacts. Also see `working_branch`, `sg_branch`, `sgo_branch` | | `__deploy_from_bundles_enabled` | {true,false} | false | Whether to deploy STF from OLM bundles (TODO: compat with `__local_build_enabled`) | | `__deploy_from_index_enabled` | {true,false} | false | Whether to deploy STF from locally built bundles and index image. | -| `__service_telemetry_bundle_image_path` | | | Image path to Service Telemetry Operator bundle | -| `__smart_gateway_bundle_image_path` | | | Image path to Smart Gateway Operator bundle | +| `__service_telemetry_bundle_image_path` | | `quay.io/infrawatch-operators/service-telemetry-operator-bundle:nightly-head` | Image path to Service Telemetry Operator bundle | +| `__smart_gateway_bundle_image_path` | | `quay.io/infrawatch-operators/smart-gateway-operator-bundle:nightly-head` | Image path to Smart Gateway Operator bundle | | `prometheus_webhook_snmp_branch` | | master | Which Prometheus Webhook SNMP git branch to checkout | | `sgo_branch` | | master | Which Smart Gateway Operator git branch to checkout | | `sg_core_branch` | | master | Which Smart Gateway Core git branch to checkout | diff --git a/build/stf-run-ci/defaults/main.yml b/build/stf-run-ci/defaults/main.yml index af263f7b8..7b51439b1 100644 --- a/build/stf-run-ci/defaults/main.yml +++ b/build/stf-run-ci/defaults/main.yml @@ -31,8 +31,8 @@ __service_telemetry_observability_strategy: use_redhat __service_telemetry_transports_certificates_endpoint_cert_duration: 70080h __service_telemetry_transports_certificates_ca_cert_duration: 70080h __internal_registry_path: image-registry.openshift-image-registry.svc:5000 -__service_telemetry_bundle_image_path: -__smart_gateway_bundle_image_path: +__service_telemetry_bundle_image_path: "quay.io/infrawatch-operators/service-telemetry-operator-bundle:nightly-head" +__smart_gateway_bundle_image_path: "quay.io/infrawatch-operators/smart-gateway-operator-bundle:nightly-head" default_operator_registry_image_base: registry.redhat.io/openshift4/ose-operator-registry default_operator_registry_image_tag: v4.12 diff --git a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml index fd5423dd3..2e20ab726 100644 --- a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml +++ b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml @@ -80,6 +80,14 @@ tags: - bundle_registry_tls_ca +- name: "Ensure that the bundle paths are set." + ansible.builtin.assert: + that: + - '__smart_gateway_bundle_image_path is defined and __smart_gateway_bundle_image_path != None' + - '__service_telemetry_bundle_image_path is defined and __service_telemetry_bundle_image_path != None' + fail_msg: "Bundle path(s) not set. __smart_gateway_bundle_image_path is '{{ __smart_gateway_bundle_image_path }}' and __service_telemetry_bundle_image_path is '{{ __service_telemetry_bundle_image_path }}'. Both values need to be set." + success_msg: "Bundle paths are defined and not None" + - name: Deploy SGO via OLM bundle shell: cmd: "{{ base_dir }}/working/operator-sdk run bundle {{__smart_gateway_bundle_image_path}} {% if pull_secret is defined %}--pull-secret-name=pull-secret --ca-secret-name=registry-tls-ca{% endif %} --namespace={{ namespace }} --timeout 600s" From 3549d5b31ffbaff5aef1e8152911edd4225dc8bc Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Tue, 5 Sep 2023 14:28:14 -0400 Subject: [PATCH 39/95] Bump compatibility from 4.10-4.12 to 4.10-4.14 (#448) --- .jenkins/README.md | 2 +- .jenkins/agent/Dockerfile | 2 +- build/stf-run-ci/defaults/main.yml | 2 +- build/stf-run-ci/tasks/create_catalog.yml | 4 ++-- deploy/olm-catalog/service-telemetry-operator/Dockerfile.in | 2 +- .../service-telemetry-operator/metadata/properties.yaml | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.jenkins/README.md b/.jenkins/README.md index 2085f21d3..f39f1b771 100644 --- a/.jenkins/README.md +++ b/.jenkins/README.md @@ -40,7 +40,7 @@ export SMEE_CHANNEL= #(just the slug, not the whole URL) export GH_ORG= export JENKINS_URL=$(oc get route jenkins -ojsonpath='{.spec.host}') # This is for labelling the status that is returned to github -export OCP_VERSION= # e.g. 4.12 +export OCP_VERSION= # e.g. 4.14 for f in deploy/*; do envsubst < "${f}" | oc apply -f - diff --git a/.jenkins/agent/Dockerfile b/.jenkins/agent/Dockerfile index c41fb9c69..ab7f97cb3 100644 --- a/.jenkins/agent/Dockerfile +++ b/.jenkins/agent/Dockerfile @@ -1,7 +1,7 @@ FROM quay.io/openshift/origin-jenkins-agent-base:latest # pass --build-arg OC_CLIENT_VERSION= to build stage to change client version -ARG OC_CLIENT_VERSION="4.12" +ARG OC_CLIENT_VERSION="4.13" RUN curl -LO "https://github.com/operator-framework/operator-sdk/releases/download/v0.19.4/operator-sdk-v0.19.4-x86_64-linux-gnu" && \ chmod +x operator-sdk-v0.19.4-x86_64-linux-gnu && mv operator-sdk-v0.19.4-x86_64-linux-gnu /usr/local/bin/operator-sdk diff --git a/build/stf-run-ci/defaults/main.yml b/build/stf-run-ci/defaults/main.yml index 7b51439b1..502eb8194 100644 --- a/build/stf-run-ci/defaults/main.yml +++ b/build/stf-run-ci/defaults/main.yml @@ -35,7 +35,7 @@ __service_telemetry_bundle_image_path: "quay.io/infrawatch-operators/service-tel __smart_gateway_bundle_image_path: "quay.io/infrawatch-operators/smart-gateway-operator-bundle:nightly-head" default_operator_registry_image_base: registry.redhat.io/openshift4/ose-operator-registry -default_operator_registry_image_tag: v4.12 +default_operator_registry_image_tag: v4.13 elasticsearch_version: 7.16.1 diff --git a/build/stf-run-ci/tasks/create_catalog.yml b/build/stf-run-ci/tasks/create_catalog.yml index 2134be017..3ad667c4e 100644 --- a/build/stf-run-ci/tasks/create_catalog.yml +++ b/build/stf-run-ci/tasks/create_catalog.yml @@ -76,7 +76,7 @@ dockerfile: | # The base image is expected to contain # /bin/opm (with a serve subcommand) and /bin/grpc_health_probe - FROM registry.redhat.io/openshift4/ose-operator-registry:v4.12 + FROM registry.redhat.io/openshift4/ose-operator-registry:v4.13 COPY --chmod=666 index.yaml /configs/ @@ -95,7 +95,7 @@ dockerStrategy: from: kind: ImageStreamTag - name: ose-operator-registry:v4.12 + name: ose-operator-registry:v4.13 volumes: - mounts: - destinationPath: /opt/app-root/auth diff --git a/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in b/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in index 182dbf160..c3e7aa29e 100644 --- a/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in +++ b/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in @@ -13,7 +13,7 @@ LABEL operators.operatorframework.io.metrics.mediatype.v1=metrics+v1 LABEL operators.operatorframework.io.metrics.builder=operator-sdk-v0.19.4 LABEL operators.operatorframework.io.metrics.project_layout=ansible LABEL com.redhat.delivery.operator.bundle=true -LABEL com.redhat.openshift.versions="v4.10-v4.12" +LABEL com.redhat.openshift.versions="v4.10-v4.14" LABEL com.redhat.delivery.backport=false LABEL com.redhat.component="service-telemetry-operator-bundle-container" \ diff --git a/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml b/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml index 2905d1189..63f6bd1cd 100644 --- a/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml @@ -1,6 +1,6 @@ properties: - type: olm.maxOpenShiftVersion - value: "4.12" + value: "4.14" - type: olm.constraint value: failureMessage: Require Smart Gateway for Service Telemetry Framework From cd216c0c54e7d4eb0f2974eb87181f31b932a44c Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Thu, 7 Sep 2023 14:04:28 -0400 Subject: [PATCH 40/95] Allow Grafana management in all observability modes (#449) Allow management of Grafana in all observability modes. Grafana continues to be installed from community operators, but can continue to be used even when deploying STF in the default observability strategy of use_redhat. Closes STF-1500 Signed-off-by: Leif Madsen --- roles/servicetelemetry/tasks/main.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/roles/servicetelemetry/tasks/main.yml b/roles/servicetelemetry/tasks/main.yml index ce615f25b..20991f50e 100644 --- a/roles/servicetelemetry/tasks/main.yml +++ b/roles/servicetelemetry/tasks/main.yml @@ -102,7 +102,6 @@ - when: - has_integreatly_api | bool - - observability_strategy in ['use_community', 'use_hybrid'] name: Start graphing component plays include_tasks: component_grafana.yml From 266fc3e2a535df95133074f2bae88e515b33fe99 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Mon, 11 Sep 2023 13:08:10 -0400 Subject: [PATCH 41/95] Add tests/infrared/17.1 (#440) * Add tests/infrared/17.1 Add a new tests/infrared/17.1/ configuration. Add extra hosts file entry configuration to make it easier to point at an existing OpenShift deployment (primarily used with stf-verify-containers). * Sync enable-stf.yaml template to documentation * Add qdr::router_id configuration Update template to match documentation for qdr::router_id values in order to avoid rhbz#2208020 * Additional templating and add extra hosts Some additional templating and removing ceph from the deployment framework since STF isn't monitoring ceph going forward. Update templating so that extra hosts for QDR can be passed at deployment time to provide entries in /etc/hosts on all the nodes. --- tests/infrared/17.1/.gitignore | 3 + tests/infrared/17.1/enable-stf.yaml.template | 81 +++++++ tests/infrared/17.1/extra-hosts.yaml.template | 9 + .../17.1/gnocchi-connectors.yaml.template | 24 ++ tests/infrared/17.1/infrared-openstack.sh | 223 ++++++++++++++++++ tests/infrared/17.1/outputs/.KEEPIT | 0 .../17.1/stf-connectors.yaml.template | 67 ++++++ 7 files changed, 407 insertions(+) create mode 100644 tests/infrared/17.1/.gitignore create mode 100644 tests/infrared/17.1/enable-stf.yaml.template create mode 100644 tests/infrared/17.1/extra-hosts.yaml.template create mode 100644 tests/infrared/17.1/gnocchi-connectors.yaml.template create mode 100755 tests/infrared/17.1/infrared-openstack.sh create mode 100644 tests/infrared/17.1/outputs/.KEEPIT create mode 100644 tests/infrared/17.1/stf-connectors.yaml.template diff --git a/tests/infrared/17.1/.gitignore b/tests/infrared/17.1/.gitignore new file mode 100644 index 000000000..7c466baa0 --- /dev/null +++ b/tests/infrared/17.1/.gitignore @@ -0,0 +1,3 @@ +outputs/** +!outputs/.KEEPIT + diff --git a/tests/infrared/17.1/enable-stf.yaml.template b/tests/infrared/17.1/enable-stf.yaml.template new file mode 100644 index 000000000..a1037f213 --- /dev/null +++ b/tests/infrared/17.1/enable-stf.yaml.template @@ -0,0 +1,81 @@ +--- +tripleo_heat_templates: + [] + +custom_templates: + # matches the documentation for enable-stf.yaml in stable-1.3 documentation + parameter_defaults: + # only send to STF, not other publishers + EventPipelinePublishers: [] + PipelinePublishers: [] + + # manage the polling and pipeline configuration files for Ceilometer agents + ManagePolling: true + ManagePipeline: true + + # enable Ceilometer metrics and events + CeilometerQdrPublishMetrics: true + CeilometerQdrPublishEvents: true + + # enable collection of API status + CollectdEnableSensubility: true + CollectdSensubilityTransport: amqp1 + + # enable collection of containerized service metrics + CollectdEnableLibpodstats: true + + # set collectd overrides for higher telemetry resolution and extra plugins + # to load + CollectdConnectionType: amqp1 + CollectdAmqpInterval: 5 + CollectdDefaultPollingInterval: 5 + CollectdExtraPlugins: + - vmem + + # set standard prefixes for where metrics and events are published to QDR + MetricsQdrAddresses: + - prefix: 'collectd' + distribution: multicast + - prefix: 'anycast/ceilometer' + distribution: multicast + + ExtraConfig: + ceilometer::agent::polling::polling_interval: 30 + ceilometer::agent::polling::polling_meters: + - cpu + - disk.* + - ip.* + - image.* + - memory + - memory.* + - network.services.vpn.* + - network.services.firewall.* + - perf.* + - port + - port.* + - switch + - switch.* + - storage.* + - volume.* + + # to avoid filling the memory buffers if disconnected from the message bus + # note: this may need an adjustment if there are many metrics to be sent. + collectd::plugin::amqp1::send_queue_limit: 5000 + + # receive extra information about virtual memory + collectd::plugin::vmem::verbose: true + + # provide name and uuid in addition to hostname for better correlation + # to ceilometer data + collectd::plugin::virt::hostname_format: "name uuid hostname" + + # provide the human-friendly name of the virtual instance + collectd::plugin::virt::plugin_instance_format: metadata + + # set memcached collectd plugin to report its metrics by hostname + # rather than host IP, ensuring metrics in the dashboard remain uniform + collectd::plugin::memcached::instances: + local: + host: "%{hiera('fqdn_canonical')}" + port: 11211 + diff --git a/tests/infrared/17.1/extra-hosts.yaml.template b/tests/infrared/17.1/extra-hosts.yaml.template new file mode 100644 index 000000000..3129c35ac --- /dev/null +++ b/tests/infrared/17.1/extra-hosts.yaml.template @@ -0,0 +1,9 @@ +--- +tripleo_heat_templates: + [] + +custom_templates: + parameter_defaults: + ExtraHostFileEntries: + - '<>' + diff --git a/tests/infrared/17.1/gnocchi-connectors.yaml.template b/tests/infrared/17.1/gnocchi-connectors.yaml.template new file mode 100644 index 000000000..1a5b729a2 --- /dev/null +++ b/tests/infrared/17.1/gnocchi-connectors.yaml.template @@ -0,0 +1,24 @@ +--- +tripleo_heat_templates: + [] + +custom_templates: + resource_registry: + OS::TripleO::Services::GnocchiApi: /usr/share/openstack-tripleo-heat-templates/deployment/gnocchi/gnocchi-api-container-puppet.yaml + OS::TripleO::Services::GnocchiMetricd: /usr/share/openstack-tripleo-heat-templates/deployment/gnocchi/gnocchi-metricd-container-puppet.yaml + OS::TripleO::Services::GnocchiStatsd: /usr/share/openstack-tripleo-heat-templates/deployment/gnocchi/gnocchi-statsd-container-puppet.yaml + OS::TripleO::Services::AodhApi: /usr/share/openstack-tripleo-heat-templates/deployment/aodh/aodh-api-container-puppet.yaml + OS::TripleO::Services::AodhEvaluator: /usr/share/openstack-tripleo-heat-templates/deployment/aodh/aodh-evaluator-container-puppet.yaml + OS::TripleO::Services::AodhNotifier: /usr/share/openstack-tripleo-heat-templates/deployment/aodh/aodh-notifier-container-puppet.yaml + OS::TripleO::Services::AodhListener: /usr/share/openstack-tripleo-heat-templates/deployment/aodh/aodh-listener-container-puppet.yaml + + parameter_defaults: + CeilometerEnableGnocchi: true + CeilometerEnablePanko: false + GnocchiArchivePolicy: 'high' + GnocchiBackend: 'rbd' + GnocchiRbdPoolName: 'metrics' + + EventPipelinePublishers: ['gnocchi://?filter_project=service'] + PipelinePublishers: ['gnocchi://?filter_project=service'] + diff --git a/tests/infrared/17.1/infrared-openstack.sh b/tests/infrared/17.1/infrared-openstack.sh new file mode 100755 index 000000000..cf478fddd --- /dev/null +++ b/tests/infrared/17.1/infrared-openstack.sh @@ -0,0 +1,223 @@ +#!/usr/bin/env bash +set -e + +# Usage: +# VIRTHOST=my.big.hypervisor.net +# ./infrared-openstack.sh +VIRTHOST=${VIRTHOST:-localhost} +AMQP_HOST=${AMQP_HOST:-stf-default-interconnect-5671-service-telemetry.apps-crc.testing} +AMQP_PORT=${AMQP_PORT:-443} +SSH_KEY="${SSH_KEY:-${HOME}/.ssh/id_rsa}" +NTP_SERVER="${NTP_SERVER:-clock.redhat.com,10.5.27.10,10.11.160.238}" +CLOUD_NAME="${CLOUD_NAME:-cloud1}" +OCP_ROUTE_IP=${OCP_ROUTE_IP:-} + +VM_IMAGE_URL_PATH="${VM_IMAGE_URL_PATH:-http://download.devel.redhat.com/rhel-9/rel-eng/RHEL-9/latest-RHEL-9.2/compose/BaseOS/x86_64/images/}" +# Recommend these default to tested immutable dentifiers where possible, pass "latest" style ids via environment if you want them +VM_IMAGE="${VM_IMAGE:-rhel-guest-image-9.2-20230414.17.x86_64.qcow2}" +VM_IMAGE_LOCATION="${VM_IMAGE_URL_PATH}/${VM_IMAGE}" + +OSP_BUILD="${OSP_BUILD:-passed_phase2}" +OSP_VERSION="${OSP_VERSION:-17.1}" +OSP_TOPOLOGY="${OSP_TOPOLOGY:-undercloud:1,controller:3,compute:2,ceph:0}" +OSP_MIRROR="${OSP_MIRROR:-rdu2}" +LIBVIRT_DISKPOOL="${LIBVIRT_DISKPOOL:-/var/lib/libvirt/images}" +STF_ENVIRONMENT_TEMPLATE="${STF_ENVIRONMENT_TEMPLATE:-stf-connectors.yaml.template}" +GNOCCHI_ENVIRONMENT_TEMPLATE="${GNOCCHI_ENVIRONMENT_TEMPLATE:-gnocchi-connectors.yaml.template}" +ENABLE_STF_ENVIRONMENT_TEMPLATE="${ENABLE_STF_ENVIRONMENT_TEMPLATE:-enable-stf.yaml.template}" +EXTRA_HOST_FILE_TEMPLATE="${EXTRA_HOST_FILE_TEMPLATE:-extra-hosts.yaml.template}" +OVERCLOUD_DOMAIN="${OVERCLOUD_DOMAIN:-`hostname -s`}" + +UNDERCLOUD_CPU="${UNDERCLOUD_CPU:-4}" +UNDERCLOUD_MEMORY="${UNDERCLOUD_MEMORY:-16384}" +CONTROLLER_CPU="${CONTROLLER_CPU:-2}" +CONTROLLER_MEMORY="${CONTROLLER_MEMORY:-12228}" +COMPUTE_CPU="${COMPUTE_CPU:-4}" +COMPUTE_MEMORY="${COMPUTE_MEMORY:-12228}" +CEPH_CPU="${CEPH_CPU:-2}" +CEPH_MEMORY="${CEPH_MEMORY:-4096}" + +TEMPEST_ONLY="${TEMPEST_ONLY:-false}" +RUN_WORKLOAD="${RUN_WORKLOAD:-false}" +CA_CERT_FILE_CONTENT="${CA_CERT_FILE_CONTENT:-}" +ENABLE_STF_CONNECTORS="${ENABLE_STF_CONNECTORS:-true}" +ENABLE_GNOCCHI_CONNECTORS="${ENABLE_GNOCCHI_CONNECTORS:-true}" + +ir_run_cleanup() { + infrared virsh \ + -vv \ + -o outputs/cleanup.yml \ + --disk-pool "${LIBVIRT_DISKPOOL}" \ + --host-address "${VIRTHOST}" \ + --host-key "${SSH_KEY}" \ + --cleanup yes + + echo "*** If you just want to clean up the environment now is your chance to Ctrl+C ***" + sleep 10 +} + +ir_run_provision() { + infrared virsh \ + -vvv \ + -o outputs/provision.yml \ + --disk-pool "${LIBVIRT_DISKPOOL}" \ + --topology-nodes "${OSP_TOPOLOGY}" \ + --host-address "${VIRTHOST}" \ + --host-key "${SSH_KEY}" \ + --image-url "${VM_IMAGE_LOCATION}" \ + --host-memory-overcommit True \ + --topology-network 3_nets \ + -e override.undercloud.cpu="${UNDERCLOUD_CPU}" \ + -e override.undercloud.memory="${UNDERCLOUD_MEMORY}" \ + -e override.controller.cpu="${CONTROLLER_CPU}" \ + -e override.controller.memory="${CONTROLLER_MEMORY}" \ + -e override.compute.cpu="${COMPUTE_CPU}" \ + -e override.compute.memory="${COMPUTE_MEMORY}" \ + -e override.ceph.cpu="${CEPH_CPU}" \ + -e override.ceph.memory="${CEPH_MEMORY}" \ + --serial-files True \ + --bootmode uefi +} + +ir_create_undercloud() { + infrared tripleo-undercloud \ + -vv \ + -o outputs/undercloud-install.yml \ + --mirror "${OSP_MIRROR}" \ + --version "${OSP_VERSION}" \ + --splitstack no \ + --shade-host undercloud-0 \ + --ssl yes \ + --build "${OSP_BUILD}" \ + --images-task rpm \ + --images-update no \ + --tls-ca https://password.corp.redhat.com/RH-IT-Root-CA.crt \ + --overcloud-domain "${OVERCLOUD_DOMAIN}" \ + --config-options DEFAULT.undercloud_timezone=UTC +} + +stf_create_config() { + sed -r "s/<>/${AMQP_HOST}/;s/<>/${AMQP_PORT}/;s/<>/${CLOUD_NAME}/;s%<>%${CA_CERT_FILE_CONTENT//$'\n'/<@@@>}%;s/<@@@>/\n /g" ${STF_ENVIRONMENT_TEMPLATE} > outputs/stf-connectors.yaml +} + +gnocchi_create_config() { + cat ${GNOCCHI_ENVIRONMENT_TEMPLATE} > outputs/gnocchi-connectors.yaml +} + +enable_stf_create_config() { + cat ${ENABLE_STF_ENVIRONMENT_TEMPLATE} > outputs/enable-stf.yaml +} + +enable_extra_host_file_create_config() { + sed -r "s/<>/${OCP_ROUTE_IP} ${AMQP_HOST}/g" ${EXTRA_HOST_FILE_TEMPLATE} > outputs/extra-hosts.yaml +} + +ir_create_overcloud() { + infrared tripleo-overcloud \ + -vv \ + -o outputs/overcloud-install.yml \ + --version "${OSP_VERSION}" \ + --deployment-files virt \ + --overcloud-debug yes \ + --network-backend geneve \ + --network-protocol ipv4 \ + --network-bgpvpn no \ + --network-dvr no \ + --network-l2gw no \ + --storage-backend lvm \ + --overcloud-ssl no \ + --introspect yes \ + --tagging yes \ + --deploy yes \ + --overcloud-templates ceilometer-write-qdr-edge-only,outputs/enable-stf.yaml,outputs/stf-connectors.yaml,outputs/gnocchi-connectors.yaml,outputs/extra-hosts.yaml \ + --overcloud-domain "${OVERCLOUD_DOMAIN}" \ + --containers yes \ + --vbmc-force False \ + --vbmc-host undercloud \ + --config-heat ComputeParameters.NeutronBridgeMappings='tenant:br-isolated' \ + --extra-vars osp_version="${OSP_VERSION}" +} + +ir_run_tempest() { + infrared tempest \ + -vv \ + -o outputs/test.yml \ + --openstack-installer tripleo \ + --openstack-version "${OSP_VERSION}" \ + --tests smoke \ + --setup rpm \ + --revision=HEAD \ + --image http://download.cirros-cloud.net/0.4.0/cirros-0.4.0-x86_64-disk.img +} + +ir_expose_ui() { + infrared cloud-config --external-dhcp True \ + --external-shared True \ + --deployment-files virt \ + --tasks create_external_network,forward_overcloud_dashboard +} + +ir_run_workload() { + infrared cloud-config --deployment-files virt --tasks launch_workload +} + + +if [ -z "${CA_CERT_FILE_CONTENT}" ]; then + echo "CA_CERT_FILE_CONTENT must be set and passed to the deployment, or QDR will fail to connect." + exit 1 +fi + +time if ${TEMPEST_ONLY}; then + echo "-- Running tempest tests" + ir_run_tempest +else + echo "-- full cloud deployment" + echo ">> Cloud name: ${CLOUD_NAME}" + echo ">> Overcloud domain: ${OVERCLOUD_DOMAIN}" + echo ">> STF enabled: ${ENABLE_STF_CONNECTORS}" + echo ">> Gnocchi enabled: ${ENABLE_GNOCCHI_CONNECTORS}" + echo ">> OSP version: ${OSP_VERSION}" + echo ">> OSP build: ${OSP_BUILD}" + echo ">> OSP topology: ${OSP_TOPOLOGY}" + + ir_run_cleanup + if ${ENABLE_STF_CONNECTORS}; then + stf_create_config + enable_stf_create_config + if [ -z "${OCP_ROUTE_IP}" ]; then + touch outputs/extra-hosts.yaml + truncate --size 0 outputs/extra-hosts.yaml + else + enable_extra_host_file_create_config + fi + else + touch outputs/stf-connectors.yaml + truncate --size 0 outputs/stf-connectors.yaml + touch outputs/enable-stf.yaml + truncate --size 0 outputs/enable-stf.yaml + touch outputs/extra-hosts.yaml + truncate --size 0 outputs/extra-hosts.yaml + fi + if ${ENABLE_GNOCCHI_CONNECTORS}; then + gnocchi_create_config + else + touch outputs/gnocchi-connectors.yaml + truncate --size 0 outputs/gnocchi-connectors.yaml + fi + ir_run_provision + ir_create_undercloud + ir_create_overcloud + ir_expose_ui + if ${RUN_WORKLOAD}; then + ir_run_workload + fi + + echo "-- deployment completed" + echo ">> Cloud name: ${CLOUD_NAME}" + echo ">> Overcloud domain: ${OVERCLOUD_DOMAIN}" + echo ">> STF enabled: ${ENABLE_STF_CONNECTORS}" + echo ">> Gnocchi enabled: ${ENABLE_GNOCCHI_CONNECTORS}" + echo ">> OSP version: ${OSP_VERSION}" + echo ">> OSP build: ${OSP_BUILD}" + echo ">> OSP topology: ${OSP_TOPOLOGY}" +fi diff --git a/tests/infrared/17.1/outputs/.KEEPIT b/tests/infrared/17.1/outputs/.KEEPIT new file mode 100644 index 000000000..e69de29bb diff --git a/tests/infrared/17.1/stf-connectors.yaml.template b/tests/infrared/17.1/stf-connectors.yaml.template new file mode 100644 index 000000000..30e119b3c --- /dev/null +++ b/tests/infrared/17.1/stf-connectors.yaml.template @@ -0,0 +1,67 @@ +--- +tripleo_heat_templates: + [] + +custom_templates: + # don't load collectd-write-qdr.yaml when using multi-cloud and instead load collectd service directly + resource_registry: + OS::TripleO::Services::Collectd: /usr/share/openstack-tripleo-heat-templates/deployment/metrics/collectd-container-puppet.yaml + + # set parameter defaults to match stable-1.3 documentation + parameter_defaults: + ExtraConfig: + qdr::router_id: "%{::hostname}.<>" + + MetricsQdrConnectors: + - host: <> + port: <> + role: edge + verifyHostname: false + sslProfile: sslProfile + + MetricsQdrSSLProfiles: + - name: sslProfile + caCertFileContent: | + <> + + CeilometerQdrEventsConfig: + driver: amqp + topic: <>-event + + CeilometerQdrMetricsConfig: + driver: amqp + topic: <>-metering + + CollectdAmqpInstances: + <>-notify: + format: JSON + notify: true + presettle: false + <>-telemetry: + format: JSON + presettle: false + + CollectdSensubilityResultsChannel: sensubility/<>-telemetry + + # --- below here, extended configuration for environment beyond what is documented in stable-1.3 + CollectdSensubilityLogLevel: DEBUG + CephStorageExtraConfig: + tripleo::profile::base::metrics::collectd::amqp_host: "%{hiera('storage')}" + tripleo::profile::base::metrics::qdr::listener_addr: "%{hiera('storage')}" + + collectd::plugin::ceph::daemons: + - ceph-osd.0 + - ceph-osd.1 + - ceph-osd.2 + - ceph-osd.3 + - ceph-osd.4 + - ceph-osd.5 + - ceph-osd.6 + - ceph-osd.7 + - ceph-osd.8 + - ceph-osd.9 + - ceph-osd.10 + - ceph-osd.11 + - ceph-osd.12 + - ceph-osd.13 + - ceph-osd.14 From 627db785aa1c5746dad6a7274106511f38694b3b Mon Sep 17 00:00:00 2001 From: enothen Date: Mon, 11 Sep 2023 20:27:13 +0200 Subject: [PATCH 42/95] Corrected Hugepages alert summary to match severity (#451) Co-authored-by: Eric Nothen Co-authored-by: Leif Madsen --- deploy/alerts/alerts.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/alerts/alerts.yaml b/deploy/alerts/alerts.yaml index 229c5c385..15ef94629 100644 --- a/deploy/alerts/alerts.yaml +++ b/deploy/alerts/alerts.yaml @@ -182,7 +182,7 @@ spec: labels: severity: critical annotations: - summary: Hugepages (warning) + summary: Hugepages (critical) expr: >- sum without (type_instance) (collectd_hugepages_vmpage_number{type_instance="free"})/ sum without (type_instance) (collectd_hugepages_vmpage_number) < 0.1 for: 10m From 9a6142d54f57cbc5636fdf8a705902eecd8a6199 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Wed, 13 Sep 2023 17:49:27 +0100 Subject: [PATCH 43/95] [stf-run-ci] Update vars in README (#459) * [stf-run-ci] Update vars in README Update default values Remove the vars that are no longer used * Update build/stf-run-ci/README.md Co-authored-by: Chris Sibbitt --------- Co-authored-by: Chris Sibbitt --- build/stf-run-ci/README.md | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/build/stf-run-ci/README.md b/build/stf-run-ci/README.md index 6b908d4f2..f0c47db15 100644 --- a/build/stf-run-ci/README.md +++ b/build/stf-run-ci/README.md @@ -32,8 +32,7 @@ choose to override: | `sg_core_repository` | | https://github.com/infrawatch/sg-core | Which Smart Gateway Core git repository to clone | | `sg_bridge_repository` | | https://github.com/infrawatch/sg-bridge | Which Smart Gateway Bridge git repository to clone | | `prometheus_webhook_snmp_repository` | | https://github.com/infrawatch/prometheus-webhook-snmp | Which Prometheus webhook snmp git repository to clone | -| `loki_operator_repository` | | https://github.com/viaq/loki-operator | Which Loki-operator git repository to clone | -| `__service_telemetry_events_certificates_endpoint_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 2160h | Lifetime of the ElasticSearch endpoint certificate (minimum duration is 1h) | +| `__service_telemetry_events_certificates_endpoint_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the ElasticSearch endpoint certificate (minimum duration is 1h) | | `__service_telemetry_events_certificates_ca_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the ElasticSearch CA certificate (minimum duration is 1h) | | `__service_telemetry_events_enabled` | {true,false} | true | Whether to enable events support in ServiceTelemetry | | `__service_telemetry_high_availability_enabled` | {true,false} | false | Whether to enable high availability support in ServiceTelemetry | @@ -51,14 +50,10 @@ choose to override: | `__service_telemetry_trap_default_oid` | | 1.3.6.1.4.1.50495.15.1.2.1 | The trap OID if none is found in the Prometheus alert labels. | | `__service_telemetry_trap_default_severity` | | | The trap severity if none is found in the Prometheus alert labels. | | `__service_telemetry_logs_enabled` | {true,false} | false | Whether to enable logs support in ServiceTelemetry | -| `__service_telemetry_observability_strategy` | | `use_hybrid` | Which observability strategy to use for deployment. Default is 'use_hybrid'. Also supported are 'use_redhat', 'use_community', and 'none' | -| `__service_telemetry_transports_certificates_endpoint_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 2160h | Lifetime of the QDR endpoint certificate (minimum duration is 1h) | +| `__service_telemetry_observability_strategy` | | `use_redhat` | Which observability strategy to use for deployment. Default is 'use_redhat'. Also supported are 'use_hybrid', 'use_community', and 'none' | +| `__service_telemetry_transports_certificates_endpoint_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the QDR endpoint certificate (minimum duration is 1h) | | `__service_telemetry_transports_certificates_ca_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the QDR CA certificate (minimum duration is 1h) | | `__internal_registry_path` | | image-registry.openshift-image-registry.svc:5000 | Path to internal registry for image path | -| `__deploy_loki_enabled` | {true,false} | false | Whether to deploy loki-operator and other systems for logging development purposes | -| `__golang_image_path` | | quay.io/infrawatch/golang:1.16 | Golang image path for building the loki-operator image | -| `__loki_image_path` | | quay.io/infrawatch/loki:2.2.1 | Loki image path for Loki microservices | - # Example Playbook From 29460aad7ad9d2726e5a67c9c852999fdf2490e8 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Wed, 13 Sep 2023 18:03:13 +0100 Subject: [PATCH 44/95] Update Dockerfile to create epel repo config when NO_PROXY is undefined (#458) --- build/Dockerfile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/build/Dockerfile b/build/Dockerfile index c96f01ccf..420d86a8b 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -2,8 +2,9 @@ FROM quay.io/openshift/origin-ansible-operator:4.10 USER 0 # Upstream CI builds need the additional EPEL sources for python3-passlib and python3-bcrypt but have no working repos to install epel-release -# NO_PROXY is undefined in upsream CI builds, but defined (usually blank) during openshift builds (a possibly brittle hack) -RUN bash -c -- 'if [ "${NO_PROXY-__ZZZZZ}" == "__ZZZZZ" ]; then echo "Applying upstream EPEL hacks" && echo -e "-----BEGIN PGP PUBLIC KEY BLOCK-----\n\nmQINBFz3zvsBEADJOIIWllGudxnpvJnkxQz2CtoWI7godVnoclrdl83kVjqSQp+2\ndgxuG5mUiADUfYHaRQzxKw8efuQnwxzU9kZ70ngCxtmbQWGmUmfSThiapOz00018\n+eo5MFabd2vdiGo1y+51m2sRDpN8qdCaqXko65cyMuLXrojJHIuvRA/x7iqOrRfy\na8x3OxC4PEgl5pgDnP8pVK0lLYncDEQCN76D9ubhZQWhISF/zJI+e806V71hzfyL\n/Mt3mQm/li+lRKU25Usk9dWaf4NH/wZHMIPAkVJ4uD4H/uS49wqWnyiTYGT7hUbi\necF7crhLCmlRzvJR8mkRP6/4T/F3tNDPWZeDNEDVFUkTFHNU6/h2+O398MNY/fOh\nyKaNK3nnE0g6QJ1dOH31lXHARlpFOtWt3VmZU0JnWLeYdvap4Eff9qTWZJhI7Cq0\nWm8DgLUpXgNlkmquvE7P2W5EAr2E5AqKQoDbfw/GiWdRvHWKeNGMRLnGI3QuoX3U\npAlXD7v13VdZxNydvpeypbf/AfRyrHRKhkUj3cU1pYkM3DNZE77C5JUe6/0nxbt4\nETUZBTgLgYJGP8c7PbkVnO6I/KgL1jw+7MW6Az8Ox+RXZLyGMVmbW/TMc8haJfKL\nMoUo3TVk8nPiUhoOC0/kI7j9ilFrBxBU5dUtF4ITAWc8xnG6jJs/IsvRpQARAQAB\ntChGZWRvcmEgRVBFTCAoOCkgPGVwZWxAZmVkb3JhcHJvamVjdC5vcmc+iQI4BBMB\nAgAiBQJc9877AhsPBgsJCAcDAgYVCAIJCgsEFgIDAQIeAQIXgAAKCRAh6kWrL4bW\noWagD/4xnLWws34GByVDQkjprk0fX7Iyhpm/U7BsIHKspHLL+Y46vAAGY/9vMvdE\n0fcr9Ek2Zp7zE1RWmSCzzzUgTG6BFoTG1H4Fho/7Z8BXK/jybowXSZfqXnTOfhSF\nalwDdwlSJvfYNV9MbyvbxN8qZRU1z7PEWZrIzFDDToFRk0R71zHpnPTNIJ5/YXTw\nNqU9OxII8hMQj4ufF11040AJQZ7br3rzerlyBOB+Jd1zSPVrAPpeMyJppWFHSDAI\nWK6x+am13VIInXtqB/Cz4GBHLFK5d2/IYspVw47Solj8jiFEtnAq6+1Aq5WH3iB4\nbE2e6z00DSF93frwOyWN7WmPIoc2QsNRJhgfJC+isGQAwwq8xAbHEBeuyMG8GZjz\nxohg0H4bOSEujVLTjH1xbAG4DnhWO/1VXLX+LXELycO8ZQTcjj/4AQKuo4wvMPrv\n9A169oETG+VwQlNd74VBPGCvhnzwGXNbTK/KH1+WRH0YSb+41flB3NKhMSU6dGI0\nSGtIxDSHhVVNmx2/6XiT9U/znrZsG5Kw8nIbbFz+9MGUUWgJMsd1Zl9R8gz7V9fp\nn7L7y5LhJ8HOCMsY/Z7/7HUs+t/A1MI4g7Q5g5UuSZdgi0zxukiWuCkLeAiAP4y7\nzKK4OjJ644NDcWCHa36znwVmkz3ixL8Q0auR15Oqq2BjR/fyog==\n=84m8\n-----END PGP PUBLIC KEY BLOCK-----" > /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-8 && echo -e "[epel]\nname=Extra Packages for Enterprise Linux 8 - \$basearch\nmetalink=https://mirrors.fedoraproject.org/metalink?repo=epel-8&arch=\$basearch&infra=\$infra&content=\$contentdir\nenabled=1\ngpgcheck=1\ngpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-8" > /etc/yum.repos.d/epel.repo; fi' +# NO_PROXY is undefined in upstream CI builds, but defined (usually blank) during openshift builds (a possibly brittle hack) +RUN bash -c -- 'if [ "${NO_PROXY:-__ZZZZZ}" == "__ZZZZZ" ]; then echo "Applying upstream EPEL hacks" && echo -e "-----BEGIN PGP PUBLIC KEY BLOCK-----\n\nmQINBFz3zvsBEADJOIIWllGudxnpvJnkxQz2CtoWI7godVnoclrdl83kVjqSQp+2\ndgxuG5mUiADUfYHaRQzxKw8efuQnwxzU9kZ70ngCxtmbQWGmUmfSThiapOz00018\n+eo5MFabd2vdiGo1y+51m2sRDpN8qdCaqXko65cyMuLXrojJHIuvRA/x7iqOrRfy\na8x3OxC4PEgl5pgDnP8pVK0lLYncDEQCN76D9ubhZQWhISF/zJI+e806V71hzfyL\n/Mt3mQm/li+lRKU25Usk9dWaf4NH/wZHMIPAkVJ4uD4H/uS49wqWnyiTYGT7hUbi\necF7crhLCmlRzvJR8mkRP6/4T/F3tNDPWZeDNEDVFUkTFHNU6/h2+O398MNY/fOh\nyKaNK3nnE0g6QJ1dOH31lXHARlpFOtWt3VmZU0JnWLeYdvap4Eff9qTWZJhI7Cq0\nWm8DgLUpXgNlkmquvE7P2W5EAr2E5AqKQoDbfw/GiWdRvHWKeNGMRLnGI3QuoX3U\npAlXD7v13VdZxNydvpeypbf/AfRyrHRKhkUj3cU1pYkM3DNZE77C5JUe6/0nxbt4\nETUZBTgLgYJGP8c7PbkVnO6I/KgL1jw+7MW6Az8Ox+RXZLyGMVmbW/TMc8haJfKL\nMoUo3TVk8nPiUhoOC0/kI7j9ilFrBxBU5dUtF4ITAWc8xnG6jJs/IsvRpQARAQAB\ntChGZWRvcmEgRVBFTCAoOCkgPGVwZWxAZmVkb3JhcHJvamVjdC5vcmc+iQI4BBMB\nAgAiBQJc9877AhsPBgsJCAcDAgYVCAIJCgsEFgIDAQIeAQIXgAAKCRAh6kWrL4bW\noWagD/4xnLWws34GByVDQkjprk0fX7Iyhpm/U7BsIHKspHLL+Y46vAAGY/9vMvdE\n0fcr9Ek2Zp7zE1RWmSCzzzUgTG6BFoTG1H4Fho/7Z8BXK/jybowXSZfqXnTOfhSF\nalwDdwlSJvfYNV9MbyvbxN8qZRU1z7PEWZrIzFDDToFRk0R71zHpnPTNIJ5/YXTw\nNqU9OxII8hMQj4ufF11040AJQZ7br3rzerlyBOB+Jd1zSPVrAPpeMyJppWFHSDAI\nWK6x+am13VIInXtqB/Cz4GBHLFK5d2/IYspVw47Solj8jiFEtnAq6+1Aq5WH3iB4\nbE2e6z00DSF93frwOyWN7WmPIoc2QsNRJhgfJC+isGQAwwq8xAbHEBeuyMG8GZjz\nxohg0H4bOSEujVLTjH1xbAG4DnhWO/1VXLX+LXELycO8ZQTcjj/4AQKuo4wvMPrv\n9A169oETG+VwQlNd74VBPGCvhnzwGXNbTK/KH1+WRH0YSb+41flB3NKhMSU6dGI0\nSGtIxDSHhVVNmx2/6XiT9U/znrZsG5Kw8nIbbFz+9MGUUWgJMsd1Zl9R8gz7V9fp\nn7L7y5LhJ8HOCMsY/Z7/7HUs+t/A1MI4g7Q5g5UuSZdgi0zxukiWuCkLeAiAP4y7\nzKK4OjJ644NDcWCHa36znwVmkz3ixL8Q0auR15Oqq2BjR/fyog==\n=84m8\n-----END PGP PUBLIC KEY BLOCK-----" > /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-8 && echo -e "[epel]\nname=Extra Packages for Enterprise Linux 8 - \$basearch\nmetalink=https://mirrors.fedoraproject.org/metalink?repo=epel-8&arch=\$basearch&infra=\$infra&content=\$contentdir\nenabled=1\ngpgcheck=1\ngpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-8" > /etc/yum.repos.d/epel.repo; fi' + # Required for oauth-proxy RUN dnf install -y python3-passlib python3-bcrypt USER 1001 From 1d9acc7c0c4db049939a4a7330a4b00f5abed1b0 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Wed, 13 Sep 2023 15:41:14 -0400 Subject: [PATCH 45/95] Add basic README for OSP 17.1 test deployment (#456) --- tests/infrared/17.1/README.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 tests/infrared/17.1/README.md diff --git a/tests/infrared/17.1/README.md b/tests/infrared/17.1/README.md new file mode 100644 index 000000000..576e91c94 --- /dev/null +++ b/tests/infrared/17.1/README.md @@ -0,0 +1,16 @@ +# Deployments + +## Basic deployment + +```bash +CA_CERT_FILE_CONTENT="$(oc get secret/default-interconnect-selfsigned -o jsonpath='{.data.ca\.crt}' | base64 -d)" \ +OCP_ROUTE_IP="10.0.100.50" \ +AMQP_HOST="default-interconnect-5671-service-telemetry.apps.stf15.localhost" \ +ENABLE_STF_CONNECTORS=true \ +ENABLE_GNOCCHI_CONNECTORS=false \ +CONTROLLER_MEMORY="24000" \ +COMPUTE_CPU="6" \ +COMPUTE_MEMORY="24000" \ +LIBVIRT_DISKPOOL="/home/libvirt/images" \ +./infrared-openstack.sh +``` From 0d1c4da7f2930bd24bc056ab30ff2003ac7281fe Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Wed, 13 Sep 2023 17:49:24 -0400 Subject: [PATCH 46/95] Use fully qualified apiVersion on Routes (#453) --- roles/servicetelemetry/templates/manifest_alertmanager_route.j2 | 2 +- roles/servicetelemetry/templates/manifest_prometheus_route.j2 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/roles/servicetelemetry/templates/manifest_alertmanager_route.j2 b/roles/servicetelemetry/templates/manifest_alertmanager_route.j2 index 5e00bee57..b25b7811a 100644 --- a/roles/servicetelemetry/templates/manifest_alertmanager_route.j2 +++ b/roles/servicetelemetry/templates/manifest_alertmanager_route.j2 @@ -1,4 +1,4 @@ -apiVersion: v1 +apiVersion: route.openshift.io/v1 kind: Route metadata: name: '{{ ansible_operator_meta.name }}-alertmanager-proxy' diff --git a/roles/servicetelemetry/templates/manifest_prometheus_route.j2 b/roles/servicetelemetry/templates/manifest_prometheus_route.j2 index af1c024ad..85611cb80 100644 --- a/roles/servicetelemetry/templates/manifest_prometheus_route.j2 +++ b/roles/servicetelemetry/templates/manifest_prometheus_route.j2 @@ -1,4 +1,4 @@ -apiVersion: v1 +apiVersion: route.openshift.io/v1 kind: Route metadata: name: '{{ ansible_operator_meta.name }}-prometheus-proxy' From f490a9f34acd673dd4d2eb9c0dd441205c71ab29 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Wed, 13 Sep 2023 17:53:09 -0400 Subject: [PATCH 47/95] Clean up dependency installation in stf-run-ci (#455) * Clean up dependency installation in stf-run-ci Clean up the dependency installation when not using index images (which installs dependencies via dependencies.yaml). Adjust installation of cert-manager for versions of OCP 4.10 vs 4.12 and later. * Move ObO CatalogSource removal to pre-clean stage --- build/stf-run-ci/tasks/pre-clean.yml | 15 ++++- build/stf-run-ci/tasks/setup_base.yml | 90 +++++++++++++-------------- 2 files changed, 59 insertions(+), 46 deletions(-) diff --git a/build/stf-run-ci/tasks/pre-clean.yml b/build/stf-run-ci/tasks/pre-clean.yml index 24e35d0f6..ed11aec39 100644 --- a/build/stf-run-ci/tasks/pre-clean.yml +++ b/build/stf-run-ci/tasks/pre-clean.yml @@ -48,6 +48,19 @@ displayName: OperatorHub.io Operators publisher: OperatorHub.io +# Upstream Source + Sub from https://github.com/rhobs/observability-operator/tree/main/hack/olm +# Moved to using Community Operators Catalog, so no longer require upstream CatalogSource. Eventually move to Red Hat Operators CatalogSource. +- name: Remove Red Hat Observability Operator CatalogSource if it is installed + k8s: + state: absent + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: CatalogSource + metadata: + annotations: + name: observability-operator + namespace: openshift-marketplace + - name: Remove CloudOps CatalogSource if it is installed k8s: state: absent @@ -130,4 +143,4 @@ kind: Elasticsearch metadata: name: elasticsearch - namespace: "{{ namespace }}" \ No newline at end of file + namespace: "{{ namespace }}" diff --git a/build/stf-run-ci/tasks/setup_base.yml b/build/stf-run-ci/tasks/setup_base.yml index a90071b5b..28ff5e48c 100644 --- a/build/stf-run-ci/tasks/setup_base.yml +++ b/build/stf-run-ci/tasks/setup_base.yml @@ -13,10 +13,10 @@ name: certified-operators - disabled: false name: redhat-operators - - disabled: "{{ false if __service_telemetry_observability_strategy in ['use_community', 'use_hybrid'] else true }}" + - disabled: false name: community-operators -- name: Create OperatorGroup +- name: Create OperatorGroup for service-telemetry k8s: definition: apiVersion: operators.coreos.com/v1 @@ -28,7 +28,8 @@ targetNamespaces: - "{{ namespace }}" -- when: not __deploy_from_index_enabled | bool +# deploy cert-manager from tech-preview when using versions of OCP < 4.12 +- when: not __deploy_from_index_enabled | bool and ocp_ver.stdout is version ('4.12', '<') block: - name: Create openshift-cert-manager-operator namespace k8s: @@ -66,6 +67,28 @@ source: redhat-operators sourceNamespace: openshift-marketplace +# deploy cert-manager from stable-v1 in 4.12 and later using namespace scoped operator +- when: not __deploy_from_index_enabled | bool and ocp_ver.stdout is version ('4.12', '>=') + block: + - name: Subscribe to Cert Manager for OpenShift Operator + k8s: + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + labels: + operators.coreos.com/openshift-cert-manager-operator.service-telemetry: "" + name: openshift-cert-manager-operator-stable-v1-redhat-operators-openshift-marketplace + namespace: service-telemetry + spec: + channel: stable-v1 + installPlanApproval: Automatic + name: openshift-cert-manager-operator + source: redhat-operators + sourceNamespace: openshift-marketplace + +- when: not __deploy_from_index_enabled | bool + block: - name: Subscribe to AMQ Interconnect Operator k8s: definition: @@ -98,6 +121,25 @@ when: - __service_telemetry_observability_strategy == "use_community" + - name: Subscribe to Red Hat Obervability Operator + k8s: + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + labels: + operators.coreos.com/observability-operator.openshift-operators: "" + name: observability-operator + namespace: openshift-operators + spec: + channel: stable + installPlanApproval: Automatic + name: observability-operator + source: community-operators + sourceNamespace: openshift-marketplace + when: + - __service_telemetry_observability_strategy in ['use_redhat', 'use_hybrid'] + - name: Subscribe to Elastic Cloud on Kubernetes Operator k8s: definition: @@ -122,45 +164,3 @@ until: eckCRD.resources[0] is defined retries: 5 delay: 30 - -- block: - # Upstream Source + Sub from https://github.com/rhobs/observability-operator/tree/main/hack/olm - - name: Create CatalogSource for Red Hat Observability Operator - k8s: - definition: - apiVersion: operators.coreos.com/v1alpha1 - kind: CatalogSource - metadata: - annotations: - name: observability-operator - namespace: openshift-marketplace - spec: - displayName: Observability Operator - Test - icon: - base64data: "" - mediatype: "" - image: quay.io/rhobs/observability-operator-catalog:latest - publisher: Sunil Thaha - sourceType: grpc - updateStrategy: - registryPoll: - interval: 10m0s - - - name: Subscribe to Red Hat Obervability Operator - k8s: - definition: - apiVersion: operators.coreos.com/v1alpha1 - kind: Subscription - metadata: - labels: - operators.coreos.com/observability-operator.openshift-operators: "" - name: observability-operator - namespace: openshift-operators - spec: - channel: development - installPlanApproval: Automatic - name: observability-operator - source: observability-operator - sourceNamespace: openshift-marketplace - when: - - __service_telemetry_observability_strategy in ['use_redhat', 'use_hybrid'] From 0d0dbecd2ba6fbc47873ecc3eb4322990c0d6cdd Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Thu, 14 Sep 2023 12:01:47 -0400 Subject: [PATCH 48/95] Fix static namespace reference (#464) Fix a static namespace reference introduced in #455 --- build/stf-run-ci/tasks/setup_base.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/stf-run-ci/tasks/setup_base.yml b/build/stf-run-ci/tasks/setup_base.yml index 28ff5e48c..8c6f44104 100644 --- a/build/stf-run-ci/tasks/setup_base.yml +++ b/build/stf-run-ci/tasks/setup_base.yml @@ -79,7 +79,7 @@ labels: operators.coreos.com/openshift-cert-manager-operator.service-telemetry: "" name: openshift-cert-manager-operator-stable-v1-redhat-operators-openshift-marketplace - namespace: service-telemetry + namespace: "{{ namespace }}" spec: channel: stable-v1 installPlanApproval: Automatic From e04c521d2b0f8c2402d4720210d3453c33ecaf0b Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Thu, 14 Sep 2023 17:35:19 +0100 Subject: [PATCH 49/95] [ansible-lint] Use fqcn for modules (#457) * [ansible-lint] Use fqcn for modules * command -> ansible.builtin.command * debug -> ansible.builtin.debug * file -> ansible.builtin.file * k8s -> kubernetes.core.k8s * k8s_info -> kubernetes.core.k8s_info * set_fact -> ansible.builtin.set_fact * shell -> ansible.builtin.shell * template -> ansible.builtin.template https://github.com/infrawatch/service-telemetry-operator/pull/455 * Add lint testing for stf-run-ci fqcn (#462) --------- Co-authored-by: Leif Madsen --- .github/workflows/main.yml | 23 ++++++- build/stf-run-ci/.ansible-lint | 52 ++++++++++++++++ build/stf-run-ci/meta/main.yml | 35 +++-------- build/stf-run-ci/tasks/clone_repos.yml | 18 +++--- build/stf-run-ci/tasks/create_builds.yml | 23 +++---- build/stf-run-ci/tasks/create_catalog.yml | 30 +++++----- build/stf-run-ci/tasks/deploy_stf.yml | 8 +-- build/stf-run-ci/tasks/main.yml | 60 +++++++++---------- build/stf-run-ci/tasks/pre-clean.yml | 32 +++++----- build/stf-run-ci/tasks/preflight_checks.yml | 2 +- build/stf-run-ci/tasks/setup_base.yml | 22 +++---- .../stf-run-ci/tasks/setup_elasticsearch.yml | 8 +-- build/stf-run-ci/tasks/setup_stf.yml | 12 ++-- .../tasks/setup_stf_from_bundles.yml | 18 +++--- .../tasks/setup_stf_local_build.yml | 31 ++++++---- 15 files changed, 215 insertions(+), 159 deletions(-) create mode 100644 build/stf-run-ci/.ansible-lint diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 5cbce3e9e..aef3d0796 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -2,8 +2,8 @@ name: CI on: push jobs: - linting: - name: Linting + operator-linting: + name: Operator Linting runs-on: ubuntu-20.04 steps: - name: Checkout code @@ -16,11 +16,28 @@ jobs: run: ansible-galaxy collection install operator_sdk.util - name: Install ansible-lint - run: pip install 'ansible-lint < 6.0.0' + run: python -m pip install 'ansible-lint < 6.0.0' - name: Lint Ansible roles/servicetelemetry/ directory run: ${HOME}/.local/bin/ansible-lint roles/servicetelemetry + stf-run-ci-linting: + name: stf-run-ci Linting + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Install Ansible + run: python -m pip install 'ansible' + + - name: Install ansible-lint + run: python -m pip install 'ansible-lint' + + - name: Lint Ansible build/stf-run-ci directory + run: ${HOME}/.local/bin/ansible-lint . + working-directory: ./build/stf-run-ci + build-operator-check: name: Build Operator check runs-on: ubuntu-20.04 diff --git a/build/stf-run-ci/.ansible-lint b/build/stf-run-ci/.ansible-lint new file mode 100644 index 000000000..ae660af6f --- /dev/null +++ b/build/stf-run-ci/.ansible-lint @@ -0,0 +1,52 @@ +--- +profile: null +skip_list: + - args + - avoid-implicit + - command-instead-of-module + - command-instead-of-shell + - complexity + - deprecated-bare-vars + - deprecated-local-action + - deprecated-module + - empty-string-compare + - galaxy + - ignore-errors + - inline-env-var + - internal-error + - jinja + - key-order + - latest + - literal-compare + - loop-var-prefix + - meta-incorrect + - meta-no-tags + - meta-runtime + - meta-video-links + - name + - no-changed-when + - no-free-form + - no-handler + - no-jinja-when + - no-log-password + - no-prompting + - no-relative-paths + - no-same-owner + - no-tabs + - only-builtins + - package-latest + - parser-error + - partial-become + - playbook-extension + - risky-file-permissions + - risky-octal + - risky-shell-pipe + - role-name + - run-once + - sanity + - schema + - var-naming + - warning + - yaml + +# vimrc: ft=yaml diff --git a/build/stf-run-ci/meta/main.yml b/build/stf-run-ci/meta/main.yml index 227ad9c34..e79928dd5 100644 --- a/build/stf-run-ci/meta/main.yml +++ b/build/stf-run-ci/meta/main.yml @@ -1,7 +1,10 @@ galaxy_info: - author: your name - description: your role description - company: your company (optional) + role_name: stf_run_ci # if absent directory name hosting role is used instead + namespace: infrawatch + + author: InfraWatch + description: Helper CI role for Service Telemetry Framework + company: Red Hat # If the issue tracker for your role is not on github, uncomment the # next line and provide a value @@ -14,30 +17,9 @@ galaxy_info: # - GPL-3.0-only # - Apache-2.0 # - CC-BY-4.0 - license: license (GPL-2.0-or-later, MIT, etc) - - min_ansible_version: 2.9 - - # If this a Container Enabled role, provide the minimum Ansible Container version. - # min_ansible_container_version: + license: Apache-2.0 - # - # Provide a list of supported platforms, and for each platform a list of versions. - # If you don't wish to enumerate all versions for a particular platform, use 'all'. - # To view available platforms and versions (or releases), visit: - # https://galaxy.ansible.com/api/v1/platforms/ - # - # platforms: - # - name: Fedora - # versions: - # - all - # - 25 - # - name: SomePlatform - # versions: - # - all - # - 1.0 - # - 7 - # - 99.99 + min_ansible_version: '2.14' galaxy_tags: [] # List tags for your role here, one per line. A tag is a keyword that describes @@ -50,4 +32,3 @@ galaxy_info: dependencies: [] # List your role dependencies here, one per line. Be sure to remove the '[]' above, # if you add dependencies to this list. - \ No newline at end of file diff --git a/build/stf-run-ci/tasks/clone_repos.yml b/build/stf-run-ci/tasks/clone_repos.yml index 11e5d84d6..f7b5dab27 100644 --- a/build/stf-run-ci/tasks/clone_repos.yml +++ b/build/stf-run-ci/tasks/clone_repos.yml @@ -6,14 +6,14 @@ - name: Get Smart Gateway Operator block: - name: Try cloning same-named branch or override branch from SGO repository - git: + ansible.builtin.git: repo: "{{ sgo_repository }}" dest: working/smart-gateway-operator version: "{{ sgo_branch | default(branch, true) }}" - force: yes + force: true rescue: - name: "Get {{ version_branches.sgo }} upstream branch because specified branch or repository doesn't exist" - git: + ansible.builtin.git: repo: https://github.com/infrawatch/smart-gateway-operator dest: working/smart-gateway-operator version: "{{ version_branches.sgo }}" @@ -21,13 +21,13 @@ - name: Get sg-core block: - name: Try cloning same-named branch or override branch from sg-core repository - git: + ansible.builtin.git: repo: "{{ sg_core_repository }}" dest: working/sg-core version: "{{ sg_core_branch | default(branch, true) }}" rescue: - name: "Get {{ version_branches.sg_core }} upstream branch because specified branch or repository doesn't exist" - git: + ansible.builtin.git: repo: https://github.com/infrawatch/sg-core dest: working/sg-core version: "{{ version_branches.sg_core }}" @@ -35,13 +35,13 @@ - name: Get sg-bridge block: - name: Try cloning same-named branch or override branch from sg-bridge repository - git: + ansible.builtin.git: repo: "{{ sg_bridge_repository }}" dest: working/sg-bridge version: "{{ sg_bridge_branch | default(branch, true) }}" rescue: - name: "Get {{ version_branches.sg_bridge }} upstream branch because specified branch or repository doesn't exist" - git: + ansible.builtin.git: repo: https://github.com/infrawatch/sg-bridge dest: working/sg-bridge version: "{{ version_branches.sg_bridge }}" @@ -49,13 +49,13 @@ - name: Get prometheus-webhook-snmp block: - name: Try cloning same-named branch or override branch from prometheus-webhook-snmp repository - git: + ansible.builtin.git: repo: "{{ prometheus_webhook_snmp_repository }}" dest: working/prometheus-webhook-snmp version: "{{ prometheus_webhook_snmp_branch | default(branch, true) }}" rescue: - name: "Get {{ version_branches.prometheus_webhook_snmp }} upstream branch because specified branch or repository doesn't exist" - git: + ansible.builtin.git: repo: https://github.com/infrawatch/prometheus-webhook-snmp dest: working/prometheus-webhook-snmp version: "{{ version_branches.prometheus_webhook_snmp }}" diff --git a/build/stf-run-ci/tasks/create_builds.yml b/build/stf-run-ci/tasks/create_builds.yml index 0282f3d56..132f64e55 100644 --- a/build/stf-run-ci/tasks/create_builds.yml +++ b/build/stf-run-ci/tasks/create_builds.yml @@ -1,6 +1,6 @@ --- - name: Get current BuildConfig for artifact to check if it exists - k8s_info: + kubernetes.core.k8s_info: api_version: build.openshift.io/v1 kind: BuildConfig namespace: "{{ namespace }}" @@ -8,7 +8,7 @@ register: build_config_lookup - name: Get current Builds for artifact to check if it exists - k8s_info: + kubernetes.core.k8s_info: api_version: build.openshift.io/v1 kind: Build namespace: "{{ namespace }}" @@ -19,33 +19,34 @@ - when: build_config_lookup.resources | length == 0 block: - name: Create BuildConfig and ImageStream - shell: oc new-build -n "{{ namespace }}" --name {{ artifact.name }} --dockerfile - < {{ artifact.working_build_dir }}/{{ artifact.dockerfile_path }} + ansible.builtin.shell: oc new-build -n "{{ namespace }}" --name {{ artifact.name }} --dockerfile - < {{ artifact.working_build_dir }}/{{ artifact.dockerfile_path }} - name: Kill first build since it will always fail (triggered on BuildConfig creation) - shell: sleep 10 ; oc delete build {{ artifact.name }}-1 -n "{{ namespace }}" + ansible.builtin.shell: sleep 10 ; oc delete build {{ artifact.name }}-1 -n "{{ namespace }}" - name: Start local image build - command: oc start-build {{ artifact.name }} -n "{{ namespace }}" --wait --from-dir "{{ artifact.working_build_dir }}" + ansible.builtin.command: oc start-build {{ artifact.name }} -n "{{ namespace }}" --wait --from-dir "{{ artifact.working_build_dir }}" register: build_results when: build_lookup.resources | length == 0 - name: Get latest build information for artifact - command: oc get build --selector build={{ artifact.name }} -n "{{ namespace }}" -ojsonpath='{.items[-1:]}' + ansible.builtin.command: oc get build --selector build={{ artifact.name }} -n "{{ namespace }}" -ojsonpath='{.items[-1:]}' register: build_describe_results - name: Set build_describe from json results - set_fact: + ansible.builtin.set_fact: build_describe: "{{ build_describe_results.stdout | from_json }}" -- debug: +- ansible.builtin.debug: var: build_describe -- debug: +- ansible.builtin.debug: var: build_describe.status.outputDockerImageReference - name: Set unique image reference for this artifact - set_fact: + ansible.builtin.set_fact: "{{ artifact.image_reference_name }}": "{{ build_describe.status.outputDockerImageReference }}" -- debug: +- name: Show the image reference name for the build + ansible.builtin.debug: var: "{{ artifact.image_reference_name }}" diff --git a/build/stf-run-ci/tasks/create_catalog.yml b/build/stf-run-ci/tasks/create_catalog.yml index 3ad667c4e..d57825760 100644 --- a/build/stf-run-ci/tasks/create_catalog.yml +++ b/build/stf-run-ci/tasks/create_catalog.yml @@ -1,21 +1,21 @@ --- - name: Create service-telemetry-framework-index working directory - file: + ansible.builtin.file: path: working/service-telemetry-framework-index state: directory mode: '0755' - name: Create info variables from bundle generation output - set_fact: + ansible.builtin.set_fact: sto_bundle_info: "{{ generate_bundle_sto.stdout }}" sgo_bundle_info: "{{ generate_bundle_sgo.stdout }}" - name: Get the builder-dockercfg Secret name - command: oc get secret -n {{ namespace }} --field-selector='type==kubernetes.io/dockercfg' -ojsonpath='{.items[?(@.metadata.annotations.kubernetes\.io/service-account\.name=="builder")].metadata.name}' + ansible.builtin.command: oc get secret -n {{ namespace }} --field-selector='type==kubernetes.io/dockercfg' -ojsonpath='{.items[?(@.metadata.annotations.kubernetes\.io/service-account\.name=="builder")].metadata.name}' register: secret_builder_dockercfg_name - name: Get contents of builder Secret - k8s_info: + kubernetes.core.k8s_info: api_version: v1 kind: Secret name: "{{ secret_builder_dockercfg_name.stdout }}" @@ -23,35 +23,35 @@ register: secret_builder_dockercfg_results - name: Get builder-dockercfg authentication contents - set_fact: + ansible.builtin.set_fact: builder_dockercfg_auth_results: "{{ secret_builder_dockercfg_results.resources[0].data['.dockercfg'] | b64decode }}" - name: Set internal registry authentication - set_fact: + ansible.builtin.set_fact: internal_registry: "{{ builder_dockercfg_auth_results['image-registry.openshift-image-registry.svc:5000'] | to_json }}" - when: query('kubernetes.core.k8s', api_version='v1', kind='Secret', resource_name='service-telemetry-framework-index-dockercfg', namespace=namespace) | length == 0 block: - name: Create config.json to import as Secret - template: + ansible.builtin.template: variable_start_string: "<<" variable_end_string: ">>" src: config-json.j2 dest: working/service-telemetry-framework-index/config.json - name: Create a Secret for the dockercfg - command: oc create secret generic -n {{ namespace }} service-telemetry-framework-index-dockercfg --from-file=.dockerconfigjson=working/service-telemetry-framework-index/config.json --type=kubernetes.io/dockerconfigjson + ansible.builtin.command: oc create secret generic -n {{ namespace }} service-telemetry-framework-index-dockercfg --from-file=.dockerconfigjson=working/service-telemetry-framework-index/config.json --type=kubernetes.io/dockerconfigjson - name: Create ImageStream for ose-operator-registry - command: oc import-image -n {{ namespace }} ose-operator-registry:{{ default_operator_registry_image_tag }} --from={{ default_operator_registry_image_base }}:{{ default_operator_registry_image_tag }} --confirm + ansible.builtin.command: oc import-image -n {{ namespace }} ose-operator-registry:{{ default_operator_registry_image_tag }} --from={{ default_operator_registry_image_base }}:{{ default_operator_registry_image_tag }} --confirm when: query('kubernetes.core.k8s', api_version='v1', kind='ImageStream', resource_name='ose-operator-registry', namespace=namespace) | length == 0 - name: Create ImageStream for service-telemetry-framework-index - command: oc create imagestream -n {{ namespace }} service-telemetry-framework-index + ansible.builtin.command: oc create imagestream -n {{ namespace }} service-telemetry-framework-index when: query('kubernetes.core.k8s', api_version='v1', kind='ImageStream', resource_name='service-telemetry-framework-index', namespace=namespace) | length == 0 - name: Create BuildConfig for service-telemetry-framework-index - k8s: + kubernetes.core.k8s: definition: apiVersion: build.openshift.io/v1 kind: BuildConfig @@ -109,7 +109,7 @@ successfulBuildsHistoryLimit: 5 - name: Get builds of service-telemetry-framework-index - k8s_info: + kubernetes.core.k8s_info: api_version: build.openshift.io/v1 kind: Build namespace: "{{ namespace }}" @@ -120,15 +120,15 @@ - when: index_builds.resources | length == 0 block: - name: Create index.yaml base for index image - template: + ansible.builtin.template: src: index-yaml.j2 dest: working/service-telemetry-framework-index/index.yaml - name: Build service-telemetry-framework-index - command: oc start-build -n "{{ namespace }}" service-telemetry-framework-index --wait --from-dir working/service-telemetry-framework-index + ansible.builtin.command: oc start-build -n "{{ namespace }}" service-telemetry-framework-index --wait --from-dir working/service-telemetry-framework-index - name: Create CloudOps CatalogSource - k8s: + kubernetes.core.k8s: definition: apiVersion: operators.coreos.com/v1alpha1 kind: CatalogSource diff --git a/build/stf-run-ci/tasks/deploy_stf.yml b/build/stf-run-ci/tasks/deploy_stf.yml index 2fddbcbc4..da8a9781a 100644 --- a/build/stf-run-ci/tasks/deploy_stf.yml +++ b/build/stf-run-ci/tasks/deploy_stf.yml @@ -1,7 +1,7 @@ # NOTE: be aware that if the API version changes for the ServiceTelemetry # object that it'll need to be updated here - name: Create default ServiceTelemetry manifest with a observabilityStrategy other than none - set_fact: + ansible.builtin.set_fact: service_telemetry_manifest: | apiVersion: infra.watch/v1beta1 kind: ServiceTelemetry @@ -66,7 +66,7 @@ - __service_telemetry_observability_strategy != "none" - name: Create default ServiceTelemetry manifest with observabilityStrategy none - set_fact: + ansible.builtin.set_fact: service_telemetry_manifest: | apiVersion: infra.watch/v1beta1 kind: ServiceTelemetry @@ -80,10 +80,10 @@ - __service_telemetry_observability_strategy == "none" - name: Show ServiceTelemetry manifest - debug: + ansible.builtin.debug: var: service_telemetry_manifest | from_yaml - name: Create ServiceTelemetry instance - k8s: + kubernetes.core.k8s: definition: '{{ service_telemetry_manifest }}' diff --git a/build/stf-run-ci/tasks/main.yml b/build/stf-run-ci/tasks/main.yml index f62739541..a0ee7b1fa 100644 --- a/build/stf-run-ci/tasks/main.yml +++ b/build/stf-run-ci/tasks/main.yml @@ -3,12 +3,12 @@ # -- initial setup - name: Setup default values - set_fact: + ansible.builtin.set_fact: branch: "{{ working_branch | default('master') }}" namespace: "{{ working_namespace | default('service-telemetry') }}" - name: Set default image paths for local builds - set_fact: + ansible.builtin.set_fact: sgo_image_path: "{{ __internal_registry_path }}/{{ namespace }}/smart-gateway-operator:{{ sgo_image_tag }}" sto_image_path: "{{ __internal_registry_path }}/{{ namespace }}/service-telemetry-operator:{{ sto_image_tag }}" sg_core_image_path: "{{ __internal_registry_path }}/{{ namespace }}/sg-core:{{ sg_core_image_tag }}" @@ -16,61 +16,61 @@ prometheus_webhook_snmp_image_path: "{{ __internal_registry_path }}/{{ namespace }}/prometheus-webhook-snmp:{{ prometheus_webhook_snmp_image_tag }}" - name: Set default image paths for bundle and index builds - set_fact: + ansible.builtin.set_fact: sgo_bundle_image_path: "{{ __internal_registry_path }}/{{ namespace }}/smart-gateway-operator-bundle:{{ sgo_bundle_image_tag }}" sto_bundle_image_path: "{{ __internal_registry_path }}/{{ namespace }}/service-telemetry-operator-bundle:{{ sto_bundle_image_tag }}" stf_index_image_path: "{{ __internal_registry_path }}/{{ namespace }}/service-telemetry-framework-index:{{ stf_index_image_tag }}" - name: Fail on mutually exclusive flags - fail: + ansible.builtin.fail: msg: __deploy_from_bundles_enabled not currently supported with __local_build_enabled (but should be) when: __local_build_enabled | bool and __deploy_from_bundles_enabled | bool - name: Fail when deploying from index image and local build disabled - fail: + ansible.builtin.fail: msg: __deploy_from_index_enabled must also have __local_build_enabled when: __deploy_from_index_enabled | bool and not __local_build_enabled | bool - name: Fail when deploying from index images and deployment from bundles also requested (mutually exclusive methods) - fail: + ansible.builtin.fail: msg: __deploy_from_index_enabled can not be used with __deploy_from_bundles_enabled when: __deploy_from_index_enabled | bool and __deploy_from_bundles_enabled | bool - name: Get the list of nodes - k8s_info: + kubernetes.core.k8s_info: kind: Node register: node_info - name: Get OCP version - shell: oc version -o yaml | grep openshiftVersion | awk '{print $2}' + ansible.builtin.shell: oc version -o yaml | grep openshiftVersion | awk '{print $2}' register: ocp_ver - name: Find out if we are using crc by looking at the node hostnames - set_fact: + ansible.builtin.set_fact: is_crc: "{{ True if 'crc' in node_info.resources[0].metadata.labels[\"kubernetes.io/hostname\"] else False }}" # -- prepare environment and cleanup - name: Clean up any existing global artifacts - include_tasks: pre-clean.yml + ansible.builtin.include_tasks: pre-clean.yml tags: - pre-clean - name: Setup supporting Operator subscriptions - include_tasks: setup_base.yml + ansible.builtin.include_tasks: setup_base.yml tags: - deploy - name: Deploy ES for events testing - include_tasks: setup_elasticsearch.yml + ansible.builtin.include_tasks: setup_elasticsearch.yml - name: Set default base dir if not provided - set_fact: + ansible.builtin.set_fact: base_dir: "{{ playbook_dir }}" when: base_dir | length == 0 - name: Get new operator sdk when: __local_build_enabled | bool or __deploy_from_bundles_enabled | bool or __deploy_from_index_enabled | bool - command: "{{ base_dir }}/get_new_operator_sdk.sh {{ new_operator_sdk_version }}" + ansible.builtin.command: "{{ base_dir }}/get_new_operator_sdk.sh {{ new_operator_sdk_version }}" # -- create artifacts - when: __local_build_enabled | bool @@ -78,12 +78,12 @@ - create_builds block: - name: Setup supporting repositories - include_tasks: clone_repos.yml + ansible.builtin.include_tasks: clone_repos.yml tags: - clone - name: Create base build list - set_fact: + ansible.builtin.set_fact: build_list: - { name: service-telemetry-operator, dockerfile_path: build/Dockerfile, image_reference_name: sto_image_path, working_build_dir: ../ } - { name: smart-gateway-operator, dockerfile_path: build/Dockerfile, image_reference_name: sgo_image_path, working_build_dir: ./working/smart-gateway-operator } @@ -91,11 +91,11 @@ - { name: sg-bridge, dockerfile_path: build/Dockerfile, image_reference_name: sg_bridge_image_path, working_build_dir: ./working/sg-bridge } - { name: prometheus-webhook-snmp, dockerfile_path: Dockerfile, image_reference_name: prometheus_webhook_snmp_image_path, working_build_dir: ./working/prometheus-webhook-snmp } - - debug: + - ansible.builtin.debug: var: build_list - name: Create builds and artifacts - include_tasks: create_builds.yml + ansible.builtin.include_tasks: create_builds.yml loop: "{{ build_list }}" loop_control: loop_var: artifact @@ -103,7 +103,7 @@ - build - name: Setup STF using local artifacts - include_tasks: setup_stf_local_build.yml + ansible.builtin.include_tasks: setup_stf_local_build.yml tags: - deploy @@ -112,16 +112,16 @@ - create_bundles block: - name: Create base build list - set_fact: + ansible.builtin.set_fact: bundle_build_list: - { name: service-telemetry-operator-bundle, dockerfile_path: Dockerfile, image_reference_name: sto_bundle_image_path, working_build_dir: ./working/service-telemetry-operator-bundle } - { name: smart-gateway-operator-bundle, dockerfile_path: Dockerfile, image_reference_name: sgo_bundle_image_path, working_build_dir: ./working/smart-gateway-operator-bundle } - - debug: + - ansible.builtin.debug: var: bundle_build_list - name: Create bundle builds and artifacts - include_tasks: create_builds.yml + ansible.builtin.include_tasks: create_builds.yml loop: "{{ bundle_build_list }}" loop_control: loop_var: artifact @@ -129,22 +129,22 @@ - build - name: Create file-based catalog - include_tasks: create_catalog.yml + ansible.builtin.include_tasks: create_catalog.yml # -- deploy - when: not __local_build_enabled | bool block: - name: Setup Service Telemetry Framework from supplied bundle URLs - include_tasks: setup_stf_from_bundles.yml + ansible.builtin.include_tasks: setup_stf_from_bundles.yml when: __deploy_from_bundles_enabled | bool - name: Setup Service Telemetry Framework from application registry - include_tasks: setup_stf.yml + ansible.builtin.include_tasks: setup_stf.yml when: not __deploy_from_bundles_enabled | bool - when: __deploy_from_index_enabled | bool name: Subscribe to locally built Service Telemetry Operator - k8s: + kubernetes.core.k8s: definition: apiVersion: operators.coreos.com/v1alpha1 kind: Subscription @@ -162,20 +162,20 @@ # -- check if we're ready to instantiate - name: Pre-flight checks - include_tasks: preflight_checks.yml + ansible.builtin.include_tasks: preflight_checks.yml # -- create a ServiceTelemetry object to stand up the STF instance - when: __deploy_stf | bool block: - name: Deploy an instance of STF - include_tasks: deploy_stf.yml + ansible.builtin.include_tasks: deploy_stf.yml - name: Validate system is operational - shell: | + ansible.builtin.shell: | OCP_PROJECT="{{ namespace }}" VALIDATION_SCOPE="{{ __service_telemetry_observability_strategy }}" "{{ base_dir }}/validate_deployment.sh" args: executable: /bin/bash register: validate_deployment - - debug: + - ansible.builtin.debug: var: validate_deployment.stdout_lines diff --git a/build/stf-run-ci/tasks/pre-clean.yml b/build/stf-run-ci/tasks/pre-clean.yml index ed11aec39..8e6df8bef 100644 --- a/build/stf-run-ci/tasks/pre-clean.yml +++ b/build/stf-run-ci/tasks/pre-clean.yml @@ -1,6 +1,6 @@ # NOTE: This cleanup step prevents parallel CI jobs - name: Clear out existing CRDs so we don't conflict or fail merge - k8s: + kubernetes.core.k8s: state: absent api_version: apiextensions.k8s.io/v1 kind: CustomResourceDefinition @@ -14,7 +14,7 @@ # The clusterroles and clusterrolebindings are global objects that can be left # behind by failed bundle installs - name: Remove all clusterrolebindings owned by OLM for this namespace - k8s: + kubernetes.core.k8s: state: absent api_version: rbac.authorization.k8s.io/v1 kind: clusterrolebindings @@ -22,7 +22,7 @@ - "olm.owner.namespace = {{ namespace }}" - name: Remove all clusterroles owned by OLM for this namespace - k8s: + kubernetes.core.k8s: state: absent api_version: rbac.authorization.k8s.io/v1 kind: clusterroles @@ -34,7 +34,7 @@ # been enabled. This avoids installing an additional CatalogSource which is no # longer required. - name: Remove OperatorHub.io CatalogSource if it installed - k8s: + kubernetes.core.k8s: state: absent definition: apiVersion: operators.coreos.com/v1alpha1 @@ -51,7 +51,7 @@ # Upstream Source + Sub from https://github.com/rhobs/observability-operator/tree/main/hack/olm # Moved to using Community Operators Catalog, so no longer require upstream CatalogSource. Eventually move to Red Hat Operators CatalogSource. - name: Remove Red Hat Observability Operator CatalogSource if it is installed - k8s: + kubernetes.core.k8s: state: absent definition: apiVersion: operators.coreos.com/v1alpha1 @@ -62,7 +62,7 @@ namespace: openshift-marketplace - name: Remove CloudOps CatalogSource if it is installed - k8s: + kubernetes.core.k8s: state: absent definition: apiVersion: operators.coreos.com/v1alpha1 @@ -76,7 +76,7 @@ sourceType: grpc - name: Remove Service Telemetry Operator bundle build - k8s: + kubernetes.core.k8s: state: absent api_version: build.openshift.io/v1 kind: Build @@ -85,7 +85,7 @@ - "build=service-telemetry-operator-bundle" - name: Remove Smart Gateway Operator bundle build - k8s: + kubernetes.core.k8s: state: absent api_version: build.openshift.io/v1 kind: Build @@ -94,7 +94,7 @@ - "build=smart-gateway-operator-bundle" - name: Remove Service Telemetry Framework index build - k8s: + kubernetes.core.k8s: state: absent api_version: build.openshift.io/v1 kind: Build @@ -103,7 +103,7 @@ - "build=service-telemetry-framework-index" - name: Remove service-telemetry-operator-bundle CatalogSource (bundle deploy) - k8s: + kubernetes.core.k8s: state: absent definition: apiVersion: operators.coreos.com/v1alpha1 @@ -113,7 +113,7 @@ namespace: "{{ namespace }}" - name: Remove smart-gateway-operator-bundle CatalogSource (bundle deploy) - k8s: + kubernetes.core.k8s: state: absent definition: apiVersion: operators.coreos.com/v1alpha1 @@ -124,9 +124,9 @@ # Remove the cert manager since we install it as part of the CI/documented pre-install process - name: Remove openshift-cert-manager-operator namespace - k8s: + kubernetes.core.k8s: state: absent - wait: yes + wait: true definition: apiVersion: project.openshift.io/v1 kind: Project @@ -134,10 +134,10 @@ name: openshift-cert-manager-operator - name: Remove Elasticsearch - ignore_errors: True - k8s: + ignore_errors: true + kubernetes.core.k8s: state: absent - wait: yes + wait: true definition: apiVersion: elasticsearch.k8s.elastic.co/v1 kind: Elasticsearch diff --git a/build/stf-run-ci/tasks/preflight_checks.yml b/build/stf-run-ci/tasks/preflight_checks.yml index 9b9036de6..2664ea715 100644 --- a/build/stf-run-ci/tasks/preflight_checks.yml +++ b/build/stf-run-ci/tasks/preflight_checks.yml @@ -1,4 +1,4 @@ --- - name: Wait for Service Telemetry Operator to be Succeeded - shell: | + ansible.builtin.shell: | while ! oc get csv -n "{{ namespace }}" | grep service-telemetry-operator | grep Succeeded; do echo "waiting for Service Telemetry Operator..."; sleep 3; done diff --git a/build/stf-run-ci/tasks/setup_base.yml b/build/stf-run-ci/tasks/setup_base.yml index 8c6f44104..b25826f12 100644 --- a/build/stf-run-ci/tasks/setup_base.yml +++ b/build/stf-run-ci/tasks/setup_base.yml @@ -1,6 +1,6 @@ --- - name: Setup OperatorHub dependencies - k8s: + kubernetes.core.k8s: definition: apiVersion: config.openshift.io/v1 kind: OperatorHub @@ -17,7 +17,7 @@ name: community-operators - name: Create OperatorGroup for service-telemetry - k8s: + kubernetes.core.k8s: definition: apiVersion: operators.coreos.com/v1 kind: OperatorGroup @@ -32,7 +32,7 @@ - when: not __deploy_from_index_enabled | bool and ocp_ver.stdout is version ('4.12', '<') block: - name: Create openshift-cert-manager-operator namespace - k8s: + kubernetes.core.k8s: definition: apiVersion: project.openshift.io/v1 kind: Project @@ -43,7 +43,7 @@ - kubernetes - name: Create openshift-cert-manager-operator OperatorGroup - k8s: + kubernetes.core.k8s: definition: apiVersion: operators.coreos.com/v1 kind: OperatorGroup @@ -53,7 +53,7 @@ spec: {} - name: Subscribe to Cert Manager for OpenShift Operator - k8s: + kubernetes.core.k8s: definition: apiVersion: operators.coreos.com/v1alpha1 kind: Subscription @@ -71,7 +71,7 @@ - when: not __deploy_from_index_enabled | bool and ocp_ver.stdout is version ('4.12', '>=') block: - name: Subscribe to Cert Manager for OpenShift Operator - k8s: + kubernetes.core.k8s: definition: apiVersion: operators.coreos.com/v1alpha1 kind: Subscription @@ -90,7 +90,7 @@ - when: not __deploy_from_index_enabled | bool block: - name: Subscribe to AMQ Interconnect Operator - k8s: + kubernetes.core.k8s: definition: apiVersion: operators.coreos.com/v1alpha1 kind: Subscription @@ -105,7 +105,7 @@ sourceNamespace: openshift-marketplace - name: Subscribe to Prometheus Operator - k8s: + kubernetes.core.k8s: definition: apiVersion: operators.coreos.com/v1alpha1 kind: Subscription @@ -122,7 +122,7 @@ - __service_telemetry_observability_strategy == "use_community" - name: Subscribe to Red Hat Obervability Operator - k8s: + kubernetes.core.k8s: definition: apiVersion: operators.coreos.com/v1alpha1 kind: Subscription @@ -141,7 +141,7 @@ - __service_telemetry_observability_strategy in ['use_redhat', 'use_hybrid'] - name: Subscribe to Elastic Cloud on Kubernetes Operator - k8s: + kubernetes.core.k8s: definition: apiVersion: operators.coreos.com/v1alpha1 kind: Subscription @@ -156,7 +156,7 @@ sourceNamespace: openshift-marketplace - name: Wait for Elasticsearch CRD to appear - k8s_info: + kubernetes.core.k8s_info: api_version: apiextensions.k8s.io/v1 kind: CustomResourceDefinition name: elasticsearches.elasticsearch.k8s.elastic.co diff --git a/build/stf-run-ci/tasks/setup_elasticsearch.yml b/build/stf-run-ci/tasks/setup_elasticsearch.yml index 20638fd51..ce227537e 100644 --- a/build/stf-run-ci/tasks/setup_elasticsearch.yml +++ b/build/stf-run-ci/tasks/setup_elasticsearch.yml @@ -1,16 +1,16 @@ - name: Set default ElasticSearch manifest - set_fact: + ansible.builtin.set_fact: elasticsearch_manifest: "{{ lookup('template', './manifest_elasticsearch.j2') | from_yaml }}" when: elasticsearch_manifest is not defined - name: Create an instance of Elasticsearch - k8s: + kubernetes.core.k8s: state: present definition: '{{ elasticsearch_manifest }}' - name: Look up the newly generated ES Certs - k8s_info: + kubernetes.core.k8s_info: api_version: v1 kind: Secret name: elasticsearch-es-http-certs-public @@ -21,7 +21,7 @@ delay: 30 - name: Copy the ES CA cert to our TLS secret - k8s: + kubernetes.core.k8s: definition: apiVersion: v1 kind: Secret diff --git a/build/stf-run-ci/tasks/setup_stf.yml b/build/stf-run-ci/tasks/setup_stf.yml index 66cc1f201..e76eb1734 100644 --- a/build/stf-run-ci/tasks/setup_stf.yml +++ b/build/stf-run-ci/tasks/setup_stf.yml @@ -1,6 +1,6 @@ --- - name: Set default InfraWatch OperatorSource manifest - set_fact: + ansible.builtin.set_fact: infrawatch_catalog_source_manifest: | apiVersion: operators.coreos.com/v1alpha1 kind: CatalogSource @@ -18,7 +18,7 @@ when: infrawatch_catalog_source_manifest is not defined - name: Set default Smart Gateway Operator Subscription manifest - set_fact: + ansible.builtin.set_fact: smart_gateway_operator_subscription_manifest: | apiVersion: operators.coreos.com/v1alpha1 kind: Subscription @@ -34,7 +34,7 @@ when: smart_gateway_operator_subscription_manifest is not defined - name: Set default Service Telemetry Operator Subscription manifest - set_fact: + ansible.builtin.set_fact: service_telemetry_operator_subscription_manifest: | apiVersion: operators.coreos.com/v1alpha1 kind: Subscription @@ -50,16 +50,16 @@ when: service_telemetry_operator_subscription_manifest is not defined - name: Subscribe to Smart Gateway Operator - k8s: + kubernetes.core.k8s: definition: '{{ smart_gateway_operator_subscription_manifest }}' - name: Subscribe to Service Telemetry Operator - k8s: + kubernetes.core.k8s: definition: '{{ service_telemetry_operator_subscription_manifest }}' - name: Enable InfraWatch Catalog Source - k8s: + kubernetes.core.k8s: definition: '{{ infrawatch_catalog_source_manifest }}' diff --git a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml index 2e20ab726..c62eb468c 100644 --- a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml +++ b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml @@ -1,6 +1,6 @@ - block: - name: Get existing Pull Secret from openshift config - k8s_info: + kubernetes.core.k8s_info: api_version: v1 kind: Secret namespace: openshift-config @@ -8,11 +8,11 @@ register: pull_secret - name: Decode docker config json - set_fact: + ansible.builtin.set_fact: dockerconfigjson: "{{ pull_secret.resources[0].data['.dockerconfigjson'] | b64decode }}" - name: Merge registry creds into auth section of docker config - set_fact: + ansible.builtin.set_fact: new_dockerauths: "{{ dockerconfigjson['auths'] | combine( { pull_secret_registry:{ 'auth': (pull_secret_user ~ ':' ~ pull_secret_pass) | b64encode @@ -20,11 +20,11 @@ }) }}" - name: Create new docker config - set_fact: + ansible.builtin.set_fact: new_dockerconfigjson: "{{ dockerconfigjson | combine({'auths': new_dockerauths}) }}" - name: Create Pull Secret for bundle registry access (in the local namespace) - k8s: + kubernetes.core.k8s: state: present definition: apiVersion: v1 @@ -37,7 +37,7 @@ .dockerconfigjson: "{{ new_dockerconfigjson | tojson | b64encode }}" - name: Create Pull Secret for bundle registry access (in the global namespace) - k8s: + kubernetes.core.k8s: state: present definition: apiVersion: v1 @@ -53,7 +53,7 @@ - bundle_registry_auth - name: Create registry CA Cert - k8s: + kubernetes.core.k8s: state: present definition: apiVersion: v1 @@ -89,9 +89,9 @@ success_msg: "Bundle paths are defined and not None" - name: Deploy SGO via OLM bundle - shell: + ansible.builtin.shell: cmd: "{{ base_dir }}/working/operator-sdk run bundle {{__smart_gateway_bundle_image_path}} {% if pull_secret is defined %}--pull-secret-name=pull-secret --ca-secret-name=registry-tls-ca{% endif %} --namespace={{ namespace }} --timeout 600s" - name: Deploy STO via OLM bundle - shell: + ansible.builtin.shell: cmd: "{{ base_dir }}/working/operator-sdk run bundle {{ __service_telemetry_bundle_image_path}} {% if pull_secret is defined %}--pull-secret-name=pull-secret --ca-secret-name=registry-tls-ca{% endif %} --namespace={{ namespace }} --timeout 600s" diff --git a/build/stf-run-ci/tasks/setup_stf_local_build.yml b/build/stf-run-ci/tasks/setup_stf_local_build.yml index 32fe68c9d..a087f4cf0 100644 --- a/build/stf-run-ci/tasks/setup_stf_local_build.yml +++ b/build/stf-run-ci/tasks/setup_stf_local_build.yml @@ -4,7 +4,7 @@ # --- Smart Gateway Operator --- - name: Generate Smart Gateway Operator CSV - shell: + ansible.builtin.shell: chdir: working/smart-gateway-operator/build cmd: | WORKING_DIR="{{ base_dir }}/working/smart-gateway-operator-bundle" \ @@ -18,17 +18,17 @@ register: generate_bundle_sgo - name: Results of SGO bundle generation - debug: + ansible.builtin.debug: var: generate_bundle_sgo.stdout - name: Replace namespace in SGO role binding - replace: + ansible.builtin.replace: path: "{{ base_dir }}/working/smart-gateway-operator/deploy/role_binding.yaml" regexp: 'placeholder' replace: '{{ namespace }}' - name: Replace namespace in SGO CSV - replace: + ansible.builtin.replace: path: "{{ base_dir }}/working/smart-gateway-operator-bundle/manifests/smart-gateway-operator.clusterserviceversion.yaml" regexp: 'placeholder' replace: '{{ namespace }}' @@ -36,7 +36,8 @@ - when: not __deploy_from_index_enabled | bool block: - name: Load Smart Gateway Operator RBAC - command: oc apply -f working/smart-gateway-operator/deploy/{{ item }} -n "{{ namespace }}" + ansible.builtin.command: + cmd: oc apply -f working/smart-gateway-operator/deploy/{{ item }} -n "{{ namespace }}" loop: - service_account.yaml - role.yaml @@ -44,11 +45,12 @@ - olm-catalog/smart-gateway-operator/manifests/smartgateway.infra.watch_smartgateways_crd.yaml - name: Load Smart Gateway Operator CSV - shell: oc apply -f working/smart-gateway-operator-bundle/manifests/smart-gateway-operator.clusterserviceversion.yaml -n "{{ namespace }}" + ansible.builtin.shell: + cmd: oc apply -f working/smart-gateway-operator-bundle/manifests/smart-gateway-operator.clusterserviceversion.yaml -n "{{ namespace }}" # --- Service Telemetry Operator --- - name: Generate Service Telemetry Operator CSV - shell: + ansible.builtin.shell: chdir: "{{ base_dir }}" cmd: | WORKING_DIR="{{ base_dir }}/working/service-telemetry-operator-bundle" \ @@ -60,17 +62,17 @@ register: generate_bundle_sto - name: Results of STO bundle generation - debug: + ansible.builtin.debug: var: generate_bundle_sto.stdout - name: Replace namespace in STO role binding - replace: + ansible.builtin.replace: path: "{{ base_dir }}/../deploy/role_binding.yaml" regexp: 'placeholder' replace: '{{ namespace }}' - name: Replace namespace in STO CSV - replace: + ansible.builtin.replace: path: "{{ base_dir }}/working/service-telemetry-operator-bundle/manifests/service-telemetry-operator.clusterserviceversion.yaml" regexp: 'placeholder' replace: '{{ namespace }}' @@ -78,7 +80,8 @@ - when: not __deploy_from_index_enabled | bool block: - name: Load Service Telemetry Operator RBAC - command: oc apply -f ../deploy/{{ item }} -n "{{ namespace }}" + ansible.builtin.command: + cmd: oc apply -f ../deploy/{{ item }} -n "{{ namespace }}" loop: - service_account.yaml - role.yaml @@ -86,8 +89,10 @@ - olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml - name: Load Service Telemetry Operator CSV - shell: oc apply -f working/service-telemetry-operator-bundle/manifests/service-telemetry-operator.clusterserviceversion.yaml -n "{{ namespace }}" + ansible.builtin.shell: + cmd: oc apply -f working/service-telemetry-operator-bundle/manifests/service-telemetry-operator.clusterserviceversion.yaml -n "{{ namespace }}" # cleanup - name: Revert local change to role_binding.yaml - shell: git checkout -- "{{ base_dir }}/../deploy/role_binding.yaml" + ansible.builtin.shell: + cmd: git checkout -- "{{ base_dir }}/../deploy/role_binding.yaml" From bf93f82551317a55e0d683c754da7bf1ea666e78 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Thu, 14 Sep 2023 18:03:19 +0100 Subject: [PATCH 50/95] [stf-run-ci] Update pre-flight checks to be time-limited (#461) If the CSV doesn't succeed, the task continues indefinitely. This will cause an infinite loop in CI if there is no timeout on the job. Adding in a 10 minute timeout allows the CI job to return a failure instead of running indefinitely. --- build/stf-run-ci/tasks/preflight_checks.yml | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/build/stf-run-ci/tasks/preflight_checks.yml b/build/stf-run-ci/tasks/preflight_checks.yml index 2664ea715..94352aded 100644 --- a/build/stf-run-ci/tasks/preflight_checks.yml +++ b/build/stf-run-ci/tasks/preflight_checks.yml @@ -1,4 +1,15 @@ --- -- name: Wait for Service Telemetry Operator to be Succeeded +# Try for 10 minutes to get an output +- name: "Wait for up to 10 minutes for Service Telemetry Operator to be Succeeded" ansible.builtin.shell: | - while ! oc get csv -n "{{ namespace }}" | grep service-telemetry-operator | grep Succeeded; do echo "waiting for Service Telemetry Operator..."; sleep 3; done + oc get csv -n "{{ namespace }}" | grep service-telemetry-operator | grep Succeeded + register: output + retries: 60 + delay: 10 + until: output.stdout | length != 0 + ignore_errors: true + +- name: "Show fail message if CSV isn't Succeeded after the alotted time" + ansible.builtin.fail: + msg: "Service Telemetry Operator CSV not Succeeded after 10 minutes" + when: output.rc != 0 \ No newline at end of file From 19f250502c0a03f4ad9cb876784fa3c4edb6ded0 Mon Sep 17 00:00:00 2001 From: Victoria Martinez de la Cruz Date: Fri, 15 Sep 2023 19:21:46 +0200 Subject: [PATCH 51/95] Expose operator_sdk versions used in CI (#466) * Expose operator_sdk versions used in CI Currently we use two different versions of the operator_sdk in our CI. One is being used for building the operator bundles (v0.x) and the other is being used for deploying with operator bundles (v1.x). This change makes this fact more explicit across the code and also enables the possibility of configuring which versions to use. * Update build/stf-run-ci/tasks/main.yml * [run-ci] Set gather_facts true We are using ansible_env, so facts need to be gathered --------- Co-authored-by: Emma Foley --- build/get_new_operator_sdk.sh | 16 ------------- build/get_operator_sdk.sh | 24 +++++++++++++++++++ build/run-ci.yaml | 2 +- build/stf-run-ci/defaults/main.yml | 3 ++- build/stf-run-ci/tasks/main.yml | 11 +++++++-- .../tasks/setup_stf_from_bundles.yml | 4 ++-- .../tasks/setup_stf_local_build.yml | 4 ++++ 7 files changed, 42 insertions(+), 22 deletions(-) delete mode 100755 build/get_new_operator_sdk.sh create mode 100755 build/get_operator_sdk.sh diff --git a/build/get_new_operator_sdk.sh b/build/get_new_operator_sdk.sh deleted file mode 100755 index 124b09117..000000000 --- a/build/get_new_operator_sdk.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -REL=$(dirname "$0") -ARCH=$(case $(uname -m) in x86_64) echo -n amd64 ;; aarch64) echo -n arm64 ;; *) echo -n $(uname -m) ;; esac) -OS=$(uname | awk '{print tolower($0)}') -VERSION="${1:-v1.5.0}" -OPERATOR_SDK_DL_URL=https://github.com/operator-framework/operator-sdk/releases/download/${VERSION} - -if [[ ! -f ${REL}/working/operator-sdk-${VERSION} ]]; then - mkdir ${REL}/working - curl -L ${OPERATOR_SDK_DL_URL}/operator-sdk_${OS}_${ARCH} -o ${REL}/working/operator-sdk-${VERSION} - chmod +x ${REL}/working/operator-sdk-${VERSION} - rm ${REL}/working/operator-sdk - ln -s operator-sdk-${VERSION} ${REL}/working/operator-sdk -fi - diff --git a/build/get_operator_sdk.sh b/build/get_operator_sdk.sh new file mode 100755 index 000000000..d1ada222c --- /dev/null +++ b/build/get_operator_sdk.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +set -x + +REL=$(dirname "$0") +ARCH=$(case $(uname -m) in x86_64) echo -n amd64 ;; aarch64) echo -n arm64 ;; *) echo -n $(uname -m) ;; esac) +OS=$(uname | awk '{print tolower($0)}') +VERSION="${1:-v1.5.0}" +OPERATOR_SDK_DL_URL=https://github.com/operator-framework/operator-sdk/releases/download/${VERSION} + +if [[ ! -f ${REL}/working/operator-sdk-${VERSION} ]]; then + mkdir ${REL}/working + if [[ "${VERSION}" =~ "v0" ]]; then + # naming scheme for v0.x is operator-sdk-$VERSION-$ARCH-$OS e.g. operator-sdk-v0.19.4-x86_64-linux-gnu + curl -L ${OPERATOR_SDK_DL_URL}/operator-sdk-${VERSION}-x86_64-linux-gnu -o ${REL}/working/operator-sdk-${VERSION} + else + # naming scheme for v1.x is operator-sdk_$OS-$ARCH e.g. operator-sdk_linux_amd64 + curl -L ${OPERATOR_SDK_DL_URL}/operator-sdk_${OS}_${ARCH} -o ${REL}/working/operator-sdk-${VERSION} + fi + chmod +x ${REL}/working/operator-sdk-${VERSION} + rm -f ${REL}/working/operator-sdk +fi + +set +x \ No newline at end of file diff --git a/build/run-ci.yaml b/build/run-ci.yaml index 797957269..932ef10c0 100644 --- a/build/run-ci.yaml +++ b/build/run-ci.yaml @@ -1,7 +1,7 @@ --- # run STF CI setup in CRC (already provisioned) - hosts: localhost - gather_facts: no + gather_facts: yes connection: local tasks: - name: Run the STF CI system diff --git a/build/stf-run-ci/defaults/main.yml b/build/stf-run-ci/defaults/main.yml index 502eb8194..f9210e451 100644 --- a/build/stf-run-ci/defaults/main.yml +++ b/build/stf-run-ci/defaults/main.yml @@ -47,7 +47,8 @@ prometheus_webhook_snmp_image_tag: latest sgo_bundle_image_tag: latest sto_bundle_image_tag: latest stf_index_image_tag: latest -new_operator_sdk_version: v1.11.0 +operator_sdk_v0: v0.19.4 +operator_sdk_v1: v1.11.0 namespace: service-telemetry pull_secret_registry: pull_secret_user: diff --git a/build/stf-run-ci/tasks/main.yml b/build/stf-run-ci/tasks/main.yml index a0ee7b1fa..f12644a84 100644 --- a/build/stf-run-ci/tasks/main.yml +++ b/build/stf-run-ci/tasks/main.yml @@ -68,9 +68,16 @@ base_dir: "{{ playbook_dir }}" when: base_dir | length == 0 -- name: Get new operator sdk +- name: Get operator_sdk_v0 (build bundles) + ansible.builtin.command: + cmd: "./get_operator_sdk.sh {{ operator_sdk_v0 }}" + creates: "{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v0 }}" + +- name: Get operator_sdk_v1 (deploy from bundles) when: __local_build_enabled | bool or __deploy_from_bundles_enabled | bool or __deploy_from_index_enabled | bool - ansible.builtin.command: "{{ base_dir }}/get_new_operator_sdk.sh {{ new_operator_sdk_version }}" + ansible.builtin.command: + cmd: "{{ base_dir }}/get_operator_sdk.sh {{ operator_sdk_v1 }}" + creates: "{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v1 }}" # -- create artifacts - when: __local_build_enabled | bool diff --git a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml index c62eb468c..ae8948d75 100644 --- a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml +++ b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml @@ -90,8 +90,8 @@ - name: Deploy SGO via OLM bundle ansible.builtin.shell: - cmd: "{{ base_dir }}/working/operator-sdk run bundle {{__smart_gateway_bundle_image_path}} {% if pull_secret is defined %}--pull-secret-name=pull-secret --ca-secret-name=registry-tls-ca{% endif %} --namespace={{ namespace }} --timeout 600s" + cmd: "{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v1 }} run bundle {{ __smart_gateway_bundle_image_path }} {% if pull_secret is defined %} --pull-secret-name=pull-secret --ca-secret-name=registry-tls-ca {% endif %} --namespace={{ namespace }} --timeout 600s" - name: Deploy STO via OLM bundle ansible.builtin.shell: - cmd: "{{ base_dir }}/working/operator-sdk run bundle {{ __service_telemetry_bundle_image_path}} {% if pull_secret is defined %}--pull-secret-name=pull-secret --ca-secret-name=registry-tls-ca{% endif %} --namespace={{ namespace }} --timeout 600s" + cmd: "{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v1 }} run bundle {{ __service_telemetry_bundle_image_path }} {% if pull_secret is defined %} --pull-secret-name=pull-secret --ca-secret-name=registry-tls-ca {% endif %} --namespace={{ namespace }} --timeout 600s" diff --git a/build/stf-run-ci/tasks/setup_stf_local_build.yml b/build/stf-run-ci/tasks/setup_stf_local_build.yml index a087f4cf0..bf10fc2eb 100644 --- a/build/stf-run-ci/tasks/setup_stf_local_build.yml +++ b/build/stf-run-ci/tasks/setup_stf_local_build.yml @@ -7,6 +7,8 @@ ansible.builtin.shell: chdir: working/smart-gateway-operator/build cmd: | + LOGFILE="{{ ansible_env.HOME }}/sgo_gen_bundle.log" \ + OPERATOR_SDK="{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v0 }}" \ WORKING_DIR="{{ base_dir }}/working/smart-gateway-operator-bundle" \ RELATED_IMAGE_CORE_SMARTGATEWAY={{ sg_core_image_path | parse_image | quote }} \ RELATED_IMAGE_BRIDGE_SMARTGATEWAY={{ sg_bridge_image_path | parse_image | quote }} \ @@ -53,6 +55,8 @@ ansible.builtin.shell: chdir: "{{ base_dir }}" cmd: | + LOGFILE="{{ ansible_env.HOME }}/sto_gen_bundle.log" \ + OPERATOR_SDK="{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v0 }}" \ WORKING_DIR="{{ base_dir }}/working/service-telemetry-operator-bundle" \ RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP={{ prometheus_webhook_snmp_image_path | parse_image | quote }} \ RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG={{ prometheus_webhook_snmp_image_path | parse_tag | quote }} \ From 92c7a8b2c0d256070a4a1d356c4c0ab30c6234bb Mon Sep 17 00:00:00 2001 From: Victoria Martinez de la Cruz Date: Mon, 18 Sep 2023 16:24:43 +0200 Subject: [PATCH 52/95] Remove registry access config tags (#446) * Remove registry access config tags Remove "bundle_registry_tls_ca" and "bundle_registry_auth" tags in favor of "setup_bundle_registry_tls_ca" and "setup_bundle_registry_auth" config options. Values for these new options are set to true to keep backwards compatibility. * Add bundle registry setup options to README Adds setup_bundle_registry_tls_ca and setup_bundle_registry_auth options definitions to the README --------- Co-authored-by: Emma Foley --- build/stf-run-ci/README.md | 7 +++++-- build/stf-run-ci/defaults/main.yml | 3 +++ .../stf-run-ci/tasks/setup_stf_from_bundles.yml | 16 ++++++---------- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/build/stf-run-ci/README.md b/build/stf-run-ci/README.md index f0c47db15..959a6048a 100644 --- a/build/stf-run-ci/README.md +++ b/build/stf-run-ci/README.md @@ -23,6 +23,8 @@ choose to override: | `__deploy_from_index_enabled` | {true,false} | false | Whether to deploy STF from locally built bundles and index image. | | `__service_telemetry_bundle_image_path` | | `quay.io/infrawatch-operators/service-telemetry-operator-bundle:nightly-head` | Image path to Service Telemetry Operator bundle | | `__smart_gateway_bundle_image_path` | | `quay.io/infrawatch-operators/smart-gateway-operator-bundle:nightly-head` | Image path to Smart Gateway Operator bundle | +| `setup_bundle_registry_tls_ca` | {true,false} | true | Whether to setup or not a TLS CA cert for the bundle registry access | +| `setup_bundle_registry_auth` | {true,false} | true | Whether to setup or not the auth for the bundle registry access | | `prometheus_webhook_snmp_branch` | | master | Which Prometheus Webhook SNMP git branch to checkout | | `sgo_branch` | | master | Which Smart Gateway Operator git branch to checkout | | `sg_core_branch` | | master | Which Smart Gateway Core git branch to checkout | @@ -113,8 +115,9 @@ ansible-playbook -e __local_build_enabled=false -e __deploy_from_bundles_enabled NOTE: When deploying from bundles, you must have a _CA.pem_ for the registry already in place in the build directory, if required. If this is -not required, add `--skip-tags bundle_registry_tls_ca`. If no login is required -to your bundle image registry, add `--skip-tags bundle_registry_auth` +not required, set `setup_bundle_registry_tls_ca` to `false`. If no login is required +to your bundle image registry, set `setup_bundle_registry_auth` to `false`. +By default, those configuration options are set to `true`. ## Deployment from local artifacts, bundles, and index diff --git a/build/stf-run-ci/defaults/main.yml b/build/stf-run-ci/defaults/main.yml index f9210e451..4e00f8395 100644 --- a/build/stf-run-ci/defaults/main.yml +++ b/build/stf-run-ci/defaults/main.yml @@ -67,3 +67,6 @@ sg_bridge_repository: https://github.com/infrawatch/sg-bridge prometheus_webhook_snmp_repository: https://github.com/infrawatch/prometheus-webhook-snmp base_dir: '' + +setup_bundle_registry_auth: true +setup_bundle_registry_tls_ca: true diff --git a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml index ae8948d75..7b8259a14 100644 --- a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml +++ b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml @@ -1,4 +1,5 @@ -- block: +- when: setup_bundle_registry_auth + block: - name: Get existing Pull Secret from openshift config kubernetes.core.k8s_info: api_version: v1 @@ -49,10 +50,8 @@ data: .dockerconfigjson: "{{ new_dockerconfigjson | tojson | b64encode }}" - tags: - - bundle_registry_auth - -- name: Create registry CA Cert +- when: setup_bundle_registry_tls_ca + name: Create registry CA Cert kubernetes.core.k8s: state: present definition: @@ -64,10 +63,9 @@ namespace: "{{ namespace }}" data: cert.pem: "{{ lookup('file', 'CA.pem') | b64encode }}" - tags: - - bundle_registry_tls_ca -- name: Patch the default service account to use our pull secret +- when: setup_bundle_registry_tls_ca + name: Patch the default service account to use our pull secret kubernetes.core.k8s_json_patch: kind: ServiceAccount namespace: "{{ namespace }}" @@ -77,8 +75,6 @@ path: /imagePullSecrets value: - name: pull-secret - tags: - - bundle_registry_tls_ca - name: "Ensure that the bundle paths are set." ansible.builtin.assert: From d41edf46fe32076a94dece911ba748ac8851fafb Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Mon, 18 Sep 2023 18:53:32 +0100 Subject: [PATCH 53/95] generate_bundle: add logging (#463) * use absolute dir for REL * specify default value for OPERATOR_SDK * pass a logfile * use -x for more debug info --- build/generate_bundle.sh | 12 ++++++++++-- build/stf-run-ci/tasks/create_catalog.yml | 6 ++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/build/generate_bundle.sh b/build/generate_bundle.sh index 3145eba13..186354cf4 100755 --- a/build/generate_bundle.sh +++ b/build/generate_bundle.sh @@ -1,6 +1,13 @@ #!/usr/bin/env bash set -e -REL=$(dirname "$0") +set -x + +LOGFILE=${LOGFILE:-/dev/null} +# If LOGFILE is /dev/null, this command fails, so ignore that error +truncate --size=0 ${LOGFILE} || true + +OPERATOR_SDK=${OPERATOR_SDK:-operator-sdk} +REL=$( readlink -f $(dirname "$0")) # shellcheck source=build/metadata.sh . "${REL}/metadata.sh" @@ -23,7 +30,7 @@ generate_bundle() { REPLACE_REGEX="s#<>#${CREATED_DATE}#g;s#<>#${OPERATOR_IMAGE}#g;s#<>#${OPERATOR_TAG}#g;s#<>#${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP}#g;s#<>#${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG}#g;s#<>#${RELATED_IMAGE_OAUTH_PROXY}#g;s#<>#${RELATED_IMAGE_OAUTH_PROXY_TAG}#g;s#<>#${OPERATOR_BUNDLE_VERSION}#g;s#1.99.0#${OPERATOR_BUNDLE_VERSION}#g;s#<>#${OPERATOR_DOCUMENTATION_URL}#g;s#<>#${BUNDLE_OLM_SKIP_RANGE_LOWER_BOUND}#g" pushd "${REL}/../" > /dev/null 2>&1 - ${OPERATOR_SDK} generate bundle --channels ${BUNDLE_CHANNELS} --default-channel ${BUNDLE_DEFAULT_CHANNEL} --manifests --metadata --version "${OPERATOR_BUNDLE_VERSION}" --output-dir "${WORKING_DIR}" > /dev/null 2>&1 + ${OPERATOR_SDK} generate bundle --verbose --channels ${BUNDLE_CHANNELS} --default-channel ${BUNDLE_DEFAULT_CHANNEL} --manifests --metadata --version "${OPERATOR_BUNDLE_VERSION}" --output-dir "${WORKING_DIR}" >> ${LOGFILE} 2>&1 popd > /dev/null 2>&1 sed -i -E "${REPLACE_REGEX}" "${WORKING_DIR}/manifests/${OPERATOR_NAME}.clusterserviceversion.yaml" @@ -57,5 +64,6 @@ copy_extra_metadata #build_bundle_instructions #echo "## End Bundle creation" +set +x JSON_OUTPUT='{"operator_bundle_image":"%s","operator_bundle_version":"%s","operator_image":"%s","bundle_channels":"%s","bundle_default_channel":"%s","operator_tag":"%s","working_dir":"%s"}' printf "$JSON_OUTPUT" "$OPERATOR_BUNDLE_IMAGE" "$OPERATOR_BUNDLE_VERSION" "$OPERATOR_IMAGE" "$BUNDLE_CHANNELS" "$BUNDLE_DEFAULT_CHANNEL" "$OPERATOR_TAG" "$WORKING_DIR" diff --git a/build/stf-run-ci/tasks/create_catalog.yml b/build/stf-run-ci/tasks/create_catalog.yml index d57825760..ea950f72c 100644 --- a/build/stf-run-ci/tasks/create_catalog.yml +++ b/build/stf-run-ci/tasks/create_catalog.yml @@ -5,10 +5,12 @@ state: directory mode: '0755' +# Updating to use stdout_lines[-1] so that any additional info that gets added to generate_bundles (e.g. for debug) doesn't break this task +# Adding from_json so that the JSON output is parsed into a dictionary - name: Create info variables from bundle generation output ansible.builtin.set_fact: - sto_bundle_info: "{{ generate_bundle_sto.stdout }}" - sgo_bundle_info: "{{ generate_bundle_sgo.stdout }}" + sto_bundle_info: "{{ generate_bundle_sto.stdout_lines[-1] | from_json }}" + sgo_bundle_info: "{{ generate_bundle_sgo.stdout_lines[-1] | from_json }}" - name: Get the builder-dockercfg Secret name ansible.builtin.command: oc get secret -n {{ namespace }} --field-selector='type==kubernetes.io/dockercfg' -ojsonpath='{.items[?(@.metadata.annotations.kubernetes\.io/service-account\.name=="builder")].metadata.name}' From aa95e093f074c61379bbc77e48b9a5b206971802 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Tue, 19 Sep 2023 13:42:51 +0100 Subject: [PATCH 54/95] Improve logging (#471) * [better_logging][stf-run-ci] Add more output information and logfiles Update s{t,g}o_bundle_info Use stdout_lines to get the last line of the output, and from_json to parse it from json to a dict. Additional lines of debug in the generate_bundle script will not effect the value of *_bundle_info * main: add timeout and logfile for validate deployment * create_builds: always show output of the build (block/always) * Show more verbose output on builds * Always show the build output (block/always) * Name the tasks to describe what's happening * smoketest: set -x to see more details * pass LOGFILE to generate_bundle.sh * [improve_logging] Add logfile_dir var * Remove set -x from smoketests * Update build/generate_bundle.sh Co-authored-by: Emma Foley Co-authored-by: Leif Madsen Co-authored-by: Victoria Martinez de la Cruz --- build/generate_bundle.sh | 2 + build/stf-run-ci/tasks/create_builds.yml | 28 +++++++++--- build/stf-run-ci/tasks/main.yml | 18 ++++++-- .../tasks/setup_stf_local_build.yml | 43 ++++++++++--------- 4 files changed, 62 insertions(+), 29 deletions(-) diff --git a/build/generate_bundle.sh b/build/generate_bundle.sh index 186354cf4..14a635cf5 100755 --- a/build/generate_bundle.sh +++ b/build/generate_bundle.sh @@ -3,10 +3,12 @@ set -e set -x LOGFILE=${LOGFILE:-/dev/null} + # If LOGFILE is /dev/null, this command fails, so ignore that error truncate --size=0 ${LOGFILE} || true OPERATOR_SDK=${OPERATOR_SDK:-operator-sdk} + REL=$( readlink -f $(dirname "$0")) # shellcheck source=build/metadata.sh diff --git a/build/stf-run-ci/tasks/create_builds.yml b/build/stf-run-ci/tasks/create_builds.yml index 132f64e55..e54b77cb9 100644 --- a/build/stf-run-ci/tasks/create_builds.yml +++ b/build/stf-run-ci/tasks/create_builds.yml @@ -23,11 +23,25 @@ - name: Kill first build since it will always fail (triggered on BuildConfig creation) ansible.builtin.shell: sleep 10 ; oc delete build {{ artifact.name }}-1 -n "{{ namespace }}" + ignore_errors: true + retries: 3 + delay: 10 + register: kill_build + until: kill_build.rc == 0 -- name: Start local image build - ansible.builtin.command: oc start-build {{ artifact.name }} -n "{{ namespace }}" --wait --from-dir "{{ artifact.working_build_dir }}" - register: build_results - when: build_lookup.resources | length == 0 +- block: + - name: Start local image build + ansible.builtin.command: oc start-build {{ artifact.name }} -n "{{ namespace }}" --follow --wait --from-dir "{{ artifact.working_build_dir }}" + register: build_results + when: build_lookup.resources | length == 0 + ignore_errors: true + retries: 3 + delay: 10 + until: build_results.rc == 0 + always: + - name: "Show build results" + ansible.builtin.debug: + var: build_results - name: Get latest build information for artifact ansible.builtin.command: oc get build --selector build={{ artifact.name }} -n "{{ namespace }}" -ojsonpath='{.items[-1:]}' @@ -37,10 +51,12 @@ ansible.builtin.set_fact: build_describe: "{{ build_describe_results.stdout | from_json }}" -- ansible.builtin.debug: +- name: Get the build results + ansible.builtin.debug: var: build_describe -- ansible.builtin.debug: +- name: Show the outputDockerImageReference, which will be used for the image reference name + ansible.builtin.debug: var: build_describe.status.outputDockerImageReference - name: Set unique image reference for this artifact diff --git a/build/stf-run-ci/tasks/main.yml b/build/stf-run-ci/tasks/main.yml index f12644a84..78bf11d63 100644 --- a/build/stf-run-ci/tasks/main.yml +++ b/build/stf-run-ci/tasks/main.yml @@ -79,6 +79,16 @@ cmd: "{{ base_dir }}/get_operator_sdk.sh {{ operator_sdk_v1 }}" creates: "{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v1 }}" +- name: Set logfile_dir + when: not (logfile_dir is defined) + ansible.builtin.set_fact: + logfile_dir: "{{ base_dir }}/working/logs" + +- name: Make sure the logging dir exists + ansible.builtin.command: + cmd: mkdir -p {{ logfile_dir }} + creates: "{{ logfile_dir }}" + # -- create artifacts - when: __local_build_enabled | bool tags: @@ -179,10 +189,12 @@ - name: Validate system is operational ansible.builtin.shell: | - OCP_PROJECT="{{ namespace }}" VALIDATION_SCOPE="{{ __service_telemetry_observability_strategy }}" "{{ base_dir }}/validate_deployment.sh" + OCP_PROJECT="{{ namespace }}" VALIDATION_SCOPE="{{ __service_telemetry_observability_strategy }}" "{{ base_dir }}/validate_deployment.sh" >> {{ logfile_dir }}/validate_deployment.log 2>&1 args: executable: /bin/bash register: validate_deployment - - ansible.builtin.debug: - var: validate_deployment.stdout_lines + - name: Show the result of the validate_deployment script + ansible.builtin.shell: + cmd: | + cat {{ logfile_dir }}/validate_deployment.log diff --git a/build/stf-run-ci/tasks/setup_stf_local_build.yml b/build/stf-run-ci/tasks/setup_stf_local_build.yml index bf10fc2eb..70fc3f1e3 100644 --- a/build/stf-run-ci/tasks/setup_stf_local_build.yml +++ b/build/stf-run-ci/tasks/setup_stf_local_build.yml @@ -3,25 +3,28 @@ # to use the generate_bundle_ content for use in other places # --- Smart Gateway Operator --- -- name: Generate Smart Gateway Operator CSV - ansible.builtin.shell: - chdir: working/smart-gateway-operator/build - cmd: | - LOGFILE="{{ ansible_env.HOME }}/sgo_gen_bundle.log" \ - OPERATOR_SDK="{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v0 }}" \ - WORKING_DIR="{{ base_dir }}/working/smart-gateway-operator-bundle" \ - RELATED_IMAGE_CORE_SMARTGATEWAY={{ sg_core_image_path | parse_image | quote }} \ - RELATED_IMAGE_BRIDGE_SMARTGATEWAY={{ sg_bridge_image_path | parse_image | quote }} \ - RELATED_IMAGE_CORE_SMARTGATEWAY_TAG={{ sg_core_image_path | parse_tag | quote }} \ - RELATED_IMAGE_BRIDGE_SMARTGATEWAY_TAG={{ sg_bridge_image_path | parse_tag | quote }} \ - OPERATOR_IMAGE={{ sgo_image_path | parse_image | quote }} \ - OPERATOR_TAG={{ sgo_image_path | parse_tag | quote }} \ - ./generate_bundle.sh - register: generate_bundle_sgo - -- name: Results of SGO bundle generation - ansible.builtin.debug: - var: generate_bundle_sgo.stdout +- block: + - name: Generate Smart Gateway Operator CSV + ansible.builtin.shell: + chdir: working/smart-gateway-operator/build + cmd: | + LOGFILE="{{ logfile_dir }}/sgo_gen_bundle.log" \ + OPERATOR_SDK="{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v0 }}" \ + WORKING_DIR="{{ base_dir }}/working/smart-gateway-operator-bundle" \ + RELATED_IMAGE_CORE_SMARTGATEWAY={{ sg_core_image_path | parse_image | quote }} \ + RELATED_IMAGE_BRIDGE_SMARTGATEWAY={{ sg_bridge_image_path | parse_image | quote }} \ + RELATED_IMAGE_CORE_SMARTGATEWAY_TAG={{ sg_core_image_path | parse_tag | quote }} \ + RELATED_IMAGE_BRIDGE_SMARTGATEWAY_TAG={{ sg_bridge_image_path | parse_tag | quote }} \ + OPERATOR_IMAGE={{ sgo_image_path | parse_image | quote }} \ + OPERATOR_TAG={{ sgo_image_path | parse_tag | quote }} \ + ./generate_bundle.sh + register: generate_bundle_sgo + always: + # "|| true" is needed until https://github.com/infrawatch/smart-gateway-operator/pull/143 is merged + - name: Show generate bundle log + ansible.builtin.shell: + cmd: | + cat {{ logfile_dir }}/sgo_gen_bundle.log || true - name: Replace namespace in SGO role binding ansible.builtin.replace: @@ -55,7 +58,7 @@ ansible.builtin.shell: chdir: "{{ base_dir }}" cmd: | - LOGFILE="{{ ansible_env.HOME }}/sto_gen_bundle.log" \ + LOGFILE="{{ logfile_dir }}/sto_gen_bundle.log" \ OPERATOR_SDK="{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v0 }}" \ WORKING_DIR="{{ base_dir }}/working/service-telemetry-operator-bundle" \ RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP={{ prometheus_webhook_snmp_image_path | parse_image | quote }} \ From 59a40d2f17518dc35b83e6df537daeeed05053e7 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Tue, 19 Sep 2023 10:18:36 -0400 Subject: [PATCH 55/95] Relax pod admission controls when using a local catalog index (#450) * Relax pod admission controls when using a local catalog index See https://docs.openshift.com/container-platform/4.13/operators/admin/olm-managing-custom-catalogs.html#olm-catalog-sources-and-psa_olm-managing-custom-catalogs * Fix FQCN lint --- build/stf-run-ci/tasks/create_catalog.yml | 2 ++ build/stf-run-ci/tasks/main.yml | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/build/stf-run-ci/tasks/create_catalog.yml b/build/stf-run-ci/tasks/create_catalog.yml index ea950f72c..459c28d8b 100644 --- a/build/stf-run-ci/tasks/create_catalog.yml +++ b/build/stf-run-ci/tasks/create_catalog.yml @@ -142,6 +142,8 @@ image: "{{ stf_index_image_path }}" publisher: CloudOps sourceType: grpc + grpcPodConfig: + securityContextConfig: legacy updateStrategy: registryPoll: interval: 1m diff --git a/build/stf-run-ci/tasks/main.yml b/build/stf-run-ci/tasks/main.yml index 78bf11d63..44157f075 100644 --- a/build/stf-run-ci/tasks/main.yml +++ b/build/stf-run-ci/tasks/main.yml @@ -124,6 +124,20 @@ tags: - deploy +- when: __deploy_from_index_enabled | bool or __deploy_from_bundles_enabled | bool + name: Relax the pod security admission controls to allow local catalog index registry pods + kubernetes.core.k8s: + definition: + apiVersion: v1 + kind: Namespace + metadata: + name: "{{ namespace }}" + labels: + security.openshift.io/scc.podSecurityLabelSync: "false" + pod-security.kubernetes.io/enforce: baseline + pod-security.kubernetes.io/audit: restricted + pod-security.kubernetes.io/warn: restricted + - when: __deploy_from_index_enabled | bool tags: - create_bundles From a4b4b7a059687f29d1a2eaa944bf0949c67b2bce Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Thu, 21 Sep 2023 11:14:40 +0100 Subject: [PATCH 56/95] [stf-run-ci] Update pull_secret check for running bundles (#474) When switching from tags to skipping using a bool, the check ``pull_secret is defined`` became invalid. When a task is skipped, it still returns a value, which is captured with `register: pull_secret` The run bundle command was using `pull_secret is defined` to determine whether to pass the pull-secret arg, and this check was defunct, since pull_secret is always defined. The solution for this is to re-set the value back to a 0-length string, and change the conditional to check the length. Additionally, the boolean values needed to be explicitly treated as bools so that the registry setup tasks would be skipped --- .../tasks/setup_stf_from_bundles.yml | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml index 7b8259a14..8439dce4a 100644 --- a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml +++ b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml @@ -1,4 +1,4 @@ -- when: setup_bundle_registry_auth +- when: setup_bundle_registry_auth | bool block: - name: Get existing Pull Secret from openshift config kubernetes.core.k8s_info: @@ -50,7 +50,7 @@ data: .dockerconfigjson: "{{ new_dockerconfigjson | tojson | b64encode }}" -- when: setup_bundle_registry_tls_ca +- when: setup_bundle_registry_tls_ca | bool name: Create registry CA Cert kubernetes.core.k8s: state: present @@ -76,6 +76,17 @@ value: - name: pull-secret + # When the task is skipped, pull_secret is still defined. It is set to the task output i.e. + # "pull_secret": { + # "changed": false, + # "skip_reason": "Conditional result was False", + # "skipped": true + # } +- name: "Set pull_secret to a zero-length string, if setup_bundle_registry_auth is false" + when: not (setup_bundle_registry_auth | bool) + ansible.builtin.set_fact: + pull_secret: '' + - name: "Ensure that the bundle paths are set." ansible.builtin.assert: that: @@ -86,8 +97,8 @@ - name: Deploy SGO via OLM bundle ansible.builtin.shell: - cmd: "{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v1 }} run bundle {{ __smart_gateway_bundle_image_path }} {% if pull_secret is defined %} --pull-secret-name=pull-secret --ca-secret-name=registry-tls-ca {% endif %} --namespace={{ namespace }} --timeout 600s" + cmd: "{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v1 }} --verbose run bundle {{ __smart_gateway_bundle_image_path }} {% if pull_secret | length > 0 %} --pull-secret-name=pull-secret --ca-secret-name=registry-tls-ca {% endif %} --namespace={{ namespace }} --timeout 600s" - name: Deploy STO via OLM bundle ansible.builtin.shell: - cmd: "{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v1 }} run bundle {{ __service_telemetry_bundle_image_path }} {% if pull_secret is defined %} --pull-secret-name=pull-secret --ca-secret-name=registry-tls-ca {% endif %} --namespace={{ namespace }} --timeout 600s" + cmd: "{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v1 }} --verbose run bundle {{ __service_telemetry_bundle_image_path }} {% if pull_secret | length > 0 %} --pull-secret-name=pull-secret --ca-secret-name=registry-tls-ca {% endif %} --namespace={{ namespace }} --timeout 600s" From dcf6964cbf0e571022e29fb500ae355e1706a3ae Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Thu, 21 Sep 2023 07:33:56 -0400 Subject: [PATCH 57/95] Install ObO during index-based deployments (#472) Because Observability Operator (ObO) is a cluster-scoped Operator, the OLM dependency management can't resolve the dependency for us. Update setup_base to pre-install ObO when observabilityStrategy is use_redhat or use_hybrid, even when when index-based deployment is enabled. Resolves: STF-1483 --- build/stf-run-ci/tasks/setup_base.yml | 36 +++++++++++++-------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/build/stf-run-ci/tasks/setup_base.yml b/build/stf-run-ci/tasks/setup_base.yml index b25826f12..cf9c92fdf 100644 --- a/build/stf-run-ci/tasks/setup_base.yml +++ b/build/stf-run-ci/tasks/setup_base.yml @@ -121,24 +121,24 @@ when: - __service_telemetry_observability_strategy == "use_community" - - name: Subscribe to Red Hat Obervability Operator - kubernetes.core.k8s: - definition: - apiVersion: operators.coreos.com/v1alpha1 - kind: Subscription - metadata: - labels: - operators.coreos.com/observability-operator.openshift-operators: "" - name: observability-operator - namespace: openshift-operators - spec: - channel: stable - installPlanApproval: Automatic - name: observability-operator - source: community-operators - sourceNamespace: openshift-marketplace - when: - - __service_telemetry_observability_strategy in ['use_redhat', 'use_hybrid'] +- name: Subscribe to Red Hat Obervability Operator + kubernetes.core.k8s: + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + labels: + operators.coreos.com/observability-operator.openshift-operators: "" + name: observability-operator + namespace: openshift-operators + spec: + channel: stable + installPlanApproval: Automatic + name: observability-operator + source: community-operators + sourceNamespace: openshift-marketplace + when: + - __service_telemetry_observability_strategy in ['use_redhat', 'use_hybrid'] - name: Subscribe to Elastic Cloud on Kubernetes Operator kubernetes.core.k8s: From 3765a658aebdaaf62614a5a7ecea38c9878c6ac7 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Thu, 21 Sep 2023 07:45:55 -0400 Subject: [PATCH 58/95] Update 17.1 helper scripts README (#476) Update the README to move the OCP_ROUTE_IP to the top making it easier to not include it when copying the command (since it needs to be populated manually). Update the default path contents with an oc command to fill that in automatically. --- tests/infrared/17.1/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/infrared/17.1/README.md b/tests/infrared/17.1/README.md index 576e91c94..15bcf37a9 100644 --- a/tests/infrared/17.1/README.md +++ b/tests/infrared/17.1/README.md @@ -3,9 +3,9 @@ ## Basic deployment ```bash -CA_CERT_FILE_CONTENT="$(oc get secret/default-interconnect-selfsigned -o jsonpath='{.data.ca\.crt}' | base64 -d)" \ OCP_ROUTE_IP="10.0.100.50" \ -AMQP_HOST="default-interconnect-5671-service-telemetry.apps.stf15.localhost" \ +CA_CERT_FILE_CONTENT="$(oc get secret/default-interconnect-selfsigned -o jsonpath='{.data.ca\.crt}' | base64 -d)" \ +AMQP_HOST="$(oc get route default-interconnect-5671 -ojsonpath='{.spec.host}')" \ ENABLE_STF_CONNECTORS=true \ ENABLE_GNOCCHI_CONNECTORS=false \ CONTROLLER_MEMORY="24000" \ From 805ada4bf45074bf1ad513eb2e01d0568a1bff80 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Thu, 21 Sep 2023 15:54:30 +0100 Subject: [PATCH 59/95] [issue#306] Add missing ClusterRoles (#465) * [issue#306] Add missing ClusterRoles The cluster-monitoring-operator is required for STF to install. It creates the required alertmanager-main and prometheus-k8s. ClusterRoles, and STF relies on these being present. These are not present when using CRC, so ClusterRoles need to be explicitly created. The names of the ClusterRoles have been updated, in case there is some conflict when cluster-monitoring-operator is installed after STF. This is a workaround for not having cluster-monitoring-operator installed: https://github.com/infrawatch/service-telemetry-operator/issues/306 resolves #306 * Fix up the RBAC setup for prometheus-stf (#467) Fix up the RBAC changes to fully get prometheus-stf working and decoupled from prometheus-k8s. Changes to using a separate prometheus-stf ClusterRole, ClusterRoleBinding, and ServiceAccount, along with a Role and RoleBinding, all using prometheus-stf as the ServiceAccount. Also updates the Alertmanager configuration to use alertmanager-stf instead of alertmanager-main. * Fix smoketest to use prometheus-stf for token retrieval * Refactor smoketest script (#468) * Refactor smoketest script Perform a bit of smoketest refactoring and fix up a few bugs. * Update alert trigger to use startsAt in order to potentially speed up delivery of the alerts. Failures in the SNMP_WEBHOOK_STATUS seems to be primarily to delayed alert notification through prometheus-snmp-webhook. * Add an alert clean up task as part of the clean up logic at the end. * Update openssl x509 to not use the -in flag which seems unnecessary and on some systems causes a failure. * Add new SMOKETEST_VERBOSE boolean so local testing can skip massive amounts of information dumped to stdout. * Remove curl pod using label selector for slightly cleaner output. * Update failure check to combine RET and SNMP_WEBHOOK_STATUS since testing seems to show changes are slightly more reliable. * Show logs from curl * Remove nodes/metrics permission from ClusterRole As part of least priviledge work, remove the nodes/metrics permission as we're not scraping nodes for information. Everything appears to continue working in STF without this permission. * Move SCC RBAC from ClusterRole to Role Working on simplifying and reducing our access scope as much as possible. It appears moving SCC RBAC from ClusterRole to Role allows things to continue to work with Prometheus. It's possible further testing may reveal this will need to reverted. * Convert alertmanager-stf Role to ClusterRole (#473) Convert alertmanager-stf Role to ClusterRole as the tokenreviews and subjectaccessreviews resources need to be accessable at the cluster scope. * Create ClusterRoleBinding and Role for alertmanager (#475) * Create ClusterRoleBinding and Role for alertmanager Create appropriate ClusterRoleBinding and Role for alertmanager-stf, breaking out SCC into a Role vs ClusterRole to keep things in alignment to prometheus-stf RBAC setup. * Adjust smoketest.sh for SNMP webhook test failures Adjust the smoketest script to also fail when the SNMP webhook test has failed. Add a wait condition for the curl pod to complete so logs can be retrieved. * Add *RoleBinding rescue capabilities If changes happen to the ClusterRoleBinding or RoleBinding then generally the system is not going to allow you to patch the object. Adds block/rescue logic to remove the existing ClusterRoleBinding or RoleBinding before creating it when patching the object fails. --------- Co-authored-by: Leif Madsen --- deploy/role.yaml | 3 +- .../tasks/component_alertmanager.yml | 119 ++++++++++- .../tasks/component_prometheus.yml | 200 ++++++++++++------ .../templates/manifest_prometheus.j2 | 4 +- tests/smoketest/smoketest.sh | 148 ++++++------- 5 files changed, 327 insertions(+), 147 deletions(-) diff --git a/deploy/role.yaml b/deploy/role.yaml index 1d0a841fa..58966b6f7 100644 --- a/deploy/role.yaml +++ b/deploy/role.yaml @@ -22,6 +22,7 @@ rules: - watch - update - patch + - delete - apiGroups: - authorization.k8s.io resources: @@ -185,4 +186,4 @@ rules: verbs: - get - list - - watch \ No newline at end of file + - watch diff --git a/roles/servicetelemetry/tasks/component_alertmanager.yml b/roles/servicetelemetry/tasks/component_alertmanager.yml index bcb63e44f..1e5551311 100644 --- a/roles/servicetelemetry/tasks/component_alertmanager.yml +++ b/roles/servicetelemetry/tasks/component_alertmanager.yml @@ -66,7 +66,7 @@ kind: Route name: '{{ ansible_operator_meta.name }}-alertmanager-proxy' -- name: Add a service account to used by Alertmanager +- name: Create ServiceAccount/alertmanager-stf with oauth redirect annotation k8s: definition: apiVersion: v1 @@ -77,22 +77,121 @@ annotations: serviceaccounts.openshift.io/oauth-redirectreference.alertmanager: '{{ alertmanager_oauth_redir_ref | to_json }}' -- name: Bind role +- name: Create ClusterRole/alertmanager-stf k8s: definition: apiVersion: rbac.authorization.k8s.io/v1 - kind: ClusterRoleBinding + kind: ClusterRole metadata: name: alertmanager-stf - namespace: '{{ ansible_operator_meta.namespace }}' - roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: alertmanager-main - subjects: - - kind: ServiceAccount + rules: + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create + +- name: Setup ClusterRoleBinding for Alertmanager + block: + - name: Define ClusterRoleBinding/alertmanager-stf + set_fact: + def_alertmanager_stf_crb: | + apiVersion: rbac.authorization.k8s.io/v1 + kind: ClusterRoleBinding + metadata: + name: alertmanager-stf + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: alertmanager-stf + subjects: + - kind: ServiceAccount + name: alertmanager-stf + namespace: '{{ ansible_operator_meta.namespace }}' + + - name: Create ClusterRoleBinding/alertmanager-stf + k8s: + definition: + "{{ def_alertmanager_stf_crb }}" + rescue: + - name: Remove ClusterRoleBinding/alertmanager-stf when fail to update + k8s: + state: absent + definition: + apiVersion: rbac.authorization.k8s.io/v1 + kind: ClusterRoleBinding + metadata: + name: alertmanager-stf + + - name: Create ClusterRoleBinding/alertmanager-stf + k8s: + definition: + "{{ def_alertmanager_stf_crb }}" + +- name: Create Role/alertmanager-stf + k8s: + definition: + apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: name: alertmanager-stf namespace: '{{ ansible_operator_meta.namespace }}' + rules: + - apiGroups: + - security.openshift.io + resourceNames: + - nonroot + resources: + - securitycontextconstraints + verbs: + - use + +- name: Setup RoleBinding for Alertmanager + block: + - name: Define RoleBinding/alertmanager-stf + set_fact: + def_alertmanager_stf_rb: | + apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: alertmanager-stf + namespace: '{{ ansible_operator_meta.namespace }}' + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: alertmanager-stf + namespace: '{{ ansible_operator_meta.namespace }}' + subjects: + - kind: ServiceAccount + name: alertmanager-stf + namespace: '{{ ansible_operator_meta.namespace }}' + + - name: Create RoleBinding/alertmanager-stf + k8s: + definition: + "{{ def_alertmanager_stf_rb }}" + rescue: + - name: Remove RoleBinding/alertmanager-stf when fail to update + k8s: + state: absent + definition: + apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: alertmanager-stf + namespace: '{{ ansible_operator_meta.namespace }}' + + - name: Create RoleBinding/alertmanager-stf + k8s: + definition: + "{{ def_alertmanager_stf_rb }}" - name: Set default alertmanager service template set_fact: diff --git a/roles/servicetelemetry/tasks/component_prometheus.yml b/roles/servicetelemetry/tasks/component_prometheus.yml index eb890c1be..2e865abd2 100644 --- a/roles/servicetelemetry/tasks/component_prometheus.yml +++ b/roles/servicetelemetry/tasks/component_prometheus.yml @@ -7,91 +7,171 @@ kind: Route name: '{{ ansible_operator_meta.name }}-prometheus-proxy' -- name: Add oauth redirect annotation to prometheus-k8s service account +- name: Create ServiceAccount/prometheus-stf with oauth redirect annotation k8s: definition: apiVersion: v1 kind: ServiceAccount metadata: - name: prometheus-k8s + name: prometheus-stf namespace: '{{ ansible_operator_meta.namespace }}' annotations: serviceaccounts.openshift.io/oauth-redirectreference.prometheus: '{{ prom_oauth_redir_ref | to_json }}' -- block: - - name: Install RBAC Role for prometheus operations +- name: Create ClusterRole/prometheus-stf for non-resource URL /metrics access + k8s: + definition: + apiVersion: rbac.authorization.k8s.io/v1 + kind: ClusterRole + metadata: + name: prometheus-stf + rules: + - nonResourceURLs: + - /metrics + verbs: + - get + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + +- name: Setup ClusterRoleBinding for Prometheus + block: + - name: Define ClusterRoleBinding/prometheus-stf + set_fact: + def_prometheus_stf_crb: | + apiVersion: rbac.authorization.k8s.io/v1 + kind: ClusterRoleBinding + metadata: + name: prometheus-stf + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-stf + subjects: + - kind: ServiceAccount + name: prometheus-stf + namespace: '{{ ansible_operator_meta.namespace }}' + + - name: Create ClusterRoleBinding/prometheus-stf + k8s: + definition: + "{{ def_prometheus_stf_crb }}" + rescue: + - name: Remove ClusterRoleBinding/prometheus-stf when fail to update + k8s: + state: absent + definition: + apiVersion: rbac.authorization.k8s.io/v1 + kind: ClusterRoleBinding + metadata: + name: prometheus-stf + + - name: Create ClusterRoleBinding/prometheus-stf + k8s: + definition: + "{{ def_prometheus_stf_crb }}" + +- name: Create Role/prometheus-stf for Prometheus operations + k8s: + definition: + apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + name: prometheus-stf + namespace: '{{ ansible_operator_meta.namespace }}' + rules: + - apiGroups: + - "" + resources: + - services + - endpoints + - pods + verbs: + - get + - list + - watch + - apiGroups: + - extensions + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch + - apiGroups: + - security.openshift.io + resourceNames: + - nonroot + - nonroot-v2 + resources: + - securitycontextconstraints + verbs: + - use + +- name: Setup RoleBinding for Prometheus + block: + - name: Define RoleBinding/prometheus-stf + set_fact: + def_prometheus_stf_rb: | + apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: prometheus-stf + namespace: '{{ ansible_operator_meta.namespace }}' + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-stf + subjects: + - kind: ServiceAccount + name: prometheus-stf + namespace: '{{ ansible_operator_meta.namespace }}' + + - name: Create RoleBinding/prometheus-stf + k8s: + definition: + "{{ def_prometheus_stf_rb }}" + rescue: + - name: Remove RoleBinding/prometheus-stf on failure to update k8s: + state: absent definition: apiVersion: rbac.authorization.k8s.io/v1 - kind: Role + kind: RoleBinding metadata: name: prometheus-stf namespace: '{{ ansible_operator_meta.namespace }}' - rules: - - apiGroups: - - "" - resources: - - services - - endpoints - - pods - verbs: - - get - - list - - watch - - apiGroups: - - extensions - - networking.k8s.io - resources: - - ingresses - verbs: - - get - - list - - watch - - apiGroups: - - security.openshift.io - resourceNames: - - nonroot - - nonroot-v2 - resources: - - securitycontextconstraints - verbs: - - use - - - name: Bind the local prometheus SA to our new role + + - name: Create RoleBinding/prometheus-stf k8s: definition: - apiVersion: rbac.authorization.k8s.io/v1 - kind: RoleBinding - metadata: - name: prometheus-k8s-stf - namespace: '{{ ansible_operator_meta.namespace }}' - roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: prometheus-stf - subjects: - - kind: ServiceAccount - name: prometheus-k8s - namespace: '{{ ansible_operator_meta.namespace }}' - when: - - observability_strategy in ['use_redhat', 'use_hybrid'] + "{{ def_prometheus_stf_rb }}" -- name: Bind the local prometheus SA to prometheus cluster role (for oauth perms) +- name: Remove old ClusterRoleBinding for prometheus-k8s using CMO roleRef k8s: + state: absent definition: apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: prometheus-k8s-{{ ansible_operator_meta.namespace }} namespace: '{{ ansible_operator_meta.namespace }}' - roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus-k8s - subjects: - - kind: ServiceAccount - name: prometheus-k8s - namespace: '{{ ansible_operator_meta.namespace }}' - name: Check for existing prometheus htpasswd user secret k8s_info: diff --git a/roles/servicetelemetry/templates/manifest_prometheus.j2 b/roles/servicetelemetry/templates/manifest_prometheus.j2 index 4e8651def..2bdf408b9 100644 --- a/roles/servicetelemetry/templates/manifest_prometheus.j2 +++ b/roles/servicetelemetry/templates/manifest_prometheus.j2 @@ -11,7 +11,7 @@ spec: replicas: {{ servicetelemetry_vars.backends.metrics.prometheus.deployment_size }} ruleSelector: {} securityContext: {} - serviceAccountName: prometheus-k8s + serviceAccountName: prometheus-stf serviceMonitorSelector: matchLabels: app: smart-gateway @@ -44,7 +44,7 @@ spec: - -upstream=http://localhost:9090/ - -htpasswd-file=/etc/proxy/htpasswd/auth - -cookie-secret-file=/etc/proxy/secrets/session_secret - - -openshift-service-account=prometheus-k8s + - -openshift-service-account=prometheus-stf - '-openshift-sar={"resource": "namespaces", "verb": "get"}' ports: - containerPort: 9092 diff --git a/tests/smoketest/smoketest.sh b/tests/smoketest/smoketest.sh index 8a801c004..4204398f2 100755 --- a/tests/smoketest/smoketest.sh +++ b/tests/smoketest/smoketest.sh @@ -28,6 +28,7 @@ if [ "${OC_CLIENT_VERSION_Y}" -lt "${OC_CLIENT_VERSION_Y_REQUIRED}" ] || [ "${OC fi CLEANUP=${CLEANUP:-true} +SMOKETEST_VERBOSE=${SMOKETEST_VERBOSE:-true} for ((i=1; i<=NUMCLOUDS; i++)); do NAME="smoke${i}" @@ -71,33 +72,18 @@ echo "*** [INFO] Triggering an alertmanager notification..." # check if the oc client version is less than 4.11 and adjust the token command to match available commands if [ 0${OC_CLIENT_VERSION_Y} -lt 011 ]; then - PROMETHEUS_K8S_TOKEN=$(oc serviceaccounts get-token prometheus-k8s) + PROMETHEUS_K8S_TOKEN=$(oc serviceaccounts get-token prometheus-stf) else - PROMETHEUS_K8S_TOKEN=$(oc create token prometheus-k8s) + PROMETHEUS_K8S_TOKEN=$(oc create token prometheus-stf) fi -oc run curl --restart='Never' --image=quay.io/infrawatch/busyboxplus:curl -- sh -c "curl -k -H \"Content-Type: application/json\" -H \"Authorization: Bearer ${PROMETHEUS_K8S_TOKEN}\" -d '[{\"labels\":{\"alertname\":\"Testalert1\"}}]' https://default-alertmanager-proxy:9095/api/v1/alerts" -# it takes some time to get the alert delivered, continuing with other tests - - -# Trying to find a less brittle test than a timeout -JOB_TIMEOUT=300s -for NAME in "${CLOUDNAMES[@]}"; do - echo "*** [INFO] Waiting on job/stf-smoketest-${NAME}..." - oc wait --for=condition=complete --timeout=${JOB_TIMEOUT} "job/stf-smoketest-${NAME}" - RET=$((RET || $?)) # Accumulate exit codes -done - -echo "*** [INFO] Checking that the qdr certificate has a long expiry" -EXPIRETIME=$(oc get secret default-interconnect-openstack-ca -o json | grep \"tls.crt\"\: | awk -F '": "' '{print $2}' | rev | cut -c3- | rev | base64 -d | openssl x509 -in - -text | grep "Not After" | awk -F " : " '{print $2}') -EXPIRETIME_UNIX=$(date -d "${EXPIRETIME}" "+%s") -TARGET_UNIX=$(date -d "now + 7 years" "+%s") -if [ ${EXPIRETIME_UNIX} -lt ${TARGET_UNIX} ]; then - echo "[FAILURE] Certificate expire time (${EXPIRETIME}) less than 7 years from now" -fi +# create the alert using startsAt which in theory may cause trigger to be faster +echo "*** [INFO] Create alert" +oc delete pod -l run=curl ; oc run curl --wait --restart='Never' --image=quay.io/infrawatch/busyboxplus:curl -- sh -c "curl -v -k -H \"Content-Type: application/json\" -H \"Authorization: Bearer ${PROMETHEUS_K8S_TOKEN}\" -d '[{\"status\":\"firing\",\"labels\":{\"alertname\":\"smoketest\",\"severity\":\"warning\"},\"startsAt\":\"$(date --rfc-3339=seconds | sed 's/ /T/')\"}]' https://default-alertmanager-proxy:9095/api/v1/alerts" +oc wait --for=jsonpath='{.status.phase}'=Succeeded pod/curl +oc logs curl echo "*** [INFO] Waiting to see SNMP trap message in webhook pod" -oc delete pod curl SNMP_WEBHOOK_POD=$(oc get pod -l "app=default-snmp-webhook" -ojsonpath='{.items[0].metadata.name}') SNMP_WEBHOOK_CHECK_MAX_TRIES=5 SNMP_WEBHOOK_CHECK_TIMEOUT=30 @@ -112,74 +98,88 @@ while [ $SNMP_WEBHOOK_CHECK_COUNT -lt $SNMP_WEBHOOK_CHECK_MAX_TRIES ]; do sleep $SNMP_WEBHOOK_CHECK_TIMEOUT done -echo "*** [INFO] Showing oc get all..." -oc get all -echo - -echo "*** [INFO] Showing servicemonitors..." -oc get servicemonitor -o yaml -echo - -echo "*** [INFO] Logs from smoketest containers..." +# Trying to find a less brittle test than a timeout +JOB_TIMEOUT=300s for NAME in "${CLOUDNAMES[@]}"; do - oc logs "$(oc get pod -l "job-name=stf-smoketest-${NAME}" -o jsonpath='{.items[0].metadata.name}')" -c smoketest-collectd - oc logs "$(oc get pod -l "job-name=stf-smoketest-${NAME}" -o jsonpath='{.items[0].metadata.name}')" -c smoketest-ceilometer + echo "*** [INFO] Waiting on job/stf-smoketest-${NAME}..." + oc wait --for=condition=complete --timeout=${JOB_TIMEOUT} "job/stf-smoketest-${NAME}" + RET=$((RET || $?)) # Accumulate exit codes done -echo - -echo "*** [INFO] Logs from qdr..." -oc logs "$(oc get pod -l application=default-interconnect -o jsonpath='{.items[0].metadata.name}')" -echo -echo "*** [INFO] Logs from smart gateways..." -oc logs "$(oc get pod -l "smart-gateway=default-cloud1-coll-meter" -o jsonpath='{.items[0].metadata.name}')" -c bridge -oc logs "$(oc get pod -l "smart-gateway=default-cloud1-coll-meter" -o jsonpath='{.items[0].metadata.name}')" -c sg-core -oc logs "$(oc get pod -l "smart-gateway=default-cloud1-coll-event" -o jsonpath='{.items[0].metadata.name}')" -c bridge -oc logs "$(oc get pod -l "smart-gateway=default-cloud1-coll-event" -o jsonpath='{.items[0].metadata.name}')" -c sg-core -oc logs "$(oc get pod -l "smart-gateway=default-cloud1-ceil-meter" -o jsonpath='{.items[0].metadata.name}')" -c bridge -oc logs "$(oc get pod -l "smart-gateway=default-cloud1-ceil-meter" -o jsonpath='{.items[0].metadata.name}')" -c sg-core -oc logs "$(oc get pod -l "smart-gateway=default-cloud1-ceil-event" -o jsonpath='{.items[0].metadata.name}')" -c bridge -oc logs "$(oc get pod -l "smart-gateway=default-cloud1-ceil-event" -o jsonpath='{.items[0].metadata.name}')" -c sg-core -oc logs "$(oc get pod -l "smart-gateway=default-cloud1-sens-meter" -o jsonpath='{.items[0].metadata.name}')" -c bridge -oc logs "$(oc get pod -l "smart-gateway=default-cloud1-sens-meter" -o jsonpath='{.items[0].metadata.name}')" -c sg-core -echo - -echo "*** [INFO] Logs from smart gateway operator..." -oc logs "$(oc get pod -l app=smart-gateway-operator -o jsonpath='{.items[0].metadata.name}')" -echo - -echo "*** [INFO] Logs from prometheus..." -oc logs "$(oc get pod -l prometheus=default -o jsonpath='{.items[0].metadata.name}')" -c prometheus -echo +echo "*** [INFO] Checking that the qdr certificate has a long expiry" +EXPIRETIME=$(oc get secret default-interconnect-openstack-ca -o json | grep \"tls.crt\"\: | awk -F '": "' '{print $2}' | rev | cut -c3- | rev | base64 -d | openssl x509 -text | grep "Not After" | awk -F " : " '{print $2}') +EXPIRETIME_UNIX=$(date -d "${EXPIRETIME}" "+%s") +TARGET_UNIX=$(date -d "now + 7 years" "+%s") +if [ ${EXPIRETIME_UNIX} -lt ${TARGET_UNIX} ]; then + echo "[FAILURE] Certificate expire time (${EXPIRETIME}) less than 7 years from now" +fi -echo "*** [INFO] Logs from elasticsearch..." -oc logs "$(oc get pod -l common.k8s.elastic.co/type=elasticsearch -o jsonpath='{.items[0].metadata.name}')" +echo "*** [INFO] Showing oc get all..." +oc get all echo -echo "*** [INFO] Logs from snmp webhook..." -oc logs "$(oc get pod -l app=default-snmp-webhook -o jsonpath='{.items[0].metadata.name}')" +echo "*** [INFO] Showing servicemonitors..." +oc get servicemonitors.monitoring.rhobs -o yaml echo -echo "*** [INFO] Logs from alertmanager..." -oc logs "$(oc get pod -l app.kubernetes.io/name=alertmanager -o jsonpath='{.items[0].metadata.name}')" -c alertmanager -echo +if [ "$SMOKETEST_VERBOSE" = "true" ]; then + echo "*** [INFO] Logs from smoketest containers..." + for NAME in "${CLOUDNAMES[@]}"; do + oc logs "$(oc get pod -l "job-name=stf-smoketest-${NAME}" -o jsonpath='{.items[0].metadata.name}')" -c smoketest-collectd + oc logs "$(oc get pod -l "job-name=stf-smoketest-${NAME}" -o jsonpath='{.items[0].metadata.name}')" -c smoketest-ceilometer + done + echo + + echo "*** [INFO] Logs from qdr..." + oc logs "$(oc get pod -l application=default-interconnect -o jsonpath='{.items[0].metadata.name}')" + echo + + echo "*** [INFO] Logs from smart gateways..." + oc logs "$(oc get pod -l "smart-gateway=default-cloud1-coll-meter" -o jsonpath='{.items[0].metadata.name}')" -c bridge + oc logs "$(oc get pod -l "smart-gateway=default-cloud1-coll-meter" -o jsonpath='{.items[0].metadata.name}')" -c sg-core + oc logs "$(oc get pod -l "smart-gateway=default-cloud1-coll-event" -o jsonpath='{.items[0].metadata.name}')" -c bridge + oc logs "$(oc get pod -l "smart-gateway=default-cloud1-coll-event" -o jsonpath='{.items[0].metadata.name}')" -c sg-core + oc logs "$(oc get pod -l "smart-gateway=default-cloud1-ceil-meter" -o jsonpath='{.items[0].metadata.name}')" -c bridge + oc logs "$(oc get pod -l "smart-gateway=default-cloud1-ceil-meter" -o jsonpath='{.items[0].metadata.name}')" -c sg-core + oc logs "$(oc get pod -l "smart-gateway=default-cloud1-ceil-event" -o jsonpath='{.items[0].metadata.name}')" -c bridge + oc logs "$(oc get pod -l "smart-gateway=default-cloud1-ceil-event" -o jsonpath='{.items[0].metadata.name}')" -c sg-core + oc logs "$(oc get pod -l "smart-gateway=default-cloud1-sens-meter" -o jsonpath='{.items[0].metadata.name}')" -c bridge + oc logs "$(oc get pod -l "smart-gateway=default-cloud1-sens-meter" -o jsonpath='{.items[0].metadata.name}')" -c sg-core + echo + + echo "*** [INFO] Logs from smart gateway operator..." + oc logs "$(oc get pod -l app=smart-gateway-operator -o jsonpath='{.items[0].metadata.name}')" + echo + + echo "*** [INFO] Logs from prometheus..." + oc logs "$(oc get pod -l prometheus=default -o jsonpath='{.items[0].metadata.name}')" -c prometheus + echo + + echo "*** [INFO] Logs from elasticsearch..." + oc logs "$(oc get pod -l common.k8s.elastic.co/type=elasticsearch -o jsonpath='{.items[0].metadata.name}')" + echo + + echo "*** [INFO] Logs from snmp webhook..." + oc logs "$(oc get pod -l app=default-snmp-webhook -o jsonpath='{.items[0].metadata.name}')" + echo + + echo "*** [INFO] Logs from alertmanager..." + oc logs "$(oc get pod -l app.kubernetes.io/name=alertmanager -o jsonpath='{.items[0].metadata.name}')" -c alertmanager + echo +fi echo "*** [INFO] Cleanup resources..." if $CLEANUP; then oc delete "job/stf-smoketest-${NAME}" + # resolve the alert to clean up the system, otherwise this expires in 5 minutes + oc delete pod -l run=curl ; oc run curl --restart='Never' --image=quay.io/infrawatch/busyboxplus:curl -- sh -c "curl -v -k -H \"Content-Type: application/json\" -H \"Authorization: Bearer ${PROMETHEUS_K8S_TOKEN}\" -d '[{\"status\":\"firing\",\"labels\":{\"alertname\":\"smoketest\",\"severity\":\"warning\"},\"startsAt\":\"$(date --rfc-3339=seconds | sed 's/ /T/')\",\"endsAt\":\"$(date --rfc-3339=seconds | sed 's/ /T/')\"}]' https://default-alertmanager-proxy:9095/api/v1/alerts" fi echo -if [ $SNMP_WEBHOOK_STATUS -ne 0 ]; then - echo "*** [FAILURE] SNMP Webhook failed" - exit 1 -fi - -if [ $RET -eq 0 ]; then +if [ $RET -eq 0 ] && [ $SNMP_WEBHOOK_STATUS -eq 0 ]; then echo "*** [SUCCESS] Smoke test job completed successfully" + exit 0 else echo "*** [FAILURE] Smoke test job still not succeeded after ${JOB_TIMEOUT}" + exit 1 fi -echo - -exit $RET From 1f6aa2c7ed8fc32211f824d0d44a01152570e507 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Fri, 22 Sep 2023 12:07:23 +0100 Subject: [PATCH 60/95] [stf-run-ci] Use {{ base_dir }} instead of relative paths (#460) * [stf-run-ci] Use {{ base_dir }} instead of relative paths * Add base_dir for script imports in stf-run-ci * Update destination in stf-run-ci/{clone_repos,setup_stf_local_build} * Use base_dir instead of relative paths * main: build_list * create_stf_local_build: use base_dir for logfile in generate bundle get_operator_sdk.sh: Use readlink to use absolute dir * [add_base_dir] specify chdir for get_operator_sdk The get_operator_sdk.sh script downloads the operator-sdk into a directory relative to directory it was called from. To get the expected behaviour, it needs to be run from base_dir, since that value is used later, when operator-sdk is called * [add_base_dir] Update shell task to use chdir for git command --- build/stf-run-ci/tasks/clone_repos.yml | 16 ++++++++-------- build/stf-run-ci/tasks/create_catalog.yml | 10 +++++----- build/stf-run-ci/tasks/main.yml | 16 +++++++++------- build/stf-run-ci/tasks/setup_stf_local_build.yml | 7 ++++--- 4 files changed, 26 insertions(+), 23 deletions(-) diff --git a/build/stf-run-ci/tasks/clone_repos.yml b/build/stf-run-ci/tasks/clone_repos.yml index f7b5dab27..2bb2871bf 100644 --- a/build/stf-run-ci/tasks/clone_repos.yml +++ b/build/stf-run-ci/tasks/clone_repos.yml @@ -8,14 +8,14 @@ - name: Try cloning same-named branch or override branch from SGO repository ansible.builtin.git: repo: "{{ sgo_repository }}" - dest: working/smart-gateway-operator + dest: "{{ base_dir }}/working/smart-gateway-operator" version: "{{ sgo_branch | default(branch, true) }}" force: true rescue: - name: "Get {{ version_branches.sgo }} upstream branch because specified branch or repository doesn't exist" ansible.builtin.git: repo: https://github.com/infrawatch/smart-gateway-operator - dest: working/smart-gateway-operator + dest: "{{ base_dir }}/working/smart-gateway-operator" version: "{{ version_branches.sgo }}" - name: Get sg-core @@ -23,13 +23,13 @@ - name: Try cloning same-named branch or override branch from sg-core repository ansible.builtin.git: repo: "{{ sg_core_repository }}" - dest: working/sg-core + dest: "{{ base_dir }}/working/sg-core" version: "{{ sg_core_branch | default(branch, true) }}" rescue: - name: "Get {{ version_branches.sg_core }} upstream branch because specified branch or repository doesn't exist" ansible.builtin.git: repo: https://github.com/infrawatch/sg-core - dest: working/sg-core + dest: "{{ base_dir }}/working/sg-core" version: "{{ version_branches.sg_core }}" - name: Get sg-bridge @@ -37,13 +37,13 @@ - name: Try cloning same-named branch or override branch from sg-bridge repository ansible.builtin.git: repo: "{{ sg_bridge_repository }}" - dest: working/sg-bridge + dest: "{{ base_dir }}/working/sg-bridge" version: "{{ sg_bridge_branch | default(branch, true) }}" rescue: - name: "Get {{ version_branches.sg_bridge }} upstream branch because specified branch or repository doesn't exist" ansible.builtin.git: repo: https://github.com/infrawatch/sg-bridge - dest: working/sg-bridge + dest: "{{ base_dir }}/working/sg-bridge" version: "{{ version_branches.sg_bridge }}" - name: Get prometheus-webhook-snmp @@ -51,12 +51,12 @@ - name: Try cloning same-named branch or override branch from prometheus-webhook-snmp repository ansible.builtin.git: repo: "{{ prometheus_webhook_snmp_repository }}" - dest: working/prometheus-webhook-snmp + dest: "{{ base_dir }}/working/prometheus-webhook-snmp" version: "{{ prometheus_webhook_snmp_branch | default(branch, true) }}" rescue: - name: "Get {{ version_branches.prometheus_webhook_snmp }} upstream branch because specified branch or repository doesn't exist" ansible.builtin.git: repo: https://github.com/infrawatch/prometheus-webhook-snmp - dest: working/prometheus-webhook-snmp + dest: "{{ base_dir }}/working/prometheus-webhook-snmp" version: "{{ version_branches.prometheus_webhook_snmp }}" diff --git a/build/stf-run-ci/tasks/create_catalog.yml b/build/stf-run-ci/tasks/create_catalog.yml index 459c28d8b..f49586950 100644 --- a/build/stf-run-ci/tasks/create_catalog.yml +++ b/build/stf-run-ci/tasks/create_catalog.yml @@ -1,7 +1,7 @@ --- - name: Create service-telemetry-framework-index working directory ansible.builtin.file: - path: working/service-telemetry-framework-index + path: "{{ base_dir }}/working/service-telemetry-framework-index" state: directory mode: '0755' @@ -39,10 +39,10 @@ variable_start_string: "<<" variable_end_string: ">>" src: config-json.j2 - dest: working/service-telemetry-framework-index/config.json + dest: "{{ base_dir }}/working/service-telemetry-framework-index/config.json" - name: Create a Secret for the dockercfg - ansible.builtin.command: oc create secret generic -n {{ namespace }} service-telemetry-framework-index-dockercfg --from-file=.dockerconfigjson=working/service-telemetry-framework-index/config.json --type=kubernetes.io/dockerconfigjson + ansible.builtin.command: oc create secret generic -n {{ namespace }} service-telemetry-framework-index-dockercfg --from-file=.dockerconfigjson={{ base_dir }}/working/service-telemetry-framework-index/config.json --type=kubernetes.io/dockerconfigjson - name: Create ImageStream for ose-operator-registry ansible.builtin.command: oc import-image -n {{ namespace }} ose-operator-registry:{{ default_operator_registry_image_tag }} --from={{ default_operator_registry_image_base }}:{{ default_operator_registry_image_tag }} --confirm @@ -124,10 +124,10 @@ - name: Create index.yaml base for index image ansible.builtin.template: src: index-yaml.j2 - dest: working/service-telemetry-framework-index/index.yaml + dest: "{{ base_dir }}/working/service-telemetry-framework-index/index.yaml" - name: Build service-telemetry-framework-index - ansible.builtin.command: oc start-build -n "{{ namespace }}" service-telemetry-framework-index --wait --from-dir working/service-telemetry-framework-index + ansible.builtin.command: oc start-build -n "{{ namespace }}" service-telemetry-framework-index --wait --from-dir {{ base_dir }}/working/service-telemetry-framework-index - name: Create CloudOps CatalogSource kubernetes.core.k8s: diff --git a/build/stf-run-ci/tasks/main.yml b/build/stf-run-ci/tasks/main.yml index 44157f075..b8769efeb 100644 --- a/build/stf-run-ci/tasks/main.yml +++ b/build/stf-run-ci/tasks/main.yml @@ -72,12 +72,14 @@ ansible.builtin.command: cmd: "./get_operator_sdk.sh {{ operator_sdk_v0 }}" creates: "{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v0 }}" + chdir: "{{ base_dir }}" - name: Get operator_sdk_v1 (deploy from bundles) when: __local_build_enabled | bool or __deploy_from_bundles_enabled | bool or __deploy_from_index_enabled | bool ansible.builtin.command: cmd: "{{ base_dir }}/get_operator_sdk.sh {{ operator_sdk_v1 }}" creates: "{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v1 }}" + chdir: "{{ base_dir }}" - name: Set logfile_dir when: not (logfile_dir is defined) @@ -102,11 +104,11 @@ - name: Create base build list ansible.builtin.set_fact: build_list: - - { name: service-telemetry-operator, dockerfile_path: build/Dockerfile, image_reference_name: sto_image_path, working_build_dir: ../ } - - { name: smart-gateway-operator, dockerfile_path: build/Dockerfile, image_reference_name: sgo_image_path, working_build_dir: ./working/smart-gateway-operator } - - { name: sg-core, dockerfile_path: build/Dockerfile, image_reference_name: sg_core_image_path, working_build_dir: ./working/sg-core } - - { name: sg-bridge, dockerfile_path: build/Dockerfile, image_reference_name: sg_bridge_image_path, working_build_dir: ./working/sg-bridge } - - { name: prometheus-webhook-snmp, dockerfile_path: Dockerfile, image_reference_name: prometheus_webhook_snmp_image_path, working_build_dir: ./working/prometheus-webhook-snmp } + - {name: service-telemetry-operator, dockerfile_path: build/Dockerfile, image_reference_name: sto_image_path, working_build_dir: "{{ base_dir }}/../"} + - {name: smart-gateway-operator, dockerfile_path: build/Dockerfile, image_reference_name: sgo_image_path, working_build_dir: "{{ base_dir }}/working/smart-gateway-operator"} + - {name: sg-core, dockerfile_path: build/Dockerfile, image_reference_name: sg_core_image_path, working_build_dir: "{{ base_dir }}/working/sg-core"} + - {name: sg-bridge, dockerfile_path: build/Dockerfile, image_reference_name: sg_bridge_image_path, working_build_dir: "{{ base_dir }}/working/sg-bridge"} + - {name: prometheus-webhook-snmp, dockerfile_path: Dockerfile, image_reference_name: prometheus_webhook_snmp_image_path, working_build_dir: "{{ base_dir }}/working/prometheus-webhook-snmp"} - ansible.builtin.debug: var: build_list @@ -145,8 +147,8 @@ - name: Create base build list ansible.builtin.set_fact: bundle_build_list: - - { name: service-telemetry-operator-bundle, dockerfile_path: Dockerfile, image_reference_name: sto_bundle_image_path, working_build_dir: ./working/service-telemetry-operator-bundle } - - { name: smart-gateway-operator-bundle, dockerfile_path: Dockerfile, image_reference_name: sgo_bundle_image_path, working_build_dir: ./working/smart-gateway-operator-bundle } + - { name: service-telemetry-operator-bundle, dockerfile_path: Dockerfile, image_reference_name: sto_bundle_image_path, working_build_dir: "{{ base_dir }}/working/service-telemetry-operator-bundle" } + - { name: smart-gateway-operator-bundle, dockerfile_path: Dockerfile, image_reference_name: sgo_bundle_image_path, working_build_dir: "{{ base_dir }}/working/smart-gateway-operator-bundle" } - ansible.builtin.debug: var: bundle_build_list diff --git a/build/stf-run-ci/tasks/setup_stf_local_build.yml b/build/stf-run-ci/tasks/setup_stf_local_build.yml index 70fc3f1e3..6aff1bcb7 100644 --- a/build/stf-run-ci/tasks/setup_stf_local_build.yml +++ b/build/stf-run-ci/tasks/setup_stf_local_build.yml @@ -42,7 +42,7 @@ block: - name: Load Smart Gateway Operator RBAC ansible.builtin.command: - cmd: oc apply -f working/smart-gateway-operator/deploy/{{ item }} -n "{{ namespace }}" + cmd: oc apply -f {{ base_dir }}/working/smart-gateway-operator/deploy/{{ item }} -n "{{ namespace }}" loop: - service_account.yaml - role.yaml @@ -51,7 +51,7 @@ - name: Load Smart Gateway Operator CSV ansible.builtin.shell: - cmd: oc apply -f working/smart-gateway-operator-bundle/manifests/smart-gateway-operator.clusterserviceversion.yaml -n "{{ namespace }}" + cmd: oc apply -f {{ base_dir }}/working/smart-gateway-operator-bundle/manifests/smart-gateway-operator.clusterserviceversion.yaml -n "{{ namespace }}" # --- Service Telemetry Operator --- - name: Generate Service Telemetry Operator CSV @@ -97,9 +97,10 @@ - name: Load Service Telemetry Operator CSV ansible.builtin.shell: - cmd: oc apply -f working/service-telemetry-operator-bundle/manifests/service-telemetry-operator.clusterserviceversion.yaml -n "{{ namespace }}" + cmd: oc apply -f {{ base_dir }}/working/service-telemetry-operator-bundle/manifests/service-telemetry-operator.clusterserviceversion.yaml -n "{{ namespace }}" # cleanup - name: Revert local change to role_binding.yaml ansible.builtin.shell: cmd: git checkout -- "{{ base_dir }}/../deploy/role_binding.yaml" + chdir: "{{ base_dir }}" From 9dbcf3b0b7c2616b556bed24815b1c2cbc489541 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Mon, 25 Sep 2023 19:18:33 +0100 Subject: [PATCH 61/95] [stf-run-ci] Update base_dir in setup_stf_local_build.yml (#478) --- build/stf-run-ci/tasks/setup_stf_local_build.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/build/stf-run-ci/tasks/setup_stf_local_build.yml b/build/stf-run-ci/tasks/setup_stf_local_build.yml index 6aff1bcb7..999c2902a 100644 --- a/build/stf-run-ci/tasks/setup_stf_local_build.yml +++ b/build/stf-run-ci/tasks/setup_stf_local_build.yml @@ -6,7 +6,7 @@ - block: - name: Generate Smart Gateway Operator CSV ansible.builtin.shell: - chdir: working/smart-gateway-operator/build + chdir: "{{ base_dir }}/working/smart-gateway-operator/build" cmd: | LOGFILE="{{ logfile_dir }}/sgo_gen_bundle.log" \ OPERATOR_SDK="{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v0 }}" \ @@ -89,6 +89,7 @@ - name: Load Service Telemetry Operator RBAC ansible.builtin.command: cmd: oc apply -f ../deploy/{{ item }} -n "{{ namespace }}" + chdir: "{{ base_dir }}" loop: - service_account.yaml - role.yaml From 48aad2e14b07aa84a3f560f02c810b255acc75ba Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Tue, 26 Sep 2023 11:29:58 +0100 Subject: [PATCH 62/95] [stf-run-ci] Add timeout to validate deployment (#477) * add timeout for validate deployment --- build/stf-run-ci/tasks/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/stf-run-ci/tasks/main.yml b/build/stf-run-ci/tasks/main.yml index b8769efeb..a78431713 100644 --- a/build/stf-run-ci/tasks/main.yml +++ b/build/stf-run-ci/tasks/main.yml @@ -205,7 +205,7 @@ - name: Validate system is operational ansible.builtin.shell: | - OCP_PROJECT="{{ namespace }}" VALIDATION_SCOPE="{{ __service_telemetry_observability_strategy }}" "{{ base_dir }}/validate_deployment.sh" >> {{ logfile_dir }}/validate_deployment.log 2>&1 + OCP_PROJECT="{{ namespace }}" VALIDATION_SCOPE="{{ __service_telemetry_observability_strategy }}" timeout 600 "{{ base_dir }}/validate_deployment.sh" >> {{ logfile_dir }}/validate_deployment.log 2>&1 args: executable: /bin/bash register: validate_deployment From 0b33d76f2dba877e5671e5190d4f802b0a9bb073 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Tue, 26 Sep 2023 09:29:08 -0400 Subject: [PATCH 63/95] Adjust the default scrape interval (#454) * Adjust the default scrape interval Adjust the default scrape interval to be 15s in alignment with default configuration changes setting the polling rate in RHOSP to 30 seconds. Related STF-1512 * Update 17.1 helper script for poll frequency Update the 17.1 helper script for poll frequency and match upstream documentation for event-less deployment. * Update tests/infrared/17.1/enable-stf.yaml.template * Align 17.1 help stf-connectors to docs Update the 17.1 helper scripts stf-connectors.yaml.template file to match the default configuration we're pursuing for STF 1.5.3 and beyond. * Update scrape_interval to 30s in alignment with data collectors --- roles/servicetelemetry/defaults/main.yml | 2 +- tests/infrared/17.1/enable-stf.yaml.template | 11 +++---- .../17.1/stf-connectors.yaml.template | 30 ------------------- 3 files changed, 5 insertions(+), 38 deletions(-) diff --git a/roles/servicetelemetry/defaults/main.yml b/roles/servicetelemetry/defaults/main.yml index 4f4e1ac54..0d77fc5ef 100644 --- a/roles/servicetelemetry/defaults/main.yml +++ b/roles/servicetelemetry/defaults/main.yml @@ -40,7 +40,7 @@ servicetelemetry_defaults: prometheus: enabled: true deployment_size: 1 - scrape_interval: 10s + scrape_interval: 30s storage: strategy: persistent retention: 24h diff --git a/tests/infrared/17.1/enable-stf.yaml.template b/tests/infrared/17.1/enable-stf.yaml.template index a1037f213..baf02648a 100644 --- a/tests/infrared/17.1/enable-stf.yaml.template +++ b/tests/infrared/17.1/enable-stf.yaml.template @@ -6,16 +6,14 @@ custom_templates: # matches the documentation for enable-stf.yaml in stable-1.3 documentation parameter_defaults: # only send to STF, not other publishers - EventPipelinePublishers: [] PipelinePublishers: [] # manage the polling and pipeline configuration files for Ceilometer agents ManagePolling: true ManagePipeline: true - # enable Ceilometer metrics and events + # enable Ceilometer metrics CeilometerQdrPublishMetrics: true - CeilometerQdrPublishEvents: true # enable collection of API status CollectdEnableSensubility: true @@ -27,12 +25,12 @@ custom_templates: # set collectd overrides for higher telemetry resolution and extra plugins # to load CollectdConnectionType: amqp1 - CollectdAmqpInterval: 5 - CollectdDefaultPollingInterval: 5 + CollectdAmqpInterval: 30 + CollectdDefaultPollingInterval: 30 CollectdExtraPlugins: - vmem - # set standard prefixes for where metrics and events are published to QDR + # set standard prefixes for where metrics are published to QDR MetricsQdrAddresses: - prefix: 'collectd' distribution: multicast @@ -78,4 +76,3 @@ custom_templates: local: host: "%{hiera('fqdn_canonical')}" port: 11211 - diff --git a/tests/infrared/17.1/stf-connectors.yaml.template b/tests/infrared/17.1/stf-connectors.yaml.template index 30e119b3c..1031e097b 100644 --- a/tests/infrared/17.1/stf-connectors.yaml.template +++ b/tests/infrared/17.1/stf-connectors.yaml.template @@ -3,11 +3,9 @@ tripleo_heat_templates: [] custom_templates: - # don't load collectd-write-qdr.yaml when using multi-cloud and instead load collectd service directly resource_registry: OS::TripleO::Services::Collectd: /usr/share/openstack-tripleo-heat-templates/deployment/metrics/collectd-container-puppet.yaml - # set parameter defaults to match stable-1.3 documentation parameter_defaults: ExtraConfig: qdr::router_id: "%{::hostname}.<>" @@ -24,19 +22,11 @@ custom_templates: caCertFileContent: | <> - CeilometerQdrEventsConfig: - driver: amqp - topic: <>-event - CeilometerQdrMetricsConfig: driver: amqp topic: <>-metering CollectdAmqpInstances: - <>-notify: - format: JSON - notify: true - presettle: false <>-telemetry: format: JSON presettle: false @@ -45,23 +35,3 @@ custom_templates: # --- below here, extended configuration for environment beyond what is documented in stable-1.3 CollectdSensubilityLogLevel: DEBUG - CephStorageExtraConfig: - tripleo::profile::base::metrics::collectd::amqp_host: "%{hiera('storage')}" - tripleo::profile::base::metrics::qdr::listener_addr: "%{hiera('storage')}" - - collectd::plugin::ceph::daemons: - - ceph-osd.0 - - ceph-osd.1 - - ceph-osd.2 - - ceph-osd.3 - - ceph-osd.4 - - ceph-osd.5 - - ceph-osd.6 - - ceph-osd.7 - - ceph-osd.8 - - ceph-osd.9 - - ceph-osd.10 - - ceph-osd.11 - - ceph-osd.12 - - ceph-osd.13 - - ceph-osd.14 From e20f8a5fbc21737d7bb162942a00a45bf0dab864 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Tue, 26 Sep 2023 15:59:56 +0100 Subject: [PATCH 64/95] [stf-run-ci] Add requirements.txt (#484) --- build/stf-run-ci/requirements.txt | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 build/stf-run-ci/requirements.txt diff --git a/build/stf-run-ci/requirements.txt b/build/stf-run-ci/requirements.txt new file mode 100644 index 000000000..a5727b04b --- /dev/null +++ b/build/stf-run-ci/requirements.txt @@ -0,0 +1,8 @@ +# https://stackoverflow.com/questions/64073422/importerror-cannot-import-name-oauth1session-from-requests-oauthlib +requests==2.27.1 +requests_oauthlib==1.3.0 +# https://github.com/domainaware/parsedmarc/issues/318 +oauthlib==3.2.0 +kubernetes==24.2.0 +openshift==0.13.1 +ansible-core==2.12.10 From 18e9161eb335065e3ea7f0ed89a9dc17ab36d17a Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Tue, 26 Sep 2023 12:33:19 -0400 Subject: [PATCH 65/95] Add local generation difference check (#482) * Add local generation difference check Check if generating a bundle locally would result in a git diff. If so, then we should fail since that means changes are missed in the resulting bundle. * Fix missed bundle generation Fix missed bundle generation in 805ada4bf45074bf1ad513eb2e01d0568a1bff80 which I introduced by not running a local bundle generation before merging. --------- Co-authored-by: Chris Sibbitt --- .github/workflows/main.yml | 25 +++++++++++++++++++ ...emetry-operator.clusterserviceversion.yaml | 1 + 2 files changed, 26 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index aef3d0796..2b04f59a5 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -38,6 +38,31 @@ jobs: run: ${HOME}/.local/bin/ansible-lint . working-directory: ./build/stf-run-ci + generate-bundle-diff-check: + name: Check if generating the bundle would result in local changes + runs-on: ubuntu-latest + env: + RELEASE_VERSION: v0.19.4 + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Get operator-sdk image 0.19.4 + run: curl --output operator-sdk -JL https://github.com/operator-framework/operator-sdk/releases/download/$RELEASE_VERSION/operator-sdk-$RELEASE_VERSION-x86_64-linux-gnu + + - name: Make operator-sdk executable + run: chmod +x operator-sdk + + - name: Move operator-sdk binary + run: sudo mv operator-sdk /usr/local/bin + + - name: Generate bundle locally + run: operator-sdk generate bundle --manifests --metadata --default-channel unstable --channels unstable + + - name: Check if bundle generation results in local changes + run: git diff --exit-code + build-operator-check: name: Build Operator check runs-on: ubuntu-20.04 diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index b06ea07a2..7722b2ba9 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -253,6 +253,7 @@ spec: - watch - update - patch + - delete - apiGroups: - authorization.k8s.io resources: From 761bd9aaf14fef4a28e823c2eddf9c2db08b0a54 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Tue, 26 Sep 2023 12:47:49 -0400 Subject: [PATCH 66/95] Bump origin-ansible-operator base image (#487) Bump the origin-ansible-operator base image from 4.10 to 4.12. Related STF-1524 --- build/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/Dockerfile b/build/Dockerfile index 420d86a8b..7242e664f 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -1,4 +1,4 @@ -FROM quay.io/openshift/origin-ansible-operator:4.10 +FROM quay.io/openshift/origin-ansible-operator:4.12 USER 0 # Upstream CI builds need the additional EPEL sources for python3-passlib and python3-bcrypt but have no working repos to install epel-release From 76156ccd7b248165da60f9667aae32b9f81cf06b Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Wed, 27 Sep 2023 09:20:38 -0400 Subject: [PATCH 67/95] Remove logs API interface from STF (#483) * Remove logs API interface from STF The logs interface in STF has never been supported or productized. A recent bug was found where Loki can be interfered with when deployed on the same system as STF. Since this has never been documented or productized, it is being removed, as OpenStack logging should be sent off-cluster from rsyslog directly to Elasticsearch. Closes STF-1504 * Remove logs interface from CRD Missed removing the logs interface from the CRD and defaults. --- build/stf-run-ci/README.md | 1 - build/stf-run-ci/defaults/main.yml | 1 - build/stf-run-ci/tasks/deploy_stf.yml | 7 -- .../infra.watch_servicetelemetrys_crd.yaml | 104 ---------------- ...fra.watch_v1beta1_servicetelemetry_cr.yaml | 8 -- .../infra.watch_servicetelemetrys_crd.yaml | 114 ------------------ ...emetry-operator.clusterserviceversion.yaml | 12 -- deploy/role.yaml | 1 - docs/loki.md | 75 ------------ roles/servicetelemetry/defaults/main.yml | 28 ----- roles/servicetelemetry/meta/main.yml | 4 +- .../tasks/component_clouds.yml | 18 --- .../servicetelemetry/tasks/component_loki.yml | 14 --- roles/servicetelemetry/tasks/main.yml | 11 -- .../templates/manifest_grafana_ds.j2 | 10 -- .../templates/manifest_loki.j2 | 39 ------ .../templates/manifest_smartgateway_logs.j2 | 38 ------ 17 files changed, 2 insertions(+), 483 deletions(-) delete mode 100644 docs/loki.md delete mode 100644 roles/servicetelemetry/tasks/component_loki.yml delete mode 100644 roles/servicetelemetry/templates/manifest_loki.j2 delete mode 100644 roles/servicetelemetry/templates/manifest_smartgateway_logs.j2 diff --git a/build/stf-run-ci/README.md b/build/stf-run-ci/README.md index 959a6048a..f1e141878 100644 --- a/build/stf-run-ci/README.md +++ b/build/stf-run-ci/README.md @@ -51,7 +51,6 @@ choose to override: | `__service_telemetry_trap_oid_prefix` | | 1.3.6.1.4.1.50495.15 | The OID prefix for trap variable bindings. | | `__service_telemetry_trap_default_oid` | | 1.3.6.1.4.1.50495.15.1.2.1 | The trap OID if none is found in the Prometheus alert labels. | | `__service_telemetry_trap_default_severity` | | | The trap severity if none is found in the Prometheus alert labels. | -| `__service_telemetry_logs_enabled` | {true,false} | false | Whether to enable logs support in ServiceTelemetry | | `__service_telemetry_observability_strategy` | | `use_redhat` | Which observability strategy to use for deployment. Default is 'use_redhat'. Also supported are 'use_hybrid', 'use_community', and 'none' | | `__service_telemetry_transports_certificates_endpoint_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the QDR endpoint certificate (minimum duration is 1h) | | `__service_telemetry_transports_certificates_ca_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the QDR CA certificate (minimum duration is 1h) | diff --git a/build/stf-run-ci/defaults/main.yml b/build/stf-run-ci/defaults/main.yml index 4e00f8395..da9834ecf 100644 --- a/build/stf-run-ci/defaults/main.yml +++ b/build/stf-run-ci/defaults/main.yml @@ -26,7 +26,6 @@ __service_telemetry_snmptraps_alert_oid_label: "oid" __service_telemetry_snmptraps_trap_oid_prefix: "1.3.6.1.4.1.50495.15" __service_telemetry_snmptraps_trap_default_oid: "1.3.6.1.4.1.50495.15.1.2.1" __service_telemetry_snmptraps_trap_default_severity: "" -__service_telemetry_logs_enabled: false __service_telemetry_observability_strategy: use_redhat __service_telemetry_transports_certificates_endpoint_cert_duration: 70080h __service_telemetry_transports_certificates_ca_cert_duration: 70080h diff --git a/build/stf-run-ci/tasks/deploy_stf.yml b/build/stf-run-ci/tasks/deploy_stf.yml index da8a9781a..ae5985374 100644 --- a/build/stf-run-ci/tasks/deploy_stf.yml +++ b/build/stf-run-ci/tasks/deploy_stf.yml @@ -43,13 +43,6 @@ persistent: storageClass: {{ __service_telemetry_storage_persistent_storage_class }} {% endif %} - logs: - loki: - enabled: false - replicationFactor: 1 - flavor: 1x.extra-small - storage: - objectStorageSecret: test {% if __service_telemetry_storage_persistent_storage_class is defined %} storageClass: {{ __service_telemetry_storage_persistent_storage_class }} {% endif %} diff --git a/deploy/crds/infra.watch_servicetelemetrys_crd.yaml b/deploy/crds/infra.watch_servicetelemetrys_crd.yaml index 5821808aa..f45dc44bd 100644 --- a/deploy/crds/infra.watch_servicetelemetrys_crd.yaml +++ b/deploy/crds/infra.watch_servicetelemetrys_crd.yaml @@ -238,74 +238,6 @@ spec: type: object type: object type: object - logs: - description: Logs related backend configuration. - properties: - loki: - description: Logs storage backend Loki - properties: - enabled: - description: '[TESTING ONLY] Enable Loki as a storage backend for logs' - type: boolean - replicationFactor: - description: Loki replication factor - format: int32 - minimum: 1 - type: integer - flavor: - description: Loki flavor - enum: - - 1x.extra-small - - 1x.small - - 1x.medium - type: string - storage: - description: Logs storage configuration for Loki - properties: - objectStorageSecret: - description: Secret containing informaiton required for S3 object storage - type: string - storageClass: - description: Storage class used for temporary log storage before they are forwarded to object storage or when querying. - type: string - type: object - compactor: - description: Template for the compactor microservice - properties: - replicas: - description: Number of replicas for this microservice - type: string - type: object - distributor: - description: Template for the distributor microservice - properties: - replicas: - description: Number of replicas for this microservice - type: string - type: object - ingester: - description: Template for the ingester microservice - properties: - replicas: - description: Number of replicas for this microservice - type: string - type: object - querier: - description: Template for the querier microservice - properties: - replicas: - description: Number of replicas for this microservice - type: string - type: object - queryFrontend: - description: Template for the query frontend microservice - properties: - replicas: - description: Number of replicas for this microservice - type: string - type: object - type: object - type: object type: object transports: description: Data transport configuration @@ -447,42 +379,6 @@ spec: type: object type: array type: object - logs: - description: Logs related configuration for this cloud object. - properties: - collectors: - description: List of available logs collectors for this cloud - object. - items: - properties: - collectorType: - description: Set the collector type, value of 'rsyslog' - enum: - - rsyslog - type: string - debugEnabled: - description: Enable console debugging. Default is 'false'. - type: boolean - subscriptionAddress: - description: Address to subscribe on the data transport - to receive notifications. - type: string - bridge: - description: Bridge configuration and tuning configurations. - properties: - ringBufferCount: - description: sg-bridge ring buffer count. This affects the potential number of messages in queue, which can result in increased memory usage within the sg-bridge container. - type: integer - ringBufferSize: - description: sg-bridge ring buffer size. This affects the size of messages that can be passed between sg-bridge and sg-core. - type: integer - verbose: - description: Enable verbosity for debugging purposes. - type: boolean - type: object - type: object - type: array - type: object type: object type: array type: object diff --git a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml index 683019f2d..82814f8df 100644 --- a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml +++ b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml @@ -51,14 +51,6 @@ spec: certificates: endpointCertDuration: 70080h caCertDuration: 70080h - logs: - loki: - enabled: false - flavor: 1x.extra-small - replicationFactor: 1 - storage: - objectStorageSecret: test - storageClass: standard clouds: - name: cloud1 metrics: diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml index 1841298e1..f6cf302b2 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml @@ -219,78 +219,6 @@ spec: type: string type: object type: object - logs: - description: Logs related backend configuration. - properties: - loki: - description: Logs storage backend Loki - properties: - compactor: - description: Template for the compactor microservice - properties: - replicas: - description: Number of replicas for this microservice - type: string - type: object - distributor: - description: Template for the distributor microservice - properties: - replicas: - description: Number of replicas for this microservice - type: string - type: object - enabled: - description: '[TESTING ONLY] Enable Loki as a storage - backend for logs' - type: boolean - flavor: - description: Loki flavor - enum: - - 1x.extra-small - - 1x.small - - 1x.medium - type: string - ingester: - description: Template for the ingester microservice - properties: - replicas: - description: Number of replicas for this microservice - type: string - type: object - querier: - description: Template for the querier microservice - properties: - replicas: - description: Number of replicas for this microservice - type: string - type: object - queryFrontend: - description: Template for the query frontend microservice - properties: - replicas: - description: Number of replicas for this microservice - type: string - type: object - replicationFactor: - description: Loki replication factor - format: int32 - minimum: 1 - type: integer - storage: - description: Logs storage configuration for Loki - properties: - objectStorageSecret: - description: Secret containing informaiton required - for S3 object storage - type: string - storageClass: - description: Storage class used for temporary log - storage before they are forwarded to object storage - or when querying. - type: string - type: object - type: object - type: object metrics: description: Metrics related backend configuration. properties: @@ -390,48 +318,6 @@ spec: type: object type: array type: object - logs: - description: Logs related configuration for this cloud object. - properties: - collectors: - description: List of available logs collectors for this - cloud object. - items: - properties: - bridge: - description: Bridge configuration and tuning configurations. - properties: - ringBufferCount: - description: sg-bridge ring buffer count. This - affects the potential number of messages in - queue, which can result in increased memory - usage within the sg-bridge container. - type: integer - ringBufferSize: - description: sg-bridge ring buffer size. This - affects the size of messages that can be passed - between sg-bridge and sg-core. - type: integer - verbose: - description: Enable verbosity for debugging purposes. - type: boolean - type: object - collectorType: - description: Set the collector type, value of 'rsyslog' - enum: - - rsyslog - type: string - debugEnabled: - description: Enable console debugging. Default is - 'false'. - type: boolean - subscriptionAddress: - description: Address to subscribe on the data transport - to receive notifications. - type: string - type: object - type: array - type: object metrics: description: Metrics related configuration for this cloud object. properties: diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index 7722b2ba9..0e947081d 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -61,17 +61,6 @@ metadata: "version": "7.16.1" } }, - "logs": { - "loki": { - "enabled": false, - "flavor": "1x.extra-small", - "replicationFactor": 1, - "storage": { - "objectStorageSecret": "test", - "storageClass": "standard" - } - } - }, "metrics": { "prometheus": { "enabled": true, @@ -388,7 +377,6 @@ spec: - monitoring.rhobs - elasticsearch.k8s.elastic.co - integreatly.org - - loki.grafana.com resources: - '*' verbs: diff --git a/deploy/role.yaml b/deploy/role.yaml index 58966b6f7..6e22854e4 100644 --- a/deploy/role.yaml +++ b/deploy/role.yaml @@ -121,7 +121,6 @@ rules: - monitoring.rhobs - elasticsearch.k8s.elastic.co - integreatly.org - - loki.grafana.com resources: - '*' verbs: diff --git a/docs/loki.md b/docs/loki.md deleted file mode 100644 index abdf8057a..000000000 --- a/docs/loki.md +++ /dev/null @@ -1,75 +0,0 @@ -# How to run SGO with Loki -A few examples about how to deploy with Loki for logging support. - -## Deploy SGO + Loki with minio for storage -This is less resource intensive. Useful for development in crc. -``` -ansible-playbook --extra-vars __service_telemetry_logs_enabled=true --extra-vars __deploy_minio_enabled=true run-ci.yaml -``` - -## Deploy SGO + Loki with OCS for storage -This is more a production-like setup. It's more resource demanding and cannot be run in crc. This assumes OCS is already deployed. - -### Create an object bucket claim -``` -oc apply -f - < -stringData: - endpoint: https://: - bucketnames: - access_key_id: - access_key_secret: -type: Opaque -EOF -``` - -### Deploy SGO + Loki -``` -ansible-playbook --extra-vars __service_telemetry_logs_enabled=true --extra-vars __loki_skip_tls_verify=true run-ci.yaml -``` - diff --git a/roles/servicetelemetry/defaults/main.yml b/roles/servicetelemetry/defaults/main.yml index 0d77fc5ef..fd8074c3f 100644 --- a/roles/servicetelemetry/defaults/main.yml +++ b/roles/servicetelemetry/defaults/main.yml @@ -67,24 +67,6 @@ servicetelemetry_defaults: certificates: endpoint_cert_duration: 70080h ca_cert_duration: 70080h - logs: - loki: - enabled: false - replication_factor: 1 - flavor: 1x.extra-small - storage: - object_storage_secret: "" - storage_class: "" - compactor: - replicas: "" - distributor: - replicas: "" - ingester: - replicas: "" - querier: - replicas: "" - query_frontend: - replicas: "" transports: qdr: @@ -152,16 +134,6 @@ servicetelemetry_defaults: ring_buffer_size: 16384 ring_buffer_count: 15000 verbose: false - logs: - collectors: - - collector_type: rsyslog - subscription_address: rsyslog/cloud1-logs - debug_enabled: false - bridge: - ring_buffer_size: 135048 - ring_buffer_count: 15000 - verbose: false - # These variables are outside of the defaults. Their values will be # auto-detected by the role and are not meant to be set by the user. However, diff --git a/roles/servicetelemetry/meta/main.yml b/roles/servicetelemetry/meta/main.yml index 2c0d83024..d7c0f7585 100644 --- a/roles/servicetelemetry/meta/main.yml +++ b/roles/servicetelemetry/meta/main.yml @@ -9,10 +9,10 @@ galaxy_info: platforms: - name: RHEL versions: - - 7 + - 8 - name: CentOS versions: - - 7 + - 8 galaxy_tags: - monitoring diff --git a/roles/servicetelemetry/tasks/component_clouds.yml b/roles/servicetelemetry/tasks/component_clouds.yml index 2dd352c38..8a745440e 100644 --- a/roles/servicetelemetry/tasks/component_clouds.yml +++ b/roles/servicetelemetry/tasks/component_clouds.yml @@ -90,21 +90,3 @@ - this_cloud.events is defined - this_cloud.events.collectors is defined - this_cloud.events is iterable - -- name: Deploy Logs Smart Gateway instance - vars: - data_type: 'logs' - manifest: './manifest_smartgateway_logs.j2' - this_smartgateway: "{{ ansible_operator_meta.name }}-{{ this_cloud.name }}-{{ this_collector.collector_type[:4] }}-log" - include_tasks: base_smartgateway.yml - loop: "{{ this_cloud.logs.collectors }}" - loop_control: - loop_var: this_collector - label: "{{ this_collector.collector_type }}" - when: - - has_loki_api | bool - - observability_strategy in ['use_community', 'use_hybrid'] - - servicetelemetry_vars.backends.logs.loki.enabled - - this_cloud.logs is defined - - this_cloud.logs.collectors is defined - - this_cloud.logs is iterable diff --git a/roles/servicetelemetry/tasks/component_loki.yml b/roles/servicetelemetry/tasks/component_loki.yml deleted file mode 100644 index 53dc2b812..000000000 --- a/roles/servicetelemetry/tasks/component_loki.yml +++ /dev/null @@ -1,14 +0,0 @@ -- name: Lookup template - debug: - msg: "{{ lookup('template', './manifest_loki.j2') | from_yaml }}" - -- name: Set default Loki manifest - set_fact: - loki_manifest: "{{ lookup('template', './manifest_loki.j2') | from_yaml }}" - when: loki_manifest is not defined - -- name: Create an instance of Loki - k8s: - state: '{{ "present" if servicetelemetry_vars.backends.logs.loki.enabled else "absent" }}' - definition: - '{{ loki_manifest }}' diff --git a/roles/servicetelemetry/tasks/main.yml b/roles/servicetelemetry/tasks/main.yml index 20991f50e..bc33df647 100644 --- a/roles/servicetelemetry/tasks/main.yml +++ b/roles/servicetelemetry/tasks/main.yml @@ -73,17 +73,6 @@ - observability_strategy in ['use_community', 'use_hybrid'] - servicetelemetry_vars.backends.events.elasticsearch.enabled | bool -# --> backends.logs -- name: Check if we have loki API - set_fact: - has_loki_api: "{{ True if 'loki.grafana.com' in api_groups else False }}" - -- name: Create Loki instance - include_tasks: component_loki.yml - when: - - has_loki_api | bool - - observability_strategy in ['use_community', 'use_hybrid'] - # --> clouds - name: Get data about clouds debug: diff --git a/roles/servicetelemetry/templates/manifest_grafana_ds.j2 b/roles/servicetelemetry/templates/manifest_grafana_ds.j2 index 7ae2b392a..d0f0478d1 100644 --- a/roles/servicetelemetry/templates/manifest_grafana_ds.j2 +++ b/roles/servicetelemetry/templates/manifest_grafana_ds.j2 @@ -55,14 +55,4 @@ spec: timeField: generated esVersion: 70 {% endif %} - -{% if servicetelemetry_vars.backends.logs.loki.enabled %} - - access: proxy - editable: true - isDefault: false - name: STFLoki - type: loki - url: 'http://loki-query-frontend-http-lokistack:3100' - version: 1 -{% endif %} name: {{ ansible_operator_meta.name }}-ds-stf.yaml diff --git a/roles/servicetelemetry/templates/manifest_loki.j2 b/roles/servicetelemetry/templates/manifest_loki.j2 deleted file mode 100644 index ee8a0b5d8..000000000 --- a/roles/servicetelemetry/templates/manifest_loki.j2 +++ /dev/null @@ -1,39 +0,0 @@ -apiVersion: loki.grafana.com/v1beta1 -kind: LokiStack -metadata: - name: lokistack - namespace: '{{ ansible_operator_meta.namespace }}' -spec: - size: {{ servicetelemetry_vars.backends.logs.loki.flavor }} - replicationFactor: {{ servicetelemetry_vars.backends.logs.loki.replication_factor }} - storage: - secret: - name: {{ servicetelemetry_vars.backends.logs.loki.storage.object_storage_secret }} - storageClassName: {{ servicetelemetry_vars.backends.logs.loki.storage.storage_class }} -{% if servicetelemetry_vars.backends.logs.loki.compactor.replicas | length or - servicetelemetry_vars.backends.logs.loki.distributor.replicas | length or - servicetelemetry_vars.backends.logs.loki.ingester.replicas | length or - servicetelemetry_vars.backends.logs.loki.querier.replicas | length or - servicetelemetry_vars.backends.logs.loki.query_frontend.replicas | length %} - template: -{% if servicetelemetry_vars.backends.logs.loki.compactor.replicas | length %} - compactor: - replicas: {{ servicetelemetry_vars.backends.logs.loki.compactor.replicas }} -{% endif %} -{% if servicetelemetry_vars.backends.logs.loki.distributor.replicas | length %} - distributor: - replicas: {{ servicetelemetry_vars.backends.logs.loki.distributor.replicas }} -{% endif %} -{% if servicetelemetry_vars.backends.logs.loki.ingester.replicas | length %} - ingester: - replicas: {{ servicetelemetry_vars.backends.logs.loki.ingester.replicas }} -{% endif %} -{% if servicetelemetry_vars.backends.logs.loki.querier.replicas | length %} - querier: - replicas: {{ servicetelemetry_vars.backends.logs.loki.querier.replicas }} -{% endif %} -{% if servicetelemetry_vars.backends.logs.loki.query_frontend.replicas | length %} - queryFrontend: - replicas: {{ servicetelemetry_vars.backends.logs.loki.query_frontend.replicas }} -{% endif %} -{% endif %} diff --git a/roles/servicetelemetry/templates/manifest_smartgateway_logs.j2 b/roles/servicetelemetry/templates/manifest_smartgateway_logs.j2 deleted file mode 100644 index ccacb002f..000000000 --- a/roles/servicetelemetry/templates/manifest_smartgateway_logs.j2 +++ /dev/null @@ -1,38 +0,0 @@ -apiVersion: smartgateway.infra.watch/v2 -kind: SmartGateway -metadata: - name: '{{ this_smartgateway }}' - namespace: '{{ ansible_operator_meta.namespace }}' -spec: -{% if this_collector.debug_enabled is defined and this_collector.debug_enabled %} - logLevel: "debug" -{% else %} - logLevel: "info" -{% endif %} - handleErrors: true - blockEventBus: true - size: {{ smartgateway_deployment_size }} - applications: - - config: | - connection: http://loki-distributor-http-lokistack.{{ ansible_operator_meta.namespace }}.svc.cluster.local:3100 - batchSize: {{ loki_batch_size | default('2000') }} - maxWaitTime: {{ loki_max_wait_time | default('1s') }} - name: loki - bridge: - amqpUrl: amqp://{{ ansible_operator_meta.name }}-interconnect.{{ ansible_operator_meta.namespace }}.svc.cluster.local:5673/{{ this_collector.subscription_address }} - amqpBlock: true - socketBlock: true - ringBufferSize: {{ this_collector.bridge.ring_buffer_size | default(135048) }} - ringBufferCount: {{ this_collector.bridge.ring_buffer_count | default(15000) }} - verbose: {{ this_collector.bridge.verbose | default(false) }} - transports: - - config: | - path: /tmp/smartgateway - handlers: - - name: logs - config: | - timestampField: "@timestamp" - messageField: "message" - severityField: "severity" - hostnameField: "host" - name: socket From bb460120812730b6f95c747f50f5c5d265da0008 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Wed, 27 Sep 2023 12:39:12 -0400 Subject: [PATCH 68/95] Reduce the Ceilometer pollsters in helper script (#489) Update the ceilometer pollsters used by the 17.1 deployment helper script. Related: rhbz#2239390 --- tests/infrared/17.1/enable-stf.yaml.template | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/tests/infrared/17.1/enable-stf.yaml.template b/tests/infrared/17.1/enable-stf.yaml.template index baf02648a..df83a4365 100644 --- a/tests/infrared/17.1/enable-stf.yaml.template +++ b/tests/infrared/17.1/enable-stf.yaml.template @@ -41,20 +41,7 @@ custom_templates: ceilometer::agent::polling::polling_interval: 30 ceilometer::agent::polling::polling_meters: - cpu - - disk.* - - ip.* - - image.* - - memory - - memory.* - - network.services.vpn.* - - network.services.firewall.* - - perf.* - - port - - port.* - - switch - - switch.* - - storage.* - - volume.* + - memory.usage # to avoid filling the memory buffers if disconnected from the message bus # note: this may need an adjustment if there are many metrics to be sent. From 511a94f55eeed0c45725b58f6d312468fea12fda Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Wed, 27 Sep 2023 19:25:20 +0100 Subject: [PATCH 69/95] [zuul] Add base job for testing with zuul (#479) * [zuul] Add base job for testing with zuul * Update default crc_parameters, so that there's enough memory for cluster_monitoring and for schedulling all the pods [1] * Add a 60 minute timeout to the job, since the default (~30 minutes) is not enough to run through all the job stages * galaxy collection and pip package versions are pinned to known-working versions * post task: Gathers some information from oc (builds, images, CSVs, etc) Copies logs from the job nodes so they can be viewed in the job buildresults --- .zuul.yaml | 27 ++++++++++ ci/deploy_stf.yml | 25 +++++++++ ci/post-collect_logs.yml | 109 +++++++++++++++++++++++++++++++++++++++ ci/prepare.yml | 42 +++++++++++++++ ci/test_stf.yml | 28 ++++++++++ ci/vars-zuul-common.yml | 6 +++ 6 files changed, 237 insertions(+) create mode 100644 ci/deploy_stf.yml create mode 100644 ci/post-collect_logs.yml create mode 100644 ci/prepare.yml create mode 100644 ci/test_stf.yml create mode 100644 ci/vars-zuul-common.yml diff --git a/.zuul.yaml b/.zuul.yaml index 28a9499a6..a9d034504 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -1,4 +1,31 @@ --- +- job: + name: stf-base + # defined in: https://review.rdoproject.org/cgit/config/tree/zuul.d/_jobs-crc.yaml + parent: base-simple-crc + abstract: true + description: | + Run the stf-run-ci role, and then test stf + roles: # adds in dependent roles i.e. put it in the role path + - zuul: github.com/openstack-k8s-operators/ci-framework + # These are the additional repos that zuul will clone + required-projects: + - name: openstack-k8s-operators/ci-framework + override-checkout: main + pre-run: + - ci/prepare.yml + run: + - ci/deploy_stf.yml + - ci/test_stf.yml + post-run: + - ci/post-collect_logs.yml + nodeset: centos-9-crc-xxl + # The default (~30 minutes) is not enough to run through all the job stages + timeout: 3600 + vars: + # Pass vars to crc cli https://review.rdoproject.org/cgit/config/tree/playbooks/crc/simple-start.yaml#n30 + crc_parameters: '--memory 16000 --disk-size 80 --cpus 6' # Increase from 14336 + - project: name: infrawatch/service-telemetry-operator github-check: diff --git a/ci/deploy_stf.yml b/ci/deploy_stf.yml new file mode 100644 index 000000000..170e8590a --- /dev/null +++ b/ci/deploy_stf.yml @@ -0,0 +1,25 @@ +--- +- name: "Deploy STF" + hosts: controller + tasks: + - name: "Set the sto_dir if it isn't already set" + ansible.builtin.set_fact: + sto_dir: '{{ ansible_env.HOME }}/{{ zuul.project.src_dir }}' + when: sto_dir | default('') | length == 0 + + - name: "Get vars common to all jobs" + ansible.builtin.include_vars: + file: "vars-zuul-common.yml" + + - name: "Get scenario-specific vars" + ansible.builtin.include_vars: + file: "vars-{{ scenario }}.yml" + + - name: "Log into the cluster" + ansible.builtin.import_role: + name: rhol_crc + tasks_from: add_crc_creds.yml + + - name: "Deploy STF using stf-run-ci" + ansible.builtin.import_role: + name: '../build/stf-run-ci' diff --git a/ci/post-collect_logs.yml b/ci/post-collect_logs.yml new file mode 100644 index 000000000..614f8728b --- /dev/null +++ b/ci/post-collect_logs.yml @@ -0,0 +1,109 @@ +--- +# Based on https://raw.githubusercontent.com/openstack-k8s-operators/nova-operator/bc10c4f579f8538899ac7bc5f87bfdb62d7042a4/ci/nova-operator-base/playbooks/collect-logs.yaml +- hosts: all + name: Create zuul-output log dir + gather_facts: false + tasks: + - name: Create log dir + ansible.builtin.file: + path: "{{ ansible_user_dir }}/zuul-output/logs" + state: directory + mode: "0755" + +- hosts: controller + name: Collect logs on the controller + gather_facts: false + tasks: + - name: "Set the sto_dir if it isn't already set" + ansible.builtin.set_fact: + sto_dir: '{{ ansible_env.HOME }}/{{ zuul.project.src_dir }}' + when: sto_dir | default('') | length == 0 + + - name: "Get vars common to all jobs" + ansible.builtin.include_vars: + file: "vars-zuul-common.yml" + + - name: "Get scenario-specific vars" + ansible.builtin.include_vars: + file: "vars-{{ scenario }}.yml" + + - name: "Create log dir" + ansible.builtin.file: + path: "{{ logfile_dir }}" + state: directory + mode: "0755" + + - name: "Log into the cluster" + ansible.builtin.import_role: + name: rhol_crc + tasks_from: add_crc_creds.yml + + - name: "Get builds" + ansible.builtin.shell: + cmd: | + echo "*** [INFO] Showing oc get builds" > {{ logfile_dir }}/post_oc_get_builds.log 2>&1 + oc get builds -oyaml >> {{ logfile_dir }}/post_oc_get_builds.log 2>&1 + echo "*** [INFO] Showing oc get builds -oyaml" >> {{ logfile_dir }}/post_oc_get_builds.log 2>&1 + oc get builds -oyaml >> {{ logfile_dir }}/post_oc_get_builds.log 2>&1 + cat {{ logfile_dir }}/post_oc_get_builds.log + ignore_errors: true + changed_when: false + + - name: "Get subscription details" + ansible.builtin.shell: + cmd: | + oc get subscriptions > {{ logfile_dir }}/post_oc_get_subscriptions.log 2>&1 + oc describe subscription service-telemetry-operator >> {{ logfile_dir }}/post_oc_get_subscriptions.log 2>&1 + cat {{ logfile_dir}}/post_oc_get_subscriptions.log + + - name: "Get image infos" + ansible.builtin.shell: + cmd: | + echo "[INFO] oc get images" > {{ logfile_dir }}/post_oc_get_images.log 2>&1 + oc get images >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 + echo "[INFO] oc get imagestreams" >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 + oc get imagestream >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 + #echo "[INFO] oc get images -oyaml" >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 + #oc get images -oyaml >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 + echo "[INFO] oc get imagestream -oyaml" >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 + oc get imagestream -oyaml >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 + cat {{ logfile_dir }}/post_oc_get_images.log 2>&1 + register: post_oc_get_images + retries: 3 + delay: 10 + + - name: "Get STO info" + ansible.builtin.shell: + cmd: | + oc describe pod $(oc get pod -l name=service-telemetry-operator -ojsonpath='{ .items[].metadata.name }') >> {{ logfile_dir }}/describe_sto.log 2>&1 + ignore_errors: true + retries: 3 + delay: 10 + + - name: "Question the deployment" + ansible.builtin.shell: + cmd: | + echo "What images were created in the internal registry?" > {{ logfile_dir }}/post_question_deployment.log 2>&1 + oc get images | grep $(oc registry info --internal) >> {{ logfile_dir }}/post_question_deployment.log 2>&1 + echo "What state is the STO csv in?" >> {{ logfile_dir }}/post_question_deployment.log 2>&1 + oc get csv -n service-telemetry | grep service-telemetry-operator >> {{ logfile_dir }}/post_question_deployment.log 2>&1 + oc get csv -n service-telemetry-operator -oyaml >> {{ logfile_dir }}/post_question_deployment.log 2>&1 + register: output + retries: 3 + delay: 10 + + - name: "Copy generated logs" + ansible.builtin.shell: | + cp {{ ansible_env.HOME }}/*.log . + args: + chdir: "{{ ansible_user_dir }}/zuul-output/logs/controller" + changed_when: true + ignore_errors: true + +- hosts: all + name: Copy files from controller on node + gather_facts: false + tasks: + - name: Copy files from controller on node + ansible.builtin.include_role: + name: fetch-output diff --git a/ci/prepare.yml b/ci/prepare.yml new file mode 100644 index 000000000..c226ccdfd --- /dev/null +++ b/ci/prepare.yml @@ -0,0 +1,42 @@ +--- +- name: "Prepare the environment for running stf" + hosts: controller + tasks: + - name: "Update pip" + ansible.builtin.pip: + name: pip + state: latest + extra_args: "-U" + + - name: "Set the value of sto_dir, if it's not already defined" + ansible.builtin.set_fact: + sto_dir: "{{ ansible_env.HOME }}/{{ zuul.project.src_dir }}" + when: not (sto_dir is defined) + + - name: "Install pre-reqs from pip" + ansible.builtin.pip: + requirements: "build/stf-run-ci/requirements.txt" + chdir: "{{ sto_dir }}" + state: present + + - name: "Install ansible collections" + community.general.ansible_galaxy_install: + type: collection + name: "{{ item }}" + with_items: + - "kubernetes.core:2.3.2" + - "community.general:6.2.0" + + - name: "Log into the cluster" + ansible.builtin.import_role: + name: rhol_crc + tasks_from: add_crc_creds.yml + + - name: "Create the service-telemetry project" + kubernetes.core.k8s: + api_version: v1 + kind: Namespace + name: "{{ namespace }}" + state: present + retries: 3 + delay: 30 diff --git a/ci/test_stf.yml b/ci/test_stf.yml new file mode 100644 index 000000000..7f196e860 --- /dev/null +++ b/ci/test_stf.yml @@ -0,0 +1,28 @@ +--- +- name: "Run tests to verify that STF runs as expected" + hosts: controller + tasks: + - name: "Set the sto_dir if it isn't already set" + ansible.builtin.set_fact: + sto_dir: '{{ ansible_env.HOME }}/{{ zuul.project.src_dir }}' + when: sto_dir | default('') | length == 0 + + - name: "Get vars common to all jobs" + ansible.builtin.include_vars: + file: "vars-zuul-common.yml" + + - name: "Get scenario-specific vars" + ansible.builtin.include_vars: + file: "vars-{{ scenario }}.yml" + + - name: "Log into the cluster" + ansible.builtin.import_role: + name: rhol_crc + tasks_from: add_crc_creds.yml + + - name: "Run STF smoketests" + ansible.builtin.shell: + cmd: | + OCP_PROJECT={{ namespace }} CLEANUP=false ./tests/smoketest/smoketest.sh > {{ logfile_dir }}/smoketest.log 2>&1 + chdir: "{{ sto_dir }}" + changed_when: false diff --git a/ci/vars-zuul-common.yml b/ci/vars-zuul-common.yml new file mode 100644 index 000000000..b7772aea5 --- /dev/null +++ b/ci/vars-zuul-common.yml @@ -0,0 +1,6 @@ +--- +namespace: "service-telemetry-PR#{{ zuul.change }}-{{ zuul.build }}" +setup_bundle_registry_tls_ca: false +setup_bundle_registry_auth: false +base_dir: "{{ sto_dir }}/build" +logfile_dir: "{{ ansible_user_dir }}/zuul-output/logs/controller" From eea83b37c9be0971bed9bfe8a604f07c3f898599 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Thu, 28 Sep 2023 15:07:12 -0400 Subject: [PATCH 70/95] Move the metrics_result retrieve (#493) * Move the metrics_result retrieve Move the metrics_result value retrieval to above an echo command so that the result is of the curl command and not the echo command. Also change the scripts to set +e so that the scripts don't exist right away since there are checks at the end that set an appropriate exit code (and gives more information since the script will complete vs exit immediately). Reported by Chris in Slack * Update tests/smoketest/smoketest_ceilometer_entrypoint.sh Co-authored-by: Chris Sibbitt --------- Co-authored-by: Chris Sibbitt --- tests/smoketest/smoketest_ceilometer_entrypoint.sh | 3 +-- tests/smoketest/smoketest_collectd_entrypoint.sh | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/smoketest/smoketest_ceilometer_entrypoint.sh b/tests/smoketest/smoketest_ceilometer_entrypoint.sh index 674a6e203..8e2ac7f6f 100644 --- a/tests/smoketest/smoketest_ceilometer_entrypoint.sh +++ b/tests/smoketest/smoketest_ceilometer_entrypoint.sh @@ -1,5 +1,5 @@ #!/bin/sh -set -e +set +e # Executes inside the test harness container to start collectd and look for resulting metrics in prometheus PROMETHEUS=${PROMETHEUS:-"https://default-prometheus-proxy:9092"} @@ -28,7 +28,6 @@ echo "*** [INFO] Checking for recent image metrics..." echo "[DEBUG] Running the curl command to return a query" curl -k -u "internal:${PROMETHEUS_AUTH_PASS}" -g "${PROMETHEUS}/api/v1/query?" --data-urlencode 'query=ceilometer_image_size' 2>&1 | grep '"result":\[{"metric":{"__name__":"ceilometer_image_size"' -echo "[DEBUG] Query returned" metrics_result=$? echo "[DEBUG] Set metrics_result to $metrics_result" diff --git a/tests/smoketest/smoketest_collectd_entrypoint.sh b/tests/smoketest/smoketest_collectd_entrypoint.sh index a8ce1103f..d7f5132e8 100755 --- a/tests/smoketest/smoketest_collectd_entrypoint.sh +++ b/tests/smoketest/smoketest_collectd_entrypoint.sh @@ -1,5 +1,5 @@ #!/bin/sh -set -e +set +e # Executes inside the test harness container to start collectd and look for resulting metrics in prometheus PROMETHEUS=${PROMETHEUS:-"https://default-prometheus-proxy:9092"} From 624776e4b900064beb3275b2f04033db1760033a Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Thu, 28 Sep 2023 15:13:09 -0400 Subject: [PATCH 71/95] Increase sensubility smart-gateway ring buffer (#490) Increase the ring buffer size of the sensubility Smart Gateway as messages often exceed the default of 16384 bytes, resulting in no API healt check data from controllers arriving. Closes: rhbz#2241033 --- deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml | 2 +- .../service-telemetry-operator.clusterserviceversion.yaml | 2 +- roles/servicetelemetry/defaults/main.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml index 82814f8df..8c1bf5b0c 100644 --- a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml +++ b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml @@ -73,7 +73,7 @@ spec: subscriptionAddress: sensubility/cloud1-telemetry debugEnabled: false bridge: - ringBufferSize: 16384 + ringBufferSize: 65535 ringBufferCount: 15000 verbose: false events: diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index 0e947081d..86fd01669 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -126,7 +126,7 @@ metadata: { "bridge": { "ringBufferCount": 15000, - "ringBufferSize": 16384, + "ringBufferSize": 65535, "verbose": false }, "collectorType": "sensubility", diff --git a/roles/servicetelemetry/defaults/main.yml b/roles/servicetelemetry/defaults/main.yml index fd8074c3f..5ac3b31c7 100644 --- a/roles/servicetelemetry/defaults/main.yml +++ b/roles/servicetelemetry/defaults/main.yml @@ -115,7 +115,7 @@ servicetelemetry_defaults: subscription_address: sensubility/cloud1-telemetry debug_enabled: false bridge: - ring_buffer_size: 16384 + ring_buffer_size: 65535 ring_buffer_count: 15000 verbose: false events: From 48f452f594f35f01f520033b23f905b6d4836643 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Thu, 28 Sep 2023 15:13:20 -0400 Subject: [PATCH 72/95] Disable event pipeline management (#491) Disable the event pipeline management in the enable-stf.yaml file for the RHOSP 17.1 helper script to align to STF 1.5.3 disabling eventing by default, matching our documentation changes. Related STF-1498 --- tests/infrared/17.1/enable-stf.yaml.template | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/infrared/17.1/enable-stf.yaml.template b/tests/infrared/17.1/enable-stf.yaml.template index df83a4365..c1c5215da 100644 --- a/tests/infrared/17.1/enable-stf.yaml.template +++ b/tests/infrared/17.1/enable-stf.yaml.template @@ -11,6 +11,7 @@ custom_templates: # manage the polling and pipeline configuration files for Ceilometer agents ManagePolling: true ManagePipeline: true + ManageEventPipeline: false # enable Ceilometer metrics CeilometerQdrPublishMetrics: true From 30461a870e2a98e51d293334adef9cd7bf31aef9 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Fri, 29 Sep 2023 11:43:59 +0100 Subject: [PATCH 73/95] [zuul] Use include_vars in prepare stage (#496) Add the common and scenario vars to prepare stage so that namespace is defined before being used --- ci/prepare.yml | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/ci/prepare.yml b/ci/prepare.yml index c226ccdfd..7b65362d6 100644 --- a/ci/prepare.yml +++ b/ci/prepare.yml @@ -2,17 +2,25 @@ - name: "Prepare the environment for running stf" hosts: controller tasks: + - name: "Set the sto_dir if it isn't already set" + ansible.builtin.set_fact: + sto_dir: '{{ ansible_env.HOME }}/{{ zuul.project.src_dir }}' + when: sto_dir | default('') | length == 0 + + - name: "Get vars common to all jobs" + ansible.builtin.include_vars: + file: "vars-zuul-common.yml" + + - name: "Get scenario-specific vars" + ansible.builtin.include_vars: + file: "vars-{{ scenario }}.yml" + - name: "Update pip" ansible.builtin.pip: name: pip state: latest extra_args: "-U" - - name: "Set the value of sto_dir, if it's not already defined" - ansible.builtin.set_fact: - sto_dir: "{{ ansible_env.HOME }}/{{ zuul.project.src_dir }}" - when: not (sto_dir is defined) - - name: "Install pre-reqs from pip" ansible.builtin.pip: requirements: "build/stf-run-ci/requirements.txt" From be41737256d8a75488c9631ce28968ab1ceb65aa Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Fri, 29 Sep 2023 08:45:13 -0400 Subject: [PATCH 74/95] Initial changes for QDR basicAuth (#481) * Initial changes for QDR basicAuth * Update roles/servicetelemetry/tasks/pre.yml Co-authored-by: Leif Madsen * correct API version on secret * Touchups from fresh environment test * swap ansible_date_time for a filter that doesnt required facts ...and adheres to the rules for label text * Update CSV * Disable qdr auth in smoketests See: https://github.com/infrawatch/service-telemetry-operator/pull/492 --------- Co-authored-by: Leif Madsen --- Jenkinsfile | 1 + .../infra.watch_servicetelemetrys_crd.yaml | 6 +++ ...fra.watch_v1beta1_servicetelemetry_cr.yaml | 1 + .../infra.watch_servicetelemetrys_crd.yaml | 7 +++ ...emetry-operator.clusterserviceversion.yaml | 1 + roles/servicetelemetry/defaults/main.yml | 1 + .../servicetelemetry/tasks/component_qdr.yml | 31 +++++++++++++ roles/servicetelemetry/tasks/pre.yml | 45 +++++++++++++++++++ tests/smoketest/smoketest.sh | 7 +++ 9 files changed, 100 insertions(+) diff --git a/Jenkinsfile b/Jenkinsfile index f94b64b1e..f3a13d571 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -36,6 +36,7 @@ spec: strategy: ephemeral transports: qdr: + auth: none enabled: true deploymentSize: 1 web: diff --git a/deploy/crds/infra.watch_servicetelemetrys_crd.yaml b/deploy/crds/infra.watch_servicetelemetrys_crd.yaml index f45dc44bd..286d2c74b 100644 --- a/deploy/crds/infra.watch_servicetelemetrys_crd.yaml +++ b/deploy/crds/infra.watch_servicetelemetrys_crd.yaml @@ -248,6 +248,12 @@ spec: enabled: description: Enable QDR data transort type: boolean + auth: + description: 'Auth type to use for incoming OSP connections. Options are "none", or "basic"' + type: string + enum: + - none + - basic web: description: QDR web configuration properties: diff --git a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml index 8c1bf5b0c..e311546ce 100644 --- a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml +++ b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml @@ -102,6 +102,7 @@ spec: transports: qdr: enabled: true + auth: basic web: enabled: false certificates: diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml index f6cf302b2..f26cbc7b9 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml @@ -429,6 +429,13 @@ spec: qdr: description: QDR configuration for data transport properties: + auth: + description: Auth type to use for incoming OSP connections. + Options are "none", or "basic" + enum: + - none + - basic + type: string certificates: properties: caCertDuration: diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index 86fd01669..8ba230536 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -153,6 +153,7 @@ metadata: "observabilityStrategy": "use_redhat", "transports": { "qdr": { + "auth": "basic", "certificates": { "caCertDuration": "70080h", "endpointCertDuration": "70080h" diff --git a/roles/servicetelemetry/defaults/main.yml b/roles/servicetelemetry/defaults/main.yml index 5ac3b31c7..e8e92d855 100644 --- a/roles/servicetelemetry/defaults/main.yml +++ b/roles/servicetelemetry/defaults/main.yml @@ -74,6 +74,7 @@ servicetelemetry_defaults: deployment_size: 1 web: enabled: false + auth: basic certificates: endpoint_cert_duration: 70080h ca_cert_duration: 70080h diff --git a/roles/servicetelemetry/tasks/component_qdr.yml b/roles/servicetelemetry/tasks/component_qdr.yml index 84fcd1beb..64489ff74 100644 --- a/roles/servicetelemetry/tasks/component_qdr.yml +++ b/roles/servicetelemetry/tasks/component_qdr.yml @@ -149,6 +149,32 @@ sasldb_path: /tmp/qdrouterd.sasldb when: interconnect_manifest is not defined +- when: + - servicetelemetry_vars.transports.qdr.auth == "basic" + block: + - name: Get QDR BasicAuth secret + k8s_info: + api_version: interconnectedcloud.github.io/v1alpha1 + kind: Interconnect + name: "{{ ansible_operator_meta.name }}-interconnect" + namespace: "{{ ansible_operator_meta.namespace }}" + register: _qdr_basicauth_object + + # Because https://github.com/interconnectedcloud/qdr-operator/blob/576d2b33dac71437ea2b165caaaf6413220767fe/pkg/controller/interconnect/interconnect_controller.go#L634 + - name: Perform a one-time upgrade to the default generated password for QDR BasicAuth + k8s: + definition: + kind: Secret + apiVersion: v1 + metadata: + name: "{{ ansible_operator_meta.name }}-interconnect-users" + namespace: "{{ ansible_operator_meta.namespace }}" + labels: + stf_one_time_upgrade: "{{ lookup('pipe', 'date +%s') }}" + stringData: + guest: "{{ lookup('password', '/dev/null') }}" + when: + - _qdr_basicauth_object.resources[0] is defined and _qdr_basicauth_object[0].metadata.labels.stf_one_time_upgrade is not defined - name: Set default Interconnect manifest set_fact: @@ -183,7 +209,12 @@ - expose: true host: 0.0.0.0 port: 5671 + {% if servicetelemetry_vars.transports.qdr.auth == "basic" %} + saslMechanisms: PLAIN + authenticatePeer: true + {% elif servicetelemetry_vars.transports.qdr.auth == "none" %} saslMechanisms: ANONYMOUS + {% endif %} sslProfile: openstack - port: 5673 linkCapacity: 25000 diff --git a/roles/servicetelemetry/tasks/pre.yml b/roles/servicetelemetry/tasks/pre.yml index 0fd1bb59b..38477b02b 100644 --- a/roles/servicetelemetry/tasks/pre.yml +++ b/roles/servicetelemetry/tasks/pre.yml @@ -127,6 +127,51 @@ - _community_prom_object.resources[0] is not defined - _stf_object.resources[0].spec.observabilityStrategy is not defined +- name: Get QDR objects + k8s_info: + api_version: interconnectedcloud.github.io/v1alpha1 + kind: Interconnect + name: "{{ ansible_operator_meta.name }}-interconnect" + namespace: "{{ ansible_operator_meta.namespace }}" + register: _qdr_object + +- block: + - name: Apply legacy auth=none for QDR if missing on the STF object and it's currently deployed that way + k8s: + definition: + apiVersion: infra.watch/v1beta1 + kind: ServiceTelemetry + metadata: + name: "{{ ansible_operator_meta.name }}" + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + transports: + qdr: + auth: none + + - name: Set auth=none for remainder of this run + set_fact: + servicetelemetry_vars: "{{ servicetelemetry_vars|combine({'transports':{'qdr':{'auth': 'none'}}}, recursive=True) }}" # noqa 206 + when: + - _stf_object.resources[0].spec.transports.qdr.auth is not defined + - _qdr_object.resources[0] is defined and _qdr_object.resources[0].spec.edgeListeners[0].saslMechanisms == "ANONYMOUS" + +- name: Apply default auth for QDR if missing on a new STF object with no associated auth=none QDR + k8s: + definition: + apiVersion: infra.watch/v1beta1 + kind: ServiceTelemetry + metadata: + name: "{{ ansible_operator_meta.name }}" + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + transports: + qdr: + auth: "{{ servicetelemetry_defaults.transports.qdr.auth }}" + when: + - _stf_object.resources[0].spec.transports.qdr.auth is not defined + - _qdr_object.resources[0] is defined and _qdr_object.resources[0].spec.edgeListeners[0].saslMechanisms != "ANONYMOUS" + - name: Set ephemeral_storage_enabled to true when storage strategy is ephemeral set_fact: _ephemeral_storage_enabled: true diff --git a/tests/smoketest/smoketest.sh b/tests/smoketest/smoketest.sh index 4204398f2..2909e694f 100755 --- a/tests/smoketest/smoketest.sh +++ b/tests/smoketest/smoketest.sh @@ -27,6 +27,13 @@ if [ "${OC_CLIENT_VERSION_Y}" -lt "${OC_CLIENT_VERSION_Y_REQUIRED}" ] || [ "${OC exit 1 fi +if [ "$(oc get stf default -o=jsonpath='{.spec.transports.qdr.auth}')" != "none" ]; then + echo "*** QDR authentication is currently not supported in smoketests." + echo "To disable it, use: oc patch stf default --patch '{\"spec\":{\"transports\":{\"qdr\":{\"auth\":\"none\"}}}}' --type=merge" + echo "For more info: https://github.com/infrawatch/service-telemetry-operator/pull/492" + exit 1 +fi + CLEANUP=${CLEANUP:-true} SMOKETEST_VERBOSE=${SMOKETEST_VERBOSE:-true} From 713be16defc79d3597bbc99ecff62ce00a15ff4f Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Fri, 29 Sep 2023 17:31:40 +0100 Subject: [PATCH 75/95] [zuul] use 'service-telemetry' as namespace (#497) The format was incorrect, and cannot include '#' or capital letters. --- ci/vars-zuul-common.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/vars-zuul-common.yml b/ci/vars-zuul-common.yml index b7772aea5..39d43a29d 100644 --- a/ci/vars-zuul-common.yml +++ b/ci/vars-zuul-common.yml @@ -1,5 +1,5 @@ --- -namespace: "service-telemetry-PR#{{ zuul.change }}-{{ zuul.build }}" +namespace: "service-telemetry" setup_bundle_registry_tls_ca: false setup_bundle_registry_auth: false base_dir: "{{ sto_dir }}/build" From fe97f48a77f1fea1e9cf8dbdb23f56893dcf93e4 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Fri, 29 Sep 2023 19:13:08 +0100 Subject: [PATCH 76/95] Add var to set QDR auth to none (#498) * Add var to set QDR auth to none The smoketests don't support spec.transport.qdr.auth = basic. Add a var to allow us to set this value to "none" in ci. This commit can be reverted when PR#492 is merged. --- build/stf-run-ci/README.md | 1 + build/stf-run-ci/tasks/deploy_stf.yml | 3 +++ ci/vars-zuul-common.yml | 1 + 3 files changed, 5 insertions(+) diff --git a/build/stf-run-ci/README.md b/build/stf-run-ci/README.md index f1e141878..353a8f81f 100644 --- a/build/stf-run-ci/README.md +++ b/build/stf-run-ci/README.md @@ -52,6 +52,7 @@ choose to override: | `__service_telemetry_trap_default_oid` | | 1.3.6.1.4.1.50495.15.1.2.1 | The trap OID if none is found in the Prometheus alert labels. | | `__service_telemetry_trap_default_severity` | | | The trap severity if none is found in the Prometheus alert labels. | | `__service_telemetry_observability_strategy` | | `use_redhat` | Which observability strategy to use for deployment. Default is 'use_redhat'. Also supported are 'use_hybrid', 'use_community', and 'none' | +| `__service_telemetry_transports_qdr_auth` | {'none', 'basic'} | `none` | Which auth method to use for QDR. Can be 'none' or 'basic'. Note: 'basic' is not yet supported in smoketests. | | `__service_telemetry_transports_certificates_endpoint_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the QDR endpoint certificate (minimum duration is 1h) | | `__service_telemetry_transports_certificates_ca_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the QDR CA certificate (minimum duration is 1h) | | `__internal_registry_path` | | image-registry.openshift-image-registry.svc:5000 | Path to internal registry for image path | diff --git a/build/stf-run-ci/tasks/deploy_stf.yml b/build/stf-run-ci/tasks/deploy_stf.yml index ae5985374..097906ec3 100644 --- a/build/stf-run-ci/tasks/deploy_stf.yml +++ b/build/stf-run-ci/tasks/deploy_stf.yml @@ -49,6 +49,9 @@ transports: qdr: enabled: true + {% if __service_telemetry_transports_qdr_auth is defined %} + auth: "{{ __service_telemetry_transports_qdr_auth }}" + {% endif %} certificates: endpointCertDuration: {{ __service_telemetry_transports_certificates_endpoint_cert_duration }} caCertDuration: {{ __service_telemetry_transports_certificates_ca_cert_duration }} diff --git a/ci/vars-zuul-common.yml b/ci/vars-zuul-common.yml index 39d43a29d..dfd64e7ad 100644 --- a/ci/vars-zuul-common.yml +++ b/ci/vars-zuul-common.yml @@ -2,5 +2,6 @@ namespace: "service-telemetry" setup_bundle_registry_tls_ca: false setup_bundle_registry_auth: false +__service_telemetry_transports_qdr_auth: none base_dir: "{{ sto_dir }}/build" logfile_dir: "{{ ansible_user_dir }}/zuul-output/logs/controller" From 79d89682d5580b50fead0606423577ad92414446 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Tue, 3 Oct 2023 17:43:00 +0100 Subject: [PATCH 77/95] [zuul] Add a job that does a local build and deploys an STF object (#480) * [zuul] Add a job that does a local build and deploys an STF object This job replicates the job in Jenkinsfile --- .zuul.yaml | 10 +++++++++- build/stf-run-ci/tasks/main.yml | 4 ++-- ci/post-collect_logs.yml | 7 +++++-- ci/vars-local_build.yml | 5 +++++ 4 files changed, 21 insertions(+), 5 deletions(-) create mode 100644 ci/vars-local_build.yml diff --git a/.zuul.yaml b/.zuul.yaml index a9d034504..58de2c46c 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -26,8 +26,16 @@ # Pass vars to crc cli https://review.rdoproject.org/cgit/config/tree/playbooks/crc/simple-start.yaml#n30 crc_parameters: '--memory 16000 --disk-size 80 --cpus 6' # Increase from 14336 +- job: + name: stf-crc-latest-local_build + parent: stf-base + description: | + Build images locally and deploy STF + vars: + scenario: "local_build" + - project: name: infrawatch/service-telemetry-operator github-check: jobs: - - noop + - stf-crc-latest-local_build diff --git a/build/stf-run-ci/tasks/main.yml b/build/stf-run-ci/tasks/main.yml index a78431713..bd0821959 100644 --- a/build/stf-run-ci/tasks/main.yml +++ b/build/stf-run-ci/tasks/main.yml @@ -5,7 +5,7 @@ - name: Setup default values ansible.builtin.set_fact: branch: "{{ working_branch | default('master') }}" - namespace: "{{ working_namespace | default('service-telemetry') }}" + namespace: "{{ namespace if namespace is defined else (working_namespace | default('service-telemetry'))}}" - name: Set default image paths for local builds ansible.builtin.set_fact: @@ -205,7 +205,7 @@ - name: Validate system is operational ansible.builtin.shell: | - OCP_PROJECT="{{ namespace }}" VALIDATION_SCOPE="{{ __service_telemetry_observability_strategy }}" timeout 600 "{{ base_dir }}/validate_deployment.sh" >> {{ logfile_dir }}/validate_deployment.log 2>&1 + OCP_PROJECT="{{ namespace }}" VALIDATION_SCOPE="{{ __service_telemetry_observability_strategy }}" timeout 1200 "{{ base_dir }}/validate_deployment.sh" >> {{ logfile_dir }}/validate_deployment.log 2>&1 args: executable: /bin/bash register: validate_deployment diff --git a/ci/post-collect_logs.yml b/ci/post-collect_logs.yml index 614f8728b..50addf867 100644 --- a/ci/post-collect_logs.yml +++ b/ci/post-collect_logs.yml @@ -55,6 +55,7 @@ oc get subscriptions > {{ logfile_dir }}/post_oc_get_subscriptions.log 2>&1 oc describe subscription service-telemetry-operator >> {{ logfile_dir }}/post_oc_get_subscriptions.log 2>&1 cat {{ logfile_dir}}/post_oc_get_subscriptions.log + ignore_errors: true - name: "Get image infos" ansible.builtin.shell: @@ -69,6 +70,7 @@ oc get imagestream -oyaml >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 cat {{ logfile_dir }}/post_oc_get_images.log 2>&1 register: post_oc_get_images + ignore_errors: true retries: 3 delay: 10 @@ -86,9 +88,10 @@ echo "What images were created in the internal registry?" > {{ logfile_dir }}/post_question_deployment.log 2>&1 oc get images | grep $(oc registry info --internal) >> {{ logfile_dir }}/post_question_deployment.log 2>&1 echo "What state is the STO csv in?" >> {{ logfile_dir }}/post_question_deployment.log 2>&1 - oc get csv -n service-telemetry | grep service-telemetry-operator >> {{ logfile_dir }}/post_question_deployment.log 2>&1 - oc get csv -n service-telemetry-operator -oyaml >> {{ logfile_dir }}/post_question_deployment.log 2>&1 + oc get csv -n {{ namespace }} | grep service-telemetry-operator >> {{ logfile_dir }}/post_question_deployment.log 2>&1 + oc get csv -n {{ namespace }} service-telemetry-operator -oyaml >> {{ logfile_dir }}/post_question_deployment.log 2>&1 register: output + ignore_errors: true retries: 3 delay: 10 diff --git a/ci/vars-local_build.yml b/ci/vars-local_build.yml new file mode 100644 index 000000000..3126605a4 --- /dev/null +++ b/ci/vars-local_build.yml @@ -0,0 +1,5 @@ +--- +__deploy_stf: true +__local_build_enabled: true +__service_telemetry_snmptraps_enabled: true +__service_telemetry_storage_ephemeral_enabled: true From 863f45935145207e6060be0fb47deb58dc037815 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 5 Oct 2023 11:19:20 -0400 Subject: [PATCH 78/95] Bump requests from 2.27.1 to 2.31.0 in /build/stf-run-ci (#486) Bumps [requests](https://github.com/psf/requests) from 2.27.1 to 2.31.0. - [Release notes](https://github.com/psf/requests/releases) - [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md) - [Commits](https://github.com/psf/requests/compare/v2.27.1...v2.31.0) --- updated-dependencies: - dependency-name: requests dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Emma Foley --- build/stf-run-ci/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/stf-run-ci/requirements.txt b/build/stf-run-ci/requirements.txt index a5727b04b..f838b6573 100644 --- a/build/stf-run-ci/requirements.txt +++ b/build/stf-run-ci/requirements.txt @@ -1,5 +1,5 @@ # https://stackoverflow.com/questions/64073422/importerror-cannot-import-name-oauth1session-from-requests-oauthlib -requests==2.27.1 +requests==2.31.0 requests_oauthlib==1.3.0 # https://github.com/domainaware/parsedmarc/issues/318 oauthlib==3.2.0 From 0dd478b63ca74257cfec7592ef820bc542281170 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Mon, 9 Oct 2023 11:25:42 -0400 Subject: [PATCH 79/95] Use vars for registry image container name (#452) --- build/stf-run-ci/tasks/create_catalog.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build/stf-run-ci/tasks/create_catalog.yml b/build/stf-run-ci/tasks/create_catalog.yml index f49586950..cdb000870 100644 --- a/build/stf-run-ci/tasks/create_catalog.yml +++ b/build/stf-run-ci/tasks/create_catalog.yml @@ -78,7 +78,7 @@ dockerfile: | # The base image is expected to contain # /bin/opm (with a serve subcommand) and /bin/grpc_health_probe - FROM registry.redhat.io/openshift4/ose-operator-registry:v4.13 + FROM {{default_operator_registry_image_base}}:{{default_operator_registry_image_tag}} COPY --chmod=666 index.yaml /configs/ @@ -97,7 +97,7 @@ dockerStrategy: from: kind: ImageStreamTag - name: ose-operator-registry:v4.13 + name: "ose-operator-registry:{{default_operator_registry_image_tag}}" volumes: - mounts: - destinationPath: /opt/app-root/auth From 6517ac5e11d7c3519d72dab68826183cf4f2d8e3 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Wed, 11 Oct 2023 15:50:16 +0100 Subject: [PATCH 80/95] [stf-run-ci] Generate extra logs if preflight checks fail (#500) * [stf-run-ci] Generate extra logs if preflight checks fail * Update preflight checks --- build/stf-run-ci/tasks/preflight_checks.yml | 36 ++++++++++++++------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/build/stf-run-ci/tasks/preflight_checks.yml b/build/stf-run-ci/tasks/preflight_checks.yml index 94352aded..5c68b5405 100644 --- a/build/stf-run-ci/tasks/preflight_checks.yml +++ b/build/stf-run-ci/tasks/preflight_checks.yml @@ -1,15 +1,27 @@ --- # Try for 10 minutes to get an output -- name: "Wait for up to 10 minutes for Service Telemetry Operator to be Succeeded" - ansible.builtin.shell: | - oc get csv -n "{{ namespace }}" | grep service-telemetry-operator | grep Succeeded - register: output - retries: 60 - delay: 10 - until: output.stdout | length != 0 - ignore_errors: true +- block: + - name: "Wait for up to 10 minutes for Service Telemetry Operator to be Succeeded" + ansible.builtin.shell: | + oc get csv -n "{{ namespace }}" | grep service-telemetry-operator | grep Succeeded + register: output + retries: 60 + delay: 10 + until: output.stdout | length != 0 + ignore_errors: true + rescue: + - name: "Show CSV statuses" + ansible.builtin.command: + cmd: | + oc get csv -n "{{ namespace }}" -- name: "Show fail message if CSV isn't Succeeded after the alotted time" - ansible.builtin.fail: - msg: "Service Telemetry Operator CSV not Succeeded after 10 minutes" - when: output.rc != 0 \ No newline at end of file + - name: "Get service-telemetry-operator CSV information" + ansible.builtin.command: + cmd: | + oc describe csv $(oc get csv | grep "service-telemetry-operator" | awk '{print $1}') > {{ logfile_dir }}/oc_get_csv_sto.log 2>&1 + cat {{ logfile_dir }} + + - name: "Show fail message if CSV isn't Succeeded after the alotted time" + ansible.builtin.fail: + msg: "Service Telemetry Operator CSV not Succeeded after 10 minutes. Check {{ logfile_dir }}/oc_get_csv_sto.log for more information" + when: output.rc != 0 From 0a0065d0e197edd36dc8ad3ff8d3e0b802bec6a9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 11 Oct 2023 14:10:51 -0400 Subject: [PATCH 81/95] Bump oauthlib from 3.2.0 to 3.2.2 in /build/stf-run-ci (#485) Bumps [oauthlib](https://github.com/oauthlib/oauthlib) from 3.2.0 to 3.2.2. - [Release notes](https://github.com/oauthlib/oauthlib/releases) - [Changelog](https://github.com/oauthlib/oauthlib/blob/master/CHANGELOG.rst) - [Commits](https://github.com/oauthlib/oauthlib/compare/v3.2.0...v3.2.2) --- updated-dependencies: - dependency-name: oauthlib dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Emma Foley --- build/stf-run-ci/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/stf-run-ci/requirements.txt b/build/stf-run-ci/requirements.txt index f838b6573..70c742e0e 100644 --- a/build/stf-run-ci/requirements.txt +++ b/build/stf-run-ci/requirements.txt @@ -2,7 +2,7 @@ requests==2.31.0 requests_oauthlib==1.3.0 # https://github.com/domainaware/parsedmarc/issues/318 -oauthlib==3.2.0 +oauthlib==3.2.2 kubernetes==24.2.0 openshift==0.13.1 ansible-core==2.12.10 From 1823727f51edbf1985ddd383aadfd7e4daf70d3f Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Wed, 11 Oct 2023 19:29:22 +0100 Subject: [PATCH 82/95] [stf-run-ci][create_catalog] Swap query for a command task (#470) * [stf-run-ci][create_catalog] Swap query for a command task Using query looks up the kubeconfig on localhost, rather than the host that ansible is executing against. This behaviour is different from either using the shell/command modules or using k8s modules. For consistent behaviour, the queries are replaced with an alternative way to get the same information that will have consistent behahaviour whether executing against localhost or a remote host. --- build/stf-run-ci/tasks/create_catalog.yml | 32 ++++++++++++++++++++--- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/build/stf-run-ci/tasks/create_catalog.yml b/build/stf-run-ci/tasks/create_catalog.yml index cdb000870..6a464afd9 100644 --- a/build/stf-run-ci/tasks/create_catalog.yml +++ b/build/stf-run-ci/tasks/create_catalog.yml @@ -32,7 +32,14 @@ ansible.builtin.set_fact: internal_registry: "{{ builder_dockercfg_auth_results['image-registry.openshift-image-registry.svc:5000'] | to_json }}" -- when: query('kubernetes.core.k8s', api_version='v1', kind='Secret', resource_name='service-telemetry-framework-index-dockercfg', namespace=namespace) | length == 0 +- name: Get Secrets to check for service-telemetry-framework-index-dockercfg + ansible.builtin.command: + cmd: oc get secret -n {{ namespace }} service-telemetry-framework-index-dockercfg + register: index_dockercfg_secret + ignore_errors: true + +# There's an error when the requested resource doesn't exist, so check the rc +- when: index_dockercfg_secret.rc != 0 block: - name: Create config.json to import as Secret ansible.builtin.template: @@ -43,16 +50,33 @@ - name: Create a Secret for the dockercfg ansible.builtin.command: oc create secret generic -n {{ namespace }} service-telemetry-framework-index-dockercfg --from-file=.dockerconfigjson={{ base_dir }}/working/service-telemetry-framework-index/config.json --type=kubernetes.io/dockerconfigjson + ignore_errors: true + +- name: Get the ose-operator-registry ImageStream + ansible.builtin.command: + cmd: oc get -n {{ namespace }} ImageStream ose-operator-registry + register: ose_op_registry_is + ignore_errors: true - name: Create ImageStream for ose-operator-registry ansible.builtin.command: oc import-image -n {{ namespace }} ose-operator-registry:{{ default_operator_registry_image_tag }} --from={{ default_operator_registry_image_base }}:{{ default_operator_registry_image_tag }} --confirm - when: query('kubernetes.core.k8s', api_version='v1', kind='ImageStream', resource_name='ose-operator-registry', namespace=namespace) | length == 0 + when: ose_op_registry_is.rc != 0 + +- name: Delete the existing imagestream, if it exists + ansible.builtin.command: oc delete imagestream -n {{ namespace }} service-telemetry-framework-index + ignore_errors: true - name: Create ImageStream for service-telemetry-framework-index ansible.builtin.command: oc create imagestream -n {{ namespace }} service-telemetry-framework-index - when: query('kubernetes.core.k8s', api_version='v1', kind='ImageStream', resource_name='service-telemetry-framework-index', namespace=namespace) | length == 0 -- name: Create BuildConfig for service-telemetry-framework-index +- name: Get STF index image stream + ansible.builtin.command: + cmd: oc get -n {{ namespace }} ImageStream service-telemetry-framework-index + register: stf_index_imagestream + ignore_errors: true + +- when: stf_index_imagestream.rc != 0 + name: Create BuildConfig for service-telemetry-framework-index kubernetes.core.k8s: definition: apiVersion: build.openshift.io/v1 From ad37387f628dc35c5f6bf9915834c55266a375d5 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Thu, 12 Oct 2023 09:36:08 -0400 Subject: [PATCH 83/95] Add requires infrastructure annotations (#499) * Add requires infrastructure annotations Add required infrastructure annotations for the bundle. Implementation is done in generate_bundle.sh because the annotations.yaml file in the deploy/olm-catalog/ directory is not read by operator-sdk-0.19.4. Append required additional feature annotations to the generated annotations.yaml by operator-sdk generate bundle. Related STF-1530 * Include annotations in the CSV directly * Revert "Add requires infrastructure annotations" This reverts commit c9e9b2aacb73320d0f229dac63f60068ccd84147. * Generate CSV contents with operator-sdk --- ...rvice-telemetry-operator.clusterserviceversion.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index 8ba230536..29f5ac370 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -175,6 +175,16 @@ metadata: description: Service Telemetry Framework. Umbrella Operator for instantiating the required dependencies and configuration of various components to build a Service Telemetry platform for telco grade monitoring. + features.operators.openshift.io/cnf: "false" + features.operators.openshift.io/cni: "false" + features.operators.openshift.io/csi: "false" + features.operators.openshift.io/disconnected: "false" + features.operators.openshift.io/fips-compliant: "false" + features.operators.openshift.io/proxy-aware: "false" + features.operators.openshift.io/tls-profiles: "false" + features.operators.openshift.io/token-auth-aws: "false" + features.operators.openshift.io/token-auth-azure: "false" + features.operators.openshift.io/token-auth-gcp: "false" olm.skipRange: '>=<> <<>' operatorframework.io/suggested-namespace: service-telemetry operators.openshift.io/valid-subscription: '["OpenStack Platform", "Cloud Infrastructure", From d7f1708b368347819e2ee951e8dd6f5a3a745a64 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Tue, 17 Oct 2023 15:08:17 +0100 Subject: [PATCH 84/95] [stf-collect-logs] Add a role for log collection (#488) * [stf-collect-logs] Add a role for log collection * Update build/run-ci.yaml * [stf-collect-logs] Update the resource name in * [stf-collect-logs] Update README * [stf-collect-logs] Remove unnecessary lines * ci/post-collect_logs: Use stf-collect-logs role * [stf-collect-logs]: Use namespace in oc commands --------- Co-authored-by: Chris Sibbitt --- build/run-ci.yaml | 8 +++- build/stf-collect-logs/README.md | 38 +++++++++++++++ build/stf-collect-logs/defaults/main.yml | 2 + build/stf-collect-logs/meta/main.yml | 15 ++++++ build/stf-collect-logs/tasks/main.yml | 61 ++++++++++++++++++++++++ build/stf-collect-logs/vars/main.yml | 2 + ci/post-collect_logs.yml | 59 ++--------------------- 7 files changed, 128 insertions(+), 57 deletions(-) create mode 100644 build/stf-collect-logs/README.md create mode 100644 build/stf-collect-logs/defaults/main.yml create mode 100644 build/stf-collect-logs/meta/main.yml create mode 100644 build/stf-collect-logs/tasks/main.yml create mode 100644 build/stf-collect-logs/vars/main.yml diff --git a/build/run-ci.yaml b/build/run-ci.yaml index 932ef10c0..bfd07c3cb 100644 --- a/build/run-ci.yaml +++ b/build/run-ci.yaml @@ -6,4 +6,10 @@ tasks: - name: Run the STF CI system import_role: - name: stf-run-ci + name: stf-run-ci + + - name: Collect the logs + import_role: + name: stf-collect-logs + vars: + logfile_dir: "{{ playbook_dir }}/" diff --git a/build/stf-collect-logs/README.md b/build/stf-collect-logs/README.md new file mode 100644 index 000000000..66b8739d6 --- /dev/null +++ b/build/stf-collect-logs/README.md @@ -0,0 +1,38 @@ +stf-collect-logs +================ + +This role collects logs that are useful for debugging an STF deployment. + +Once the logs are collected, the user will need to fetch the logs themselves. + +Requirements +------------ + + +Role Variables +-------------- + +* `logfile_dir` - The location that the logs will be created in on the remote host(s). + +Dependencies +------------ + + +Example Playbook +---------------- + +Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: + + - hosts: servers + roles: + - { role: username.rolename, x: 42 } + +License +------- + +Apache 2 + +Author Information +------------------ + +Red Hat diff --git a/build/stf-collect-logs/defaults/main.yml b/build/stf-collect-logs/defaults/main.yml new file mode 100644 index 000000000..9855190f1 --- /dev/null +++ b/build/stf-collect-logs/defaults/main.yml @@ -0,0 +1,2 @@ +--- +# defaults file for stf-collect-logs diff --git a/build/stf-collect-logs/meta/main.yml b/build/stf-collect-logs/meta/main.yml new file mode 100644 index 000000000..9a11606cd --- /dev/null +++ b/build/stf-collect-logs/meta/main.yml @@ -0,0 +1,15 @@ +galaxy_info: + role_name: stf-collect-logs + namespace: infrawatch + + author: InfraWatch + description: Log collection role for Service Telemetry Framework + company: Red Hat + + license: Apache-2.0 + + min_ansible_version: 2.1 + + galaxy_tags: [] + +dependencies: [] diff --git a/build/stf-collect-logs/tasks/main.yml b/build/stf-collect-logs/tasks/main.yml new file mode 100644 index 000000000..dde52761f --- /dev/null +++ b/build/stf-collect-logs/tasks/main.yml @@ -0,0 +1,61 @@ +--- +- name: "Get builds" + ansible.builtin.shell: + cmd: | + echo "*** [INFO] Showing oc get builds" > {{ logfile_dir }}/post_oc_get_builds.log 2>&1 + oc -n {{ namespace }} get builds -oyaml >> {{ logfile_dir }}/post_oc_get_builds.log 2>&1 + echo "*** [INFO] Showing oc get builds -oyaml" >> {{ logfile_dir }}/post_oc_get_builds.log 2>&1 + oc -n {{ namespace }} get builds -oyaml >> {{ logfile_dir }}/post_oc_get_builds.log 2>&1 + cat {{ logfile_dir }}/post_oc_get_builds.log + ignore_errors: true + changed_when: false + +- name: "Get subscription details" + ansible.builtin.shell: + cmd: | + oc -n {{ namespace }} get subscriptions > {{ logfile_dir }}/post_oc_get_subscriptions.log 2>&1 + oc -n {{ namespace }} describe subscription service-telemetry-operator >> {{ logfile_dir }}/post_oc_get_subscriptions.log 2>&1 + ignore_errors: true + +- name: "Get image infos" + ansible.builtin.shell: + cmd: | + echo "[INFO] oc get images" > {{ logfile_dir }}/post_oc_get_images.log 2>&1 + oc -n {{ namespace }} get images >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 + echo "[INFO] oc get imagestreams" >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 + oc -n {{ namespace }} get imagestream >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 + echo "[INFO] oc get imagestream -oyaml" >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 + oc -n {{ namespace }} get imagestream -oyaml >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 + retries: 3 + delay: 10 + ignore_errors: true + +- name: "Get STO info" + ansible.builtin.shell: + cmd: | + oc -n {{ namespace }} describe pod $(oc -n {{ namespace }} get pod -l name=service-telemetry-operator -ojsonpath='{ .items[].metadata.name }') >> {{ logfile_dir }}/describe_sto.log 2>&1 + ignore_errors: true + retries: 3 + delay: 10 + +- name: "Question the deployment" + ansible.builtin.shell: + cmd: | + echo "What images were created in the internal registry?" > {{ logfile_dir }}/post_question_deployment.log 2>&1 + oc -n {{ namespace }} get images | grep $(oc -n {{ namespace }} registry info --internal) >> {{ logfile_dir }}/post_question_deployment.log 2>&1 + echo "What state is the STO csv in?" >> {{ logfile_dir }}/post_question_deployment.log 2>&1 + oc -n {{ namespace }} get csv | grep service-telemetry-operator >> {{ logfile_dir }}/post_question_deployment.log 2>&1 + oc -n {{ namespace }} get csv $(oc -n {{ namespace }} get csv | grep "service-telemetry-operator" | awk '{ print $1}') -oyaml >> {{ logfile_dir }}/post_question_deployment.log 2>&1 + register: output + retries: 3 + delay: 10 + +- name: "Get pods" + ansible.builtin.command: + cmd: | + oc -n {{ namespace }} get pods > {{ logfile_dir }}/post_oc_get_pods.log 2>&1 + echo "Additional information" >> {{ logfile_dir }}/post_oc_get_pods.log + oc -n {{ namespace }} describe pods >> {{ logfile_dir }}/post_oc_get_pods.log 2>&1 + ignore_errors: true + retries: 3 + delay: 10 diff --git a/build/stf-collect-logs/vars/main.yml b/build/stf-collect-logs/vars/main.yml new file mode 100644 index 000000000..5197b0284 --- /dev/null +++ b/build/stf-collect-logs/vars/main.yml @@ -0,0 +1,2 @@ +--- +# vars file for stf-collect-logs diff --git a/ci/post-collect_logs.yml b/ci/post-collect_logs.yml index 50addf867..78526cd1b 100644 --- a/ci/post-collect_logs.yml +++ b/ci/post-collect_logs.yml @@ -38,62 +38,9 @@ name: rhol_crc tasks_from: add_crc_creds.yml - - name: "Get builds" - ansible.builtin.shell: - cmd: | - echo "*** [INFO] Showing oc get builds" > {{ logfile_dir }}/post_oc_get_builds.log 2>&1 - oc get builds -oyaml >> {{ logfile_dir }}/post_oc_get_builds.log 2>&1 - echo "*** [INFO] Showing oc get builds -oyaml" >> {{ logfile_dir }}/post_oc_get_builds.log 2>&1 - oc get builds -oyaml >> {{ logfile_dir }}/post_oc_get_builds.log 2>&1 - cat {{ logfile_dir }}/post_oc_get_builds.log - ignore_errors: true - changed_when: false - - - name: "Get subscription details" - ansible.builtin.shell: - cmd: | - oc get subscriptions > {{ logfile_dir }}/post_oc_get_subscriptions.log 2>&1 - oc describe subscription service-telemetry-operator >> {{ logfile_dir }}/post_oc_get_subscriptions.log 2>&1 - cat {{ logfile_dir}}/post_oc_get_subscriptions.log - ignore_errors: true - - - name: "Get image infos" - ansible.builtin.shell: - cmd: | - echo "[INFO] oc get images" > {{ logfile_dir }}/post_oc_get_images.log 2>&1 - oc get images >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 - echo "[INFO] oc get imagestreams" >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 - oc get imagestream >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 - #echo "[INFO] oc get images -oyaml" >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 - #oc get images -oyaml >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 - echo "[INFO] oc get imagestream -oyaml" >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 - oc get imagestream -oyaml >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 - cat {{ logfile_dir }}/post_oc_get_images.log 2>&1 - register: post_oc_get_images - ignore_errors: true - retries: 3 - delay: 10 - - - name: "Get STO info" - ansible.builtin.shell: - cmd: | - oc describe pod $(oc get pod -l name=service-telemetry-operator -ojsonpath='{ .items[].metadata.name }') >> {{ logfile_dir }}/describe_sto.log 2>&1 - ignore_errors: true - retries: 3 - delay: 10 - - - name: "Question the deployment" - ansible.builtin.shell: - cmd: | - echo "What images were created in the internal registry?" > {{ logfile_dir }}/post_question_deployment.log 2>&1 - oc get images | grep $(oc registry info --internal) >> {{ logfile_dir }}/post_question_deployment.log 2>&1 - echo "What state is the STO csv in?" >> {{ logfile_dir }}/post_question_deployment.log 2>&1 - oc get csv -n {{ namespace }} | grep service-telemetry-operator >> {{ logfile_dir }}/post_question_deployment.log 2>&1 - oc get csv -n {{ namespace }} service-telemetry-operator -oyaml >> {{ logfile_dir }}/post_question_deployment.log 2>&1 - register: output - ignore_errors: true - retries: 3 - delay: 10 + - name: "Gather logs from stf deployment" + ansible.builtin.import_role: + name: '../build/stf-collect-logs' - name: "Copy generated logs" ansible.builtin.shell: | From 542ac47c6a2c1260ea4e1a07fc901f13539515b5 Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Tue, 17 Oct 2023 15:44:35 +0100 Subject: [PATCH 85/95] [zuul] Add job to deploy from nightly bundles (#494) * [zuul] Add job to deploy from nightly bundles This job doesn't build STF, but deploys from the pre-built and published bundles. This is useful to be able to do periodically to make sure our latest bundles are deploying, and no dependencies are out-of-date, for example --- .zuul.yaml | 9 +++++++++ ci/post-collect_logs.yml | 19 +++++++++++++++++++ ci/vars-nightly_bundles.yml | 7 +++++++ 3 files changed, 35 insertions(+) create mode 100644 ci/vars-nightly_bundles.yml diff --git a/.zuul.yaml b/.zuul.yaml index 58de2c46c..91d848359 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -26,6 +26,14 @@ # Pass vars to crc cli https://review.rdoproject.org/cgit/config/tree/playbooks/crc/simple-start.yaml#n30 crc_parameters: '--memory 16000 --disk-size 80 --cpus 6' # Increase from 14336 +- job: + name: stf-crc-latest-nightly_bundles + parent: stf-base + description: + Deploy STF nightly bundles + vars: + scenario: "nightly_bundles" + - job: name: stf-crc-latest-local_build parent: stf-base @@ -38,4 +46,5 @@ name: infrawatch/service-telemetry-operator github-check: jobs: + - stf-crc-latest-nightly_bundles - stf-crc-latest-local_build diff --git a/ci/post-collect_logs.yml b/ci/post-collect_logs.yml index 78526cd1b..58552b618 100644 --- a/ci/post-collect_logs.yml +++ b/ci/post-collect_logs.yml @@ -42,6 +42,25 @@ ansible.builtin.import_role: name: '../build/stf-collect-logs' + - name: "Get pods and describe non-completed, non-running pods" + ansible.builtin.shell: + cmd: | + echo "*** oc get pods ***" > {{ logfile_dir }}/oc_get_pods.log 2>&1 + oc -n {{ namespace }} get pods >> {{ logfile_dir }}/oc_get_pods.log 2>&1 + + for pod in $(oc get pods | grep -v NAME | grep -v Running | awk '{ print $1 }'); + do + oc -n {{ namespace }} describe pod $pod > {{ logfile_dir }}/post_oc_describe_pod_${pod}.log 2>&1 + done + ignore_errors: true + retries: 3 + delay: 10 + + - name: "Get build details" + ansible.builtin.shell: + cmd: | + for build in $(oc -n {{ namespace }} get builds -o json| jq -r '.items[].metadata.name'); do oc -n {{ namespace }} describe build $build > {{ logfile_dir }}/post_oc_describe_build_${build}.log 2>&1; done + - name: "Copy generated logs" ansible.builtin.shell: | cp {{ ansible_env.HOME }}/*.log . diff --git a/ci/vars-nightly_bundles.yml b/ci/vars-nightly_bundles.yml new file mode 100644 index 000000000..ca49656f3 --- /dev/null +++ b/ci/vars-nightly_bundles.yml @@ -0,0 +1,7 @@ +--- +# from: https://github.com/infrawatch/service-telemetry-operator/pull/437 +# ansible-playbook -e __service_telemetry_storage_ephemeral_enabled=true -e __local_build_enabled=false -e __deploy_from_bundles_enabled=true -e __service_telemetry_bundle_image_path=quay.io/infrawatch-operators/service-telemetry-operator-bundle:nightly-head -e __smart_gateway_bundle_image_path=quay.io/infrawatch-operators/smart-gateway-operator-bundle:nightly-head --skip-tags bundle_registry_tls_ca --skip-tags bundle_registry_auth build/run-ci.yaml + +__local_build_enabled: false +__deploy_from_bundles_enabled: true +__service_telemetry_storage_ephemeral_enabled: true From 7c61bbdfdf88bb4bd2cb4d46926f6015f97277f3 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Tue, 17 Oct 2023 14:01:28 -0400 Subject: [PATCH 86/95] Next release of STF will start at OCP 4.11 (#503) Support STF 1.5 from OCP 4.11 through 4.14 for the next release as OCP 4.10 is now EOL. --- deploy/olm-catalog/service-telemetry-operator/Dockerfile.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in b/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in index c3e7aa29e..cbe2ccbf3 100644 --- a/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in +++ b/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in @@ -13,7 +13,7 @@ LABEL operators.operatorframework.io.metrics.mediatype.v1=metrics+v1 LABEL operators.operatorframework.io.metrics.builder=operator-sdk-v0.19.4 LABEL operators.operatorframework.io.metrics.project_layout=ansible LABEL com.redhat.delivery.operator.bundle=true -LABEL com.redhat.openshift.versions="v4.10-v4.14" +LABEL com.redhat.openshift.versions="v4.11-v4.14" LABEL com.redhat.delivery.backport=false LABEL com.redhat.component="service-telemetry-operator-bundle-container" \ From c9df56139ce5ff1cf844e8e0e4cb039d3fc92d7b Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Wed, 18 Oct 2023 14:23:55 -0400 Subject: [PATCH 87/95] Update base image to pass security scans (#502) * Update base image to pass security scans Update the base image with a dnf update (need to excluse ansible because ansible updates aren't compatible with the current build). This keeps packages up to date to allow the resulting image to pass registry security scans at the expence of image size. * Clean up intermediate layer Co-authored-by: Chris Sibbitt * Add comments to help understand Dockerfile readout * Spellcheck fix --------- Co-authored-by: Chris Sibbitt --- build/Dockerfile | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/build/Dockerfile b/build/Dockerfile index 7242e664f..da2b7508f 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -1,14 +1,21 @@ FROM quay.io/openshift/origin-ansible-operator:4.12 +# temporarily switch to root user to adjust image layers USER 0 # Upstream CI builds need the additional EPEL sources for python3-passlib and python3-bcrypt but have no working repos to install epel-release # NO_PROXY is undefined in upstream CI builds, but defined (usually blank) during openshift builds (a possibly brittle hack) RUN bash -c -- 'if [ "${NO_PROXY:-__ZZZZZ}" == "__ZZZZZ" ]; then echo "Applying upstream EPEL hacks" && echo -e "-----BEGIN PGP PUBLIC KEY BLOCK-----\n\nmQINBFz3zvsBEADJOIIWllGudxnpvJnkxQz2CtoWI7godVnoclrdl83kVjqSQp+2\ndgxuG5mUiADUfYHaRQzxKw8efuQnwxzU9kZ70ngCxtmbQWGmUmfSThiapOz00018\n+eo5MFabd2vdiGo1y+51m2sRDpN8qdCaqXko65cyMuLXrojJHIuvRA/x7iqOrRfy\na8x3OxC4PEgl5pgDnP8pVK0lLYncDEQCN76D9ubhZQWhISF/zJI+e806V71hzfyL\n/Mt3mQm/li+lRKU25Usk9dWaf4NH/wZHMIPAkVJ4uD4H/uS49wqWnyiTYGT7hUbi\necF7crhLCmlRzvJR8mkRP6/4T/F3tNDPWZeDNEDVFUkTFHNU6/h2+O398MNY/fOh\nyKaNK3nnE0g6QJ1dOH31lXHARlpFOtWt3VmZU0JnWLeYdvap4Eff9qTWZJhI7Cq0\nWm8DgLUpXgNlkmquvE7P2W5EAr2E5AqKQoDbfw/GiWdRvHWKeNGMRLnGI3QuoX3U\npAlXD7v13VdZxNydvpeypbf/AfRyrHRKhkUj3cU1pYkM3DNZE77C5JUe6/0nxbt4\nETUZBTgLgYJGP8c7PbkVnO6I/KgL1jw+7MW6Az8Ox+RXZLyGMVmbW/TMc8haJfKL\nMoUo3TVk8nPiUhoOC0/kI7j9ilFrBxBU5dUtF4ITAWc8xnG6jJs/IsvRpQARAQAB\ntChGZWRvcmEgRVBFTCAoOCkgPGVwZWxAZmVkb3JhcHJvamVjdC5vcmc+iQI4BBMB\nAgAiBQJc9877AhsPBgsJCAcDAgYVCAIJCgsEFgIDAQIeAQIXgAAKCRAh6kWrL4bW\noWagD/4xnLWws34GByVDQkjprk0fX7Iyhpm/U7BsIHKspHLL+Y46vAAGY/9vMvdE\n0fcr9Ek2Zp7zE1RWmSCzzzUgTG6BFoTG1H4Fho/7Z8BXK/jybowXSZfqXnTOfhSF\nalwDdwlSJvfYNV9MbyvbxN8qZRU1z7PEWZrIzFDDToFRk0R71zHpnPTNIJ5/YXTw\nNqU9OxII8hMQj4ufF11040AJQZ7br3rzerlyBOB+Jd1zSPVrAPpeMyJppWFHSDAI\nWK6x+am13VIInXtqB/Cz4GBHLFK5d2/IYspVw47Solj8jiFEtnAq6+1Aq5WH3iB4\nbE2e6z00DSF93frwOyWN7WmPIoc2QsNRJhgfJC+isGQAwwq8xAbHEBeuyMG8GZjz\nxohg0H4bOSEujVLTjH1xbAG4DnhWO/1VXLX+LXELycO8ZQTcjj/4AQKuo4wvMPrv\n9A169oETG+VwQlNd74VBPGCvhnzwGXNbTK/KH1+WRH0YSb+41flB3NKhMSU6dGI0\nSGtIxDSHhVVNmx2/6XiT9U/znrZsG5Kw8nIbbFz+9MGUUWgJMsd1Zl9R8gz7V9fp\nn7L7y5LhJ8HOCMsY/Z7/7HUs+t/A1MI4g7Q5g5UuSZdgi0zxukiWuCkLeAiAP4y7\nzKK4OjJ644NDcWCHa36znwVmkz3ixL8Q0auR15Oqq2BjR/fyog==\n=84m8\n-----END PGP PUBLIC KEY BLOCK-----" > /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-8 && echo -e "[epel]\nname=Extra Packages for Enterprise Linux 8 - \$basearch\nmetalink=https://mirrors.fedoraproject.org/metalink?repo=epel-8&arch=\$basearch&infra=\$infra&content=\$contentdir\nenabled=1\ngpgcheck=1\ngpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-8" > /etc/yum.repos.d/epel.repo; fi' -# Required for oauth-proxy -RUN dnf install -y python3-passlib python3-bcrypt +# update the base image to allow forward-looking optimistic updates during the testing phase, with the added benefit of helping move closer to passing security scans. +# -- excludes ansible so it remains at 2.9 tag as shipped with the base image +# -- installs python3-passlib and python3-bcrypt for oauth-proxy interface +# -- cleans up the cached data from dnf to keep the image as small as possible +RUN dnf update -y --exclude=ansible* && dnf install -y python3-passlib python3-bcrypt && dnf clean all && rm -rf /var/cache/dnf + +# switch back to user 1001 when running the base image (non-root) USER 1001 +# copy in required artifacts for the operator COPY watches.yaml ${HOME}/watches.yaml COPY roles/ ${HOME}/roles/ COPY collections/ ${HOME}/.ansible/collections/ From 7eb286264cec20f5d078af843ed67746d6c4d51c Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Fri, 20 Oct 2023 10:18:58 -0400 Subject: [PATCH 88/95] Cluster Observability Operator dependency (#504) Add cluster observability operator as the preferred dependency (bottom of list is highest priority) when installing Service Telemetry Operator. The cluster-observability-operator is the name of the downstream (product) bundle in the Red Hat Operators CatalogSource. If installing for upstream, preferred operator will be observability-operator (when the Red Hat Operators CatalogSource is not available or enabled). And then as a fall-back method when neither Observability Operator or Cluster Observability Operator is not available, allow for Prometheus Operator from the Community Operators to satisfy for the Prometheus storage backend. --- .../service-telemetry-operator/metadata/properties.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml b/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml index 63f6bd1cd..2a0d93436 100644 --- a/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml @@ -39,3 +39,6 @@ properties: - package: packageName: observability-operator versionRange: '>=0.0.1' + - package: + packageName: cluster-observability-operator + versionRange: '>=0.0.1' From cab5a6045f63454629b4659d64a75c0ebcdc449c Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Tue, 24 Oct 2023 11:22:38 +0100 Subject: [PATCH 89/95] [better_logging] Show generate_bundle log when task fails (#501) --- .../tasks/setup_stf_local_build.yml | 38 ++++++++++--------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/build/stf-run-ci/tasks/setup_stf_local_build.yml b/build/stf-run-ci/tasks/setup_stf_local_build.yml index 999c2902a..40774223f 100644 --- a/build/stf-run-ci/tasks/setup_stf_local_build.yml +++ b/build/stf-run-ci/tasks/setup_stf_local_build.yml @@ -19,7 +19,7 @@ OPERATOR_TAG={{ sgo_image_path | parse_tag | quote }} \ ./generate_bundle.sh register: generate_bundle_sgo - always: + rescue: # "|| true" is needed until https://github.com/infrawatch/smart-gateway-operator/pull/143 is merged - name: Show generate bundle log ansible.builtin.shell: @@ -54,23 +54,25 @@ cmd: oc apply -f {{ base_dir }}/working/smart-gateway-operator-bundle/manifests/smart-gateway-operator.clusterserviceversion.yaml -n "{{ namespace }}" # --- Service Telemetry Operator --- -- name: Generate Service Telemetry Operator CSV - ansible.builtin.shell: - chdir: "{{ base_dir }}" - cmd: | - LOGFILE="{{ logfile_dir }}/sto_gen_bundle.log" \ - OPERATOR_SDK="{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v0 }}" \ - WORKING_DIR="{{ base_dir }}/working/service-telemetry-operator-bundle" \ - RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP={{ prometheus_webhook_snmp_image_path | parse_image | quote }} \ - RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG={{ prometheus_webhook_snmp_image_path | parse_tag | quote }} \ - OPERATOR_IMAGE={{ sto_image_path | parse_image | quote }} \ - OPERATOR_TAG={{ sto_image_path | parse_tag | quote }} \ - ./generate_bundle.sh - register: generate_bundle_sto - -- name: Results of STO bundle generation - ansible.builtin.debug: - var: generate_bundle_sto.stdout +- block: + - name: Generate Service Telemetry Operator CSV + ansible.builtin.shell: + chdir: "{{ base_dir }}" + cmd: | + LOGFILE="{{ logfile_dir }}/sto_gen_bundle.log" \ + OPERATOR_SDK="{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v0 }}" \ + WORKING_DIR="{{ base_dir }}/working/service-telemetry-operator-bundle" \ + RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP={{ prometheus_webhook_snmp_image_path | parse_image | quote }} \ + RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG={{ prometheus_webhook_snmp_image_path | parse_tag | quote }} \ + OPERATOR_IMAGE={{ sto_image_path | parse_image | quote }} \ + OPERATOR_TAG={{ sto_image_path | parse_tag | quote }} \ + ./generate_bundle.sh + register: generate_bundle_sto + rescue: + - name: Results of STO bundle generation + ansible.builtin.shell: + cmd: | + cat {{ logfile_dir }}/sto_gen_bundle.log || true - name: Replace namespace in STO role binding ansible.builtin.replace: From 23eb686f1f3ca53b5182c152fbc0b7ca5c859759 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Tue, 24 Oct 2023 12:37:14 -0400 Subject: [PATCH 90/95] Don't deploy events SGs by default (#506) When deploying from the UI, don't populate the events SGs by default as they are no longer used in a default configuration in RHOSP. --- ...fra.watch_v1beta1_servicetelemetry_cr.yaml | 16 ------------- ...emetry-operator.clusterserviceversion.yaml | 24 ------------------- 2 files changed, 40 deletions(-) diff --git a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml index e311546ce..c86d7e59f 100644 --- a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml +++ b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml @@ -76,22 +76,6 @@ spec: ringBufferSize: 65535 ringBufferCount: 15000 verbose: false - events: - collectors: - - collectorType: collectd - subscriptionAddress: collectd/cloud1-notify - debugEnabled: false - bridge: - ringBufferSize: 16384 - ringBufferCount: 15000 - verbose: false - - collectorType: ceilometer - subscriptionAddress: anycast/ceilometer/cloud1-event.sample - debugEnabled: false - bridge: - ringBufferSize: 16384 - ringBufferCount: 15000 - verbose: false graphing: enabled: false grafana: diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index 29f5ac370..dffd1cf3e 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -77,30 +77,6 @@ metadata: }, "clouds": [ { - "events": { - "collectors": [ - { - "bridge": { - "ringBufferCount": 15000, - "ringBufferSize": 16384, - "verbose": false - }, - "collectorType": "collectd", - "debugEnabled": false, - "subscriptionAddress": "collectd/cloud1-notify" - }, - { - "bridge": { - "ringBufferCount": 15000, - "ringBufferSize": 16384, - "verbose": false - }, - "collectorType": "ceilometer", - "debugEnabled": false, - "subscriptionAddress": "anycast/ceilometer/cloud1-event.sample" - } - ] - }, "metrics": { "collectors": [ { From bc245eb7dd0994b281e8a45ac25b143b004a1871 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Tue, 24 Oct 2023 12:56:25 -0400 Subject: [PATCH 91/95] Adjust CR annotation to matchdefault scrape interval --- deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml index c86d7e59f..8b4cf7142 100644 --- a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml +++ b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml @@ -27,7 +27,7 @@ spec: metrics: prometheus: enabled: true - scrapeInterval: 10s + scrapeInterval: 30s storage: strategy: persistent retention: 24h From 5786978b93ba9c86b4b8220942859e963d308302 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Wed, 25 Oct 2023 10:42:45 -0400 Subject: [PATCH 92/95] Fix bundle check test (#507) I forgot to run a generate bundle yesterday in some furious patch work. --- .../service-telemetry-operator.clusterserviceversion.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index dffd1cf3e..1e1fdc092 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -64,7 +64,7 @@ metadata: "metrics": { "prometheus": { "enabled": true, - "scrapeInterval": "10s", + "scrapeInterval": "30s", "storage": { "persistent": { "pvcStorageRequest": "20G" From 03a158928f81e7dcea441bcfe6a135858fb51f9c Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Wed, 25 Oct 2023 14:17:25 -0400 Subject: [PATCH 93/95] Update path to internal TLS CA (#508) --- tests/infrared/16.2/infrared-openstack.sh | 2 +- tests/infrared/17.1/infrared-openstack.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/infrared/16.2/infrared-openstack.sh b/tests/infrared/16.2/infrared-openstack.sh index cb4641463..c90d433f3 100755 --- a/tests/infrared/16.2/infrared-openstack.sh +++ b/tests/infrared/16.2/infrared-openstack.sh @@ -87,7 +87,7 @@ ir_create_undercloud() { --build "${OSP_BUILD}" \ --images-task rpm \ --images-update no \ - --tls-ca https://password.corp.redhat.com/RH-IT-Root-CA.crt \ + --tls-ca https://certs.corp.redhat.com/certs/2022-IT-Root-CA.pem \ --overcloud-domain "${OVERCLOUD_DOMAIN}" \ --config-options DEFAULT.undercloud_timezone=UTC } diff --git a/tests/infrared/17.1/infrared-openstack.sh b/tests/infrared/17.1/infrared-openstack.sh index cf478fddd..9743a0081 100755 --- a/tests/infrared/17.1/infrared-openstack.sh +++ b/tests/infrared/17.1/infrared-openstack.sh @@ -91,7 +91,7 @@ ir_create_undercloud() { --build "${OSP_BUILD}" \ --images-task rpm \ --images-update no \ - --tls-ca https://password.corp.redhat.com/RH-IT-Root-CA.crt \ + --tls-ca https://certs.corp.redhat.com/certs/2022-IT-Root-CA.pem \ --overcloud-domain "${OVERCLOUD_DOMAIN}" \ --config-options DEFAULT.undercloud_timezone=UTC } From 8693f4632e8976d33f8db24dcef32f07b4b5bb68 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Thu, 26 Oct 2023 10:03:53 -0400 Subject: [PATCH 94/95] Use stable-1.5 channel in the stable-1.5 branch --- .github/workflows/main.yml | 2 +- build/stf-run-ci/tasks/main.yml | 2 +- build/stf-run-ci/tasks/setup_stf.yml | 6 +++--- build/update_csv.sh | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 2b04f59a5..f7269fea9 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -58,7 +58,7 @@ jobs: run: sudo mv operator-sdk /usr/local/bin - name: Generate bundle locally - run: operator-sdk generate bundle --manifests --metadata --default-channel unstable --channels unstable + run: operator-sdk generate bundle --manifests --metadata --default-channel stable-1.5 --channels stable-1.5 - name: Check if bundle generation results in local changes run: git diff --exit-code diff --git a/build/stf-run-ci/tasks/main.yml b/build/stf-run-ci/tasks/main.yml index bd0821959..cf2b0a880 100644 --- a/build/stf-run-ci/tasks/main.yml +++ b/build/stf-run-ci/tasks/main.yml @@ -187,7 +187,7 @@ name: service-telemetry-operator namespace: "{{ namespace }}" spec: - channel: unstable + channel: stable-1.5 installPlanApproval: Automatic name: service-telemetry-operator source: service-telemetry-framework-operators diff --git a/build/stf-run-ci/tasks/setup_stf.yml b/build/stf-run-ci/tasks/setup_stf.yml index e76eb1734..ce4713931 100644 --- a/build/stf-run-ci/tasks/setup_stf.yml +++ b/build/stf-run-ci/tasks/setup_stf.yml @@ -9,7 +9,7 @@ namespace: openshift-marketplace spec: displayName: InfraWatch Operators - image: quay.io/infrawatch-operators/infrawatch-catalog:unstable + image: quay.io/infrawatch-operators/infrawatch-catalog:stable-1.5 publisher: InfraWatch sourceType: grpc updateStrategy: @@ -26,7 +26,7 @@ name: smart-gateway-operator namespace: "{{ namespace }}" spec: - channel: unstable + channel: stable-1.5 installPlanApproval: Automatic name: smart-gateway-operator source: infrawatch-operators @@ -42,7 +42,7 @@ name: service-telemetry-operator namespace: "{{ namespace }}" spec: - channel: unstable + channel: stable-1.5 installPlanApproval: Automatic name: service-telemetry-operator source: infrawatch-operators diff --git a/build/update_csv.sh b/build/update_csv.sh index 4e15f1fa3..172653dd4 100755 --- a/build/update_csv.sh +++ b/build/update_csv.sh @@ -3,4 +3,4 @@ # Run this script from the root directory to update the CSV whenever changes # are made to /deploy/crds/. Changes are written to # /deploy/olm-manifests/service-telemetry-operator/. -operator-sdk generate bundle --channels unstable --default-channel unstable +operator-sdk generate bundle --channels stable-1.5 --default-channel stable-1.5 From e50dc5fdd023c3fe37d71196d7abdaadf923ea90 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Fri, 27 Oct 2023 09:46:34 -0400 Subject: [PATCH 95/95] Drop .zuul.yaml for the stable-1.5 branch Drop .zuul.yaml for stable-1.5 since it's not setup for non-main testing at this point. In the future we may develop a separate set of tests for the stable-1.5 branch during merge, but not for this initial import. We'll rely on Jenkins testing for our functional validations. --- .zuul.yaml | 50 -------------------------------------------------- 1 file changed, 50 deletions(-) delete mode 100644 .zuul.yaml diff --git a/.zuul.yaml b/.zuul.yaml deleted file mode 100644 index 91d848359..000000000 --- a/.zuul.yaml +++ /dev/null @@ -1,50 +0,0 @@ ---- -- job: - name: stf-base - # defined in: https://review.rdoproject.org/cgit/config/tree/zuul.d/_jobs-crc.yaml - parent: base-simple-crc - abstract: true - description: | - Run the stf-run-ci role, and then test stf - roles: # adds in dependent roles i.e. put it in the role path - - zuul: github.com/openstack-k8s-operators/ci-framework - # These are the additional repos that zuul will clone - required-projects: - - name: openstack-k8s-operators/ci-framework - override-checkout: main - pre-run: - - ci/prepare.yml - run: - - ci/deploy_stf.yml - - ci/test_stf.yml - post-run: - - ci/post-collect_logs.yml - nodeset: centos-9-crc-xxl - # The default (~30 minutes) is not enough to run through all the job stages - timeout: 3600 - vars: - # Pass vars to crc cli https://review.rdoproject.org/cgit/config/tree/playbooks/crc/simple-start.yaml#n30 - crc_parameters: '--memory 16000 --disk-size 80 --cpus 6' # Increase from 14336 - -- job: - name: stf-crc-latest-nightly_bundles - parent: stf-base - description: - Deploy STF nightly bundles - vars: - scenario: "nightly_bundles" - -- job: - name: stf-crc-latest-local_build - parent: stf-base - description: | - Build images locally and deploy STF - vars: - scenario: "local_build" - -- project: - name: infrawatch/service-telemetry-operator - github-check: - jobs: - - stf-crc-latest-nightly_bundles - - stf-crc-latest-local_build