diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 5cbce3e9e..f7269fea9 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -2,8 +2,8 @@ name: CI on: push jobs: - linting: - name: Linting + operator-linting: + name: Operator Linting runs-on: ubuntu-20.04 steps: - name: Checkout code @@ -16,11 +16,53 @@ jobs: run: ansible-galaxy collection install operator_sdk.util - name: Install ansible-lint - run: pip install 'ansible-lint < 6.0.0' + run: python -m pip install 'ansible-lint < 6.0.0' - name: Lint Ansible roles/servicetelemetry/ directory run: ${HOME}/.local/bin/ansible-lint roles/servicetelemetry + stf-run-ci-linting: + name: stf-run-ci Linting + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Install Ansible + run: python -m pip install 'ansible' + + - name: Install ansible-lint + run: python -m pip install 'ansible-lint' + + - name: Lint Ansible build/stf-run-ci directory + run: ${HOME}/.local/bin/ansible-lint . + working-directory: ./build/stf-run-ci + + generate-bundle-diff-check: + name: Check if generating the bundle would result in local changes + runs-on: ubuntu-latest + env: + RELEASE_VERSION: v0.19.4 + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Get operator-sdk image 0.19.4 + run: curl --output operator-sdk -JL https://github.com/operator-framework/operator-sdk/releases/download/$RELEASE_VERSION/operator-sdk-$RELEASE_VERSION-x86_64-linux-gnu + + - name: Make operator-sdk executable + run: chmod +x operator-sdk + + - name: Move operator-sdk binary + run: sudo mv operator-sdk /usr/local/bin + + - name: Generate bundle locally + run: operator-sdk generate bundle --manifests --metadata --default-channel stable-1.5 --channels stable-1.5 + + - name: Check if bundle generation results in local changes + run: git diff --exit-code + build-operator-check: name: Build Operator check runs-on: ubuntu-20.04 diff --git a/.gitleaks.toml b/.gitleaks.toml new file mode 100644 index 000000000..1c2a4ed0f --- /dev/null +++ b/.gitleaks.toml @@ -0,0 +1,8 @@ +[allowlist] + description = "Global Allowlist" + + # Ignore based on any subset of the file path + paths = [ + # Ignore all example certs + '''roles\/servicetelemetry\/vars\/dummy_user_certs\.yml''' + ] diff --git a/.jenkins/Dockerfile b/.jenkins/Dockerfile index 7d946961d..9930d6c57 100644 --- a/.jenkins/Dockerfile +++ b/.jenkins/Dockerfile @@ -10,6 +10,7 @@ RUN jenkins-plugin-cli -p ant:latest \ email-ext:latest \ git:latest \ github-branch-source:latest \ + github-scm-trait-notification-context:latest \ gradle:latest \ ldap:latest \ mailer:latest \ diff --git a/.jenkins/README.md b/.jenkins/README.md index 64d768251..f39f1b771 100644 --- a/.jenkins/README.md +++ b/.jenkins/README.md @@ -39,6 +39,8 @@ oc apply -f deploy/service-route.yaml export SMEE_CHANNEL= #(just the slug, not the whole URL) export GH_ORG= export JENKINS_URL=$(oc get route jenkins -ojsonpath='{.spec.host}') +# This is for labelling the status that is returned to github +export OCP_VERSION= # e.g. 4.14 for f in deploy/*; do envsubst < "${f}" | oc apply -f - diff --git a/.jenkins/agent/Dockerfile b/.jenkins/agent/Dockerfile index c41fb9c69..ab7f97cb3 100644 --- a/.jenkins/agent/Dockerfile +++ b/.jenkins/agent/Dockerfile @@ -1,7 +1,7 @@ FROM quay.io/openshift/origin-jenkins-agent-base:latest # pass --build-arg OC_CLIENT_VERSION= to build stage to change client version -ARG OC_CLIENT_VERSION="4.12" +ARG OC_CLIENT_VERSION="4.13" RUN curl -LO "https://github.com/operator-framework/operator-sdk/releases/download/v0.19.4/operator-sdk-v0.19.4-x86_64-linux-gnu" && \ chmod +x operator-sdk-v0.19.4-x86_64-linux-gnu && mv operator-sdk-v0.19.4-x86_64-linux-gnu /usr/local/bin/operator-sdk diff --git a/.jenkins/deploy/casc-configmap.yaml b/.jenkins/deploy/casc-configmap.yaml index cb34dddb9..ab372d230 100644 --- a/.jenkins/deploy/casc-configmap.yaml +++ b/.jenkins/deploy/casc-configmap.yaml @@ -94,6 +94,11 @@ data: // 1 : Forks in the same account // 2 : Nobody } + // Custom Github Notification Context; https://github.com/jenkinsci/github-scm-trait-notification-context-plugin + traits << 'org.jenkinsci.plugins.githubScmTraitNotificationContext.NotificationContextTrait' { + contextLabel("continuous-integration/jenkins/ocp-${OCP_VERSION}") + typeSuffix(true) + } } // "Project Recognizers" diff --git a/Jenkinsfile b/Jenkinsfile index 726a4bab4..f3a13d571 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -15,7 +15,7 @@ metadata: name: default namespace: ${namespace} spec: - observabilityStrategy: use_community + observabilityStrategy: use_redhat alerting: alertmanager: storage: @@ -36,6 +36,7 @@ spec: strategy: ephemeral transports: qdr: + auth: none enabled: true deploymentSize: 1 web: @@ -177,7 +178,7 @@ pipeline { openshift.withProject(namespace) { timeout(time: 800, unit: 'SECONDS') { openshift.create(stf_resource) - sh "OCP_PROJECT=${namespace} ./build/validate_deployment.sh" + sh "OCP_PROJECT=${namespace} VALIDATION_SCOPE=use_redhat ./build/validate_deployment.sh" } } } diff --git a/build/Dockerfile b/build/Dockerfile index af0e4c83d..da2b7508f 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -1,5 +1,21 @@ -FROM quay.io/openshift/origin-ansible-operator:4.10 +FROM quay.io/openshift/origin-ansible-operator:4.12 +# temporarily switch to root user to adjust image layers +USER 0 +# Upstream CI builds need the additional EPEL sources for python3-passlib and python3-bcrypt but have no working repos to install epel-release +# NO_PROXY is undefined in upstream CI builds, but defined (usually blank) during openshift builds (a possibly brittle hack) +RUN bash -c -- 'if [ "${NO_PROXY:-__ZZZZZ}" == "__ZZZZZ" ]; then echo "Applying upstream EPEL hacks" && echo -e "-----BEGIN PGP PUBLIC KEY BLOCK-----\n\nmQINBFz3zvsBEADJOIIWllGudxnpvJnkxQz2CtoWI7godVnoclrdl83kVjqSQp+2\ndgxuG5mUiADUfYHaRQzxKw8efuQnwxzU9kZ70ngCxtmbQWGmUmfSThiapOz00018\n+eo5MFabd2vdiGo1y+51m2sRDpN8qdCaqXko65cyMuLXrojJHIuvRA/x7iqOrRfy\na8x3OxC4PEgl5pgDnP8pVK0lLYncDEQCN76D9ubhZQWhISF/zJI+e806V71hzfyL\n/Mt3mQm/li+lRKU25Usk9dWaf4NH/wZHMIPAkVJ4uD4H/uS49wqWnyiTYGT7hUbi\necF7crhLCmlRzvJR8mkRP6/4T/F3tNDPWZeDNEDVFUkTFHNU6/h2+O398MNY/fOh\nyKaNK3nnE0g6QJ1dOH31lXHARlpFOtWt3VmZU0JnWLeYdvap4Eff9qTWZJhI7Cq0\nWm8DgLUpXgNlkmquvE7P2W5EAr2E5AqKQoDbfw/GiWdRvHWKeNGMRLnGI3QuoX3U\npAlXD7v13VdZxNydvpeypbf/AfRyrHRKhkUj3cU1pYkM3DNZE77C5JUe6/0nxbt4\nETUZBTgLgYJGP8c7PbkVnO6I/KgL1jw+7MW6Az8Ox+RXZLyGMVmbW/TMc8haJfKL\nMoUo3TVk8nPiUhoOC0/kI7j9ilFrBxBU5dUtF4ITAWc8xnG6jJs/IsvRpQARAQAB\ntChGZWRvcmEgRVBFTCAoOCkgPGVwZWxAZmVkb3JhcHJvamVjdC5vcmc+iQI4BBMB\nAgAiBQJc9877AhsPBgsJCAcDAgYVCAIJCgsEFgIDAQIeAQIXgAAKCRAh6kWrL4bW\noWagD/4xnLWws34GByVDQkjprk0fX7Iyhpm/U7BsIHKspHLL+Y46vAAGY/9vMvdE\n0fcr9Ek2Zp7zE1RWmSCzzzUgTG6BFoTG1H4Fho/7Z8BXK/jybowXSZfqXnTOfhSF\nalwDdwlSJvfYNV9MbyvbxN8qZRU1z7PEWZrIzFDDToFRk0R71zHpnPTNIJ5/YXTw\nNqU9OxII8hMQj4ufF11040AJQZ7br3rzerlyBOB+Jd1zSPVrAPpeMyJppWFHSDAI\nWK6x+am13VIInXtqB/Cz4GBHLFK5d2/IYspVw47Solj8jiFEtnAq6+1Aq5WH3iB4\nbE2e6z00DSF93frwOyWN7WmPIoc2QsNRJhgfJC+isGQAwwq8xAbHEBeuyMG8GZjz\nxohg0H4bOSEujVLTjH1xbAG4DnhWO/1VXLX+LXELycO8ZQTcjj/4AQKuo4wvMPrv\n9A169oETG+VwQlNd74VBPGCvhnzwGXNbTK/KH1+WRH0YSb+41flB3NKhMSU6dGI0\nSGtIxDSHhVVNmx2/6XiT9U/znrZsG5Kw8nIbbFz+9MGUUWgJMsd1Zl9R8gz7V9fp\nn7L7y5LhJ8HOCMsY/Z7/7HUs+t/A1MI4g7Q5g5UuSZdgi0zxukiWuCkLeAiAP4y7\nzKK4OjJ644NDcWCHa36znwVmkz3ixL8Q0auR15Oqq2BjR/fyog==\n=84m8\n-----END PGP PUBLIC KEY BLOCK-----" > /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-8 && echo -e "[epel]\nname=Extra Packages for Enterprise Linux 8 - \$basearch\nmetalink=https://mirrors.fedoraproject.org/metalink?repo=epel-8&arch=\$basearch&infra=\$infra&content=\$contentdir\nenabled=1\ngpgcheck=1\ngpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-8" > /etc/yum.repos.d/epel.repo; fi' + +# update the base image to allow forward-looking optimistic updates during the testing phase, with the added benefit of helping move closer to passing security scans. +# -- excludes ansible so it remains at 2.9 tag as shipped with the base image +# -- installs python3-passlib and python3-bcrypt for oauth-proxy interface +# -- cleans up the cached data from dnf to keep the image as small as possible +RUN dnf update -y --exclude=ansible* && dnf install -y python3-passlib python3-bcrypt && dnf clean all && rm -rf /var/cache/dnf + +# switch back to user 1001 when running the base image (non-root) +USER 1001 + +# copy in required artifacts for the operator COPY watches.yaml ${HOME}/watches.yaml COPY roles/ ${HOME}/roles/ COPY collections/ ${HOME}/.ansible/collections/ diff --git a/build/generate_bundle.sh b/build/generate_bundle.sh index 8c5b13934..e507ef23e 100755 --- a/build/generate_bundle.sh +++ b/build/generate_bundle.sh @@ -1,41 +1,52 @@ #!/usr/bin/env bash set -e -REL=$(dirname "$0") +set -x + +LOGFILE=${LOGFILE:-/dev/null} + +# If LOGFILE is /dev/null, this command fails, so ignore that error +truncate --size=0 ${LOGFILE} || true + +OPERATOR_SDK=${OPERATOR_SDK:-operator-sdk} + +REL=$( readlink -f $(dirname "$0")) # shellcheck source=build/metadata.sh . "${REL}/metadata.sh" generate_version() { - echo "-- Generating operator version" UNIXDATE=$(date '+%s') OPERATOR_BUNDLE_VERSION=${OPERATOR_CSV_MAJOR_VERSION}.${UNIXDATE} - echo "---- Operator Version: ${OPERATOR_BUNDLE_VERSION}" } create_working_dir() { - echo "-- Create working directory" WORKING_DIR=${WORKING_DIR:-"/tmp/${OPERATOR_NAME}-bundle-${OPERATOR_BUNDLE_VERSION}"} mkdir -p "${WORKING_DIR}" - echo "---- Created working directory: ${WORKING_DIR}" } generate_dockerfile() { - echo "-- Generate Dockerfile for bundle" sed -E "s#<>#${OPERATOR_BUNDLE_VERSION}#g;s#<>#${BUNDLE_CHANNELS}#g;s#<>#${BUNDLE_DEFAULT_CHANNEL}#g" "${REL}/../${BUNDLE_PATH}/Dockerfile.in" > "${WORKING_DIR}/Dockerfile" - echo "---- Generated Dockerfile complete" } generate_bundle() { - echo "-- Generate bundle" - REPLACE_REGEX="s#<>#${CREATED_DATE}#g;s#<>#${OPERATOR_IMAGE}#g;s#<>#${OPERATOR_TAG}#g;s#<>#${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP}#g;s#<>#${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG}#g;s#<>#${OPERATOR_BUNDLE_VERSION}#g;s#1.99.0#${OPERATOR_BUNDLE_VERSION}#g;s#<>#${OPERATOR_DOCUMENTATION_URL}#g;s#<>#${BUNDLE_OLM_SKIP_RANGE_LOWER_BOUND}#g" + REPLACE_REGEX="s#<>#${CREATED_DATE}#g;s#<>#${OPERATOR_IMAGE}#g;s#<>#${OPERATOR_TAG}#g;s#<>#${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP}#g;s#<>#${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG}#g;s#<>#${RELATED_IMAGE_OAUTH_PROXY}#g;s#<>#${RELATED_IMAGE_OAUTH_PROXY_TAG}#g;s#<>#${OPERATOR_BUNDLE_VERSION}#g;s#1.99.0#${OPERATOR_BUNDLE_VERSION}#g;s#<>#${OPERATOR_DOCUMENTATION_URL}#g;s#<>#${BUNDLE_OLM_SKIP_RANGE_LOWER_BOUND}#g" - pushd "${REL}/../" - ${OPERATOR_SDK} generate bundle --channels ${BUNDLE_CHANNELS} --default-channel ${BUNDLE_DEFAULT_CHANNEL} --manifests --metadata --version "${OPERATOR_BUNDLE_VERSION}" --output-dir "${WORKING_DIR}" - popd + pushd "${REL}/../" > /dev/null 2>&1 + ${OPERATOR_SDK} generate bundle --verbose --channels ${BUNDLE_CHANNELS} --default-channel ${BUNDLE_DEFAULT_CHANNEL} --manifests --metadata --version "${OPERATOR_BUNDLE_VERSION}" --output-dir "${WORKING_DIR}" >> ${LOGFILE} 2>&1 + popd > /dev/null 2>&1 - echo "---- Replacing variables in generated manifest" sed -i -E "${REPLACE_REGEX}" "${WORKING_DIR}/manifests/${OPERATOR_NAME}.clusterserviceversion.yaml" - echo "---- Generated bundle complete at ${WORKING_DIR}/manifests/${OPERATOR_NAME}.clusterserviceversion.yaml" +} + +copy_extra_metadata() { + # We add this because our version of operator-sdk for building doesn't + # understand these files, but newer versions of operator-sdk (for testing + # purposes) does, and newer versions of opm (as used in both downstream and + # upstream index image builds) also understands these files. Just copy them + # into the bundle directory during building. + pushd "${REL}/../" > /dev/null 2>&1 + cp -r ./deploy/olm-catalog/service-telemetry-operator/tests/ "${WORKING_DIR}" + cp ./deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml "${WORKING_DIR}/metadata/" } copy_extra_metadata() { @@ -58,11 +69,15 @@ build_bundle_instructions() { # generate templates -echo "## Begin bundle creation" +#echo "## Begin bundle creation" generate_version create_working_dir generate_dockerfile generate_bundle copy_extra_metadata -build_bundle_instructions -echo "## End Bundle creation" +#build_bundle_instructions +#echo "## End Bundle creation" + +set +x +JSON_OUTPUT='{"operator_bundle_image":"%s","operator_bundle_version":"%s","operator_image":"%s","bundle_channels":"%s","bundle_default_channel":"%s","operator_tag":"%s","working_dir":"%s"}' +printf "$JSON_OUTPUT" "$OPERATOR_BUNDLE_IMAGE" "$OPERATOR_BUNDLE_VERSION" "$OPERATOR_IMAGE" "$BUNDLE_CHANNELS" "$BUNDLE_DEFAULT_CHANNEL" "$OPERATOR_TAG" "$WORKING_DIR" diff --git a/build/get_new_operator_sdk.sh b/build/get_new_operator_sdk.sh deleted file mode 100755 index 124b09117..000000000 --- a/build/get_new_operator_sdk.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -REL=$(dirname "$0") -ARCH=$(case $(uname -m) in x86_64) echo -n amd64 ;; aarch64) echo -n arm64 ;; *) echo -n $(uname -m) ;; esac) -OS=$(uname | awk '{print tolower($0)}') -VERSION="${1:-v1.5.0}" -OPERATOR_SDK_DL_URL=https://github.com/operator-framework/operator-sdk/releases/download/${VERSION} - -if [[ ! -f ${REL}/working/operator-sdk-${VERSION} ]]; then - mkdir ${REL}/working - curl -L ${OPERATOR_SDK_DL_URL}/operator-sdk_${OS}_${ARCH} -o ${REL}/working/operator-sdk-${VERSION} - chmod +x ${REL}/working/operator-sdk-${VERSION} - rm ${REL}/working/operator-sdk - ln -s operator-sdk-${VERSION} ${REL}/working/operator-sdk -fi - diff --git a/build/get_operator_sdk.sh b/build/get_operator_sdk.sh new file mode 100755 index 000000000..d1ada222c --- /dev/null +++ b/build/get_operator_sdk.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +set -x + +REL=$(dirname "$0") +ARCH=$(case $(uname -m) in x86_64) echo -n amd64 ;; aarch64) echo -n arm64 ;; *) echo -n $(uname -m) ;; esac) +OS=$(uname | awk '{print tolower($0)}') +VERSION="${1:-v1.5.0}" +OPERATOR_SDK_DL_URL=https://github.com/operator-framework/operator-sdk/releases/download/${VERSION} + +if [[ ! -f ${REL}/working/operator-sdk-${VERSION} ]]; then + mkdir ${REL}/working + if [[ "${VERSION}" =~ "v0" ]]; then + # naming scheme for v0.x is operator-sdk-$VERSION-$ARCH-$OS e.g. operator-sdk-v0.19.4-x86_64-linux-gnu + curl -L ${OPERATOR_SDK_DL_URL}/operator-sdk-${VERSION}-x86_64-linux-gnu -o ${REL}/working/operator-sdk-${VERSION} + else + # naming scheme for v1.x is operator-sdk_$OS-$ARCH e.g. operator-sdk_linux_amd64 + curl -L ${OPERATOR_SDK_DL_URL}/operator-sdk_${OS}_${ARCH} -o ${REL}/working/operator-sdk-${VERSION} + fi + chmod +x ${REL}/working/operator-sdk-${VERSION} + rm -f ${REL}/working/operator-sdk +fi + +set +x \ No newline at end of file diff --git a/build/metadata.sh b/build/metadata.sh index f2020b050..7b120b509 100644 --- a/build/metadata.sh +++ b/build/metadata.sh @@ -19,6 +19,8 @@ BUNDLE_OLM_SKIP_RANGE_LOWER_BOUND=${BUNDLE_OLM_SKIP_RANGE_LOWER_BOUND:-1.3.0} CREATED_DATE=${CREATED_DATE:-$(date +'%Y-%m-%dT%H:%M:%SZ')} RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP=${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP:-quay.io/infrawatch/prometheus-webhook-snmp} RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG=${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG:-stable-1.5} +RELATED_IMAGE_OAUTH_PROXY=${RELATED_IMAGE_OAUTH_PROXY:-quay.io/openshift/origin-oauth-proxy} +RELATED_IMAGE_OAUTH_PROXY_TAG=${RELATED_IMAGE_OAUTH_PROXY_TAG:-latest} BUNDLE_PATH=${BUNDLE_PATH:-deploy/olm-catalog/service-telemetry-operator} BUNDLE_CHANNELS=${BUNDLE_CHANNELS:-stable-1.5} BUNDLE_DEFAULT_CHANNEL=${BUNDLE_DEFAULT_CHANNEL:-stable-1.5} diff --git a/build/run-ci.yaml b/build/run-ci.yaml index 797957269..bfd07c3cb 100644 --- a/build/run-ci.yaml +++ b/build/run-ci.yaml @@ -1,9 +1,15 @@ --- # run STF CI setup in CRC (already provisioned) - hosts: localhost - gather_facts: no + gather_facts: yes connection: local tasks: - name: Run the STF CI system import_role: - name: stf-run-ci + name: stf-run-ci + + - name: Collect the logs + import_role: + name: stf-collect-logs + vars: + logfile_dir: "{{ playbook_dir }}/" diff --git a/build/stf-collect-logs/README.md b/build/stf-collect-logs/README.md new file mode 100644 index 000000000..66b8739d6 --- /dev/null +++ b/build/stf-collect-logs/README.md @@ -0,0 +1,38 @@ +stf-collect-logs +================ + +This role collects logs that are useful for debugging an STF deployment. + +Once the logs are collected, the user will need to fetch the logs themselves. + +Requirements +------------ + + +Role Variables +-------------- + +* `logfile_dir` - The location that the logs will be created in on the remote host(s). + +Dependencies +------------ + + +Example Playbook +---------------- + +Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: + + - hosts: servers + roles: + - { role: username.rolename, x: 42 } + +License +------- + +Apache 2 + +Author Information +------------------ + +Red Hat diff --git a/build/stf-collect-logs/defaults/main.yml b/build/stf-collect-logs/defaults/main.yml new file mode 100644 index 000000000..9855190f1 --- /dev/null +++ b/build/stf-collect-logs/defaults/main.yml @@ -0,0 +1,2 @@ +--- +# defaults file for stf-collect-logs diff --git a/build/stf-collect-logs/meta/main.yml b/build/stf-collect-logs/meta/main.yml new file mode 100644 index 000000000..9a11606cd --- /dev/null +++ b/build/stf-collect-logs/meta/main.yml @@ -0,0 +1,15 @@ +galaxy_info: + role_name: stf-collect-logs + namespace: infrawatch + + author: InfraWatch + description: Log collection role for Service Telemetry Framework + company: Red Hat + + license: Apache-2.0 + + min_ansible_version: 2.1 + + galaxy_tags: [] + +dependencies: [] diff --git a/build/stf-collect-logs/tasks/main.yml b/build/stf-collect-logs/tasks/main.yml new file mode 100644 index 000000000..dde52761f --- /dev/null +++ b/build/stf-collect-logs/tasks/main.yml @@ -0,0 +1,61 @@ +--- +- name: "Get builds" + ansible.builtin.shell: + cmd: | + echo "*** [INFO] Showing oc get builds" > {{ logfile_dir }}/post_oc_get_builds.log 2>&1 + oc -n {{ namespace }} get builds -oyaml >> {{ logfile_dir }}/post_oc_get_builds.log 2>&1 + echo "*** [INFO] Showing oc get builds -oyaml" >> {{ logfile_dir }}/post_oc_get_builds.log 2>&1 + oc -n {{ namespace }} get builds -oyaml >> {{ logfile_dir }}/post_oc_get_builds.log 2>&1 + cat {{ logfile_dir }}/post_oc_get_builds.log + ignore_errors: true + changed_when: false + +- name: "Get subscription details" + ansible.builtin.shell: + cmd: | + oc -n {{ namespace }} get subscriptions > {{ logfile_dir }}/post_oc_get_subscriptions.log 2>&1 + oc -n {{ namespace }} describe subscription service-telemetry-operator >> {{ logfile_dir }}/post_oc_get_subscriptions.log 2>&1 + ignore_errors: true + +- name: "Get image infos" + ansible.builtin.shell: + cmd: | + echo "[INFO] oc get images" > {{ logfile_dir }}/post_oc_get_images.log 2>&1 + oc -n {{ namespace }} get images >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 + echo "[INFO] oc get imagestreams" >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 + oc -n {{ namespace }} get imagestream >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 + echo "[INFO] oc get imagestream -oyaml" >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 + oc -n {{ namespace }} get imagestream -oyaml >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 + retries: 3 + delay: 10 + ignore_errors: true + +- name: "Get STO info" + ansible.builtin.shell: + cmd: | + oc -n {{ namespace }} describe pod $(oc -n {{ namespace }} get pod -l name=service-telemetry-operator -ojsonpath='{ .items[].metadata.name }') >> {{ logfile_dir }}/describe_sto.log 2>&1 + ignore_errors: true + retries: 3 + delay: 10 + +- name: "Question the deployment" + ansible.builtin.shell: + cmd: | + echo "What images were created in the internal registry?" > {{ logfile_dir }}/post_question_deployment.log 2>&1 + oc -n {{ namespace }} get images | grep $(oc -n {{ namespace }} registry info --internal) >> {{ logfile_dir }}/post_question_deployment.log 2>&1 + echo "What state is the STO csv in?" >> {{ logfile_dir }}/post_question_deployment.log 2>&1 + oc -n {{ namespace }} get csv | grep service-telemetry-operator >> {{ logfile_dir }}/post_question_deployment.log 2>&1 + oc -n {{ namespace }} get csv $(oc -n {{ namespace }} get csv | grep "service-telemetry-operator" | awk '{ print $1}') -oyaml >> {{ logfile_dir }}/post_question_deployment.log 2>&1 + register: output + retries: 3 + delay: 10 + +- name: "Get pods" + ansible.builtin.command: + cmd: | + oc -n {{ namespace }} get pods > {{ logfile_dir }}/post_oc_get_pods.log 2>&1 + echo "Additional information" >> {{ logfile_dir }}/post_oc_get_pods.log + oc -n {{ namespace }} describe pods >> {{ logfile_dir }}/post_oc_get_pods.log 2>&1 + ignore_errors: true + retries: 3 + delay: 10 diff --git a/build/stf-collect-logs/vars/main.yml b/build/stf-collect-logs/vars/main.yml new file mode 100644 index 000000000..5197b0284 --- /dev/null +++ b/build/stf-collect-logs/vars/main.yml @@ -0,0 +1,2 @@ +--- +# vars file for stf-collect-logs diff --git a/build/stf-run-ci/.ansible-lint b/build/stf-run-ci/.ansible-lint new file mode 100644 index 000000000..ae660af6f --- /dev/null +++ b/build/stf-run-ci/.ansible-lint @@ -0,0 +1,52 @@ +--- +profile: null +skip_list: + - args + - avoid-implicit + - command-instead-of-module + - command-instead-of-shell + - complexity + - deprecated-bare-vars + - deprecated-local-action + - deprecated-module + - empty-string-compare + - galaxy + - ignore-errors + - inline-env-var + - internal-error + - jinja + - key-order + - latest + - literal-compare + - loop-var-prefix + - meta-incorrect + - meta-no-tags + - meta-runtime + - meta-video-links + - name + - no-changed-when + - no-free-form + - no-handler + - no-jinja-when + - no-log-password + - no-prompting + - no-relative-paths + - no-same-owner + - no-tabs + - only-builtins + - package-latest + - parser-error + - partial-become + - playbook-extension + - risky-file-permissions + - risky-octal + - risky-shell-pipe + - role-name + - run-once + - sanity + - schema + - var-naming + - warning + - yaml + +# vimrc: ft=yaml diff --git a/build/stf-run-ci/README.md b/build/stf-run-ci/README.md index 87ca2f0aa..353a8f81f 100644 --- a/build/stf-run-ci/README.md +++ b/build/stf-run-ci/README.md @@ -1,70 +1,64 @@ -stf-run-ci -========== +# stf-run-ci Run the Service Telemetry Framework CI system. This role is intended to be called from a playbook running locally on a preconfigured test system. Primarily this means a running CodeReady Container system has been provided. -Requirements ------------- +## Requirements - CodeReady Containers - Ansible 2.9 (tested) - `oc` command line tool -Variables ---------- +## Variables Not all variables are listed here, but these are the most common ones you might choose to override: -| Parameter name | Values | Default | Description | -| ------------------------------ | ------------ | --------- | ------------------------------------ | -| `__deploy_stf` | {true,false} | true | Whether to deploy an instance of STF | -| `__local_build_enabled` | {true,false} | true | Whether to deploy STF from local built artifacts. Also see `working_branch`, `sg_branch`, `sgo_branch` | -| `__deploy_from_bundles_enabled` | {true,false} | false | Whether to deploy STF from OLM bundles (TODO: compat with `__local_build_enabled`) | -| `__service_telemetry_bundle_image_path` | | | Image path to Service Telemetry Operator bundle | -| `__smart_gateway_bundle_image_path` | | | Image path to Smart Gateway Operator bundle | -| `prometheus_webhook_snmp_branch` | | master | Which Prometheus Webhook SNMP git branch to checkout | -| `sgo_branch` | | master | Which Smart Gateway Operator git branch to checkout | -| `sg_core_branch` | | master | Which Smart Gateway Core git branch to checkout | -| `sg_bridge_branch` | | master | Which Smart Gateway Bridge git branch to checkout | -| `prometheus_webhook_snmp_branch` | | master | Which Prometheus webhook snmp branch to checkout | -| `sgo_repository` | | https://github.com/infrawatch/smart-gateway-operator | Which Smart Gateway Operator git repository to clone | -| `sg_core_repository` | | https://github.com/infrawatch/sg-core | Which Smart Gateway Core git repository to clone | -| `sg_bridge_repository` | | https://github.com/infrawatch/sg-bridge | Which Smart Gateway Bridge git repository to clone | -| `prometheus_webhook_snmp_repository` | | https://github.com/infrawatch/prometheus-webhook-snmp | Which Prometheus webhook snmp git repository to clone | -| `loki_operator_repository` | | https://github.com/viaq/loki-operator | Which Loki-operator git repository to clone | -| `__service_telemetry_events_certificates_endpoint_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 2160h | Lifetime of the ElasticSearch endpoint certificate (minimum duration is 1h) | -| `__service_telemetry_events_certificates_ca_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the ElasticSearch CA certificate (minimum duration is 1h) | -| `__service_telemetry_events_enabled` | {true,false} | true | Whether to enable events support in ServiceTelemetry | -| `__service_telemetry_high_availability_enabled` | {true,false} | false | Whether to enable high availability support in ServiceTelemetry | -| `__service_telemetry_metrics_enabled` | {true,false} | true | Whether to enable metrics support in ServiceTelemetry | -| `__service_telemetry_storage_ephemeral_enabled` | {true,false} | false | Whether to enable ephemeral storage support in ServiceTelemetry | -| `__service_telemetry_storage_persistent_storage_class` | | | Set a custom storageClass to override the default provided by OpenShift platform | -| `__service_telemetry_snmptraps_enabled` | {true,false} | true | Whether to enable snmptraps delivery via Alertmanager receiver (prometheus-webhook-snmp) | -| `__service_telemetry_snmptraps_community` | | `public` | Set the SNMP community to send traps to. Defaults to public | -| `__service_telemetry_snmptraps_target` | | `192.168.24.254` | Set the SNMP target to send traps to. Defaults to 192.168.24.254 | -| `__service_telemetry_snmptraps_retries` | | 5 | Set the SNMP retry count for traps. Defaults to 5 | -| `__service_telemetry_snmptraps_port` | | 162 | Set the SNMP target port for traps. Defaults to 162 | -| `__service_telemetry_snmptraps_timeout` | | 1 | Set the SNMP retry timeout (in seconds). Defaults to 1 | -| `__service_telemetry_alert_oid_label` | | oid | The alert label name to look for oid value. Default to oid. | -| `__service_telemetry_trap_oid_prefix` | | 1.3.6.1.4.1.50495.15 | The OID prefix for trap variable bindings. | -| `__service_telemetry_trap_default_oid` | | 1.3.6.1.4.1.50495.15.1.2.1 | The trap OID if none is found in the Prometheus alert labels. | -| `__service_telemetry_trap_default_severity` | | | The trap severity if none is found in the Prometheus alert labels. | -| `__service_telemetry_logs_enabled` | {true,false} | false | Whether to enable logs support in ServiceTelemetry | -| `__service_telemetry_observability_strategy` | | `use_community` | Which observability strategy to use for deployment. Default deployment is 'use_community'. Also supported is 'none' | -| `__service_telemetry_transports_certificates_endpoint_cert_duration`| [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 2160h | Lifetime of the QDR endpoint certificate (minimum duration is 1h) | -| `__service_telemetry_transports_certificates_ca_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the QDR CA certificate (minimum duration is 1h) | -| `__internal_registry_path` | | image-registry.openshift-image-registry.svc:5000 | Path to internal registry for image path | -| `__deploy_loki_enabled` | {true,false} | false | Whether to deploy loki-operator and other systems for logging development purposes | -| `__golang_image_path` | | quay.io/infrawatch/golang:1.16 | Golang image path for building the loki-operator image | -| `__loki_image_path` | | quay.io/infrawatch/loki:2.2.1 | Loki image path for Loki microservices | - - - -Example Playbook ----------------- +| Parameter name | Values | Default | Description | +| ------------------------------ | ------------ | --------- | ------------------------------------ | +| `__deploy_stf` | {true,false} | true | Whether to deploy an instance of STF | +| `__local_build_enabled` | {true,false} | true | Whether to deploy STF from local built artifacts. Also see `working_branch`, `sg_branch`, `sgo_branch` | +| `__deploy_from_bundles_enabled` | {true,false} | false | Whether to deploy STF from OLM bundles (TODO: compat with `__local_build_enabled`) | +| `__deploy_from_index_enabled` | {true,false} | false | Whether to deploy STF from locally built bundles and index image. | +| `__service_telemetry_bundle_image_path` | | `quay.io/infrawatch-operators/service-telemetry-operator-bundle:nightly-head` | Image path to Service Telemetry Operator bundle | +| `__smart_gateway_bundle_image_path` | | `quay.io/infrawatch-operators/smart-gateway-operator-bundle:nightly-head` | Image path to Smart Gateway Operator bundle | +| `setup_bundle_registry_tls_ca` | {true,false} | true | Whether to setup or not a TLS CA cert for the bundle registry access | +| `setup_bundle_registry_auth` | {true,false} | true | Whether to setup or not the auth for the bundle registry access | +| `prometheus_webhook_snmp_branch` | | master | Which Prometheus Webhook SNMP git branch to checkout | +| `sgo_branch` | | master | Which Smart Gateway Operator git branch to checkout | +| `sg_core_branch` | | master | Which Smart Gateway Core git branch to checkout | +| `sg_bridge_branch` | | master | Which Smart Gateway Bridge git branch to checkout | +| `prometheus_webhook_snmp_branch` | | master | Which Prometheus webhook snmp branch to checkout | +| `sgo_repository` | | https://github.com/infrawatch/smart-gateway-operator | Which Smart Gateway Operator git repository to clone | +| `sg_core_repository` | | https://github.com/infrawatch/sg-core | Which Smart Gateway Core git repository to clone | +| `sg_bridge_repository` | | https://github.com/infrawatch/sg-bridge | Which Smart Gateway Bridge git repository to clone | +| `prometheus_webhook_snmp_repository` | | https://github.com/infrawatch/prometheus-webhook-snmp | Which Prometheus webhook snmp git repository to clone | +| `__service_telemetry_events_certificates_endpoint_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the ElasticSearch endpoint certificate (minimum duration is 1h) | +| `__service_telemetry_events_certificates_ca_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the ElasticSearch CA certificate (minimum duration is 1h) | +| `__service_telemetry_events_enabled` | {true,false} | true | Whether to enable events support in ServiceTelemetry | +| `__service_telemetry_high_availability_enabled` | {true,false} | false | Whether to enable high availability support in ServiceTelemetry | +| `__service_telemetry_metrics_enabled` | {true,false} | true | Whether to enable metrics support in ServiceTelemetry | +| `__service_telemetry_storage_ephemeral_enabled` | {true,false} | false | Whether to enable ephemeral storage support in ServiceTelemetry | +| `__service_telemetry_storage_persistent_storage_class` | | | Set a custom storageClass to override the default provided by OpenShift platform | +| `__service_telemetry_snmptraps_enabled` | {true,false} | true | Whether to enable snmptraps delivery via Alertmanager receiver (prometheus-webhook-snmp) | +| `__service_telemetry_snmptraps_community` | | `public` | Set the SNMP community to send traps to. Defaults to public | +| `__service_telemetry_snmptraps_target` | | `192.168.24.254` | Set the SNMP target to send traps to. Defaults to 192.168.24.254 | +| `__service_telemetry_snmptraps_retries` | | 5 | Set the SNMP retry count for traps. Defaults to 5 | +| `__service_telemetry_snmptraps_port` | | 162 | Set the SNMP target port for traps. Defaults to 162 | +| `__service_telemetry_snmptraps_timeout` | | 1 | Set the SNMP retry timeout (in seconds). Defaults to 1 | +| `__service_telemetry_alert_oid_label` | | oid | The alert label name to look for oid value. Default to oid. | +| `__service_telemetry_trap_oid_prefix` | | 1.3.6.1.4.1.50495.15 | The OID prefix for trap variable bindings. | +| `__service_telemetry_trap_default_oid` | | 1.3.6.1.4.1.50495.15.1.2.1 | The trap OID if none is found in the Prometheus alert labels. | +| `__service_telemetry_trap_default_severity` | | | The trap severity if none is found in the Prometheus alert labels. | +| `__service_telemetry_observability_strategy` | | `use_redhat` | Which observability strategy to use for deployment. Default is 'use_redhat'. Also supported are 'use_hybrid', 'use_community', and 'none' | +| `__service_telemetry_transports_qdr_auth` | {'none', 'basic'} | `none` | Which auth method to use for QDR. Can be 'none' or 'basic'. Note: 'basic' is not yet supported in smoketests. | +| `__service_telemetry_transports_certificates_endpoint_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the QDR endpoint certificate (minimum duration is 1h) | +| `__service_telemetry_transports_certificates_ca_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the QDR CA certificate (minimum duration is 1h) | +| `__internal_registry_path` | | image-registry.openshift-image-registry.svc:5000 | Path to internal registry for image path | + + +# Example Playbook ```yaml --- @@ -77,32 +71,39 @@ Example Playbook name: stf-run-ci ``` -Usage ------ +# Usage You can deploy Service Telemetry Framework using this role in a few configuration methods: * local build artifacts from Git repository cloned locally +* local build artifacts, local bundle artifacts, and Subscription via OLM using locally built index image * standard deployment using Subscription and OLM * supporting components but no instance of Service Telemetry Operator +## Basic deployment + You can deploy using the sample `run-ci.yaml` from the _Example Playbook_ section: -``` +```sh ansible-playbook run-ci.yaml ``` +## Standard deloyment with existing artifacts + If you want to do a standard deployment (existing remote artifacts) you can use the following command: -``` +```sh ansible-playbook --extra-vars __local_build_enabled=false run-ci.yaml ``` +## Deployment with pre-build bundles + You can deploy directly from pre-built bundles like this: -``` + +```sh ansible-playbook -e __local_build_enabled=false -e __deploy_from_bundles_enabled=true \ -e __service_telemetry_bundle_image_path=//stf-service-telemetry-operator-bundle: \ -e __smart_gateway_bundle_image_path=//stf-smart-gateway-operator-bundle: \ @@ -114,15 +115,22 @@ ansible-playbook -e __local_build_enabled=false -e __deploy_from_bundles_enabled NOTE: When deploying from bundles, you must have a _CA.pem_ for the registry already in place in the build directory, if required. If this is -not required, add `--skip-tags bundle_registry_tls_ca`. If no login is required -to your bundle image registry, add `--skip-tags bundle_registry_auth` +not required, set `setup_bundle_registry_tls_ca` to `false`. If no login is required +to your bundle image registry, set `setup_bundle_registry_auth` to `false`. +By default, those configuration options are set to `true`. + +## Deployment from local artifacts, bundles, and index + +You can perform a deployment using OLM and a Subscription from locally built artifacts, bundles, and index image like this: + +```sh +ansible-playbook -e __local_build_enabled=true -e __deploy_from_index_enabled=true run-ci.yaml +``` -License -------- +# License Apache v2.0 -Author Information ------------------- +# Author Information -Leif Madsen +Red Hat (CloudOps DFG) diff --git a/build/stf-run-ci/defaults/main.yml b/build/stf-run-ci/defaults/main.yml index ab9dab7a6..da9834ecf 100644 --- a/build/stf-run-ci/defaults/main.yml +++ b/build/stf-run-ci/defaults/main.yml @@ -7,6 +7,7 @@ list_of_stf_objects: __local_build_enabled: true __deploy_from_bundles_enabled: false +__deploy_from_index_enabled: false __deploy_stf: true __service_telemetry_events_certificates_endpoint_cert_duration: 70080h @@ -25,20 +26,28 @@ __service_telemetry_snmptraps_alert_oid_label: "oid" __service_telemetry_snmptraps_trap_oid_prefix: "1.3.6.1.4.1.50495.15" __service_telemetry_snmptraps_trap_default_oid: "1.3.6.1.4.1.50495.15.1.2.1" __service_telemetry_snmptraps_trap_default_severity: "" -__service_telemetry_logs_enabled: false -__service_telemetry_observability_strategy: use_community +__service_telemetry_observability_strategy: use_redhat __service_telemetry_transports_certificates_endpoint_cert_duration: 70080h __service_telemetry_transports_certificates_ca_cert_duration: 70080h __internal_registry_path: image-registry.openshift-image-registry.svc:5000 -__service_telemetry_bundle_image_path: -__smart_gateway_bundle_image_path: +__service_telemetry_bundle_image_path: "quay.io/infrawatch-operators/service-telemetry-operator-bundle:nightly-head" +__smart_gateway_bundle_image_path: "quay.io/infrawatch-operators/smart-gateway-operator-bundle:nightly-head" + +default_operator_registry_image_base: registry.redhat.io/openshift4/ose-operator-registry +default_operator_registry_image_tag: v4.13 + +elasticsearch_version: 7.16.1 sgo_image_tag: latest sto_image_tag: latest sg_core_image_tag: latest sg_bridge_image_tag: latest prometheus_webhook_snmp_image_tag: latest -new_operator_sdk_version: v1.11.0 +sgo_bundle_image_tag: latest +sto_bundle_image_tag: latest +stf_index_image_tag: latest +operator_sdk_v0: v0.19.4 +operator_sdk_v1: v1.11.0 namespace: service-telemetry pull_secret_registry: pull_secret_user: @@ -57,3 +66,6 @@ sg_bridge_repository: https://github.com/infrawatch/sg-bridge prometheus_webhook_snmp_repository: https://github.com/infrawatch/prometheus-webhook-snmp base_dir: '' + +setup_bundle_registry_auth: true +setup_bundle_registry_tls_ca: true diff --git a/build/stf-run-ci/meta/main.yml b/build/stf-run-ci/meta/main.yml index 227ad9c34..e79928dd5 100644 --- a/build/stf-run-ci/meta/main.yml +++ b/build/stf-run-ci/meta/main.yml @@ -1,7 +1,10 @@ galaxy_info: - author: your name - description: your role description - company: your company (optional) + role_name: stf_run_ci # if absent directory name hosting role is used instead + namespace: infrawatch + + author: InfraWatch + description: Helper CI role for Service Telemetry Framework + company: Red Hat # If the issue tracker for your role is not on github, uncomment the # next line and provide a value @@ -14,30 +17,9 @@ galaxy_info: # - GPL-3.0-only # - Apache-2.0 # - CC-BY-4.0 - license: license (GPL-2.0-or-later, MIT, etc) - - min_ansible_version: 2.9 - - # If this a Container Enabled role, provide the minimum Ansible Container version. - # min_ansible_container_version: + license: Apache-2.0 - # - # Provide a list of supported platforms, and for each platform a list of versions. - # If you don't wish to enumerate all versions for a particular platform, use 'all'. - # To view available platforms and versions (or releases), visit: - # https://galaxy.ansible.com/api/v1/platforms/ - # - # platforms: - # - name: Fedora - # versions: - # - all - # - 25 - # - name: SomePlatform - # versions: - # - all - # - 1.0 - # - 7 - # - 99.99 + min_ansible_version: '2.14' galaxy_tags: [] # List tags for your role here, one per line. A tag is a keyword that describes @@ -50,4 +32,3 @@ galaxy_info: dependencies: [] # List your role dependencies here, one per line. Be sure to remove the '[]' above, # if you add dependencies to this list. - \ No newline at end of file diff --git a/build/stf-run-ci/requirements.txt b/build/stf-run-ci/requirements.txt new file mode 100644 index 000000000..70c742e0e --- /dev/null +++ b/build/stf-run-ci/requirements.txt @@ -0,0 +1,8 @@ +# https://stackoverflow.com/questions/64073422/importerror-cannot-import-name-oauth1session-from-requests-oauthlib +requests==2.31.0 +requests_oauthlib==1.3.0 +# https://github.com/domainaware/parsedmarc/issues/318 +oauthlib==3.2.2 +kubernetes==24.2.0 +openshift==0.13.1 +ansible-core==2.12.10 diff --git a/build/stf-run-ci/tasks/clone_repos.yml b/build/stf-run-ci/tasks/clone_repos.yml index d4f2173d3..2bb2871bf 100644 --- a/build/stf-run-ci/tasks/clone_repos.yml +++ b/build/stf-run-ci/tasks/clone_repos.yml @@ -6,56 +6,57 @@ - name: Get Smart Gateway Operator block: - name: Try cloning same-named branch or override branch from SGO repository - git: + ansible.builtin.git: repo: "{{ sgo_repository }}" - dest: working/smart-gateway-operator + dest: "{{ base_dir }}/working/smart-gateway-operator" version: "{{ sgo_branch | default(branch, true) }}" - force: yes + force: true rescue: - name: "Get {{ version_branches.sgo }} upstream branch because specified branch or repository doesn't exist" - git: + ansible.builtin.git: repo: https://github.com/infrawatch/smart-gateway-operator - dest: working/smart-gateway-operator + dest: "{{ base_dir }}/working/smart-gateway-operator" version: "{{ version_branches.sgo }}" - name: Get sg-core block: - name: Try cloning same-named branch or override branch from sg-core repository - git: + ansible.builtin.git: repo: "{{ sg_core_repository }}" - dest: working/sg-core + dest: "{{ base_dir }}/working/sg-core" version: "{{ sg_core_branch | default(branch, true) }}" rescue: - name: "Get {{ version_branches.sg_core }} upstream branch because specified branch or repository doesn't exist" - git: + ansible.builtin.git: repo: https://github.com/infrawatch/sg-core - dest: working/sg-core + dest: "{{ base_dir }}/working/sg-core" version: "{{ version_branches.sg_core }}" - name: Get sg-bridge block: - name: Try cloning same-named branch or override branch from sg-bridge repository - git: + ansible.builtin.git: repo: "{{ sg_bridge_repository }}" - dest: working/sg-bridge + dest: "{{ base_dir }}/working/sg-bridge" version: "{{ sg_bridge_branch | default(branch, true) }}" rescue: - name: "Get {{ version_branches.sg_bridge }} upstream branch because specified branch or repository doesn't exist" - git: + ansible.builtin.git: repo: https://github.com/infrawatch/sg-bridge - dest: working/sg-bridge + dest: "{{ base_dir }}/working/sg-bridge" version: "{{ version_branches.sg_bridge }}" - name: Get prometheus-webhook-snmp block: - name: Try cloning same-named branch or override branch from prometheus-webhook-snmp repository - git: + ansible.builtin.git: repo: "{{ prometheus_webhook_snmp_repository }}" - dest: working/prometheus-webhook-snmp + dest: "{{ base_dir }}/working/prometheus-webhook-snmp" version: "{{ prometheus_webhook_snmp_branch | default(branch, true) }}" rescue: - name: "Get {{ version_branches.prometheus_webhook_snmp }} upstream branch because specified branch or repository doesn't exist" - git: + ansible.builtin.git: repo: https://github.com/infrawatch/prometheus-webhook-snmp - dest: working/prometheus-webhook-snmp + dest: "{{ base_dir }}/working/prometheus-webhook-snmp" version: "{{ version_branches.prometheus_webhook_snmp }}" + diff --git a/build/stf-run-ci/tasks/create_builds.yml b/build/stf-run-ci/tasks/create_builds.yml index 3dadb98b8..e54b77cb9 100644 --- a/build/stf-run-ci/tasks/create_builds.yml +++ b/build/stf-run-ci/tasks/create_builds.yml @@ -1,47 +1,68 @@ --- -- name: Create BuildConfig and ImageStream - shell: oc new-build -n "{{ namespace }}" --name {{ artifact.name }} --dockerfile - < {{ artifact.working_build_dir }}/{{ artifact.dockerfile_path }} +- name: Get current BuildConfig for artifact to check if it exists + kubernetes.core.k8s_info: + api_version: build.openshift.io/v1 + kind: BuildConfig + namespace: "{{ namespace }}" + name: "{{ artifact.name }}" + register: build_config_lookup -- name: Kill first build since it will always fail (triggered on BuildConfig creation) - shell: sleep 10 ; oc delete build {{ artifact.name }}-1 -n "{{ namespace }}" +- name: Get current Builds for artifact to check if it exists + kubernetes.core.k8s_info: + api_version: build.openshift.io/v1 + kind: Build + namespace: "{{ namespace }}" + label_selectors: + - "build={{ artifact.name }}" + register: build_lookup -- name: Kick off build +- when: build_config_lookup.resources | length == 0 block: - - name: Start local image build - command: oc start-build {{ artifact.name }} -n "{{ namespace }}" --wait --from-dir "{{ artifact.working_build_dir }}" - register: build_name - always: - - name: Describe local image build (results) - command: oc describe build {{ artifact.name }} -n "{{ namespace }}" - register: build_describe + - name: Create BuildConfig and ImageStream + ansible.builtin.shell: oc new-build -n "{{ namespace }}" --name {{ artifact.name }} --dockerfile - < {{ artifact.working_build_dir }}/{{ artifact.dockerfile_path }} - - debug: - var: build_describe.stdout_lines + - name: Kill first build since it will always fail (triggered on BuildConfig creation) + ansible.builtin.shell: sleep 10 ; oc delete build {{ artifact.name }}-1 -n "{{ namespace }}" + ignore_errors: true + retries: 3 + delay: 10 + register: kill_build + until: kill_build.rc == 0 -- debug: - var: build_name +- block: + - name: Start local image build + ansible.builtin.command: oc start-build {{ artifact.name }} -n "{{ namespace }}" --follow --wait --from-dir "{{ artifact.working_build_dir }}" + register: build_results + when: build_lookup.resources | length == 0 + ignore_errors: true + retries: 3 + delay: 10 + until: build_results.rc == 0 + always: + - name: "Show build results" + ansible.builtin.debug: + var: build_results -- name: Set current build name - set_fact: - this_build_name: "{{ build_name['stdout'].split(' ')[0].split('/')[1] }}" +- name: Get latest build information for artifact + ansible.builtin.command: oc get build --selector build={{ artifact.name }} -n "{{ namespace }}" -ojsonpath='{.items[-1:]}' + register: build_describe_results -- debug: - var: this_build_name +- name: Set build_describe from json results + ansible.builtin.set_fact: + build_describe: "{{ build_describe_results.stdout | from_json }}" -- name: Get artifact path - k8s_info: - api_version: build.openshift.io/v1 - kind: Build - name: "{{ this_build_name }}" - namespace: "{{ namespace }}" - register: image_reference +- name: Get the build results + ansible.builtin.debug: + var: build_describe -- debug: - var: image_reference.resources[0].status.outputDockerImageReference +- name: Show the outputDockerImageReference, which will be used for the image reference name + ansible.builtin.debug: + var: build_describe.status.outputDockerImageReference - name: Set unique image reference for this artifact - set_fact: - "{{ artifact.image_reference_name }}": "{{ image_reference.resources[0].status.outputDockerImageReference }}" + ansible.builtin.set_fact: + "{{ artifact.image_reference_name }}": "{{ build_describe.status.outputDockerImageReference }}" -- debug: +- name: Show the image reference name for the build + ansible.builtin.debug: var: "{{ artifact.image_reference_name }}" diff --git a/build/stf-run-ci/tasks/create_catalog.yml b/build/stf-run-ci/tasks/create_catalog.yml new file mode 100644 index 000000000..6a464afd9 --- /dev/null +++ b/build/stf-run-ci/tasks/create_catalog.yml @@ -0,0 +1,173 @@ +--- +- name: Create service-telemetry-framework-index working directory + ansible.builtin.file: + path: "{{ base_dir }}/working/service-telemetry-framework-index" + state: directory + mode: '0755' + +# Updating to use stdout_lines[-1] so that any additional info that gets added to generate_bundles (e.g. for debug) doesn't break this task +# Adding from_json so that the JSON output is parsed into a dictionary +- name: Create info variables from bundle generation output + ansible.builtin.set_fact: + sto_bundle_info: "{{ generate_bundle_sto.stdout_lines[-1] | from_json }}" + sgo_bundle_info: "{{ generate_bundle_sgo.stdout_lines[-1] | from_json }}" + +- name: Get the builder-dockercfg Secret name + ansible.builtin.command: oc get secret -n {{ namespace }} --field-selector='type==kubernetes.io/dockercfg' -ojsonpath='{.items[?(@.metadata.annotations.kubernetes\.io/service-account\.name=="builder")].metadata.name}' + register: secret_builder_dockercfg_name + +- name: Get contents of builder Secret + kubernetes.core.k8s_info: + api_version: v1 + kind: Secret + name: "{{ secret_builder_dockercfg_name.stdout }}" + namespace: "{{ namespace }}" + register: secret_builder_dockercfg_results + +- name: Get builder-dockercfg authentication contents + ansible.builtin.set_fact: + builder_dockercfg_auth_results: "{{ secret_builder_dockercfg_results.resources[0].data['.dockercfg'] | b64decode }}" + +- name: Set internal registry authentication + ansible.builtin.set_fact: + internal_registry: "{{ builder_dockercfg_auth_results['image-registry.openshift-image-registry.svc:5000'] | to_json }}" + +- name: Get Secrets to check for service-telemetry-framework-index-dockercfg + ansible.builtin.command: + cmd: oc get secret -n {{ namespace }} service-telemetry-framework-index-dockercfg + register: index_dockercfg_secret + ignore_errors: true + +# There's an error when the requested resource doesn't exist, so check the rc +- when: index_dockercfg_secret.rc != 0 + block: + - name: Create config.json to import as Secret + ansible.builtin.template: + variable_start_string: "<<" + variable_end_string: ">>" + src: config-json.j2 + dest: "{{ base_dir }}/working/service-telemetry-framework-index/config.json" + + - name: Create a Secret for the dockercfg + ansible.builtin.command: oc create secret generic -n {{ namespace }} service-telemetry-framework-index-dockercfg --from-file=.dockerconfigjson={{ base_dir }}/working/service-telemetry-framework-index/config.json --type=kubernetes.io/dockerconfigjson + ignore_errors: true + +- name: Get the ose-operator-registry ImageStream + ansible.builtin.command: + cmd: oc get -n {{ namespace }} ImageStream ose-operator-registry + register: ose_op_registry_is + ignore_errors: true + +- name: Create ImageStream for ose-operator-registry + ansible.builtin.command: oc import-image -n {{ namespace }} ose-operator-registry:{{ default_operator_registry_image_tag }} --from={{ default_operator_registry_image_base }}:{{ default_operator_registry_image_tag }} --confirm + when: ose_op_registry_is.rc != 0 + +- name: Delete the existing imagestream, if it exists + ansible.builtin.command: oc delete imagestream -n {{ namespace }} service-telemetry-framework-index + ignore_errors: true + +- name: Create ImageStream for service-telemetry-framework-index + ansible.builtin.command: oc create imagestream -n {{ namespace }} service-telemetry-framework-index + +- name: Get STF index image stream + ansible.builtin.command: + cmd: oc get -n {{ namespace }} ImageStream service-telemetry-framework-index + register: stf_index_imagestream + ignore_errors: true + +- when: stf_index_imagestream.rc != 0 + name: Create BuildConfig for service-telemetry-framework-index + kubernetes.core.k8s: + definition: + apiVersion: build.openshift.io/v1 + kind: BuildConfig + metadata: + annotations: + openshift.io/generated-by: stf-run-ci + labels: + build: service-telemetry-framework-index + name: service-telemetry-framework-index + namespace: "{{ namespace }}" + spec: + failedBuildsHistoryLimit: 5 + nodeSelector: null + output: + to: + kind: ImageStreamTag + name: service-telemetry-framework-index:latest + postCommit: {} + resources: {} + runPolicy: Serial + source: + dockerfile: | + # The base image is expected to contain + # /bin/opm (with a serve subcommand) and /bin/grpc_health_probe + FROM {{default_operator_registry_image_base}}:{{default_operator_registry_image_tag}} + + COPY --chmod=666 index.yaml /configs/ + + RUN mkdir /tmp/auth/ + # we need the contents of the mounted build volume from secret placed into config.json + RUN cp /opt/app-root/auth/.dockerconfigjson /tmp/auth/config.json + RUN DOCKER_CONFIG=/tmp/auth /bin/opm --skip-tls-verify render {{ sto_bundle_image_path }} {{ sgo_bundle_image_path }} --output=yaml >> /configs/index.yaml + + ENTRYPOINT ["/bin/opm"] + CMD ["serve", "/configs"] + # Set DC-specific label for the location of the DC root directory + # in the image + LABEL operators.operatorframework.io.index.configs.v1=/configs + type: Dockerfile + strategy: + dockerStrategy: + from: + kind: ImageStreamTag + name: "ose-operator-registry:{{default_operator_registry_image_tag}}" + volumes: + - mounts: + - destinationPath: /opt/app-root/auth + name: pull-secret + source: + secret: + defaultMode: 420 + secretName: service-telemetry-framework-index-dockercfg + type: Secret + type: Docker + successfulBuildsHistoryLimit: 5 + +- name: Get builds of service-telemetry-framework-index + kubernetes.core.k8s_info: + api_version: build.openshift.io/v1 + kind: Build + namespace: "{{ namespace }}" + label_selectors: + - "build=service-telemetry-framework-index" + register: index_builds + +- when: index_builds.resources | length == 0 + block: + - name: Create index.yaml base for index image + ansible.builtin.template: + src: index-yaml.j2 + dest: "{{ base_dir }}/working/service-telemetry-framework-index/index.yaml" + + - name: Build service-telemetry-framework-index + ansible.builtin.command: oc start-build -n "{{ namespace }}" service-telemetry-framework-index --wait --from-dir {{ base_dir }}/working/service-telemetry-framework-index + +- name: Create CloudOps CatalogSource + kubernetes.core.k8s: + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: CatalogSource + metadata: + name: service-telemetry-framework-operators + namespace: "{{ namespace }}" + spec: + displayName: CloudOps Operators + image: "{{ stf_index_image_path }}" + publisher: CloudOps + sourceType: grpc + grpcPodConfig: + securityContextConfig: legacy + updateStrategy: + registryPoll: + interval: 1m diff --git a/build/stf-run-ci/tasks/deploy_stf.yml b/build/stf-run-ci/tasks/deploy_stf.yml index bc49897c0..097906ec3 100644 --- a/build/stf-run-ci/tasks/deploy_stf.yml +++ b/build/stf-run-ci/tasks/deploy_stf.yml @@ -1,7 +1,7 @@ # NOTE: be aware that if the API version changes for the ServiceTelemetry # object that it'll need to be updated here -- name: Create default ServiceTelemetry manifest with observabilityStrategy use_community - set_fact: +- name: Create default ServiceTelemetry manifest with a observabilityStrategy other than none + ansible.builtin.set_fact: service_telemetry_manifest: | apiVersion: infra.watch/v1beta1 kind: ServiceTelemetry @@ -9,7 +9,7 @@ name: default namespace: "{{ namespace }}" spec: - observabilityStrategy: "use_community" + observabilityStrategy: "{{ __service_telemetry_observability_strategy }}" alerting: alertmanager: storage: @@ -34,15 +34,6 @@ events: elasticsearch: enabled: {{ __service_telemetry_events_enabled }} - storage: - strategy: {{ "ephemeral" if __service_telemetry_storage_ephemeral_enabled else "persistent" }} - {% if __service_telemetry_storage_persistent_storage_class is defined %} - persistent: - storageClass: {{ __service_telemetry_storage_persistent_storage_class }} - {% endif %} - certificates: - endpointCertDuration: {{ __service_telemetry_events_certificates_endpoint_cert_duration }} - caCertDuration: {{ __service_telemetry_events_certificates_ca_cert_duration }} metrics: prometheus: enabled: {{ __service_telemetry_metrics_enabled }} @@ -52,19 +43,15 @@ persistent: storageClass: {{ __service_telemetry_storage_persistent_storage_class }} {% endif %} - logs: - loki: - enabled: false - replicationFactor: 1 - flavor: 1x.extra-small - storage: - objectStorageSecret: test {% if __service_telemetry_storage_persistent_storage_class is defined %} storageClass: {{ __service_telemetry_storage_persistent_storage_class }} {% endif %} transports: qdr: enabled: true + {% if __service_telemetry_transports_qdr_auth is defined %} + auth: "{{ __service_telemetry_transports_qdr_auth }}" + {% endif %} certificates: endpointCertDuration: {{ __service_telemetry_transports_certificates_endpoint_cert_duration }} caCertDuration: {{ __service_telemetry_transports_certificates_ca_cert_duration }} @@ -72,10 +59,10 @@ enabled: {{ __service_telemetry_high_availability_enabled }} when: - service_telemetry_manifest is not defined - - __service_telemetry_observability_strategy == "use_community" + - __service_telemetry_observability_strategy != "none" - name: Create default ServiceTelemetry manifest with observabilityStrategy none - set_fact: + ansible.builtin.set_fact: service_telemetry_manifest: | apiVersion: infra.watch/v1beta1 kind: ServiceTelemetry @@ -89,10 +76,10 @@ - __service_telemetry_observability_strategy == "none" - name: Show ServiceTelemetry manifest - debug: + ansible.builtin.debug: var: service_telemetry_manifest | from_yaml - name: Create ServiceTelemetry instance - k8s: + kubernetes.core.k8s: definition: '{{ service_telemetry_manifest }}' diff --git a/build/stf-run-ci/tasks/main.yml b/build/stf-run-ci/tasks/main.yml index 3041d22ea..cf2b0a880 100644 --- a/build/stf-run-ci/tasks/main.yml +++ b/build/stf-run-ci/tasks/main.yml @@ -1,70 +1,120 @@ --- # tasks file for stf-run-ci + +# -- initial setup - name: Setup default values - set_fact: + ansible.builtin.set_fact: branch: "{{ working_branch | default('master') }}" - namespace: "{{ working_namespace | default('service-telemetry') }}" + namespace: "{{ namespace if namespace is defined else (working_namespace | default('service-telemetry'))}}" - name: Set default image paths for local builds - set_fact: + ansible.builtin.set_fact: sgo_image_path: "{{ __internal_registry_path }}/{{ namespace }}/smart-gateway-operator:{{ sgo_image_tag }}" sto_image_path: "{{ __internal_registry_path }}/{{ namespace }}/service-telemetry-operator:{{ sto_image_tag }}" sg_core_image_path: "{{ __internal_registry_path }}/{{ namespace }}/sg-core:{{ sg_core_image_tag }}" sg_bridge_image_path: "{{ __internal_registry_path }}/{{ namespace }}/sg-bridge:{{ sg_bridge_image_tag }}" prometheus_webhook_snmp_image_path: "{{ __internal_registry_path }}/{{ namespace }}/prometheus-webhook-snmp:{{ prometheus_webhook_snmp_image_tag }}" +- name: Set default image paths for bundle and index builds + ansible.builtin.set_fact: + sgo_bundle_image_path: "{{ __internal_registry_path }}/{{ namespace }}/smart-gateway-operator-bundle:{{ sgo_bundle_image_tag }}" + sto_bundle_image_path: "{{ __internal_registry_path }}/{{ namespace }}/service-telemetry-operator-bundle:{{ sto_bundle_image_tag }}" + stf_index_image_path: "{{ __internal_registry_path }}/{{ namespace }}/service-telemetry-framework-index:{{ stf_index_image_tag }}" + - name: Fail on mutually exclusive flags - fail: + ansible.builtin.fail: msg: __deploy_from_bundles_enabled not currently supported with __local_build_enabled (but should be) when: __local_build_enabled | bool and __deploy_from_bundles_enabled | bool +- name: Fail when deploying from index image and local build disabled + ansible.builtin.fail: + msg: __deploy_from_index_enabled must also have __local_build_enabled + when: __deploy_from_index_enabled | bool and not __local_build_enabled | bool + +- name: Fail when deploying from index images and deployment from bundles also requested (mutually exclusive methods) + ansible.builtin.fail: + msg: __deploy_from_index_enabled can not be used with __deploy_from_bundles_enabled + when: __deploy_from_index_enabled | bool and __deploy_from_bundles_enabled | bool + - name: Get the list of nodes - k8s_info: + kubernetes.core.k8s_info: kind: Node register: node_info +- name: Get OCP version + ansible.builtin.shell: oc version -o yaml | grep openshiftVersion | awk '{print $2}' + register: ocp_ver + - name: Find out if we are using crc by looking at the node hostnames - set_fact: + ansible.builtin.set_fact: is_crc: "{{ True if 'crc' in node_info.resources[0].metadata.labels[\"kubernetes.io/hostname\"] else False }}" +# -- prepare environment and cleanup - name: Clean up any existing global artifacts - include_tasks: pre-clean.yml + ansible.builtin.include_tasks: pre-clean.yml + tags: + - pre-clean - name: Setup supporting Operator subscriptions - include_tasks: setup_base.yml + ansible.builtin.include_tasks: setup_base.yml tags: - deploy +- name: Deploy ES for events testing + ansible.builtin.include_tasks: setup_elasticsearch.yml + - name: Set default base dir if not provided - set_fact: + ansible.builtin.set_fact: base_dir: "{{ playbook_dir }}" when: base_dir | length == 0 -- name: Get new operator sdk - when: __local_build_enabled | bool or __deploy_from_bundles_enabled | bool - command: "{{ base_dir }}/get_new_operator_sdk.sh {{ new_operator_sdk_version }}" - +- name: Get operator_sdk_v0 (build bundles) + ansible.builtin.command: + cmd: "./get_operator_sdk.sh {{ operator_sdk_v0 }}" + creates: "{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v0 }}" + chdir: "{{ base_dir }}" + +- name: Get operator_sdk_v1 (deploy from bundles) + when: __local_build_enabled | bool or __deploy_from_bundles_enabled | bool or __deploy_from_index_enabled | bool + ansible.builtin.command: + cmd: "{{ base_dir }}/get_operator_sdk.sh {{ operator_sdk_v1 }}" + creates: "{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v1 }}" + chdir: "{{ base_dir }}" + +- name: Set logfile_dir + when: not (logfile_dir is defined) + ansible.builtin.set_fact: + logfile_dir: "{{ base_dir }}/working/logs" + +- name: Make sure the logging dir exists + ansible.builtin.command: + cmd: mkdir -p {{ logfile_dir }} + creates: "{{ logfile_dir }}" + +# -- create artifacts - when: __local_build_enabled | bool + tags: + - create_builds block: - name: Setup supporting repositories - include_tasks: clone_repos.yml + ansible.builtin.include_tasks: clone_repos.yml tags: - clone - name: Create base build list - set_fact: + ansible.builtin.set_fact: build_list: - - { name: service-telemetry-operator, dockerfile_path: build/Dockerfile, image_reference_name: sto_image_path, working_build_dir: ../ } - - { name: smart-gateway-operator, dockerfile_path: build/Dockerfile, image_reference_name: sgo_image_path, working_build_dir: ./working/smart-gateway-operator } - - { name: sg-core, dockerfile_path: build/Dockerfile, image_reference_name: sg_core_image_path, working_build_dir: ./working/sg-core } - - { name: sg-bridge, dockerfile_path: build/Dockerfile, image_reference_name: sg_bridge_image_path, working_build_dir: ./working/sg-bridge } - - { name: prometheus-webhook-snmp, dockerfile_path: Dockerfile, image_reference_name: prometheus_webhook_snmp_image_path, working_build_dir: ./working/prometheus-webhook-snmp } + - {name: service-telemetry-operator, dockerfile_path: build/Dockerfile, image_reference_name: sto_image_path, working_build_dir: "{{ base_dir }}/../"} + - {name: smart-gateway-operator, dockerfile_path: build/Dockerfile, image_reference_name: sgo_image_path, working_build_dir: "{{ base_dir }}/working/smart-gateway-operator"} + - {name: sg-core, dockerfile_path: build/Dockerfile, image_reference_name: sg_core_image_path, working_build_dir: "{{ base_dir }}/working/sg-core"} + - {name: sg-bridge, dockerfile_path: build/Dockerfile, image_reference_name: sg_bridge_image_path, working_build_dir: "{{ base_dir }}/working/sg-bridge"} + - {name: prometheus-webhook-snmp, dockerfile_path: Dockerfile, image_reference_name: prometheus_webhook_snmp_image_path, working_build_dir: "{{ base_dir }}/working/prometheus-webhook-snmp"} - - debug: - var: build_list + - ansible.builtin.debug: + var: build_list - name: Create builds and artifacts - include_tasks: create_builds.yml + ansible.builtin.include_tasks: create_builds.yml loop: "{{ build_list }}" loop_control: loop_var: artifact @@ -72,36 +122,95 @@ - build - name: Setup STF using local artifacts - include_tasks: setup_stf_local_build.yml + ansible.builtin.include_tasks: setup_stf_local_build.yml tags: - deploy -- block: +- when: __deploy_from_index_enabled | bool or __deploy_from_bundles_enabled | bool + name: Relax the pod security admission controls to allow local catalog index registry pods + kubernetes.core.k8s: + definition: + apiVersion: v1 + kind: Namespace + metadata: + name: "{{ namespace }}" + labels: + security.openshift.io/scc.podSecurityLabelSync: "false" + pod-security.kubernetes.io/enforce: baseline + pod-security.kubernetes.io/audit: restricted + pod-security.kubernetes.io/warn: restricted + +- when: __deploy_from_index_enabled | bool + tags: + - create_bundles + block: + - name: Create base build list + ansible.builtin.set_fact: + bundle_build_list: + - { name: service-telemetry-operator-bundle, dockerfile_path: Dockerfile, image_reference_name: sto_bundle_image_path, working_build_dir: "{{ base_dir }}/working/service-telemetry-operator-bundle" } + - { name: smart-gateway-operator-bundle, dockerfile_path: Dockerfile, image_reference_name: sgo_bundle_image_path, working_build_dir: "{{ base_dir }}/working/smart-gateway-operator-bundle" } + + - ansible.builtin.debug: + var: bundle_build_list + + - name: Create bundle builds and artifacts + ansible.builtin.include_tasks: create_builds.yml + loop: "{{ bundle_build_list }}" + loop_control: + loop_var: artifact + tags: + - build + + - name: Create file-based catalog + ansible.builtin.include_tasks: create_catalog.yml + +# -- deploy +- when: not __local_build_enabled | bool + block: - name: Setup Service Telemetry Framework from supplied bundle URLs - include_tasks: setup_stf_from_bundles.yml + ansible.builtin.include_tasks: setup_stf_from_bundles.yml when: __deploy_from_bundles_enabled | bool - name: Setup Service Telemetry Framework from application registry - include_tasks: setup_stf.yml + ansible.builtin.include_tasks: setup_stf.yml when: not __deploy_from_bundles_enabled | bool - when: not __local_build_enabled | bool - +- when: __deploy_from_index_enabled | bool + name: Subscribe to locally built Service Telemetry Operator + kubernetes.core.k8s: + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + labels: + operators.coreos.com/service-telemetry-operator.service-telemetry: "" + name: service-telemetry-operator + namespace: "{{ namespace }}" + spec: + channel: stable-1.5 + installPlanApproval: Automatic + name: service-telemetry-operator + source: service-telemetry-framework-operators + sourceNamespace: "{{ namespace }}" + +# -- check if we're ready to instantiate - name: Pre-flight checks - include_tasks: preflight_checks.yml + ansible.builtin.include_tasks: preflight_checks.yml -- block: +# -- create a ServiceTelemetry object to stand up the STF instance +- when: __deploy_stf | bool + block: - name: Deploy an instance of STF - include_tasks: deploy_stf.yml + ansible.builtin.include_tasks: deploy_stf.yml - name: Validate system is operational - shell: | - OCP_PROJECT="{{ namespace }}" VALIDATION_SCOPE="{{ __service_telemetry_observability_strategy }}" "{{ base_dir }}/validate_deployment.sh" + ansible.builtin.shell: | + OCP_PROJECT="{{ namespace }}" VALIDATION_SCOPE="{{ __service_telemetry_observability_strategy }}" timeout 1200 "{{ base_dir }}/validate_deployment.sh" >> {{ logfile_dir }}/validate_deployment.log 2>&1 args: executable: /bin/bash register: validate_deployment - - debug: - var: validate_deployment.stdout_lines - - when: __deploy_stf | bool + - name: Show the result of the validate_deployment script + ansible.builtin.shell: + cmd: | + cat {{ logfile_dir }}/validate_deployment.log diff --git a/build/stf-run-ci/tasks/pre-clean.yml b/build/stf-run-ci/tasks/pre-clean.yml index d86093cce..8e6df8bef 100644 --- a/build/stf-run-ci/tasks/pre-clean.yml +++ b/build/stf-run-ci/tasks/pre-clean.yml @@ -1,6 +1,6 @@ # NOTE: This cleanup step prevents parallel CI jobs - name: Clear out existing CRDs so we don't conflict or fail merge - k8s: + kubernetes.core.k8s: state: absent api_version: apiextensions.k8s.io/v1 kind: CustomResourceDefinition @@ -14,7 +14,7 @@ # The clusterroles and clusterrolebindings are global objects that can be left # behind by failed bundle installs - name: Remove all clusterrolebindings owned by OLM for this namespace - k8s: + kubernetes.core.k8s: state: absent api_version: rbac.authorization.k8s.io/v1 kind: clusterrolebindings @@ -22,7 +22,7 @@ - "olm.owner.namespace = {{ namespace }}" - name: Remove all clusterroles owned by OLM for this namespace - k8s: + kubernetes.core.k8s: state: absent api_version: rbac.authorization.k8s.io/v1 kind: clusterroles @@ -34,7 +34,7 @@ # been enabled. This avoids installing an additional CatalogSource which is no # longer required. - name: Remove OperatorHub.io CatalogSource if it installed - k8s: + kubernetes.core.k8s: state: absent definition: apiVersion: operators.coreos.com/v1alpha1 @@ -48,13 +48,99 @@ displayName: OperatorHub.io Operators publisher: OperatorHub.io +# Upstream Source + Sub from https://github.com/rhobs/observability-operator/tree/main/hack/olm +# Moved to using Community Operators Catalog, so no longer require upstream CatalogSource. Eventually move to Red Hat Operators CatalogSource. +- name: Remove Red Hat Observability Operator CatalogSource if it is installed + kubernetes.core.k8s: + state: absent + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: CatalogSource + metadata: + annotations: + name: observability-operator + namespace: openshift-marketplace + +- name: Remove CloudOps CatalogSource if it is installed + kubernetes.core.k8s: + state: absent + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: CatalogSource + metadata: + name: service-telemetry-framework-operators + namespace: "{{ namespace }}" + spec: + displayName: CloudOps Operators + publisher: CloudOps + sourceType: grpc + +- name: Remove Service Telemetry Operator bundle build + kubernetes.core.k8s: + state: absent + api_version: build.openshift.io/v1 + kind: Build + namespace: "{{ namespace }}" + label_selectors: + - "build=service-telemetry-operator-bundle" + +- name: Remove Smart Gateway Operator bundle build + kubernetes.core.k8s: + state: absent + api_version: build.openshift.io/v1 + kind: Build + namespace: "{{ namespace }}" + label_selectors: + - "build=smart-gateway-operator-bundle" + +- name: Remove Service Telemetry Framework index build + kubernetes.core.k8s: + state: absent + api_version: build.openshift.io/v1 + kind: Build + namespace: "{{ namespace }}" + label_selectors: + - "build=service-telemetry-framework-index" + +- name: Remove service-telemetry-operator-bundle CatalogSource (bundle deploy) + kubernetes.core.k8s: + state: absent + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: CatalogSource + metadata: + name: service-telemetry-operator-catalog + namespace: "{{ namespace }}" + +- name: Remove smart-gateway-operator-bundle CatalogSource (bundle deploy) + kubernetes.core.k8s: + state: absent + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: CatalogSource + metadata: + name: smart-gateway-operator-catalog + namespace: "{{ namespace }}" + # Remove the cert manager since we install it as part of the CI/documented pre-install process - name: Remove openshift-cert-manager-operator namespace - k8s: + kubernetes.core.k8s: state: absent - wait: yes + wait: true definition: apiVersion: project.openshift.io/v1 kind: Project metadata: name: openshift-cert-manager-operator + +- name: Remove Elasticsearch + ignore_errors: true + kubernetes.core.k8s: + state: absent + wait: true + definition: + apiVersion: elasticsearch.k8s.elastic.co/v1 + kind: Elasticsearch + metadata: + name: elasticsearch + namespace: "{{ namespace }}" diff --git a/build/stf-run-ci/tasks/preflight_checks.yml b/build/stf-run-ci/tasks/preflight_checks.yml index 9b9036de6..5c68b5405 100644 --- a/build/stf-run-ci/tasks/preflight_checks.yml +++ b/build/stf-run-ci/tasks/preflight_checks.yml @@ -1,4 +1,27 @@ --- -- name: Wait for Service Telemetry Operator to be Succeeded - shell: | - while ! oc get csv -n "{{ namespace }}" | grep service-telemetry-operator | grep Succeeded; do echo "waiting for Service Telemetry Operator..."; sleep 3; done +# Try for 10 minutes to get an output +- block: + - name: "Wait for up to 10 minutes for Service Telemetry Operator to be Succeeded" + ansible.builtin.shell: | + oc get csv -n "{{ namespace }}" | grep service-telemetry-operator | grep Succeeded + register: output + retries: 60 + delay: 10 + until: output.stdout | length != 0 + ignore_errors: true + rescue: + - name: "Show CSV statuses" + ansible.builtin.command: + cmd: | + oc get csv -n "{{ namespace }}" + + - name: "Get service-telemetry-operator CSV information" + ansible.builtin.command: + cmd: | + oc describe csv $(oc get csv | grep "service-telemetry-operator" | awk '{print $1}') > {{ logfile_dir }}/oc_get_csv_sto.log 2>&1 + cat {{ logfile_dir }} + + - name: "Show fail message if CSV isn't Succeeded after the alotted time" + ansible.builtin.fail: + msg: "Service Telemetry Operator CSV not Succeeded after 10 minutes. Check {{ logfile_dir }}/oc_get_csv_sto.log for more information" + when: output.rc != 0 diff --git a/build/stf-run-ci/tasks/setup_base.yml b/build/stf-run-ci/tasks/setup_base.yml index 9b0c838f9..cf9c92fdf 100644 --- a/build/stf-run-ci/tasks/setup_base.yml +++ b/build/stf-run-ci/tasks/setup_base.yml @@ -1,6 +1,6 @@ --- - name: Setup OperatorHub dependencies - k8s: + kubernetes.core.k8s: definition: apiVersion: config.openshift.io/v1 kind: OperatorHub @@ -13,11 +13,11 @@ name: certified-operators - disabled: false name: redhat-operators - - disabled: "{{ false if __service_telemetry_observability_strategy == 'use_community' else true }}" + - disabled: false name: community-operators -- name: Create OperatorGroup - k8s: +- name: Create OperatorGroup for service-telemetry + kubernetes.core.k8s: definition: apiVersion: operators.coreos.com/v1 kind: OperatorGroup @@ -28,88 +28,139 @@ targetNamespaces: - "{{ namespace }}" -- block: - - name: Create openshift-cert-manager-operator namespace - k8s: - definition: - apiVersion: project.openshift.io/v1 - kind: Project - metadata: - name: openshift-cert-manager-operator - spec: - finalizers: - - kubernetes +# deploy cert-manager from tech-preview when using versions of OCP < 4.12 +- when: not __deploy_from_index_enabled | bool and ocp_ver.stdout is version ('4.12', '<') + block: + - name: Create openshift-cert-manager-operator namespace + kubernetes.core.k8s: + definition: + apiVersion: project.openshift.io/v1 + kind: Project + metadata: + name: openshift-cert-manager-operator + spec: + finalizers: + - kubernetes - - name: Create openshift-cert-manager-operator OperatorGroup - k8s: - definition: - apiVersion: operators.coreos.com/v1 - kind: OperatorGroup - metadata: - name: openshift-cert-manager-operator - namespace: openshift-cert-manager-operator - spec: {} + - name: Create openshift-cert-manager-operator OperatorGroup + kubernetes.core.k8s: + definition: + apiVersion: operators.coreos.com/v1 + kind: OperatorGroup + metadata: + name: openshift-cert-manager-operator + namespace: openshift-cert-manager-operator + spec: {} - - name: Subscribe to Cert Manager for OpenShift Operator - k8s: - definition: - apiVersion: operators.coreos.com/v1alpha1 - kind: Subscription - metadata: - name: openshift-cert-manager-operator - namespace: openshift-cert-manager-operator - spec: - channel: tech-preview - installPlanApproval: Automatic - name: openshift-cert-manager-operator - source: redhat-operators - sourceNamespace: openshift-marketplace + - name: Subscribe to Cert Manager for OpenShift Operator + kubernetes.core.k8s: + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + name: openshift-cert-manager-operator + namespace: openshift-cert-manager-operator + spec: + channel: "tech-preview" + installPlanApproval: Automatic + name: openshift-cert-manager-operator + source: redhat-operators + sourceNamespace: openshift-marketplace -- name: Subscribe to Elastic Cloud on Kubernetes Operator - k8s: +# deploy cert-manager from stable-v1 in 4.12 and later using namespace scoped operator +- when: not __deploy_from_index_enabled | bool and ocp_ver.stdout is version ('4.12', '>=') + block: + - name: Subscribe to Cert Manager for OpenShift Operator + kubernetes.core.k8s: + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + labels: + operators.coreos.com/openshift-cert-manager-operator.service-telemetry: "" + name: openshift-cert-manager-operator-stable-v1-redhat-operators-openshift-marketplace + namespace: "{{ namespace }}" + spec: + channel: stable-v1 + installPlanApproval: Automatic + name: openshift-cert-manager-operator + source: redhat-operators + sourceNamespace: openshift-marketplace + +- when: not __deploy_from_index_enabled | bool + block: + - name: Subscribe to AMQ Interconnect Operator + kubernetes.core.k8s: + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + name: amq7-interconnect-operator + namespace: "{{ namespace }}" + spec: + channel: 1.10.x + installPlanApproval: Automatic + name: amq7-interconnect-operator + source: redhat-operators + sourceNamespace: openshift-marketplace + + - name: Subscribe to Prometheus Operator + kubernetes.core.k8s: + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + name: prometheus + namespace: "{{ namespace }}" + spec: + channel: beta + installPlanApproval: Automatic + name: prometheus + source: community-operators + sourceNamespace: openshift-marketplace + when: + - __service_telemetry_observability_strategy == "use_community" + +- name: Subscribe to Red Hat Obervability Operator + kubernetes.core.k8s: definition: apiVersion: operators.coreos.com/v1alpha1 kind: Subscription metadata: - name: elasticsearch-eck-operator-certified - namespace: "{{ namespace }}" + labels: + operators.coreos.com/observability-operator.openshift-operators: "" + name: observability-operator + namespace: openshift-operators spec: channel: stable installPlanApproval: Automatic - name: elasticsearch-eck-operator-certified - source: certified-operators + name: observability-operator + source: community-operators sourceNamespace: openshift-marketplace when: - - __service_telemetry_observability_strategy == "use_community" + - __service_telemetry_observability_strategy in ['use_redhat', 'use_hybrid'] -- name: Subscribe to AMQ Interconnect Operator - k8s: +- name: Subscribe to Elastic Cloud on Kubernetes Operator + kubernetes.core.k8s: definition: apiVersion: operators.coreos.com/v1alpha1 kind: Subscription metadata: - name: amq7-interconnect-operator + name: elasticsearch-eck-operator-certified namespace: "{{ namespace }}" spec: - channel: 1.10.x + channel: stable installPlanApproval: Automatic - name: amq7-interconnect-operator - source: redhat-operators + name: elasticsearch-eck-operator-certified + source: certified-operators sourceNamespace: openshift-marketplace -- name: Subscribe to Prometheus Operator - k8s: - definition: - apiVersion: operators.coreos.com/v1alpha1 - kind: Subscription - metadata: - name: prometheus - namespace: "{{ namespace }}" - spec: - channel: beta - installPlanApproval: Automatic - name: prometheus - source: community-operators - sourceNamespace: openshift-marketplace - when: - - __service_telemetry_observability_strategy == "use_community" +- name: Wait for Elasticsearch CRD to appear + kubernetes.core.k8s_info: + api_version: apiextensions.k8s.io/v1 + kind: CustomResourceDefinition + name: elasticsearches.elasticsearch.k8s.elastic.co + register: eckCRD + until: eckCRD.resources[0] is defined + retries: 5 + delay: 30 diff --git a/build/stf-run-ci/tasks/setup_elasticsearch.yml b/build/stf-run-ci/tasks/setup_elasticsearch.yml new file mode 100644 index 000000000..ce227537e --- /dev/null +++ b/build/stf-run-ci/tasks/setup_elasticsearch.yml @@ -0,0 +1,32 @@ +- name: Set default ElasticSearch manifest + ansible.builtin.set_fact: + elasticsearch_manifest: "{{ lookup('template', './manifest_elasticsearch.j2') | from_yaml }}" + when: elasticsearch_manifest is not defined + +- name: Create an instance of Elasticsearch + kubernetes.core.k8s: + state: present + definition: + '{{ elasticsearch_manifest }}' + +- name: Look up the newly generated ES Certs + kubernetes.core.k8s_info: + api_version: v1 + kind: Secret + name: elasticsearch-es-http-certs-public + namespace: '{{ namespace }}' + register: elasticsearch_certs + until: elasticsearch_certs.resources[0].data["ca.crt"] is defined + retries: 5 + delay: 30 + +- name: Copy the ES CA cert to our TLS secret + kubernetes.core.k8s: + definition: + apiVersion: v1 + kind: Secret + metadata: + name: elasticsearch-es-cert + namespace: '{{ namespace }}' + data: + ca.crt: '{{ elasticsearch_certs.resources[0].data["ca.crt"] }}' \ No newline at end of file diff --git a/build/stf-run-ci/tasks/setup_stf.yml b/build/stf-run-ci/tasks/setup_stf.yml index 66cc1f201..ce4713931 100644 --- a/build/stf-run-ci/tasks/setup_stf.yml +++ b/build/stf-run-ci/tasks/setup_stf.yml @@ -1,6 +1,6 @@ --- - name: Set default InfraWatch OperatorSource manifest - set_fact: + ansible.builtin.set_fact: infrawatch_catalog_source_manifest: | apiVersion: operators.coreos.com/v1alpha1 kind: CatalogSource @@ -9,7 +9,7 @@ namespace: openshift-marketplace spec: displayName: InfraWatch Operators - image: quay.io/infrawatch-operators/infrawatch-catalog:unstable + image: quay.io/infrawatch-operators/infrawatch-catalog:stable-1.5 publisher: InfraWatch sourceType: grpc updateStrategy: @@ -18,7 +18,7 @@ when: infrawatch_catalog_source_manifest is not defined - name: Set default Smart Gateway Operator Subscription manifest - set_fact: + ansible.builtin.set_fact: smart_gateway_operator_subscription_manifest: | apiVersion: operators.coreos.com/v1alpha1 kind: Subscription @@ -26,7 +26,7 @@ name: smart-gateway-operator namespace: "{{ namespace }}" spec: - channel: unstable + channel: stable-1.5 installPlanApproval: Automatic name: smart-gateway-operator source: infrawatch-operators @@ -34,7 +34,7 @@ when: smart_gateway_operator_subscription_manifest is not defined - name: Set default Service Telemetry Operator Subscription manifest - set_fact: + ansible.builtin.set_fact: service_telemetry_operator_subscription_manifest: | apiVersion: operators.coreos.com/v1alpha1 kind: Subscription @@ -42,7 +42,7 @@ name: service-telemetry-operator namespace: "{{ namespace }}" spec: - channel: unstable + channel: stable-1.5 installPlanApproval: Automatic name: service-telemetry-operator source: infrawatch-operators @@ -50,16 +50,16 @@ when: service_telemetry_operator_subscription_manifest is not defined - name: Subscribe to Smart Gateway Operator - k8s: + kubernetes.core.k8s: definition: '{{ smart_gateway_operator_subscription_manifest }}' - name: Subscribe to Service Telemetry Operator - k8s: + kubernetes.core.k8s: definition: '{{ service_telemetry_operator_subscription_manifest }}' - name: Enable InfraWatch Catalog Source - k8s: + kubernetes.core.k8s: definition: '{{ infrawatch_catalog_source_manifest }}' diff --git a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml index a3305440a..8439dce4a 100644 --- a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml +++ b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml @@ -1,6 +1,7 @@ -- block: +- when: setup_bundle_registry_auth | bool + block: - name: Get existing Pull Secret from openshift config - k8s_info: + kubernetes.core.k8s_info: api_version: v1 kind: Secret namespace: openshift-config @@ -8,11 +9,11 @@ register: pull_secret - name: Decode docker config json - set_fact: + ansible.builtin.set_fact: dockerconfigjson: "{{ pull_secret.resources[0].data['.dockerconfigjson'] | b64decode }}" - name: Merge registry creds into auth section of docker config - set_fact: + ansible.builtin.set_fact: new_dockerauths: "{{ dockerconfigjson['auths'] | combine( { pull_secret_registry:{ 'auth': (pull_secret_user ~ ':' ~ pull_secret_pass) | b64encode @@ -20,11 +21,11 @@ }) }}" - name: Create new docker config - set_fact: + ansible.builtin.set_fact: new_dockerconfigjson: "{{ dockerconfigjson | combine({'auths': new_dockerauths}) }}" - name: Create Pull Secret for bundle registry access (in the local namespace) - k8s: + kubernetes.core.k8s: state: present definition: apiVersion: v1 @@ -37,7 +38,7 @@ .dockerconfigjson: "{{ new_dockerconfigjson | tojson | b64encode }}" - name: Create Pull Secret for bundle registry access (in the global namespace) - k8s: + kubernetes.core.k8s: state: present definition: apiVersion: v1 @@ -49,11 +50,9 @@ data: .dockerconfigjson: "{{ new_dockerconfigjson | tojson | b64encode }}" - tags: - - bundle_registry_auth - -- name: Create registry CA Cert - k8s: +- when: setup_bundle_registry_tls_ca | bool + name: Create registry CA Cert + kubernetes.core.k8s: state: present definition: apiVersion: v1 @@ -64,10 +63,9 @@ namespace: "{{ namespace }}" data: cert.pem: "{{ lookup('file', 'CA.pem') | b64encode }}" - tags: - - bundle_registry_tls_ca -- name: Patch the default service account to use our pull secret +- when: setup_bundle_registry_tls_ca + name: Patch the default service account to use our pull secret kubernetes.core.k8s_json_patch: kind: ServiceAccount namespace: "{{ namespace }}" @@ -77,13 +75,30 @@ path: /imagePullSecrets value: - name: pull-secret - tags: - - bundle_registry_tls_ca + + # When the task is skipped, pull_secret is still defined. It is set to the task output i.e. + # "pull_secret": { + # "changed": false, + # "skip_reason": "Conditional result was False", + # "skipped": true + # } +- name: "Set pull_secret to a zero-length string, if setup_bundle_registry_auth is false" + when: not (setup_bundle_registry_auth | bool) + ansible.builtin.set_fact: + pull_secret: '' + +- name: "Ensure that the bundle paths are set." + ansible.builtin.assert: + that: + - '__smart_gateway_bundle_image_path is defined and __smart_gateway_bundle_image_path != None' + - '__service_telemetry_bundle_image_path is defined and __service_telemetry_bundle_image_path != None' + fail_msg: "Bundle path(s) not set. __smart_gateway_bundle_image_path is '{{ __smart_gateway_bundle_image_path }}' and __service_telemetry_bundle_image_path is '{{ __service_telemetry_bundle_image_path }}'. Both values need to be set." + success_msg: "Bundle paths are defined and not None" - name: Deploy SGO via OLM bundle - shell: - cmd: "{{ base_dir }}/working/operator-sdk run bundle {{__smart_gateway_bundle_image_path}} --pull-secret-name=pull-secret --ca-secret-name=registry-tls-ca --namespace={{ namespace }} --timeout 600s" + ansible.builtin.shell: + cmd: "{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v1 }} --verbose run bundle {{ __smart_gateway_bundle_image_path }} {% if pull_secret | length > 0 %} --pull-secret-name=pull-secret --ca-secret-name=registry-tls-ca {% endif %} --namespace={{ namespace }} --timeout 600s" - name: Deploy STO via OLM bundle - shell: - cmd: "{{ base_dir }}/working/operator-sdk run bundle {{ __service_telemetry_bundle_image_path}} --pull-secret-name=pull-secret --ca-secret-name=registry-tls-ca --namespace={{ namespace }} --timeout 600s" + ansible.builtin.shell: + cmd: "{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v1 }} --verbose run bundle {{ __service_telemetry_bundle_image_path }} {% if pull_secret | length > 0 %} --pull-secret-name=pull-secret --ca-secret-name=registry-tls-ca {% endif %} --namespace={{ namespace }} --timeout 600s" diff --git a/build/stf-run-ci/tasks/setup_stf_local_build.yml b/build/stf-run-ci/tasks/setup_stf_local_build.yml index a7c3c2578..40774223f 100644 --- a/build/stf-run-ci/tasks/setup_stf_local_build.yml +++ b/build/stf-run-ci/tasks/setup_stf_local_build.yml @@ -1,83 +1,109 @@ --- -# NOTE: the split filter cuts the image path (quay.io:443/infrawatch/container_image:tag_name) on the colon. Field :-1 (everything but the final field) is the image path, field -1 (final field) is the image tag -# --- Smart Gateway Operator --- -- name: Generate Smart Gateway Operator CSV - shell: - chdir: working/smart-gateway-operator/build - cmd: | - WORKING_DIR="{{ base_dir }}/working/smart-gateway-operator-bundle" \ - RELATED_IMAGE_CORE_SMARTGATEWAY={{ sg_core_image_path | parse_image | quote }} \ - RELATED_IMAGE_BRIDGE_SMARTGATEWAY={{ sg_bridge_image_path | parse_image | quote }} \ - RELATED_IMAGE_CORE_SMARTGATEWAY_TAG={{ sg_core_image_path | parse_tag | quote }} \ - RELATED_IMAGE_BRIDGE_SMARTGATEWAY_TAG={{ sg_bridge_image_path | parse_tag | quote }} \ - OPERATOR_IMAGE={{ sgo_image_path | parse_image | quote }} \ - OPERATOR_TAG={{ sgo_image_path | parse_tag | quote }} \ - ./generate_bundle.sh - register: generate_bundle_sgo +# WARNING: generation of bundles is not idempotent from the point of being able +# to use the generate_bundle_ content for use in other places -- name: Results of bundle generation - debug: - var: generate_bundle_sgo.stdout_lines +# --- Smart Gateway Operator --- +- block: + - name: Generate Smart Gateway Operator CSV + ansible.builtin.shell: + chdir: "{{ base_dir }}/working/smart-gateway-operator/build" + cmd: | + LOGFILE="{{ logfile_dir }}/sgo_gen_bundle.log" \ + OPERATOR_SDK="{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v0 }}" \ + WORKING_DIR="{{ base_dir }}/working/smart-gateway-operator-bundle" \ + RELATED_IMAGE_CORE_SMARTGATEWAY={{ sg_core_image_path | parse_image | quote }} \ + RELATED_IMAGE_BRIDGE_SMARTGATEWAY={{ sg_bridge_image_path | parse_image | quote }} \ + RELATED_IMAGE_CORE_SMARTGATEWAY_TAG={{ sg_core_image_path | parse_tag | quote }} \ + RELATED_IMAGE_BRIDGE_SMARTGATEWAY_TAG={{ sg_bridge_image_path | parse_tag | quote }} \ + OPERATOR_IMAGE={{ sgo_image_path | parse_image | quote }} \ + OPERATOR_TAG={{ sgo_image_path | parse_tag | quote }} \ + ./generate_bundle.sh + register: generate_bundle_sgo + rescue: + # "|| true" is needed until https://github.com/infrawatch/smart-gateway-operator/pull/143 is merged + - name: Show generate bundle log + ansible.builtin.shell: + cmd: | + cat {{ logfile_dir }}/sgo_gen_bundle.log || true - name: Replace namespace in SGO role binding - replace: + ansible.builtin.replace: path: "{{ base_dir }}/working/smart-gateway-operator/deploy/role_binding.yaml" regexp: 'placeholder' replace: '{{ namespace }}' -- name: Load Smart Gateway Operator RBAC - command: oc apply -f working/smart-gateway-operator/deploy/{{ item }} -n "{{ namespace }}" - loop: - - service_account.yaml - - role.yaml - - role_binding.yaml - - olm-catalog/smart-gateway-operator/manifests/smartgateway.infra.watch_smartgateways_crd.yaml - - name: Replace namespace in SGO CSV - replace: + ansible.builtin.replace: path: "{{ base_dir }}/working/smart-gateway-operator-bundle/manifests/smart-gateway-operator.clusterserviceversion.yaml" regexp: 'placeholder' replace: '{{ namespace }}' -- name: Load Smart Gateway Operator CSV - shell: oc apply -f working/smart-gateway-operator-bundle/manifests/smart-gateway-operator.clusterserviceversion.yaml -n "{{ namespace }}" +- when: not __deploy_from_index_enabled | bool + block: + - name: Load Smart Gateway Operator RBAC + ansible.builtin.command: + cmd: oc apply -f {{ base_dir }}/working/smart-gateway-operator/deploy/{{ item }} -n "{{ namespace }}" + loop: + - service_account.yaml + - role.yaml + - role_binding.yaml + - olm-catalog/smart-gateway-operator/manifests/smartgateway.infra.watch_smartgateways_crd.yaml + + - name: Load Smart Gateway Operator CSV + ansible.builtin.shell: + cmd: oc apply -f {{ base_dir }}/working/smart-gateway-operator-bundle/manifests/smart-gateway-operator.clusterserviceversion.yaml -n "{{ namespace }}" # --- Service Telemetry Operator --- -- name: Generate Service Telemetry Operator CSV - shell: - chdir: "{{ base_dir }}" - cmd: | - WORKING_DIR="{{ base_dir }}/working/service-telemetry-operator-bundle" \ - RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP={{ prometheus_webhook_snmp_image_path | parse_image | quote }} \ - RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG={{ prometheus_webhook_snmp_image_path | parse_tag | quote }} \ - OPERATOR_IMAGE={{ sto_image_path | parse_image | quote }} \ - OPERATOR_TAG={{ sto_image_path | parse_tag | quote }} \ - ./generate_bundle.sh +- block: + - name: Generate Service Telemetry Operator CSV + ansible.builtin.shell: + chdir: "{{ base_dir }}" + cmd: | + LOGFILE="{{ logfile_dir }}/sto_gen_bundle.log" \ + OPERATOR_SDK="{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v0 }}" \ + WORKING_DIR="{{ base_dir }}/working/service-telemetry-operator-bundle" \ + RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP={{ prometheus_webhook_snmp_image_path | parse_image | quote }} \ + RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG={{ prometheus_webhook_snmp_image_path | parse_tag | quote }} \ + OPERATOR_IMAGE={{ sto_image_path | parse_image | quote }} \ + OPERATOR_TAG={{ sto_image_path | parse_tag | quote }} \ + ./generate_bundle.sh + register: generate_bundle_sto + rescue: + - name: Results of STO bundle generation + ansible.builtin.shell: + cmd: | + cat {{ logfile_dir }}/sto_gen_bundle.log || true - name: Replace namespace in STO role binding - replace: + ansible.builtin.replace: path: "{{ base_dir }}/../deploy/role_binding.yaml" regexp: 'placeholder' replace: '{{ namespace }}' -- block: +- name: Replace namespace in STO CSV + ansible.builtin.replace: + path: "{{ base_dir }}/working/service-telemetry-operator-bundle/manifests/service-telemetry-operator.clusterserviceversion.yaml" + regexp: 'placeholder' + replace: '{{ namespace }}' + +- when: not __deploy_from_index_enabled | bool + block: - name: Load Service Telemetry Operator RBAC - command: oc apply -f ../deploy/{{ item }} -n "{{ namespace }}" + ansible.builtin.command: + cmd: oc apply -f ../deploy/{{ item }} -n "{{ namespace }}" + chdir: "{{ base_dir }}" loop: - service_account.yaml - role.yaml - role_binding.yaml - olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml - - name: Revert local change to role_binding.yaml - shell: git checkout -- "{{ base_dir }}/../deploy/role_binding.yaml" - -- name: Replace namespace in STO CSV - replace: - path: "{{ base_dir }}/working/service-telemetry-operator-bundle/manifests/service-telemetry-operator.clusterserviceversion.yaml" - regexp: 'placeholder' - replace: '{{ namespace }}' - -- name: Load Service Telemetry Operator CSV - shell: oc apply -f working/service-telemetry-operator-bundle/manifests/service-telemetry-operator.clusterserviceversion.yaml -n "{{ namespace }}" + - name: Load Service Telemetry Operator CSV + ansible.builtin.shell: + cmd: oc apply -f {{ base_dir }}/working/service-telemetry-operator-bundle/manifests/service-telemetry-operator.clusterserviceversion.yaml -n "{{ namespace }}" +# cleanup +- name: Revert local change to role_binding.yaml + ansible.builtin.shell: + cmd: git checkout -- "{{ base_dir }}/../deploy/role_binding.yaml" + chdir: "{{ base_dir }}" diff --git a/build/stf-run-ci/templates/config-json.j2 b/build/stf-run-ci/templates/config-json.j2 new file mode 100644 index 000000000..136015e9b --- /dev/null +++ b/build/stf-run-ci/templates/config-json.j2 @@ -0,0 +1 @@ +{"auths":{"image-registry.openshift-image-registry.svc:5000":<< internal_registry >>}} diff --git a/build/stf-run-ci/templates/index-yaml.j2 b/build/stf-run-ci/templates/index-yaml.j2 new file mode 100644 index 000000000..54731b8f9 --- /dev/null +++ b/build/stf-run-ci/templates/index-yaml.j2 @@ -0,0 +1,20 @@ +--- +defaultChannel: {{ sto_bundle_info.bundle_default_channel }} +name: service-telemetry-operator +schema: olm.package +--- +schema: olm.channel +package: service-telemetry-operator +name: {{ sto_bundle_info.bundle_channels }} +entries: + - name: service-telemetry-operator.v{{ sto_bundle_info.operator_bundle_version }} +--- +defaultChannel: {{ sgo_bundle_info.bundle_default_channel }} +name: smart-gateway-operator +schema: olm.package +--- +schema: olm.channel +package: smart-gateway-operator +name: {{ sgo_bundle_info.bundle_channels }} +entries: + - name: smart-gateway-operator.v{{ sgo_bundle_info.operator_bundle_version }} diff --git a/build/stf-run-ci/templates/manifest_elasticsearch.j2 b/build/stf-run-ci/templates/manifest_elasticsearch.j2 new file mode 100644 index 000000000..e2e50c6f4 --- /dev/null +++ b/build/stf-run-ci/templates/manifest_elasticsearch.j2 @@ -0,0 +1,52 @@ +apiVersion: elasticsearch.k8s.elastic.co/v1 +kind: Elasticsearch +metadata: + name: elasticsearch + namespace: {{ namespace }} +spec: + auth: {} + http: + service: + metadata: {} + spec: {} + tls: + certificate: {} + monitoring: + logs: {} + metrics: {} + nodeSets: + - count: 1 + name: default + config: + node.roles: + - master + - data + - ingest + node.store.allow_mmap: true + podTemplate: + metadata: + labels: + tuned.openshift.io/elasticsearch: elasticsearch + spec: + containers: + - name: elasticsearch + resources: + limits: + cpu: "2" + memory: 4Gi + requests: + cpu: "1" + memory: 4Gi + volumes: + - emptyDir: {} + name: elasticsearch-data + transport: + service: + metadata: {} + spec: {} + tls: + certificate: {} + certificateAuthorities: {} + updateStrategy: + changeBudget: {} + version: {{ elasticsearch_version }} \ No newline at end of file diff --git a/build/update_csv.sh b/build/update_csv.sh index 4e15f1fa3..172653dd4 100755 --- a/build/update_csv.sh +++ b/build/update_csv.sh @@ -3,4 +3,4 @@ # Run this script from the root directory to update the CSV whenever changes # are made to /deploy/crds/. Changes are written to # /deploy/olm-manifests/service-telemetry-operator/. -operator-sdk generate bundle --channels unstable --default-channel unstable +operator-sdk generate bundle --channels stable-1.5 --default-channel stable-1.5 diff --git a/build/validate_deployment.sh b/build/validate_deployment.sh index 345ee3d7f..14ea741c7 100755 --- a/build/validate_deployment.sh +++ b/build/validate_deployment.sh @@ -12,7 +12,7 @@ echo -e "\n* [info] Waiting for QDR deployment to complete\n" until timeout 300 oc rollout status deployment.apps/default-interconnect; do sleep 3; done case "${VALIDATION_SCOPE}" in - "use_community") + "use_community" | "use_hybrid") echo -e "\n* [info] Waiting for prometheus deployment to complete\n" until timeout 300 oc rollout status statefulset.apps/prometheus-default; do sleep 3; done echo -e "\n* [info] Waiting for elasticsearch deployment to complete \n" @@ -30,12 +30,25 @@ case "${VALIDATION_SCOPE}" in until timeout 300 oc rollout status deployment.apps/default-cloud1-coll-event-smartgateway; do sleep 3; done until timeout 300 oc rollout status deployment.apps/default-cloud1-ceil-event-smartgateway; do sleep 3; done until timeout 300 oc rollout status deployment.apps/default-cloud1-ceil-meter-smartgateway; do sleep 3; done + until timeout 300 oc rollout status deployment.apps/default-cloud1-sens-meter-smartgateway; do sleep 3; done + ;; + + "use_redhat") + echo -e "\n* [info] Waiting for prometheus deployment to complete\n" + until timeout 300 oc rollout status statefulset.apps/prometheus-default; do sleep 3; done + echo -e "\n* [info] Waiting for alertmanager deployment to complete\n" + until timeout 300 oc rollout status statefulset.apps/alertmanager-default; do sleep 3; done + echo -e "\n* [info] Waiting for smart-gateway deployment to complete\n" + until timeout 300 oc rollout status deployment.apps/default-cloud1-coll-meter-smartgateway; do sleep 3; done + until timeout 300 oc rollout status deployment.apps/default-cloud1-ceil-meter-smartgateway; do sleep 3; done + until timeout 300 oc rollout status deployment.apps/default-cloud1-sens-meter-smartgateway; do sleep 3; done ;; "none") echo -e "\n* [info] Waiting for smart-gateway deployment to complete\n" until timeout 300 oc rollout status deployment.apps/default-cloud1-coll-meter-smartgateway; do sleep 3; done until timeout 300 oc rollout status deployment.apps/default-cloud1-ceil-meter-smartgateway; do sleep 3; done + until timeout 300 oc rollout status deployment.apps/default-cloud1-sens-meter-smartgateway; do sleep 3; done ;; esac diff --git a/ci/deploy_stf.yml b/ci/deploy_stf.yml new file mode 100644 index 000000000..170e8590a --- /dev/null +++ b/ci/deploy_stf.yml @@ -0,0 +1,25 @@ +--- +- name: "Deploy STF" + hosts: controller + tasks: + - name: "Set the sto_dir if it isn't already set" + ansible.builtin.set_fact: + sto_dir: '{{ ansible_env.HOME }}/{{ zuul.project.src_dir }}' + when: sto_dir | default('') | length == 0 + + - name: "Get vars common to all jobs" + ansible.builtin.include_vars: + file: "vars-zuul-common.yml" + + - name: "Get scenario-specific vars" + ansible.builtin.include_vars: + file: "vars-{{ scenario }}.yml" + + - name: "Log into the cluster" + ansible.builtin.import_role: + name: rhol_crc + tasks_from: add_crc_creds.yml + + - name: "Deploy STF using stf-run-ci" + ansible.builtin.import_role: + name: '../build/stf-run-ci' diff --git a/ci/post-collect_logs.yml b/ci/post-collect_logs.yml new file mode 100644 index 000000000..58552b618 --- /dev/null +++ b/ci/post-collect_logs.yml @@ -0,0 +1,78 @@ +--- +# Based on https://raw.githubusercontent.com/openstack-k8s-operators/nova-operator/bc10c4f579f8538899ac7bc5f87bfdb62d7042a4/ci/nova-operator-base/playbooks/collect-logs.yaml +- hosts: all + name: Create zuul-output log dir + gather_facts: false + tasks: + - name: Create log dir + ansible.builtin.file: + path: "{{ ansible_user_dir }}/zuul-output/logs" + state: directory + mode: "0755" + +- hosts: controller + name: Collect logs on the controller + gather_facts: false + tasks: + - name: "Set the sto_dir if it isn't already set" + ansible.builtin.set_fact: + sto_dir: '{{ ansible_env.HOME }}/{{ zuul.project.src_dir }}' + when: sto_dir | default('') | length == 0 + + - name: "Get vars common to all jobs" + ansible.builtin.include_vars: + file: "vars-zuul-common.yml" + + - name: "Get scenario-specific vars" + ansible.builtin.include_vars: + file: "vars-{{ scenario }}.yml" + + - name: "Create log dir" + ansible.builtin.file: + path: "{{ logfile_dir }}" + state: directory + mode: "0755" + + - name: "Log into the cluster" + ansible.builtin.import_role: + name: rhol_crc + tasks_from: add_crc_creds.yml + + - name: "Gather logs from stf deployment" + ansible.builtin.import_role: + name: '../build/stf-collect-logs' + + - name: "Get pods and describe non-completed, non-running pods" + ansible.builtin.shell: + cmd: | + echo "*** oc get pods ***" > {{ logfile_dir }}/oc_get_pods.log 2>&1 + oc -n {{ namespace }} get pods >> {{ logfile_dir }}/oc_get_pods.log 2>&1 + + for pod in $(oc get pods | grep -v NAME | grep -v Running | awk '{ print $1 }'); + do + oc -n {{ namespace }} describe pod $pod > {{ logfile_dir }}/post_oc_describe_pod_${pod}.log 2>&1 + done + ignore_errors: true + retries: 3 + delay: 10 + + - name: "Get build details" + ansible.builtin.shell: + cmd: | + for build in $(oc -n {{ namespace }} get builds -o json| jq -r '.items[].metadata.name'); do oc -n {{ namespace }} describe build $build > {{ logfile_dir }}/post_oc_describe_build_${build}.log 2>&1; done + + - name: "Copy generated logs" + ansible.builtin.shell: | + cp {{ ansible_env.HOME }}/*.log . + args: + chdir: "{{ ansible_user_dir }}/zuul-output/logs/controller" + changed_when: true + ignore_errors: true + +- hosts: all + name: Copy files from controller on node + gather_facts: false + tasks: + - name: Copy files from controller on node + ansible.builtin.include_role: + name: fetch-output diff --git a/ci/prepare.yml b/ci/prepare.yml new file mode 100644 index 000000000..7b65362d6 --- /dev/null +++ b/ci/prepare.yml @@ -0,0 +1,50 @@ +--- +- name: "Prepare the environment for running stf" + hosts: controller + tasks: + - name: "Set the sto_dir if it isn't already set" + ansible.builtin.set_fact: + sto_dir: '{{ ansible_env.HOME }}/{{ zuul.project.src_dir }}' + when: sto_dir | default('') | length == 0 + + - name: "Get vars common to all jobs" + ansible.builtin.include_vars: + file: "vars-zuul-common.yml" + + - name: "Get scenario-specific vars" + ansible.builtin.include_vars: + file: "vars-{{ scenario }}.yml" + + - name: "Update pip" + ansible.builtin.pip: + name: pip + state: latest + extra_args: "-U" + + - name: "Install pre-reqs from pip" + ansible.builtin.pip: + requirements: "build/stf-run-ci/requirements.txt" + chdir: "{{ sto_dir }}" + state: present + + - name: "Install ansible collections" + community.general.ansible_galaxy_install: + type: collection + name: "{{ item }}" + with_items: + - "kubernetes.core:2.3.2" + - "community.general:6.2.0" + + - name: "Log into the cluster" + ansible.builtin.import_role: + name: rhol_crc + tasks_from: add_crc_creds.yml + + - name: "Create the service-telemetry project" + kubernetes.core.k8s: + api_version: v1 + kind: Namespace + name: "{{ namespace }}" + state: present + retries: 3 + delay: 30 diff --git a/ci/test_stf.yml b/ci/test_stf.yml new file mode 100644 index 000000000..7f196e860 --- /dev/null +++ b/ci/test_stf.yml @@ -0,0 +1,28 @@ +--- +- name: "Run tests to verify that STF runs as expected" + hosts: controller + tasks: + - name: "Set the sto_dir if it isn't already set" + ansible.builtin.set_fact: + sto_dir: '{{ ansible_env.HOME }}/{{ zuul.project.src_dir }}' + when: sto_dir | default('') | length == 0 + + - name: "Get vars common to all jobs" + ansible.builtin.include_vars: + file: "vars-zuul-common.yml" + + - name: "Get scenario-specific vars" + ansible.builtin.include_vars: + file: "vars-{{ scenario }}.yml" + + - name: "Log into the cluster" + ansible.builtin.import_role: + name: rhol_crc + tasks_from: add_crc_creds.yml + + - name: "Run STF smoketests" + ansible.builtin.shell: + cmd: | + OCP_PROJECT={{ namespace }} CLEANUP=false ./tests/smoketest/smoketest.sh > {{ logfile_dir }}/smoketest.log 2>&1 + chdir: "{{ sto_dir }}" + changed_when: false diff --git a/ci/vars-local_build.yml b/ci/vars-local_build.yml new file mode 100644 index 000000000..3126605a4 --- /dev/null +++ b/ci/vars-local_build.yml @@ -0,0 +1,5 @@ +--- +__deploy_stf: true +__local_build_enabled: true +__service_telemetry_snmptraps_enabled: true +__service_telemetry_storage_ephemeral_enabled: true diff --git a/ci/vars-nightly_bundles.yml b/ci/vars-nightly_bundles.yml new file mode 100644 index 000000000..ca49656f3 --- /dev/null +++ b/ci/vars-nightly_bundles.yml @@ -0,0 +1,7 @@ +--- +# from: https://github.com/infrawatch/service-telemetry-operator/pull/437 +# ansible-playbook -e __service_telemetry_storage_ephemeral_enabled=true -e __local_build_enabled=false -e __deploy_from_bundles_enabled=true -e __service_telemetry_bundle_image_path=quay.io/infrawatch-operators/service-telemetry-operator-bundle:nightly-head -e __smart_gateway_bundle_image_path=quay.io/infrawatch-operators/smart-gateway-operator-bundle:nightly-head --skip-tags bundle_registry_tls_ca --skip-tags bundle_registry_auth build/run-ci.yaml + +__local_build_enabled: false +__deploy_from_bundles_enabled: true +__service_telemetry_storage_ephemeral_enabled: true diff --git a/ci/vars-zuul-common.yml b/ci/vars-zuul-common.yml new file mode 100644 index 000000000..dfd64e7ad --- /dev/null +++ b/ci/vars-zuul-common.yml @@ -0,0 +1,7 @@ +--- +namespace: "service-telemetry" +setup_bundle_registry_tls_ca: false +setup_bundle_registry_auth: false +__service_telemetry_transports_qdr_auth: none +base_dir: "{{ sto_dir }}/build" +logfile_dir: "{{ ansible_user_dir }}/zuul-output/logs/controller" diff --git a/deploy/alerts/alerts.yaml b/deploy/alerts/alerts.yaml index ea96d0be1..15ef94629 100644 --- a/deploy/alerts/alerts.yaml +++ b/deploy/alerts/alerts.yaml @@ -1,4 +1,4 @@ -apiVersion: monitoring.coreos.com/v1 +apiVersion: monitoring.rhobs/v1 kind: PrometheusRule metadata: creationTimestamp: null @@ -55,7 +55,6 @@ spec: severity: warn annotations: summary: IO read (warning) - - alert: disk:time:read critical expr: >- (abs(job:disk:time:read:rate_5m - job:disk:time:read:rate_5m:avg_over_time_1h) / job:disk:time:read:rate_5m:stddev_over_time_1h) >6 @@ -64,6 +63,7 @@ spec: severity: critical annotations: summary: IO read (critical) + - expr: 'rate(collectd_disk_disk_time_write_total[5m])' record: 'job:disk:time:write:rate_5m' - expr: 'stddev_over_time(job:disk:time:write:rate_5m[1h])' @@ -78,7 +78,6 @@ spec: severity: warn annotations: summary: IO write (warning) - - alert: disk:time:write critical expr: >- (abs(job:disk:time:write:rate_5m - job:disk:time:write:rate_5m:avg_over_time_1h) / job:disk:time:write:rate_5m:stddev_over_time_1h) >6 @@ -87,47 +86,47 @@ spec: severity: critical annotations: summary: IO write (critical) + - expr: 'rate(collectd_disk_disk_ops_read_total[5m])' - record: 'job:disk:time:read:rate_5m' - - expr: 'stddev_over_time(job:disk:time:read:rate_5m[1h])' - record: 'job:disk:time:read:rate_5m:stddev_over_time_1h' - - expr: 'avg_over_time(job:disk:time:read:rate_5m[1h])' - record: 'job:disk:time:read:rate_5m:avg_over_time_1h' - - alert: disk:time:read warn + record: 'job:disk:ops:read:rate_5m' + - expr: 'stddev_over_time(job:disk:ops:read:rate_5m[1h])' + record: 'job:disk:ops:read:rate_5m:stddev_over_time_1h' + - expr: 'avg_over_time(job:disk:ops:read:rate_5m[1h])' + record: 'job:disk:ops:read:rate_5m:avg_over_time_1h' + - alert: disk:ops:read warn expr: >- - (abs(job:disk:time:read:rate_5m - job:disk:time:read:rate_5m:avg_over_time_1h) / job:disk:time:read:rate_5m:stddev_over_time_1h) >3 + (abs(job:disk:ops:read:rate_5m - job:disk:ops:read:rate_5m:avg_over_time_1h) / job:disk:ops:read:rate_5m:stddev_over_time_1h) >3 for: 10m labels: severity: warn annotations: summary: disk ops read (warning) - - - alert: disk:time:read critical + - alert: disk:ops:read critical expr: >- - (abs(job:disk:time:read:rate_5m - job:disk:time:read:rate_5m:avg_over_time_1h) / job:disk:time:read:rate_5m:stddev_over_time_1h) >6 + (abs(job:disk:ops:read:rate_5m - job:disk:ops:read:rate_5m:avg_over_time_1h) / job:disk:ops:read:rate_5m:stddev_over_time_1h) >6 for: 10m labels: severity: critical annotations: summary: disk ops read (critical) + - expr: 'rate(collectd_disk_disk_ops_write_total[5m])' - record: 'job:disk:time:write:rate_5m' - - expr: 'stddev_over_time(job:disk:time:write:rate_5m[1h])' - record: 'job:disk:time:write:rate_5m:stddev_over_time_1h' - - expr: 'avg_over_time(job:disk:time:write:rate_5m[1h])' - record: 'job:disk:time:write:rate_5m:avg_over_time_1h' - - alert: disk:time:write warn + record: 'job:disk:ops:write:rate_5m' + - expr: 'stddev_over_time(job:disk:ops:write:rate_5m[1h])' + record: 'job:disk:ops:write:rate_5m:stddev_over_time_1h' + - expr: 'avg_over_time(job:disk:ops:write:rate_5m[1h])' + record: 'job:disk:ops:write:rate_5m:avg_over_time_1h' + - alert: disk:ops:write warn expr: >- - (abs(job:disk:time:write:rate_5m - job:disk:time:write:rate_5m:avg_over_time_1h) / job:disk:time:write:rate_5m:stddev_over_time_1h) >3 + (abs(job:disk:ops:write:rate_5m - job:disk:ops:write:rate_5m:avg_over_time_1h) / job:disk:ops:write:rate_5m:stddev_over_time_1h) >3 for: 10m labels: severity: warn annotations: summary: disk ops write (warning) - - - alert: disk:time:write critical + - alert: disk:ops:write critical expr: >- - (abs(job:disk:time:write:rate_5m - job:disk:time:write:rate_5m:avg_over_time_1h) / job:disk:time:write:rate_5m:stddev_over_time_1h) >6 + (abs(job:disk:ops:write:rate_5m - job:disk:ops:write:rate_5m:avg_over_time_1h) / job:disk:ops:write:rate_5m:stddev_over_time_1h) >6 for: 10m labels: severity: critical @@ -183,7 +182,7 @@ spec: labels: severity: critical annotations: - summary: Hugepages (warning) + summary: Hugepages (critical) expr: >- sum without (type_instance) (collectd_hugepages_vmpage_number{type_instance="free"})/ sum without (type_instance) (collectd_hugepages_vmpage_number) < 0.1 for: 10m diff --git a/deploy/crds/infra.watch_servicetelemetrys_crd.yaml b/deploy/crds/infra.watch_servicetelemetrys_crd.yaml index 21e9f8652..286d2c74b 100644 --- a/deploy/crds/infra.watch_servicetelemetrys_crd.yaml +++ b/deploy/crds/infra.watch_servicetelemetrys_crd.yaml @@ -36,10 +36,12 @@ spec: description: ServiceTelemetrySpec holds the specification of an STF instance. properties: observabilityStrategy: - description: 'The strategy to use for observability systems. Options are "none" (do not deploy any observability components), and "use_community" (community operators with administrator managed subscriptions).' + description: 'The strategy to use for observability systems. Options are "none" (do not deploy any observability components), "use_community" (community supported operators), "use_redhat" (Red Hat Observability Operator with no unsupported components), "use_hybrid" (Red Hat Observability Operator + community supported operators).' type: string enum: - use_community + - use_redhat + - use_hybrid - none alerting: properties: @@ -165,123 +167,77 @@ spec: description: Events related backend configuration. properties: elasticsearch: - description: Events storage backend ElasticSearch + description: Events storage backend Elasticsearch properties: enabled: - description: Enable ElasticSearch as a storage backend for events + description: Enable Elasticsearch as a storage backend for events type: boolean + forwarding: + description: Configuration for where to forward events + type: object + properties: + hostUrl: + description: URL of Elasticsearch HTTP(S) endpoint + type: string + tlsServerName: + description: (if required) Server Name expected to match the certificate presented by the endpoint + type: string + tlsSecretName: + description: (if required) Name of the secret that stores the CA cert and client cert/key + type: string + userSecretName: + description: (if required) Name of the secret that stores the Basic Auth credentials + type: string + useBasicAuth: + description: Whether to provide HTTP Basic Auth headers + type: boolean + useTls: + description: Whether to enable TLS + type: boolean version: - description: Version of ElasticSearch to deploy. Elasticsearch licensing has changed as of version 7.11. See https://www.elastic.co/pricing/faq/licensing for details. + description: (DEPRECATED - Use forwarding params after STF 1.5.3) Version of Elasticsearch to deploy. Elasticsearch licensing has changed as of version 7.11. See https://www.elastic.co/pricing/faq/licensing for details. type: string nodeCount: - description: Elasticsearch node count + description: (DEPRECATED - Use forwarding params after STF 1.5.3) Elasticsearch node count type: string storage: - description: Events storage configuration for ElasticSearch + description: (DEPRECATED - Use forwarding params after STF 1.5.3) Events storage configuration for Elasticsearch properties: strategy: - description: Storage strategy. One of 'ephemeral' or 'persistent'. Persistent storage must be made available by the platform. + description: (DEPRECATED - Use forwarding params after STF 1.5.3) Storage strategy. One of 'ephemeral' or 'persistent'. Persistent storage must be made available by the platform. type: string enum: - ephemeral - persistent persistent: - description: Persistent storage configuration for ElasticSearch + description: (DEPRECATED - Use forwarding params after STF 1.5.3) Persistent storage configuration for Elasticsearch properties: storageClass: - description: Storage class name used for ElasticSearch PVC + description: (DEPRECATED - Use forwarding params after STF 1.5.3) Storage class name used for Elasticsearch PVC type: string storageSelector: - description: Storage selector definition for ElasticSearch + description: (DEPRECATED - Use forwarding params after STF 1.5.3) Storage selector definition for Elasticsearch type: string pvcStorageRequest: - description: How much storage space to request in the PVC + description: (DEPRECATED - Use forwarding params after STF 1.5.3) How much storage space to request in the PVC type: string type: object type: object certificates: properties: endpointCertDuration: - description: The requested 'duration' (i.e. lifetime) of the ElasticSearch endpoint Certificate. + description: (DEPRECATED - Use forwarding params after STF 1.5.3) The requested 'duration' (i.e. lifetime) of the Elasticsearch endpoint Certificate. Minimum accepted duration is 1 hour. Value must be in units accepted by Go time.ParseDuration https://golang.org/pkg/time/#ParseDuration pattern: ^((([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ type: string caCertDuration: - description: The requested 'duration' (i.e. lifetime) of the ElasticSearch CA Certificate. + description: (DEPRECATED - Use forwarding params after STF 1.5.3) The requested 'duration' (i.e. lifetime) of the Elasticsearch CA Certificate. Minimum accepted duration is 1 hour. Value must be in units accepted by Go time.ParseDuration https://golang.org/pkg/time/#ParseDuration pattern: ^((([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ type: string type: object type: object type: object - logs: - description: Logs related backend configuration. - properties: - loki: - description: Logs storage backend Loki - properties: - enabled: - description: '[TESTING ONLY] Enable Loki as a storage backend for logs' - type: boolean - replicationFactor: - description: Loki replication factor - format: int32 - minimum: 1 - type: integer - flavor: - description: Loki flavor - enum: - - 1x.extra-small - - 1x.small - - 1x.medium - type: string - storage: - description: Logs storage configuration for Loki - properties: - objectStorageSecret: - description: Secret containing informaiton required for S3 object storage - type: string - storageClass: - description: Storage class used for temporary log storage before they are forwarded to object storage or when querying. - type: string - type: object - compactor: - description: Template for the compactor microservice - properties: - replicas: - description: Number of replicas for this microservice - type: string - type: object - distributor: - description: Template for the distributor microservice - properties: - replicas: - description: Number of replicas for this microservice - type: string - type: object - ingester: - description: Template for the ingester microservice - properties: - replicas: - description: Number of replicas for this microservice - type: string - type: object - querier: - description: Template for the querier microservice - properties: - replicas: - description: Number of replicas for this microservice - type: string - type: object - queryFrontend: - description: Template for the query frontend microservice - properties: - replicas: - description: Number of replicas for this microservice - type: string - type: object - type: object - type: object type: object transports: description: Data transport configuration @@ -292,6 +248,12 @@ spec: enabled: description: Enable QDR data transort type: boolean + auth: + description: 'Auth type to use for incoming OSP connections. Options are "none", or "basic"' + type: string + enum: + - none + - basic web: description: QDR web configuration properties: @@ -423,42 +385,6 @@ spec: type: object type: array type: object - logs: - description: Logs related configuration for this cloud object. - properties: - collectors: - description: List of available logs collectors for this cloud - object. - items: - properties: - collectorType: - description: Set the collector type, value of 'rsyslog' - enum: - - rsyslog - type: string - debugEnabled: - description: Enable console debugging. Default is 'false'. - type: boolean - subscriptionAddress: - description: Address to subscribe on the data transport - to receive notifications. - type: string - bridge: - description: Bridge configuration and tuning configurations. - properties: - ringBufferCount: - description: sg-bridge ring buffer count. This affects the potential number of messages in queue, which can result in increased memory usage within the sg-bridge container. - type: integer - ringBufferSize: - description: sg-bridge ring buffer size. This affects the size of messages that can be passed between sg-bridge and sg-core. - type: integer - verbose: - description: Enable verbosity for debugging purposes. - type: boolean - type: object - type: object - type: array - type: object type: object type: array type: object diff --git a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml index 9d324839c..8b4cf7142 100644 --- a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml +++ b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml @@ -3,7 +3,7 @@ kind: ServiceTelemetry metadata: name: default spec: - observabilityStrategy: use_community + observabilityStrategy: use_redhat alerting: enabled: true alertmanager: @@ -27,7 +27,7 @@ spec: metrics: prometheus: enabled: true - scrapeInterval: 10s + scrapeInterval: 30s storage: strategy: persistent retention: 24h @@ -36,6 +36,13 @@ spec: events: elasticsearch: enabled: false + forwarding: + hostUrl: https://elasticsearch-es-http:9200 + tlsServerName: "" + tlsSecretName: elasticsearch-es-cert + userSecretName: elasticsearch-es-elastic-user + useBasicAuth: true + useTls: true version: 7.16.1 storage: strategy: persistent @@ -44,14 +51,6 @@ spec: certificates: endpointCertDuration: 70080h caCertDuration: 70080h - logs: - loki: - enabled: false - flavor: 1x.extra-small - replicationFactor: 1 - storage: - objectStorageSecret: test - storageClass: standard clouds: - name: cloud1 metrics: @@ -74,23 +73,7 @@ spec: subscriptionAddress: sensubility/cloud1-telemetry debugEnabled: false bridge: - ringBufferSize: 16384 - ringBufferCount: 15000 - verbose: false - events: - collectors: - - collectorType: collectd - subscriptionAddress: collectd/cloud1-notify - debugEnabled: false - bridge: - ringBufferSize: 16384 - ringBufferCount: 15000 - verbose: false - - collectorType: ceilometer - subscriptionAddress: anycast/ceilometer/cloud1-event.sample - debugEnabled: false - bridge: - ringBufferSize: 16384 + ringBufferSize: 65535 ringBufferCount: 15000 verbose: false graphing: @@ -103,6 +86,7 @@ spec: transports: qdr: enabled: true + auth: basic web: enabled: false certificates: diff --git a/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in b/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in index 182dbf160..cbe2ccbf3 100644 --- a/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in +++ b/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in @@ -13,7 +13,7 @@ LABEL operators.operatorframework.io.metrics.mediatype.v1=metrics+v1 LABEL operators.operatorframework.io.metrics.builder=operator-sdk-v0.19.4 LABEL operators.operatorframework.io.metrics.project_layout=ansible LABEL com.redhat.delivery.operator.bundle=true -LABEL com.redhat.openshift.versions="v4.10-v4.12" +LABEL com.redhat.openshift.versions="v4.11-v4.14" LABEL com.redhat.delivery.backport=false LABEL com.redhat.component="service-telemetry-operator-bundle-container" \ diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml index c275c943b..f26cbc7b9 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml @@ -122,53 +122,88 @@ spec: description: Events related backend configuration. properties: elasticsearch: - description: Events storage backend ElasticSearch + description: Events storage backend Elasticsearch properties: certificates: properties: caCertDuration: - description: The requested 'duration' (i.e. lifetime) - of the ElasticSearch CA Certificate. Minimum accepted + description: (DEPRECATED - Use forwarding params after + STF 1.5.3) The requested 'duration' (i.e. lifetime) + of the Elasticsearch CA Certificate. Minimum accepted duration is 1 hour. Value must be in units accepted by Go time.ParseDuration https://golang.org/pkg/time/#ParseDuration pattern: ^((([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ type: string endpointCertDuration: - description: The requested 'duration' (i.e. lifetime) - of the ElasticSearch endpoint Certificate. Minimum + description: (DEPRECATED - Use forwarding params after + STF 1.5.3) The requested 'duration' (i.e. lifetime) + of the Elasticsearch endpoint Certificate. Minimum accepted duration is 1 hour. Value must be in units accepted by Go time.ParseDuration https://golang.org/pkg/time/#ParseDuration pattern: ^((([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ type: string type: object enabled: - description: Enable ElasticSearch as a storage backend + description: Enable Elasticsearch as a storage backend for events type: boolean + forwarding: + description: Configuration for where to forward events + properties: + hostUrl: + description: URL of Elasticsearch HTTP(S) endpoint + type: string + tlsSecretName: + description: (if required) Name of the secret that + stores the CA cert and client cert/key + type: string + tlsServerName: + description: (if required) Server Name expected to + match the certificate presented by the endpoint + type: string + useBasicAuth: + description: Whether to provide HTTP Basic Auth headers + type: boolean + useTls: + description: Whether to enable TLS + type: boolean + userSecretName: + description: (if required) Name of the secret that + stores the Basic Auth credentials + type: string + type: object nodeCount: - description: Elasticsearch node count + description: (DEPRECATED - Use forwarding params after + STF 1.5.3) Elasticsearch node count type: string storage: - description: Events storage configuration for ElasticSearch + description: (DEPRECATED - Use forwarding params after + STF 1.5.3) Events storage configuration for Elasticsearch properties: persistent: - description: Persistent storage configuration for - ElasticSearch + description: (DEPRECATED - Use forwarding params after + STF 1.5.3) Persistent storage configuration for + Elasticsearch properties: pvcStorageRequest: - description: How much storage space to request + description: (DEPRECATED - Use forwarding params + after STF 1.5.3) How much storage space to request in the PVC type: string storageClass: - description: Storage class name used for ElasticSearch - PVC + description: (DEPRECATED - Use forwarding params + after STF 1.5.3) Storage class name used for + Elasticsearch PVC type: string storageSelector: - description: Storage selector definition for ElasticSearch + description: (DEPRECATED - Use forwarding params + after STF 1.5.3) Storage selector definition + for Elasticsearch type: string type: object strategy: - description: Storage strategy. One of 'ephemeral' + description: (DEPRECATED - Use forwarding params after + STF 1.5.3) Storage strategy. One of 'ephemeral' or 'persistent'. Persistent storage must be made available by the platform. enum: @@ -177,84 +212,13 @@ spec: type: string type: object version: - description: Version of ElasticSearch to deploy. Elasticsearch + description: (DEPRECATED - Use forwarding params after + STF 1.5.3) Version of Elasticsearch to deploy. Elasticsearch licensing has changed as of version 7.11. See https://www.elastic.co/pricing/faq/licensing for details. type: string type: object type: object - logs: - description: Logs related backend configuration. - properties: - loki: - description: Logs storage backend Loki - properties: - compactor: - description: Template for the compactor microservice - properties: - replicas: - description: Number of replicas for this microservice - type: string - type: object - distributor: - description: Template for the distributor microservice - properties: - replicas: - description: Number of replicas for this microservice - type: string - type: object - enabled: - description: '[TESTING ONLY] Enable Loki as a storage - backend for logs' - type: boolean - flavor: - description: Loki flavor - enum: - - 1x.extra-small - - 1x.small - - 1x.medium - type: string - ingester: - description: Template for the ingester microservice - properties: - replicas: - description: Number of replicas for this microservice - type: string - type: object - querier: - description: Template for the querier microservice - properties: - replicas: - description: Number of replicas for this microservice - type: string - type: object - queryFrontend: - description: Template for the query frontend microservice - properties: - replicas: - description: Number of replicas for this microservice - type: string - type: object - replicationFactor: - description: Loki replication factor - format: int32 - minimum: 1 - type: integer - storage: - description: Logs storage configuration for Loki - properties: - objectStorageSecret: - description: Secret containing informaiton required - for S3 object storage - type: string - storageClass: - description: Storage class used for temporary log - storage before they are forwarded to object storage - or when querying. - type: string - type: object - type: object - type: object metrics: description: Metrics related backend configuration. properties: @@ -354,48 +318,6 @@ spec: type: object type: array type: object - logs: - description: Logs related configuration for this cloud object. - properties: - collectors: - description: List of available logs collectors for this - cloud object. - items: - properties: - bridge: - description: Bridge configuration and tuning configurations. - properties: - ringBufferCount: - description: sg-bridge ring buffer count. This - affects the potential number of messages in - queue, which can result in increased memory - usage within the sg-bridge container. - type: integer - ringBufferSize: - description: sg-bridge ring buffer size. This - affects the size of messages that can be passed - between sg-bridge and sg-core. - type: integer - verbose: - description: Enable verbosity for debugging purposes. - type: boolean - type: object - collectorType: - description: Set the collector type, value of 'rsyslog' - enum: - - rsyslog - type: string - debugEnabled: - description: Enable console debugging. Default is - 'false'. - type: boolean - subscriptionAddress: - description: Address to subscribe on the data transport - to receive notifications. - type: string - type: object - type: array - type: object metrics: description: Metrics related configuration for this cloud object. properties: @@ -491,10 +413,14 @@ spec: type: object observabilityStrategy: description: The strategy to use for observability systems. Options - are "none" (do not deploy any observability components), and "use_community" - (community operators with administrator managed subscriptions). + are "none" (do not deploy any observability components), "use_community" + (community supported operators), "use_redhat" (Red Hat Observability + Operator with no unsupported components), "use_hybrid" (Red Hat + Observability Operator + community supported operators). enum: - use_community + - use_redhat + - use_hybrid - none type: string transports: @@ -503,6 +429,13 @@ spec: qdr: description: QDR configuration for data transport properties: + auth: + description: Auth type to use for incoming OSP connections. + Options are "none", or "basic" + enum: + - none + - basic + type: string certificates: properties: caCertDuration: diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index a04701500..1e1fdc092 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -44,6 +44,14 @@ metadata: "endpointCertDuration": "70080h" }, "enabled": false, + "forwarding": { + "hostUrl": "https://elasticsearch-es-http:9200", + "tlsSecretName": "elasticsearch-es-cert", + "tlsServerName": "", + "useBasicAuth": true, + "useTls": true, + "userSecretName": "elasticsearch-es-elastic-user" + }, "storage": { "persistent": { "pvcStorageRequest": "20Gi" @@ -53,21 +61,10 @@ metadata: "version": "7.16.1" } }, - "logs": { - "loki": { - "enabled": false, - "flavor": "1x.extra-small", - "replicationFactor": 1, - "storage": { - "objectStorageSecret": "test", - "storageClass": "standard" - } - } - }, "metrics": { "prometheus": { "enabled": true, - "scrapeInterval": "10s", + "scrapeInterval": "30s", "storage": { "persistent": { "pvcStorageRequest": "20G" @@ -80,30 +77,6 @@ metadata: }, "clouds": [ { - "events": { - "collectors": [ - { - "bridge": { - "ringBufferCount": 15000, - "ringBufferSize": 16384, - "verbose": false - }, - "collectorType": "collectd", - "debugEnabled": false, - "subscriptionAddress": "collectd/cloud1-notify" - }, - { - "bridge": { - "ringBufferCount": 15000, - "ringBufferSize": 16384, - "verbose": false - }, - "collectorType": "ceilometer", - "debugEnabled": false, - "subscriptionAddress": "anycast/ceilometer/cloud1-event.sample" - } - ] - }, "metrics": { "collectors": [ { @@ -129,7 +102,7 @@ metadata: { "bridge": { "ringBufferCount": 15000, - "ringBufferSize": 16384, + "ringBufferSize": 65535, "verbose": false }, "collectorType": "sensubility", @@ -153,9 +126,10 @@ metadata: "highAvailability": { "enabled": false }, - "observabilityStrategy": "use_community", + "observabilityStrategy": "use_redhat", "transports": { "qdr": { + "auth": "basic", "certificates": { "caCertDuration": "70080h", "endpointCertDuration": "70080h" @@ -177,6 +151,16 @@ metadata: description: Service Telemetry Framework. Umbrella Operator for instantiating the required dependencies and configuration of various components to build a Service Telemetry platform for telco grade monitoring. + features.operators.openshift.io/cnf: "false" + features.operators.openshift.io/cni: "false" + features.operators.openshift.io/csi: "false" + features.operators.openshift.io/disconnected: "false" + features.operators.openshift.io/fips-compliant: "false" + features.operators.openshift.io/proxy-aware: "false" + features.operators.openshift.io/tls-profiles: "false" + features.operators.openshift.io/token-auth-aws: "false" + features.operators.openshift.io/token-auth-azure: "false" + features.operators.openshift.io/token-auth-gcp: "false" olm.skipRange: '>=<> <<>' operatorframework.io/suggested-namespace: service-telemetry operators.openshift.io/valid-subscription: '["OpenStack Platform", "Cloud Infrastructure", @@ -218,12 +202,6 @@ spec: name: servicemonitors.monitoring.coreos.com version: v1 version: v1beta1 - required: - - description: Creation of Smart Gateways - displayName: Smart Gateway - kind: SmartGateway - name: smartgateways.smartgateway.infra.watch - version: v2 description: Service Telemetry Operator for monitoring clouds displayName: Service Telemetry Operator icon: @@ -251,6 +229,7 @@ spec: - watch - update - patch + - delete - apiGroups: - authorization.k8s.io resources: @@ -261,6 +240,7 @@ spec: - security.openshift.io resourceNames: - nonroot + - nonroot-v2 resources: - securitycontextconstraints verbs: @@ -311,6 +291,8 @@ spec: value: explicit - name: RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_IMAGE value: <>:<> + - name: RELATED_IMAGE_OAUTH_PROXY_IMAGE + value: <>:<> image: <>:<> imagePullPolicy: Always name: operator @@ -379,9 +361,9 @@ spec: - interconnectedcloud.github.io - smartgateway.infra.watch - monitoring.coreos.com + - monitoring.rhobs - elasticsearch.k8s.elastic.co - integreatly.org - - loki.grafana.com resources: - '*' verbs: @@ -393,6 +375,13 @@ spec: verbs: - get - create + - apiGroups: + - monitoring.rhobs + resources: + - servicemonitors + verbs: + - get + - create - apiGroups: - apps resourceNames: @@ -419,6 +408,27 @@ spec: - '*' verbs: - '*' + - apiGroups: + - rbac.authorization.k8s.io + resources: + - roles + - rolebindings + verbs: + - create + - get + - list + - watch + - update + - patch + - apiGroups: + - extensions + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch serviceAccountName: service-telemetry-operator strategy: deployment installModes: diff --git a/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml b/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml index 8edfa0da9..2a0d93436 100644 --- a/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml @@ -1,3 +1,44 @@ properties: - type: olm.maxOpenShiftVersion - value: "4.12" + value: "4.14" + - type: olm.constraint + value: + failureMessage: Require Smart Gateway for Service Telemetry Framework + all: + constraints: + - failureMessage: Package smart-gateway-operator is needed for Service Telemetry Framework + package: + packageName: smart-gateway-operator + versionRange: '>=5.0.0' + - type: olm.constraint + value: + failureMessage: Require data transport for Service Telemetry Framework + all: + constraints: + - failureMessage: Package amq7-interconnect-operator is needed for data transport with STF + package: + packageName: amq7-interconnect-operator + versionRange: '>=1.10.0' + - type: olm.constraint + value: + failureMessage: Require certificate management for Service Telemetry Framework + all: + constraints: + - failureMessage: Package openshift-cert-manager-operator is needed for AMQ Interconnect setup + package: + packageName: openshift-cert-manager-operator + versionRange: '>=1.10.0' + - type: olm.constraint + value: + failureMessage: Require Prometheus backend for data storage of metrics for Service Telemetry Framework + any: + constraints: + - package: + packageName: prometheus + versionRange: '>=0.56.0' + - package: + packageName: observability-operator + versionRange: '>=0.0.1' + - package: + packageName: cluster-observability-operator + versionRange: '>=0.0.1' diff --git a/deploy/operator.yaml b/deploy/operator.yaml index 7b6879d4a..c56c11daa 100644 --- a/deploy/operator.yaml +++ b/deploy/operator.yaml @@ -35,6 +35,8 @@ spec: value: explicit - name: RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_IMAGE value: <>:<> + - name: RELATED_IMAGE_OAUTH_PROXY_IMAGE + value: <>:<> volumes: - name: runner emptyDir: {} diff --git a/deploy/remove_stf.sh b/deploy/remove_stf.sh index 6ebb540e7..5286693e0 100755 --- a/deploy/remove_stf.sh +++ b/deploy/remove_stf.sh @@ -4,6 +4,7 @@ # REL=$(dirname "$0"); . "${REL}/../build/metadata.sh" REMOVE_CERTMANAGER=${REMOVE_CERTMANAGER:-true} +REMOVE_OBO=${REMOVE_OBO:-true} # The whole STF project (start this first since it's slow) oc delete project "${OCP_PROJECT}" @@ -45,6 +46,18 @@ if [ "${REMOVE_CERTMANAGER}" = "true" ]; then oc get crd | grep cert-manager.io | cut -d ' ' -f 1 | xargs oc delete crd fi +if [ "${REMOVE_OBO}" = "true" ]; then + oc delete subscription observability-operator -n openshift-operators + oc delete catalogsource observability-operator -n openshift-marketplace + + # CSV for OBO + OBO_CSV=$(oc get csv | grep observability-operator | cut -d ' ' -f 1) + oc delete csv "${OBO_CSV}" + + # OBO CRDs + oc get crd | grep monitoring.rhobs | cut -d ' ' -f 1 | xargs oc delete crd +fi + # Wait for namespace to actually disappear (this can take awhile) while oc get ns "${OCP_PROJECT}" > /dev/null; do echo "Waiting for ${OCP_PROJECT} to disappear"; sleep 5; done diff --git a/deploy/role.yaml b/deploy/role.yaml index c1c465969..6e22854e4 100644 --- a/deploy/role.yaml +++ b/deploy/role.yaml @@ -22,6 +22,7 @@ rules: - watch - update - patch + - delete - apiGroups: - authorization.k8s.io resources: @@ -32,6 +33,7 @@ rules: - security.openshift.io resourceNames: - nonroot + - nonroot-v2 resources: - securitycontextconstraints verbs: @@ -116,9 +118,9 @@ rules: - interconnectedcloud.github.io - smartgateway.infra.watch - monitoring.coreos.com + - monitoring.rhobs - elasticsearch.k8s.elastic.co - integreatly.org - - loki.grafana.com resources: - '*' verbs: @@ -130,6 +132,13 @@ rules: verbs: - get - create +- apiGroups: + - monitoring.rhobs + resources: + - servicemonitors + verbs: + - get + - create - apiGroups: - apps resourceNames: @@ -156,3 +165,24 @@ rules: - '*' verbs: - '*' +- apiGroups: + - rbac.authorization.k8s.io + resources: + - roles + - rolebindings + verbs: + - create + - get + - list + - watch + - update + - patch +- apiGroups: + - extensions + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch diff --git a/docs/loki.md b/docs/loki.md deleted file mode 100644 index abdf8057a..000000000 --- a/docs/loki.md +++ /dev/null @@ -1,75 +0,0 @@ -# How to run SGO with Loki -A few examples about how to deploy with Loki for logging support. - -## Deploy SGO + Loki with minio for storage -This is less resource intensive. Useful for development in crc. -``` -ansible-playbook --extra-vars __service_telemetry_logs_enabled=true --extra-vars __deploy_minio_enabled=true run-ci.yaml -``` - -## Deploy SGO + Loki with OCS for storage -This is more a production-like setup. It's more resource demanding and cannot be run in crc. This assumes OCS is already deployed. - -### Create an object bucket claim -``` -oc apply -f - < -stringData: - endpoint: https://: - bucketnames: - access_key_id: - access_key_secret: -type: Opaque -EOF -``` - -### Deploy SGO + Loki -``` -ansible-playbook --extra-vars __service_telemetry_logs_enabled=true --extra-vars __loki_skip_tls_verify=true run-ci.yaml -``` - diff --git a/roles/servicetelemetry/defaults/main.yml b/roles/servicetelemetry/defaults/main.yml index 714d55471..98943a6de 100644 --- a/roles/servicetelemetry/defaults/main.yml +++ b/roles/servicetelemetry/defaults/main.yml @@ -3,8 +3,11 @@ # # remove SmartGateway object when cloud no longer in current `clouds` object list. clouds_remove_on_missing: false -# default observability strategy (compatible with STF 1.3) -observability_strategy: use_community +observability_strategy: use_redhat + +# These get auto-discovered, but the code is simpler if they are never undefined +prometheus_operator_api_string: "" +prometheus_operator_label: "" certificate_duration: 70080h @@ -39,7 +42,7 @@ servicetelemetry_defaults: prometheus: enabled: true deployment_size: 1 - scrape_interval: 10s + scrape_interval: 30s storage: strategy: persistent retention: 24h @@ -49,6 +52,13 @@ servicetelemetry_defaults: events: elasticsearch: enabled: false + forwarding: + host_url: https://elasticsearch-es-http:9200 + tls_server_name: "" + tls_secret_name: elasticsearch-es-cert + user_secret_name: elasticsearch-es-elastic-user + use_basic_auth: true + use_tls: true version: 7.16.1 node_count: 1 storage: @@ -59,24 +69,6 @@ servicetelemetry_defaults: certificates: endpoint_cert_duration: 70080h ca_cert_duration: 70080h - logs: - loki: - enabled: false - replication_factor: 1 - flavor: 1x.extra-small - storage: - object_storage_secret: "" - storage_class: "" - compactor: - replicas: "" - distributor: - replicas: "" - ingester: - replicas: "" - querier: - replicas: "" - query_frontend: - replicas: "" transports: qdr: @@ -84,6 +76,7 @@ servicetelemetry_defaults: deployment_size: 1 web: enabled: false + auth: basic certificates: endpoint_cert_duration: 70080h ca_cert_duration: 70080h @@ -125,7 +118,7 @@ servicetelemetry_defaults: subscription_address: sensubility/cloud1-telemetry debug_enabled: false bridge: - ring_buffer_size: 16384 + ring_buffer_size: 65535 ring_buffer_count: 15000 verbose: false events: @@ -144,16 +137,6 @@ servicetelemetry_defaults: ring_buffer_size: 16384 ring_buffer_count: 15000 verbose: false - logs: - collectors: - - collector_type: rsyslog - subscription_address: rsyslog/cloud1-logs - debug_enabled: false - bridge: - ring_buffer_size: 135048 - ring_buffer_count: 15000 - verbose: false - # These variables are outside of the defaults. Their values will be # auto-detected by the role and are not meant to be set by the user. However, @@ -162,19 +145,6 @@ servicetelemetry_defaults: is_k8s: false is_openshift: false -# - This image works on OCP 4.6, 4.7, and 4.8 (v4.4 tag instead of 'latest' as I'd prefer above) -oauth_proxy_image: image-registry.openshift-image-registry.svc:5000/openshift/oauth-proxy:v4.4 - -# - Downstream, this one works, but there is a big red "Red Hat strongly recommends updating to the newest image version..." (4.9) which doesn't work -# - See https://github.com/openshift/oauth-proxy/issues/229 for why 4.9 isn't working for us -# oauth_proxy_image: registry.redhat.io/openshift4/ose-oauth-proxy:v4.8 - -# - This image works (probably upstream and down!), but is pinned back to the version we're strongly recommended not to use: -# oauth_proxy_image: quay.io/openshift/origin-oauth-proxy:4.8 - -# - For reference this is the image line I see in openshift-monitoring for the equivalent image: -# oauth_proxy_image: quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:ca37f5fc57643a353dcdcd6ad3a7bc8bb40fd4276a555fe5de2b8c7167d64020 - _ephemeral_storage_enabled: false # set default smartgateway deployment size. You should not modify this. diff --git a/roles/servicetelemetry/meta/main.yml b/roles/servicetelemetry/meta/main.yml index 2c0d83024..d7c0f7585 100644 --- a/roles/servicetelemetry/meta/main.yml +++ b/roles/servicetelemetry/meta/main.yml @@ -9,10 +9,10 @@ galaxy_info: platforms: - name: RHEL versions: - - 7 + - 8 - name: CentOS versions: - - 7 + - 8 galaxy_tags: - monitoring diff --git a/roles/servicetelemetry/tasks/base_smartgateway.yml b/roles/servicetelemetry/tasks/base_smartgateway.yml index 95c46c6d8..4d0cfdafd 100644 --- a/roles/servicetelemetry/tasks/base_smartgateway.yml +++ b/roles/servicetelemetry/tasks/base_smartgateway.yml @@ -8,4 +8,7 @@ - name: Deploy SG-specific ServiceMonitor for metrics SGs include_tasks: component_servicemonitor.yml - when: data_type == 'metrics' + when: + - data_type == 'metrics' + - has_monitoring_api | bool + - observability_strategy != 'none' diff --git a/roles/servicetelemetry/tasks/component_alertmanager.yml b/roles/servicetelemetry/tasks/component_alertmanager.yml index edace5b1c..1e5551311 100644 --- a/roles/servicetelemetry/tasks/component_alertmanager.yml +++ b/roles/servicetelemetry/tasks/component_alertmanager.yml @@ -28,6 +28,26 @@ definition: '{{ alertmanager_manifest }}' +- name: Ensure no community Alertmanager is installed if not using community operator + k8s: + state: absent + api_version: monitoring.coreos.com/v1 + kind: alertmanager + name: '{{ ansible_operator_meta.name }}' + namespace: '{{ ansible_operator_meta.namespace }}' + when: + - observability_strategy != "use_community" + +- name: Ensure no rhobs Alertmanager is installed if not using it + k8s: + state: absent + api_version: monitoring.rhobs/v1 + kind: alertmanager + name: '{{ ansible_operator_meta.name }}' + namespace: '{{ ansible_operator_meta.namespace }}' + when: + - observability_strategy not in ['use_redhat', 'use_hybrid'] + # TODO: expand the management of alertmanager receivers and move this functionality to a common location # --> SNMP traps - name: Create SNMP traps instance @@ -46,7 +66,7 @@ kind: Route name: '{{ ansible_operator_meta.name }}-alertmanager-proxy' -- name: Add a service account to used by Alertmanager +- name: Create ServiceAccount/alertmanager-stf with oauth redirect annotation k8s: definition: apiVersion: v1 @@ -57,22 +77,121 @@ annotations: serviceaccounts.openshift.io/oauth-redirectreference.alertmanager: '{{ alertmanager_oauth_redir_ref | to_json }}' -- name: Bind role +- name: Create ClusterRole/alertmanager-stf k8s: definition: apiVersion: rbac.authorization.k8s.io/v1 - kind: ClusterRoleBinding + kind: ClusterRole metadata: name: alertmanager-stf - namespace: '{{ ansible_operator_meta.namespace }}' - roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: alertmanager-main - subjects: - - kind: ServiceAccount + rules: + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create + +- name: Setup ClusterRoleBinding for Alertmanager + block: + - name: Define ClusterRoleBinding/alertmanager-stf + set_fact: + def_alertmanager_stf_crb: | + apiVersion: rbac.authorization.k8s.io/v1 + kind: ClusterRoleBinding + metadata: + name: alertmanager-stf + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: alertmanager-stf + subjects: + - kind: ServiceAccount + name: alertmanager-stf + namespace: '{{ ansible_operator_meta.namespace }}' + + - name: Create ClusterRoleBinding/alertmanager-stf + k8s: + definition: + "{{ def_alertmanager_stf_crb }}" + rescue: + - name: Remove ClusterRoleBinding/alertmanager-stf when fail to update + k8s: + state: absent + definition: + apiVersion: rbac.authorization.k8s.io/v1 + kind: ClusterRoleBinding + metadata: + name: alertmanager-stf + + - name: Create ClusterRoleBinding/alertmanager-stf + k8s: + definition: + "{{ def_alertmanager_stf_crb }}" + +- name: Create Role/alertmanager-stf + k8s: + definition: + apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: name: alertmanager-stf namespace: '{{ ansible_operator_meta.namespace }}' + rules: + - apiGroups: + - security.openshift.io + resourceNames: + - nonroot + resources: + - securitycontextconstraints + verbs: + - use + +- name: Setup RoleBinding for Alertmanager + block: + - name: Define RoleBinding/alertmanager-stf + set_fact: + def_alertmanager_stf_rb: | + apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: alertmanager-stf + namespace: '{{ ansible_operator_meta.namespace }}' + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: alertmanager-stf + namespace: '{{ ansible_operator_meta.namespace }}' + subjects: + - kind: ServiceAccount + name: alertmanager-stf + namespace: '{{ ansible_operator_meta.namespace }}' + + - name: Create RoleBinding/alertmanager-stf + k8s: + definition: + "{{ def_alertmanager_stf_rb }}" + rescue: + - name: Remove RoleBinding/alertmanager-stf when fail to update + k8s: + state: absent + definition: + apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: alertmanager-stf + namespace: '{{ ansible_operator_meta.namespace }}' + + - name: Create RoleBinding/alertmanager-stf + k8s: + definition: + "{{ def_alertmanager_stf_rb }}" - name: Set default alertmanager service template set_fact: diff --git a/roles/servicetelemetry/tasks/component_certificates.yml b/roles/servicetelemetry/tasks/component_certificates.yml index e8f034338..bc0178409 100644 --- a/roles/servicetelemetry/tasks/component_certificates.yml +++ b/roles/servicetelemetry/tasks/component_certificates.yml @@ -1,6 +1,3 @@ -- name: Create local signing authority - include_tasks: _local_signing_authority.yml - - name: Create configmap for OAUTH CA certs k8s: definition: diff --git a/roles/servicetelemetry/tasks/component_clouds.yml b/roles/servicetelemetry/tasks/component_clouds.yml index 99de80198..8a745440e 100644 --- a/roles/servicetelemetry/tasks/component_clouds.yml +++ b/roles/servicetelemetry/tasks/component_clouds.yml @@ -19,25 +19,68 @@ - this_cloud.metrics.collectors is iterable - name: Events Smart Gateway deployment + when: servicetelemetry_vars.backends.events.elasticsearch.enabled block: - - name: Lookup ElasticSearch BasicAuth + - name: Lookup Elasticsearch BasicAuth k8s_info: api_version: v1 kind: Secret namespace: '{{ ansible_operator_meta.namespace }}' - name: 'elasticsearch-es-elastic-user' + name: '{{ servicetelemetry_vars.backends.events.elasticsearch.forwarding.user_secret_name }}' register: elasticsearch_es_elastic_user - - name: Filter out ElasticSearch password for BasicAuth + - name: Set Elasticsearch user for BasicAuth set_fact: - elastic_pass: "{{ elasticsearch_es_elastic_user | json_query('resources[0].data.elastic') | b64decode }}" + elastic_user: "{{ elasticsearch_es_elastic_user.resources[0].data | dict2items | map(attribute='key') | list | first }}" + + - name: Set Elasticsearch password for BasicAuth + set_fact: + elastic_pass: "{{ elasticsearch_es_elastic_user.resources[0].data[elastic_user] | b64decode }}" no_log: true + - name: Set elastic_tls_server_name from forwarding config if set + set_fact: + elastic_tls_server_name: "{{ servicetelemetry_vars.backends.events.elasticsearch.forwarding.tls_server_name }}" + when: servicetelemetry_vars.backends.events.elasticsearch.forwarding.tls_server_name | length > 0 + + # This sets the server name based on the host part of the URL between // and : (https://elasticsearch-host:9200) + - name: Set elastic_tls_server_name by parsing the host_url if it's not set in the config + set_fact: + elastic_tls_server_name: "{{ servicetelemetry_vars.backends.events.elasticsearch.forwarding.host_url.rpartition('//')[-1].partition(':')[0] }}" + when: servicetelemetry_vars.backends.events.elasticsearch.forwarding.tls_server_name | length == 0 + + - name: Get the Elasticsearch TLS materials secret + k8s_info: + api_version: v1 + kind: Secret + namespace: '{{ ansible_operator_meta.namespace }}' + name: '{{ servicetelemetry_vars.backends.events.elasticsearch.forwarding.tls_secret_name }}' + register: es_certs + + - when: es_certs.resources[0].data["user.key"] is not defined or es_certs.resources[0].data["user.crt"] is not defined + block: + - name: Load dummy certs + include_vars: + file: dummy_user_certs.yml + + - name: Augment the secret with dummy TLS cert/key if no TLS user auth material provided + k8s: + definition: + apiVersion: v1 + kind: Secret + metadata: + name: '{{ servicetelemetry_vars.backends.events.elasticsearch.forwarding.tls_secret_name }}' + namespace: '{{ ansible_operator_meta.namespace }}' + stringData: + user.crt: '{{ elastic_user_cert_dummy }}' + user.key: '{{ elastic_user_key_dummy }}' + - name: Deploy Events Smart Gateway instance for each collector vars: data_type: 'events' manifest: './manifest_smartgateway_events.j2' this_smartgateway: "{{ ansible_operator_meta.name }}-{{ this_cloud.name }}-{{ this_collector.collector_type[:4] }}-event" + elasticsearch: "{{ servicetelemetry_vars.backends.events.elasticsearch | from_yaml }}" include_tasks: base_smartgateway.yml loop: "{{ this_cloud.events.collectors }}" loop_control: @@ -47,29 +90,3 @@ - this_cloud.events is defined - this_cloud.events.collectors is defined - this_cloud.events is iterable - # TODO: it should be possible to deploy the eventing SGs when ElasticSearch - # is not available, but currently the template for smartgateway_events - # expects to have information about a local ES instance on cluster. - # https://github.com/infrawatch/service-telemetry-operator/issues/274 - when: - - has_elasticsearch_api | bool - - servicetelemetry_vars.backends.events.elasticsearch.enabled - - observability_strategy == 'use_community' - -- name: Deploy Logs Smart Gateway instance - vars: - data_type: 'logs' - manifest: './manifest_smartgateway_logs.j2' - this_smartgateway: "{{ ansible_operator_meta.name }}-{{ this_cloud.name }}-{{ this_collector.collector_type[:4] }}-log" - include_tasks: base_smartgateway.yml - loop: "{{ this_cloud.logs.collectors }}" - loop_control: - loop_var: this_collector - label: "{{ this_collector.collector_type }}" - when: - - has_loki_api | bool - - observability_strategy == 'use_community' - - servicetelemetry_vars.backends.logs.loki.enabled - - this_cloud.logs is defined - - this_cloud.logs.collectors is defined - - this_cloud.logs is iterable diff --git a/roles/servicetelemetry/tasks/component_elasticsearch.yml b/roles/servicetelemetry/tasks/component_elasticsearch.yml index 87b58be02..0a34b64c0 100644 --- a/roles/servicetelemetry/tasks/component_elasticsearch.yml +++ b/roles/servicetelemetry/tasks/component_elasticsearch.yml @@ -1,3 +1,10 @@ +# DEPRECATED +# +# This code in the servicetelemetry role is deprecated as of STF 1.5.3, after +# which only forwarding to an external elasticsearch is supported. +# +# The code lives on in the stf-run-ci role for CI testing of the forwarding +# feature. - name: Lookup template debug: msg: "{{ lookup('template', './manifest_elasticsearch.j2') | from_yaml }}" diff --git a/roles/servicetelemetry/tasks/_local_signing_authority.yml b/roles/servicetelemetry/tasks/component_es_certificates.yml similarity index 100% rename from roles/servicetelemetry/tasks/_local_signing_authority.yml rename to roles/servicetelemetry/tasks/component_es_certificates.yml diff --git a/roles/servicetelemetry/tasks/component_grafana.yml b/roles/servicetelemetry/tasks/component_grafana.yml index 2b0f6eeea..068507610 100644 --- a/roles/servicetelemetry/tasks/component_grafana.yml +++ b/roles/servicetelemetry/tasks/component_grafana.yml @@ -7,8 +7,43 @@ kind: Route name: 'grafana-route' -- name: Create htpasswd secret for grafana admin +- name: Check for existing grafana htpasswd secret no_log: true + k8s_info: + api_version: v1 + kind: Secret + namespace: '{{ ansible_operator_meta.namespace }}' + name: '{{ ansible_operator_meta.name }}-grafana-htpasswd' + register: grafana_htpasswd_secret + +- block: + - name: Parse current Grafana htpasswd salt from secret + no_log: true + set_fact: + grafana_htpasswd_salt: "{{ ((grafana_htpasswd_secret.resources[0].data.auth | b64decode).split('$')[-1])[0:22] }}" + rescue: + - name: Generate initial Grafana htpasswd bcrypt string from grafana.admin_password + no_log: true + set_fact: + init_grafana_htpasswd_bcrypt_string: "{{ (servicetelemetry_vars.graphing.grafana.admin_password | password_hash('bcrypt') | replace('$2b$','$2y$', 1)) }}" + + - name: Read newly generated Grafana htpasswd salt + no_log: true + set_fact: + grafana_htpasswd_salt: "{{ (init_grafana_htpasswd_bcrypt_string.split('$')[-1])[0:22] }}" + always: + - name: Generate Grafana htpasswd bcrypt string from grafana.adminPassword using salt + no_log: true + set_fact: + grafana_htpasswd_bcrypt_string: "{{ (servicetelemetry_vars.graphing.grafana.admin_password | password_hash('bcrypt', grafana_htpasswd_salt) | replace('$2b$','$2y$', 1)) }}" + + - name: Generate Grafana auth string from grafana.adminUser and grafana_htpasswd_bcrypt_string + no_log: true + set_fact: + grafana_htpasswd_auth_string: "{{ servicetelemetry_vars.graphing.grafana.admin_user }}:{{ grafana_htpasswd_bcrypt_string }}" + +- name: Create or patch htpasswd secret for grafana admin + no_log: false k8s: definition: api_version: v1 @@ -18,7 +53,7 @@ namespace: '{{ ansible_operator_meta.namespace }}' type: Opaque stringData: - auth: '{{ servicetelemetry_vars.graphing.grafana.admin_user }}:{{ servicetelemetry_vars.graphing.grafana.admin_password | htpasswd_sha1 }}' + auth: '{{ grafana_htpasswd_auth_string }}' - name: Lookup template debug: @@ -34,63 +69,49 @@ state: '{{ "present" if servicetelemetry_vars.graphing.enabled else "absent" }}' definition: '{{ grafana_manifest }}' - when: servicetelemetry_vars.graphing.enabled - when: servicetelemetry_vars.graphing.enabled block: - - when: servicetelemetry_vars.backends.metrics.prometheus.enabled - block: - - name: Retrieve configmap for OAUTH CA certs - k8s_info: - api_version: v1 - kind: ConfigMap - name: serving-certs-ca-bundle - namespace: '{{ ansible_operator_meta.namespace }}' - register: serving_certs_ca + - when: servicetelemetry_vars.backends.metrics.prometheus.enabled + block: + - name: Retrieve configmap for OAUTH CA certs + k8s_info: + api_version: v1 + kind: ConfigMap + name: serving-certs-ca-bundle + namespace: '{{ ansible_operator_meta.namespace }}' + register: serving_certs_ca - - name: Retrieve prometheus secret - k8s_info: - api_version: v1 - kind: Secret - namespace: '{{ ansible_operator_meta.namespace }}' - name: '{{ ansible_operator_meta.name }}-prometheus-htpasswd' - register: prometheus_secret + - name: Retrieve prometheus secret + k8s_info: + api_version: v1 + kind: Secret + namespace: '{{ ansible_operator_meta.namespace }}' + name: '{{ ansible_operator_meta.name }}-prometheus-htpasswd' + register: prometheus_secret - - name: Decode prometheus password - no_log: true - set_fact: - prom_basicauth_passwd: '{{ prometheus_secret.resources[0].data.password | b64decode }}' + - name: Decode prometheus password + no_log: true + set_fact: + prom_basicauth_passwd: '{{ prometheus_secret.resources[0].data.password | b64decode }}' - - when: servicetelemetry_vars.backends.events.elasticsearch.enabled - block: - - name: Retrieve elastic search secret - k8s_info: - api_version: v1 - kind: Secret - name: elasticsearch-es-elastic-user + # Lookup existing datasources + - name: Remove legacy datasources + k8s: + api_version: integreatly.org/v1alpha1 + name: '{{ ansible_operator_meta.name }}-ds-prometheus' + kind: GrafanaDataSource namespace: '{{ ansible_operator_meta.namespace }}' - register: es_secret + state: absent - - name: Decode elasticsearch password + # NOTE: this can fail if you enable grafana without prometheus due to missing resources referenced in the template + - name: Set datasources set_fact: - elasticsearch_pass: '{{ es_secret.resources[0].data.elastic | b64decode }}' - - # Lookup existing datasources - - name: Remove legacy datasources - k8s: - api_version: integreatly.org/v1alpha1 - name: '{{ ansible_operator_meta.name }}-ds-prometheus' - kind: GrafanaDataSource - namespace: '{{ ansible_operator_meta.namespace }}' - state: absent - - - name: Set datasources - set_fact: - ds_manifest: "{{ lookup('template', './manifest_grafana_ds.j2') | from_yaml }}" - when: ds_manifest is not defined + ds_manifest: "{{ lookup('template', './manifest_grafana_ds.j2') | from_yaml }}" + when: ds_manifest is not defined - - name: Create the datasources - k8s: - state: '{{ "present" if servicetelemetry_vars.graphing.enabled else "absent" }}' - definition: - '{{ ds_manifest }}' + - name: Create the datasources + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.enabled else "absent" }}' + definition: + '{{ ds_manifest }}' diff --git a/roles/servicetelemetry/tasks/component_loki.yml b/roles/servicetelemetry/tasks/component_loki.yml deleted file mode 100644 index 53dc2b812..000000000 --- a/roles/servicetelemetry/tasks/component_loki.yml +++ /dev/null @@ -1,14 +0,0 @@ -- name: Lookup template - debug: - msg: "{{ lookup('template', './manifest_loki.j2') | from_yaml }}" - -- name: Set default Loki manifest - set_fact: - loki_manifest: "{{ lookup('template', './manifest_loki.j2') | from_yaml }}" - when: loki_manifest is not defined - -- name: Create an instance of Loki - k8s: - state: '{{ "present" if servicetelemetry_vars.backends.logs.loki.enabled else "absent" }}' - definition: - '{{ loki_manifest }}' diff --git a/roles/servicetelemetry/tasks/component_prometheus.yml b/roles/servicetelemetry/tasks/component_prometheus.yml index 2cb7573ff..2e865abd2 100644 --- a/roles/servicetelemetry/tasks/component_prometheus.yml +++ b/roles/servicetelemetry/tasks/component_prometheus.yml @@ -7,33 +7,171 @@ kind: Route name: '{{ ansible_operator_meta.name }}-prometheus-proxy' -- name: Add oauth redirect annotation to prometheus-k8s service account +- name: Create ServiceAccount/prometheus-stf with oauth redirect annotation k8s: definition: apiVersion: v1 kind: ServiceAccount metadata: - name: prometheus-k8s + name: prometheus-stf namespace: '{{ ansible_operator_meta.namespace }}' annotations: serviceaccounts.openshift.io/oauth-redirectreference.prometheus: '{{ prom_oauth_redir_ref | to_json }}' -- name: Bind the local prometheus SA to prometheus cluster role +- name: Create ClusterRole/prometheus-stf for non-resource URL /metrics access k8s: + definition: + apiVersion: rbac.authorization.k8s.io/v1 + kind: ClusterRole + metadata: + name: prometheus-stf + rules: + - nonResourceURLs: + - /metrics + verbs: + - get + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + +- name: Setup ClusterRoleBinding for Prometheus + block: + - name: Define ClusterRoleBinding/prometheus-stf + set_fact: + def_prometheus_stf_crb: | + apiVersion: rbac.authorization.k8s.io/v1 + kind: ClusterRoleBinding + metadata: + name: prometheus-stf + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-stf + subjects: + - kind: ServiceAccount + name: prometheus-stf + namespace: '{{ ansible_operator_meta.namespace }}' + + - name: Create ClusterRoleBinding/prometheus-stf + k8s: + definition: + "{{ def_prometheus_stf_crb }}" + rescue: + - name: Remove ClusterRoleBinding/prometheus-stf when fail to update + k8s: + state: absent + definition: + apiVersion: rbac.authorization.k8s.io/v1 + kind: ClusterRoleBinding + metadata: + name: prometheus-stf + + - name: Create ClusterRoleBinding/prometheus-stf + k8s: + definition: + "{{ def_prometheus_stf_crb }}" + +- name: Create Role/prometheus-stf for Prometheus operations + k8s: + definition: + apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + name: prometheus-stf + namespace: '{{ ansible_operator_meta.namespace }}' + rules: + - apiGroups: + - "" + resources: + - services + - endpoints + - pods + verbs: + - get + - list + - watch + - apiGroups: + - extensions + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch + - apiGroups: + - security.openshift.io + resourceNames: + - nonroot + - nonroot-v2 + resources: + - securitycontextconstraints + verbs: + - use + +- name: Setup RoleBinding for Prometheus + block: + - name: Define RoleBinding/prometheus-stf + set_fact: + def_prometheus_stf_rb: | + apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: prometheus-stf + namespace: '{{ ansible_operator_meta.namespace }}' + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-stf + subjects: + - kind: ServiceAccount + name: prometheus-stf + namespace: '{{ ansible_operator_meta.namespace }}' + + - name: Create RoleBinding/prometheus-stf + k8s: + definition: + "{{ def_prometheus_stf_rb }}" + rescue: + - name: Remove RoleBinding/prometheus-stf on failure to update + k8s: + state: absent + definition: + apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: prometheus-stf + namespace: '{{ ansible_operator_meta.namespace }}' + + - name: Create RoleBinding/prometheus-stf + k8s: + definition: + "{{ def_prometheus_stf_rb }}" + +- name: Remove old ClusterRoleBinding for prometheus-k8s using CMO roleRef + k8s: + state: absent definition: apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: prometheus-k8s-{{ ansible_operator_meta.namespace }} namespace: '{{ ansible_operator_meta.namespace }}' - roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus-k8s - subjects: - - kind: ServiceAccount - name: prometheus-k8s - namespace: '{{ ansible_operator_meta.namespace }}' - name: Check for existing prometheus htpasswd user secret k8s_info: @@ -62,8 +200,10 @@ namespace: '{{ ansible_operator_meta.namespace }}' type: Opaque stringData: - auth: 'internal:{{ prom_basicauth_passwd | htpasswd_sha1 }}' # SHA1 is deprecated, bcrypt is available in OCP 4.8+ only https://bugzilla.redhat.com/show_bug.cgi?id=1874322 + auth: 'internal:{{ prom_basicauth_passwd | password_hash("bcrypt") | replace("$2b$","$2y$", 1)}}' password: '{{ prom_basicauth_passwd }}' + tags: + - skip_ansible_lint - name: Re-register new object for use in the annotation k8s_info: @@ -88,6 +228,26 @@ definition: '{{ prometheus_manifest }}' +- name: Ensure no community Prometheus is installed if not using community operator + k8s: + state: absent + api_version: monitoring.coreos.com/v1 + kind: prometheus + name: '{{ ansible_operator_meta.name }}' + namespace: '{{ ansible_operator_meta.namespace }}' + when: + - observability_strategy != "use_community" + +- name: Ensure no rhobs Prometheus is installed if not using it + k8s: + state: absent + api_version: monitoring.rhobs/v1 + kind: prometheus + name: '{{ ansible_operator_meta.name }}' + namespace: '{{ ansible_operator_meta.namespace }}' + when: + - observability_strategy not in ['use_redhat', 'use_hybrid'] + - name: Set default prometheus service template set_fact: prometheus_service_manifest: "{{ lookup('template', './manifest_prometheus_service.j2') | from_yaml }}" diff --git a/roles/servicetelemetry/tasks/component_qdr.yml b/roles/servicetelemetry/tasks/component_qdr.yml index de010cafc..26c210cfa 100644 --- a/roles/servicetelemetry/tasks/component_qdr.yml +++ b/roles/servicetelemetry/tasks/component_qdr.yml @@ -152,6 +152,32 @@ sasldb_path: /tmp/qdrouterd.sasldb when: interconnect_manifest is not defined +- when: + - servicetelemetry_vars.transports.qdr.auth == "basic" + block: + - name: Get QDR BasicAuth secret + k8s_info: + api_version: interconnectedcloud.github.io/v1alpha1 + kind: Interconnect + name: "{{ ansible_operator_meta.name }}-interconnect" + namespace: "{{ ansible_operator_meta.namespace }}" + register: _qdr_basicauth_object + + # Because https://github.com/interconnectedcloud/qdr-operator/blob/576d2b33dac71437ea2b165caaaf6413220767fe/pkg/controller/interconnect/interconnect_controller.go#L634 + - name: Perform a one-time upgrade to the default generated password for QDR BasicAuth + k8s: + definition: + kind: Secret + apiVersion: v1 + metadata: + name: "{{ ansible_operator_meta.name }}-interconnect-users" + namespace: "{{ ansible_operator_meta.namespace }}" + labels: + stf_one_time_upgrade: "{{ lookup('pipe', 'date +%s') }}" + stringData: + guest: "{{ lookup('password', '/dev/null') }}" + when: + - _qdr_basicauth_object.resources[0] is defined and _qdr_basicauth_object[0].metadata.labels.stf_one_time_upgrade is not defined - name: Set default Interconnect manifest set_fact: @@ -186,7 +212,12 @@ - expose: true host: 0.0.0.0 port: 5671 + {% if servicetelemetry_vars.transports.qdr.auth == "basic" %} + saslMechanisms: PLAIN + authenticatePeer: true + {% elif servicetelemetry_vars.transports.qdr.auth == "none" %} saslMechanisms: ANONYMOUS + {% endif %} sslProfile: openstack - port: 5673 linkCapacity: 25000 diff --git a/roles/servicetelemetry/tasks/component_servicemonitor.yml b/roles/servicetelemetry/tasks/component_servicemonitor.yml index 886d67df7..753116c46 100644 --- a/roles/servicetelemetry/tasks/component_servicemonitor.yml +++ b/roles/servicetelemetry/tasks/component_servicemonitor.yml @@ -1,7 +1,7 @@ - name: Create SG-specific Service Monitor manifest set_fact: sg_specific_servicemonitor_manifest: | - apiVersion: monitoring.coreos.com/v1 + apiVersion: {{ prometheus_operator_api_string }} kind: ServiceMonitor metadata: labels: diff --git a/roles/servicetelemetry/tasks/main.yml b/roles/servicetelemetry/tasks/main.yml index 96f5f2b9e..bc33df647 100644 --- a/roles/servicetelemetry/tasks/main.yml +++ b/roles/servicetelemetry/tasks/main.yml @@ -23,9 +23,26 @@ - has_certmanager_api | bool # --> backends.metrics -- name: Check if we have monitoring.coreos.com API +- name: Setup Certificates for metrics components + include_tasks: component_certificates.yml + when: + - has_certmanager_api | bool + +- name: Set community monitoring API string and labels + set_fact: + prometheus_operator_api_string: monitoring.coreos.com/v1 + prometheus_operator_label: + when: observability_strategy == 'use_community' + +- name: Set Red Hat monitoring API string + set_fact: + prometheus_operator_api_string: monitoring.rhobs/v1 + prometheus_operator_label: 'app.kubernetes.io/managed-by: observability-operator' + when: observability_strategy in ['use_redhat', 'use_hybrid'] + +- name: Check if we have the requested monitoring API set_fact: - has_monitoring_coreos_api: "{{ True if 'monitoring.coreos.com' in api_groups else False }}" + has_monitoring_api: "{{ True if (prometheus_operator_api_string | dirname) in api_groups else False }}" - block: - name: Create Prometheus instance @@ -35,8 +52,8 @@ - name: Create Alertmanager instance include_tasks: component_alertmanager.yml when: - - has_monitoring_coreos_api | bool - - observability_strategy == 'use_community' + - has_monitoring_api | bool + - observability_strategy != 'none' # --> backends.events - name: Check if we have elasticsearch API @@ -46,42 +63,15 @@ - name: Deploy ElasticSearch events backend block: - name: Setup Certificates for ElasticSearch - include_tasks: component_certificates.yml + include_tasks: component_es_certificates.yml - name: Setup ElasticSearch include_tasks: component_elasticsearch.yml when: - has_elasticsearch_api | bool - has_certmanager_api | bool - - observability_strategy == 'use_community' - -# --> backends.logs -- name: Check if we have loki API - set_fact: - has_loki_api: "{{ True if 'loki.grafana.com' in api_groups else False }}" - -- name: Create Loki instance - include_tasks: component_loki.yml - when: - - has_loki_api | bool - - observability_strategy == 'use_community' - -# --> graphing -- name: Check if we have integreatly.org API - set_fact: - has_integreatly_api: "{{ True if 'integreatly.org' in api_groups else False }}" - -- name: Deploy graphing - block: - - name: Create Grafana instance - include_tasks: component_grafana.yml - -# TODO -# - name: Create dashboards -# include_tasks: component_dashboards.yml - when: - - has_integreatly_api | bool - - observability_strategy == 'use_community' + - observability_strategy in ['use_community', 'use_hybrid'] + - servicetelemetry_vars.backends.events.elasticsearch.enabled | bool # --> clouds - name: Get data about clouds @@ -94,6 +84,16 @@ loop_control: loop_var: this_cloud +# --> graphing +- name: Check if we have integreatly.org API + set_fact: + has_integreatly_api: "{{ True if 'integreatly.org' in api_groups else False }}" + +- when: + - has_integreatly_api | bool + name: Start graphing component plays + include_tasks: component_grafana.yml + # Post deployment tasks - name: Post-setup include_tasks: post.yml diff --git a/roles/servicetelemetry/tasks/pre.yml b/roles/servicetelemetry/tasks/pre.yml index 6a6c5b494..38477b02b 100644 --- a/roles/servicetelemetry/tasks/pre.yml +++ b/roles/servicetelemetry/tasks/pre.yml @@ -34,6 +34,7 @@ - name: "Set supporting container image paths" set_fact: prometheus_webhook_snmp_container_image_path: "{{ lookup('env', 'RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_IMAGE') | default('quay.io/infrawatch/prometheus-webhook-snmp:latest', true) }}" # noqa 204 + oauth_proxy_image: "{{ lookup('env', 'RELATED_IMAGE_OAUTH_PROXY_IMAGE') | default('quay.io/openshift/origin-oauth-proxy:latest', true) }}" # noqa 204 - name: Adjust defaults when highAvailability.enabled is true block: @@ -78,6 +79,99 @@ namespace: "{{ ansible_operator_meta.namespace }}" register: smartgateways_loaded +- name: Get current STF object + k8s_info: + api_version: infra.watch/v1beta1 + kind: ServiceTelemetry + name: "{{ ansible_operator_meta.name }}" + namespace: "{{ ansible_operator_meta.namespace }}" + register: _stf_object + +- name: Get community Prometheus objects + k8s_info: + api_version: monitoring.coreos.com/v1 + kind: Prometheus + name: "{{ ansible_operator_meta.name }}" + namespace: "{{ ansible_operator_meta.namespace }}" + register: _community_prom_object + +- block: + - name: Apply community observabilityStrategy if missing on an STF object with an existing community prometheus + k8s: + definition: + apiVersion: infra.watch/v1beta1 + kind: ServiceTelemetry + metadata: + name: "{{ ansible_operator_meta.name }}" + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + observabilityStrategy: use_community + - name: Set non-default community strategy for remainder of this run + set_fact: + observability_strategy: use_community + when: + - _community_prom_object.resources[0] is defined + - _stf_object.resources[0].spec.observabilityStrategy is not defined + +- name: Apply default observabilityStrategy if missing on a new STF object with no associated community prometheus + k8s: + definition: + apiVersion: infra.watch/v1beta1 + kind: ServiceTelemetry + metadata: + name: "{{ ansible_operator_meta.name }}" + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + observabilityStrategy: "{{ observability_strategy }}" + when: + - _community_prom_object.resources[0] is not defined + - _stf_object.resources[0].spec.observabilityStrategy is not defined + +- name: Get QDR objects + k8s_info: + api_version: interconnectedcloud.github.io/v1alpha1 + kind: Interconnect + name: "{{ ansible_operator_meta.name }}-interconnect" + namespace: "{{ ansible_operator_meta.namespace }}" + register: _qdr_object + +- block: + - name: Apply legacy auth=none for QDR if missing on the STF object and it's currently deployed that way + k8s: + definition: + apiVersion: infra.watch/v1beta1 + kind: ServiceTelemetry + metadata: + name: "{{ ansible_operator_meta.name }}" + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + transports: + qdr: + auth: none + + - name: Set auth=none for remainder of this run + set_fact: + servicetelemetry_vars: "{{ servicetelemetry_vars|combine({'transports':{'qdr':{'auth': 'none'}}}, recursive=True) }}" # noqa 206 + when: + - _stf_object.resources[0].spec.transports.qdr.auth is not defined + - _qdr_object.resources[0] is defined and _qdr_object.resources[0].spec.edgeListeners[0].saslMechanisms == "ANONYMOUS" + +- name: Apply default auth for QDR if missing on a new STF object with no associated auth=none QDR + k8s: + definition: + apiVersion: infra.watch/v1beta1 + kind: ServiceTelemetry + metadata: + name: "{{ ansible_operator_meta.name }}" + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + transports: + qdr: + auth: "{{ servicetelemetry_defaults.transports.qdr.auth }}" + when: + - _stf_object.resources[0].spec.transports.qdr.auth is not defined + - _qdr_object.resources[0] is defined and _qdr_object.resources[0].spec.edgeListeners[0].saslMechanisms != "ANONYMOUS" + - name: Set ephemeral_storage_enabled to true when storage strategy is ephemeral set_fact: _ephemeral_storage_enabled: true @@ -86,14 +180,6 @@ servicetelemetry_vars.backends.events.elasticsearch.storage.strategy == "ephemeral" or servicetelemetry_vars.alerting.alertmanager.storage.strategy == "ephemeral" -- name: Get current ephemeralStorageEnabled status - k8s_info: - api_version: infra.watch/v1beta1 - kind: ServiceTelemetry - name: "{{ ansible_operator_meta.name }}" - namespace: "{{ ansible_operator_meta.namespace }}" - register: _stf_object - - name: Set ServiceTelemetry object status to have ephemeralStorageEnabled status operator_sdk.util.k8s_status: api_version: infra.watch/v1beta1 diff --git a/roles/servicetelemetry/templates/manifest_alertmanager.j2 b/roles/servicetelemetry/templates/manifest_alertmanager.j2 index 0500772c9..2465ee43f 100644 --- a/roles/servicetelemetry/templates/manifest_alertmanager.j2 +++ b/roles/servicetelemetry/templates/manifest_alertmanager.j2 @@ -1,8 +1,9 @@ -apiVersion: monitoring.coreos.com/v1 +apiVersion: {{ prometheus_operator_api_string }} kind: Alertmanager metadata: labels: alertmanager: '{{ ansible_operator_meta.name }}' + {{ prometheus_operator_label }} name: '{{ ansible_operator_meta.name }}' namespace: '{{ ansible_operator_meta.namespace }}' spec: diff --git a/roles/servicetelemetry/templates/manifest_alertmanager_route.j2 b/roles/servicetelemetry/templates/manifest_alertmanager_route.j2 index 5e00bee57..b25b7811a 100644 --- a/roles/servicetelemetry/templates/manifest_alertmanager_route.j2 +++ b/roles/servicetelemetry/templates/manifest_alertmanager_route.j2 @@ -1,4 +1,4 @@ -apiVersion: v1 +apiVersion: route.openshift.io/v1 kind: Route metadata: name: '{{ ansible_operator_meta.name }}-alertmanager-proxy' diff --git a/roles/servicetelemetry/templates/manifest_grafana.j2 b/roles/servicetelemetry/templates/manifest_grafana.j2 index d2b26eb34..792f7065c 100644 --- a/roles/servicetelemetry/templates/manifest_grafana.j2 +++ b/roles/servicetelemetry/templates/manifest_grafana.j2 @@ -9,7 +9,7 @@ spec: serviceaccounts.openshift.io/oauth-redirectreference.primary: '{{ grafana_oauth_redir_ref | to_json }}' deployment: annotations: - hash-of-creds-to-force-restart-if-changed: {{ (servicetelemetry_vars.graphing.grafana.admin_user + servicetelemetry_vars.graphing.grafana.admin_password) | password_hash('sha256', (session_secret | b64encode)[:16] ) }} + hash-of-creds-to-force-restart-if-changed: {{ grafana_htpasswd_auth_string | b64encode }} baseImage: {{ servicetelemetry_vars.graphing.grafana.base_image }} ingress: enabled: {{ servicetelemetry_vars.graphing.grafana.ingress_enabled }} diff --git a/roles/servicetelemetry/templates/manifest_grafana_ds.j2 b/roles/servicetelemetry/templates/manifest_grafana_ds.j2 index 69910918e..d0f0478d1 100644 --- a/roles/servicetelemetry/templates/manifest_grafana_ds.j2 +++ b/roles/servicetelemetry/templates/manifest_grafana_ds.j2 @@ -29,11 +29,11 @@ spec: access: proxy editable: true isDefault: false - url: 'https://elasticsearch-es-http:9200' + url: {{ servicetelemetry_vars.backends.events.elasticsearch.forwarding.host_url }} type: elasticsearch - basicAuth: true - basicAuthUser: elastic - basicAuthPassword: {{ elasticsearch_pass }} + basicAuth: {{ servicetelemetry_vars.backends.events.elasticsearch.forwarding.use_basic_auth }} + basicAuthUser: {{ elastic_user }} + basicAuthPassword: {{ elastic_pass }} database: collectd_* jsonData: tlsSkipVerify: true @@ -44,25 +44,15 @@ spec: access: proxy editable: true isDefault: false - url: 'https://elasticsearch-es-http:9200' + url: {{ servicetelemetry_vars.backends.events.elasticsearch.forwarding.host_url }} type: elasticsearch - basicAuth: true - basicAuthUser: elastic - basicAuthPassword: {{ elasticsearch_pass }} + basicAuth: {{ servicetelemetry_vars.backends.events.elasticsearch.forwarding.use_basic_auth }} + basicAuthUser: {{ elastic_user }} + basicAuthPassword: {{ elastic_pass }} database: ceilometer_* jsonData: tlsSkipVerify: true timeField: generated esVersion: 70 {% endif %} - -{% if servicetelemetry_vars.backends.logs.loki.enabled %} - - access: proxy - editable: true - isDefault: false - name: STFLoki - type: loki - url: 'http://loki-query-frontend-http-lokistack:3100' - version: 1 -{% endif %} name: {{ ansible_operator_meta.name }}-ds-stf.yaml diff --git a/roles/servicetelemetry/templates/manifest_loki.j2 b/roles/servicetelemetry/templates/manifest_loki.j2 deleted file mode 100644 index ee8a0b5d8..000000000 --- a/roles/servicetelemetry/templates/manifest_loki.j2 +++ /dev/null @@ -1,39 +0,0 @@ -apiVersion: loki.grafana.com/v1beta1 -kind: LokiStack -metadata: - name: lokistack - namespace: '{{ ansible_operator_meta.namespace }}' -spec: - size: {{ servicetelemetry_vars.backends.logs.loki.flavor }} - replicationFactor: {{ servicetelemetry_vars.backends.logs.loki.replication_factor }} - storage: - secret: - name: {{ servicetelemetry_vars.backends.logs.loki.storage.object_storage_secret }} - storageClassName: {{ servicetelemetry_vars.backends.logs.loki.storage.storage_class }} -{% if servicetelemetry_vars.backends.logs.loki.compactor.replicas | length or - servicetelemetry_vars.backends.logs.loki.distributor.replicas | length or - servicetelemetry_vars.backends.logs.loki.ingester.replicas | length or - servicetelemetry_vars.backends.logs.loki.querier.replicas | length or - servicetelemetry_vars.backends.logs.loki.query_frontend.replicas | length %} - template: -{% if servicetelemetry_vars.backends.logs.loki.compactor.replicas | length %} - compactor: - replicas: {{ servicetelemetry_vars.backends.logs.loki.compactor.replicas }} -{% endif %} -{% if servicetelemetry_vars.backends.logs.loki.distributor.replicas | length %} - distributor: - replicas: {{ servicetelemetry_vars.backends.logs.loki.distributor.replicas }} -{% endif %} -{% if servicetelemetry_vars.backends.logs.loki.ingester.replicas | length %} - ingester: - replicas: {{ servicetelemetry_vars.backends.logs.loki.ingester.replicas }} -{% endif %} -{% if servicetelemetry_vars.backends.logs.loki.querier.replicas | length %} - querier: - replicas: {{ servicetelemetry_vars.backends.logs.loki.querier.replicas }} -{% endif %} -{% if servicetelemetry_vars.backends.logs.loki.query_frontend.replicas | length %} - queryFrontend: - replicas: {{ servicetelemetry_vars.backends.logs.loki.query_frontend.replicas }} -{% endif %} -{% endif %} diff --git a/roles/servicetelemetry/templates/manifest_prometheus.j2 b/roles/servicetelemetry/templates/manifest_prometheus.j2 index 8b594cefa..2bdf408b9 100644 --- a/roles/servicetelemetry/templates/manifest_prometheus.j2 +++ b/roles/servicetelemetry/templates/manifest_prometheus.j2 @@ -1,15 +1,17 @@ -apiVersion: monitoring.coreos.com/v1 +apiVersion: {{ prometheus_operator_api_string }} kind: Prometheus metadata: labels: prometheus: '{{ ansible_operator_meta.name }}' + {{ prometheus_operator_label }} name: '{{ ansible_operator_meta.name }}' namespace: '{{ ansible_operator_meta.namespace }}' spec: + version: v2.43.0 replicas: {{ servicetelemetry_vars.backends.metrics.prometheus.deployment_size }} ruleSelector: {} securityContext: {} - serviceAccountName: prometheus-k8s + serviceAccountName: prometheus-stf serviceMonitorSelector: matchLabels: app: smart-gateway @@ -42,7 +44,7 @@ spec: - -upstream=http://localhost:9090/ - -htpasswd-file=/etc/proxy/htpasswd/auth - -cookie-secret-file=/etc/proxy/secrets/session_secret - - -openshift-service-account=prometheus-k8s + - -openshift-service-account=prometheus-stf - '-openshift-sar={"resource": "namespaces", "verb": "get"}' ports: - containerPort: 9092 diff --git a/roles/servicetelemetry/templates/manifest_prometheus_route.j2 b/roles/servicetelemetry/templates/manifest_prometheus_route.j2 index af1c024ad..85611cb80 100644 --- a/roles/servicetelemetry/templates/manifest_prometheus_route.j2 +++ b/roles/servicetelemetry/templates/manifest_prometheus_route.j2 @@ -1,4 +1,4 @@ -apiVersion: v1 +apiVersion: route.openshift.io/v1 kind: Route metadata: name: '{{ ansible_operator_meta.name }}-prometheus-proxy' diff --git a/roles/servicetelemetry/templates/manifest_smartgateway_events.j2 b/roles/servicetelemetry/templates/manifest_smartgateway_events.j2 index 884b0ae0f..3a80ad92d 100644 --- a/roles/servicetelemetry/templates/manifest_smartgateway_events.j2 +++ b/roles/servicetelemetry/templates/manifest_smartgateway_events.j2 @@ -1,27 +1,28 @@ apiVersion: smartgateway.infra.watch/v2 kind: SmartGateway metadata: - name: '{{ this_smartgateway }}' - namespace: '{{ ansible_operator_meta.namespace }}' + name: {{ this_smartgateway }} + namespace: {{ ansible_operator_meta.namespace }} spec: {% if this_collector.debug_enabled is defined and this_collector.debug_enabled %} - logLevel: "debug" + logLevel: debug {% else %} - logLevel: "info" + logLevel: info {% endif %} handleErrors: true size: {{ smartgateway_deployment_size }} applications: - config: | - hostURL: https://elasticsearch-es-http.{{ ansible_operator_meta.namespace }}.svc.cluster.local:9200 - useTLS: true - tlsClientCert: /config/certs/tls.crt - tlsClientKey: /config/certs/tls.key + hostURL: {{ elasticsearch.forwarding.host_url }} + useTLS: {{ elasticsearch.forwarding.use_tls }} + tlsClientCert: /config/certs/user.crt + tlsClientKey: /config/certs/user.key tlsCaCert: /config/certs/ca.crt - tlsServerName: 'elasticsearch-es-http.{{ ansible_operator_meta.namespace }}.svc.cluster.local' - user: '{{ elastic_user | default('elastic') }}' - password: '{{ elastic_pass | default('') }}' - useBasicAuth: true + tlsServerName: {{ elastic_tls_server_name }} + tlsSecretName: {{ elasticsearch.forwarding.tls_secret_name }} + user: {{ elastic_user | default('elastic') }} + password: {{ elastic_pass | default('') }} + useBasicAuth: {{ elasticsearch.forwarding.use_basic_auth }} name: elasticsearch bridge: amqpUrl: amqp://{{ ansible_operator_meta.name }}-interconnect.{{ ansible_operator_meta.namespace }}.svc.cluster.local:5673/{{ this_collector.subscription_address }} diff --git a/roles/servicetelemetry/templates/manifest_smartgateway_logs.j2 b/roles/servicetelemetry/templates/manifest_smartgateway_logs.j2 deleted file mode 100644 index ccacb002f..000000000 --- a/roles/servicetelemetry/templates/manifest_smartgateway_logs.j2 +++ /dev/null @@ -1,38 +0,0 @@ -apiVersion: smartgateway.infra.watch/v2 -kind: SmartGateway -metadata: - name: '{{ this_smartgateway }}' - namespace: '{{ ansible_operator_meta.namespace }}' -spec: -{% if this_collector.debug_enabled is defined and this_collector.debug_enabled %} - logLevel: "debug" -{% else %} - logLevel: "info" -{% endif %} - handleErrors: true - blockEventBus: true - size: {{ smartgateway_deployment_size }} - applications: - - config: | - connection: http://loki-distributor-http-lokistack.{{ ansible_operator_meta.namespace }}.svc.cluster.local:3100 - batchSize: {{ loki_batch_size | default('2000') }} - maxWaitTime: {{ loki_max_wait_time | default('1s') }} - name: loki - bridge: - amqpUrl: amqp://{{ ansible_operator_meta.name }}-interconnect.{{ ansible_operator_meta.namespace }}.svc.cluster.local:5673/{{ this_collector.subscription_address }} - amqpBlock: true - socketBlock: true - ringBufferSize: {{ this_collector.bridge.ring_buffer_size | default(135048) }} - ringBufferCount: {{ this_collector.bridge.ring_buffer_count | default(15000) }} - verbose: {{ this_collector.bridge.verbose | default(false) }} - transports: - - config: | - path: /tmp/smartgateway - handlers: - - name: logs - config: | - timestampField: "@timestamp" - messageField: "message" - severityField: "severity" - hostnameField: "host" - name: socket diff --git a/roles/servicetelemetry/templates/manifest_snmp_traps.j2 b/roles/servicetelemetry/templates/manifest_snmp_traps.j2 index b4a48445b..cee12c01b 100644 --- a/roles/servicetelemetry/templates/manifest_snmp_traps.j2 +++ b/roles/servicetelemetry/templates/manifest_snmp_traps.j2 @@ -28,7 +28,7 @@ spec: - name: SNMP_PORT value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.port }}" - name: SNMP_TIMEOUT - value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.port }}" + value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.timeout }}" - name: ALERT_OID_LABEL value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.alert_oid_label }}" - name: TRAP_OID_PREFIX diff --git a/roles/servicetelemetry/vars/dummy_user_certs.yml b/roles/servicetelemetry/vars/dummy_user_certs.yml new file mode 100644 index 000000000..e352309d1 --- /dev/null +++ b/roles/servicetelemetry/vars/dummy_user_certs.yml @@ -0,0 +1,56 @@ +# These are required because sg-core always expects there to be a user cert + key, whether we need it for auth or not +# CN = STF DUMMY USER CERT - DO NOT USE +elastic_user_cert_dummy: | + -----BEGIN CERTIFICATE----- + MIIEAzCCAuugAwIBAgIUVwi6wEIGgmyQfZ8s1+oqaf+yTpcwDQYJKoZIhvcNAQEL + BQAwgZAxCzAJBgNVBAYTAlVTMRcwFQYDVQQIDA5Ob3J0aCBDYXJvbGluYTEQMA4G + A1UEBwwHUmFsZWlnaDEUMBIGA1UECgwLUmVkIEhhdCBJbmMxFTATBgNVBAsMDE9T + UCBDbG91ZG9wczEpMCcGA1UEAwwgU1RGIERVTU1ZIFVTRVIgQ0VSVCAtIERPIE5P + VCBVU0UwHhcNMjMwNjEyMTgxODQ1WhcNMjMwNjEzMTgxODQ1WjCBkDELMAkGA1UE + BhMCVVMxFzAVBgNVBAgMDk5vcnRoIENhcm9saW5hMRAwDgYDVQQHDAdSYWxlaWdo + MRQwEgYDVQQKDAtSZWQgSGF0IEluYzEVMBMGA1UECwwMT1NQIENsb3Vkb3BzMSkw + JwYDVQQDDCBTVEYgRFVNTVkgVVNFUiBDRVJUIC0gRE8gTk9UIFVTRTCCASIwDQYJ + KoZIhvcNAQEBBQADggEPADCCAQoCggEBANQU/9/BEJbuX2xJUozSbUvG7qlk6yEi + KcFjkUwnXT+131ho+UWUn29yuqXI60E+8trWsL3uFlMbGh9t2VRfbfNNZiqon197 + CfzqS596AP8HtTZZx0Qy4sZrPRs8ffR/3wMjp8kMj+2jPpMq0zngJ1efHK7Z6GSR + IveXbCCfPQU4tvT3aQ5JQkIWvIo7kuS/u9K6LvOspYP04YNLUZdMCJDNE8hSpEkv + KfG7ZL2cfWF1nsX5+qyU5aIrUS7RYd/HGMKvpA0/Lvzl5FBMZ0BCF00LmY1tjUzK + DhHR62g/IkRaq8rrjdE+H2isVgSAIPAvnC039ePE4OOsoqO+aYYWqEsCAwEAAaNT + MFEwHQYDVR0OBBYEFMKfKoCQcbkb9BBDxXAQjYLSUWtoMB8GA1UdIwQYMBaAFMKf + KoCQcbkb9BBDxXAQjYLSUWtoMA8GA1UdEwEB/wQFMAMBAf8wDQYJKoZIhvcNAQEL + BQADggEBAI9q3AhqodLtsbET7yoZ2vAj8fzJyu5FXyyEf9wVgsyoJ56H77s2yp+b + iixc/MI9jsLX9wXa8monxdbHu01rlB8c9OwbcAyAhlnBWYcPqVFz4bjqNw5sH2Jg + vqIaGncn62IZv3mvN1VpyKZp2MbZGJdbgU8X3h1C6DCLf4fToFlqiiZ/XVvbk824 + j/OZ9l6Eu8VVekIQXVp2m9PPndOuEsBIMBkiB/xf32RTiOYWSG5mp70fxD7n2of/ + yb7hY+fL/wlucqS4ryT+2307ouEcTmpDSjHwKZRUYUDBZ4TmxCx5LlkuTO9MRnRy + 9hCGFF1rI+K33F952hxjkNaSSZvt3lQ= + -----END CERTIFICATE----- + +elastic_user_key_dummy: | + -----BEGIN RSA PRIVATE KEY----- + MIIEpAIBAAKCAQEA1BT/38EQlu5fbElSjNJtS8buqWTrISIpwWORTCddP7XfWGj5 + RZSfb3K6pcjrQT7y2tawve4WUxsaH23ZVF9t801mKqifX3sJ/OpLn3oA/we1NlnH + RDLixms9Gzx99H/fAyOnyQyP7aM+kyrTOeAnV58crtnoZJEi95dsIJ89BTi29Pdp + DklCQha8ijuS5L+70rou86ylg/Thg0tRl0wIkM0TyFKkSS8p8btkvZx9YXWexfn6 + rJTloitRLtFh38cYwq+kDT8u/OXkUExnQEIXTQuZjW2NTMoOEdHraD8iRFqryuuN + 0T4faKxWBIAg8C+cLTf148Tg46yio75phhaoSwIDAQABAoIBABXMUsBcx6e7uHMY + 1jNDLZisSbt/c+tj54bJBRYetabmup4LrBNKw1hhIm4HyKZcIfn8Nw5OelzwXC7+ + y2ewp0xqmCWqTzcxHkWwjzVFBPUxhZ6ge6q20Dg0rYMvJIMM4Y8hCw3PDLwQG05l + CHDaaTDIWdpe61Pq1v07wxFXTJ5MlgjoIfDN3xCFhHOEpbNCl6yVie4irjmxItS9 + Xp1/tdqtq8xSAAo9wWGb9SjsOn/C/AMtxerdHFjv8QErrA/ta/5qXa3KdEnElHqc + 2HkGt5w5FcRXCwrUW1MwnBzwbK5kEZth3D/i41y/F4vjwYwPfHRh3AeOpDpul0XW + qH+8qQECgYEA9iiUvbepX4mnj7CIQGKlDRCvvhdCUBnIAgA9/L8WWZIDvAEl1Rka + avAIvLMCTzoAO+TframNef0dNJWOAo/WQ/ViiLaqg7gbGE2DPjLitk0XaeKl+XAv + ip0K1Qouzxv2FFJR4h9iDCWjRIeClKIhE1sEMyJk45qyR4bMx0jQZJMCgYEA3I+l + wOO0kLD2lk/t9JBiBSLUrr6/mkPkCT7wn9U7owwHuoPDJHYX8+7y2u8vow6fgQyD + Jvud8wQOV4owBOBNafBT8a3Vp3W1lLTm1r0jJ7qbVNuMAnXcj1S0Q3VNX/jvO6wn + q6Ddxqh9p9+tYSNzwnD5XqxLeZiHXWCE2fB1+GkCgYEAillwj9iD52BUvtu3GIjY + vykbvTkRWjfDQ+yi6kTz6M+6LZZvjv+W63eRUY1CxQiSTRdr6A0dqOxr17wenq38 + /SETikcwOuvkvpoCI5kx9sgJWse6BSHadouhJO+eM2VBv1YtE2wUDUOyKbgH2kXt + VRWYnKy+C3ZMsQrAWVlBVuUCgYBI6LNCMANgUR8yUPm3/oJocDseCLANrqOS6ttf + +nzcSP3FCglX5DHG0RY2iRqWLB9N6XTxTfvIeW7EQUneUsdEXc1h9rTJxn9fyO0F + zz/vwh/WzTxbE9r1BmsQYZZSQ1fRwfbbJTIqmUfwVmBZ2/5IKFBGm23XpDQbCezg + njxhAQKBgQD0lOpKtL8qz9gmqtkhDRe+EPHSX8rfirqqRrPUiwK7kAJeW2vtU8aa + hFT7lEDjb7ERyZfybIkTVVBipKx2yse9nE+1dPGIgZop3E1guDuF9aOAzIUd/+/s + CI7s/lIBZsPD3PyxXXRtsvN7iUv5tLvNFhfomB7miTYHE+MC5QHJVQ== + -----END RSA PRIVATE KEY----- diff --git a/tests/infrared/13/stf-connectors.yaml.template b/tests/infrared/13/stf-connectors.yaml.template index 98fca0847..c6cf36182 100644 --- a/tests/infrared/13/stf-connectors.yaml.template +++ b/tests/infrared/13/stf-connectors.yaml.template @@ -5,7 +5,7 @@ tripleo_heat_templates: custom_templates: # don't load collectd-write-qdr.yaml when using multi-cloud and instead load collectd service directly resource_registry: - OS::TripleO::Services::Collectd: /usr/share/openstack-tripleo-heat-templates/deployment/metrics/collectd-container-puppet.yaml + OS::TripleO::Services::Collectd: /usr/share/openstack-tripleo-heat-templates/docker/services/metrics/collectd.yaml # set parameter defaults to match stable-1.3 documentation parameter_defaults: diff --git a/tests/infrared/16.2/infrared-openstack.sh b/tests/infrared/16.2/infrared-openstack.sh index cb4641463..c90d433f3 100755 --- a/tests/infrared/16.2/infrared-openstack.sh +++ b/tests/infrared/16.2/infrared-openstack.sh @@ -87,7 +87,7 @@ ir_create_undercloud() { --build "${OSP_BUILD}" \ --images-task rpm \ --images-update no \ - --tls-ca https://password.corp.redhat.com/RH-IT-Root-CA.crt \ + --tls-ca https://certs.corp.redhat.com/certs/2022-IT-Root-CA.pem \ --overcloud-domain "${OVERCLOUD_DOMAIN}" \ --config-options DEFAULT.undercloud_timezone=UTC } diff --git a/tests/infrared/17.1/.gitignore b/tests/infrared/17.1/.gitignore new file mode 100644 index 000000000..7c466baa0 --- /dev/null +++ b/tests/infrared/17.1/.gitignore @@ -0,0 +1,3 @@ +outputs/** +!outputs/.KEEPIT + diff --git a/tests/infrared/17.1/README.md b/tests/infrared/17.1/README.md new file mode 100644 index 000000000..15bcf37a9 --- /dev/null +++ b/tests/infrared/17.1/README.md @@ -0,0 +1,16 @@ +# Deployments + +## Basic deployment + +```bash +OCP_ROUTE_IP="10.0.100.50" \ +CA_CERT_FILE_CONTENT="$(oc get secret/default-interconnect-selfsigned -o jsonpath='{.data.ca\.crt}' | base64 -d)" \ +AMQP_HOST="$(oc get route default-interconnect-5671 -ojsonpath='{.spec.host}')" \ +ENABLE_STF_CONNECTORS=true \ +ENABLE_GNOCCHI_CONNECTORS=false \ +CONTROLLER_MEMORY="24000" \ +COMPUTE_CPU="6" \ +COMPUTE_MEMORY="24000" \ +LIBVIRT_DISKPOOL="/home/libvirt/images" \ +./infrared-openstack.sh +``` diff --git a/tests/infrared/17.1/enable-stf.yaml.template b/tests/infrared/17.1/enable-stf.yaml.template new file mode 100644 index 000000000..c1c5215da --- /dev/null +++ b/tests/infrared/17.1/enable-stf.yaml.template @@ -0,0 +1,66 @@ +--- +tripleo_heat_templates: + [] + +custom_templates: + # matches the documentation for enable-stf.yaml in stable-1.3 documentation + parameter_defaults: + # only send to STF, not other publishers + PipelinePublishers: [] + + # manage the polling and pipeline configuration files for Ceilometer agents + ManagePolling: true + ManagePipeline: true + ManageEventPipeline: false + + # enable Ceilometer metrics + CeilometerQdrPublishMetrics: true + + # enable collection of API status + CollectdEnableSensubility: true + CollectdSensubilityTransport: amqp1 + + # enable collection of containerized service metrics + CollectdEnableLibpodstats: true + + # set collectd overrides for higher telemetry resolution and extra plugins + # to load + CollectdConnectionType: amqp1 + CollectdAmqpInterval: 30 + CollectdDefaultPollingInterval: 30 + CollectdExtraPlugins: + - vmem + + # set standard prefixes for where metrics are published to QDR + MetricsQdrAddresses: + - prefix: 'collectd' + distribution: multicast + - prefix: 'anycast/ceilometer' + distribution: multicast + + ExtraConfig: + ceilometer::agent::polling::polling_interval: 30 + ceilometer::agent::polling::polling_meters: + - cpu + - memory.usage + + # to avoid filling the memory buffers if disconnected from the message bus + # note: this may need an adjustment if there are many metrics to be sent. + collectd::plugin::amqp1::send_queue_limit: 5000 + + # receive extra information about virtual memory + collectd::plugin::vmem::verbose: true + + # provide name and uuid in addition to hostname for better correlation + # to ceilometer data + collectd::plugin::virt::hostname_format: "name uuid hostname" + + # provide the human-friendly name of the virtual instance + collectd::plugin::virt::plugin_instance_format: metadata + + # set memcached collectd plugin to report its metrics by hostname + # rather than host IP, ensuring metrics in the dashboard remain uniform + collectd::plugin::memcached::instances: + local: + host: "%{hiera('fqdn_canonical')}" + port: 11211 diff --git a/tests/infrared/17.1/extra-hosts.yaml.template b/tests/infrared/17.1/extra-hosts.yaml.template new file mode 100644 index 000000000..3129c35ac --- /dev/null +++ b/tests/infrared/17.1/extra-hosts.yaml.template @@ -0,0 +1,9 @@ +--- +tripleo_heat_templates: + [] + +custom_templates: + parameter_defaults: + ExtraHostFileEntries: + - '<>' + diff --git a/tests/infrared/17.1/gnocchi-connectors.yaml.template b/tests/infrared/17.1/gnocchi-connectors.yaml.template new file mode 100644 index 000000000..1a5b729a2 --- /dev/null +++ b/tests/infrared/17.1/gnocchi-connectors.yaml.template @@ -0,0 +1,24 @@ +--- +tripleo_heat_templates: + [] + +custom_templates: + resource_registry: + OS::TripleO::Services::GnocchiApi: /usr/share/openstack-tripleo-heat-templates/deployment/gnocchi/gnocchi-api-container-puppet.yaml + OS::TripleO::Services::GnocchiMetricd: /usr/share/openstack-tripleo-heat-templates/deployment/gnocchi/gnocchi-metricd-container-puppet.yaml + OS::TripleO::Services::GnocchiStatsd: /usr/share/openstack-tripleo-heat-templates/deployment/gnocchi/gnocchi-statsd-container-puppet.yaml + OS::TripleO::Services::AodhApi: /usr/share/openstack-tripleo-heat-templates/deployment/aodh/aodh-api-container-puppet.yaml + OS::TripleO::Services::AodhEvaluator: /usr/share/openstack-tripleo-heat-templates/deployment/aodh/aodh-evaluator-container-puppet.yaml + OS::TripleO::Services::AodhNotifier: /usr/share/openstack-tripleo-heat-templates/deployment/aodh/aodh-notifier-container-puppet.yaml + OS::TripleO::Services::AodhListener: /usr/share/openstack-tripleo-heat-templates/deployment/aodh/aodh-listener-container-puppet.yaml + + parameter_defaults: + CeilometerEnableGnocchi: true + CeilometerEnablePanko: false + GnocchiArchivePolicy: 'high' + GnocchiBackend: 'rbd' + GnocchiRbdPoolName: 'metrics' + + EventPipelinePublishers: ['gnocchi://?filter_project=service'] + PipelinePublishers: ['gnocchi://?filter_project=service'] + diff --git a/tests/infrared/17.1/infrared-openstack.sh b/tests/infrared/17.1/infrared-openstack.sh new file mode 100755 index 000000000..9743a0081 --- /dev/null +++ b/tests/infrared/17.1/infrared-openstack.sh @@ -0,0 +1,223 @@ +#!/usr/bin/env bash +set -e + +# Usage: +# VIRTHOST=my.big.hypervisor.net +# ./infrared-openstack.sh +VIRTHOST=${VIRTHOST:-localhost} +AMQP_HOST=${AMQP_HOST:-stf-default-interconnect-5671-service-telemetry.apps-crc.testing} +AMQP_PORT=${AMQP_PORT:-443} +SSH_KEY="${SSH_KEY:-${HOME}/.ssh/id_rsa}" +NTP_SERVER="${NTP_SERVER:-clock.redhat.com,10.5.27.10,10.11.160.238}" +CLOUD_NAME="${CLOUD_NAME:-cloud1}" +OCP_ROUTE_IP=${OCP_ROUTE_IP:-} + +VM_IMAGE_URL_PATH="${VM_IMAGE_URL_PATH:-http://download.devel.redhat.com/rhel-9/rel-eng/RHEL-9/latest-RHEL-9.2/compose/BaseOS/x86_64/images/}" +# Recommend these default to tested immutable dentifiers where possible, pass "latest" style ids via environment if you want them +VM_IMAGE="${VM_IMAGE:-rhel-guest-image-9.2-20230414.17.x86_64.qcow2}" +VM_IMAGE_LOCATION="${VM_IMAGE_URL_PATH}/${VM_IMAGE}" + +OSP_BUILD="${OSP_BUILD:-passed_phase2}" +OSP_VERSION="${OSP_VERSION:-17.1}" +OSP_TOPOLOGY="${OSP_TOPOLOGY:-undercloud:1,controller:3,compute:2,ceph:0}" +OSP_MIRROR="${OSP_MIRROR:-rdu2}" +LIBVIRT_DISKPOOL="${LIBVIRT_DISKPOOL:-/var/lib/libvirt/images}" +STF_ENVIRONMENT_TEMPLATE="${STF_ENVIRONMENT_TEMPLATE:-stf-connectors.yaml.template}" +GNOCCHI_ENVIRONMENT_TEMPLATE="${GNOCCHI_ENVIRONMENT_TEMPLATE:-gnocchi-connectors.yaml.template}" +ENABLE_STF_ENVIRONMENT_TEMPLATE="${ENABLE_STF_ENVIRONMENT_TEMPLATE:-enable-stf.yaml.template}" +EXTRA_HOST_FILE_TEMPLATE="${EXTRA_HOST_FILE_TEMPLATE:-extra-hosts.yaml.template}" +OVERCLOUD_DOMAIN="${OVERCLOUD_DOMAIN:-`hostname -s`}" + +UNDERCLOUD_CPU="${UNDERCLOUD_CPU:-4}" +UNDERCLOUD_MEMORY="${UNDERCLOUD_MEMORY:-16384}" +CONTROLLER_CPU="${CONTROLLER_CPU:-2}" +CONTROLLER_MEMORY="${CONTROLLER_MEMORY:-12228}" +COMPUTE_CPU="${COMPUTE_CPU:-4}" +COMPUTE_MEMORY="${COMPUTE_MEMORY:-12228}" +CEPH_CPU="${CEPH_CPU:-2}" +CEPH_MEMORY="${CEPH_MEMORY:-4096}" + +TEMPEST_ONLY="${TEMPEST_ONLY:-false}" +RUN_WORKLOAD="${RUN_WORKLOAD:-false}" +CA_CERT_FILE_CONTENT="${CA_CERT_FILE_CONTENT:-}" +ENABLE_STF_CONNECTORS="${ENABLE_STF_CONNECTORS:-true}" +ENABLE_GNOCCHI_CONNECTORS="${ENABLE_GNOCCHI_CONNECTORS:-true}" + +ir_run_cleanup() { + infrared virsh \ + -vv \ + -o outputs/cleanup.yml \ + --disk-pool "${LIBVIRT_DISKPOOL}" \ + --host-address "${VIRTHOST}" \ + --host-key "${SSH_KEY}" \ + --cleanup yes + + echo "*** If you just want to clean up the environment now is your chance to Ctrl+C ***" + sleep 10 +} + +ir_run_provision() { + infrared virsh \ + -vvv \ + -o outputs/provision.yml \ + --disk-pool "${LIBVIRT_DISKPOOL}" \ + --topology-nodes "${OSP_TOPOLOGY}" \ + --host-address "${VIRTHOST}" \ + --host-key "${SSH_KEY}" \ + --image-url "${VM_IMAGE_LOCATION}" \ + --host-memory-overcommit True \ + --topology-network 3_nets \ + -e override.undercloud.cpu="${UNDERCLOUD_CPU}" \ + -e override.undercloud.memory="${UNDERCLOUD_MEMORY}" \ + -e override.controller.cpu="${CONTROLLER_CPU}" \ + -e override.controller.memory="${CONTROLLER_MEMORY}" \ + -e override.compute.cpu="${COMPUTE_CPU}" \ + -e override.compute.memory="${COMPUTE_MEMORY}" \ + -e override.ceph.cpu="${CEPH_CPU}" \ + -e override.ceph.memory="${CEPH_MEMORY}" \ + --serial-files True \ + --bootmode uefi +} + +ir_create_undercloud() { + infrared tripleo-undercloud \ + -vv \ + -o outputs/undercloud-install.yml \ + --mirror "${OSP_MIRROR}" \ + --version "${OSP_VERSION}" \ + --splitstack no \ + --shade-host undercloud-0 \ + --ssl yes \ + --build "${OSP_BUILD}" \ + --images-task rpm \ + --images-update no \ + --tls-ca https://certs.corp.redhat.com/certs/2022-IT-Root-CA.pem \ + --overcloud-domain "${OVERCLOUD_DOMAIN}" \ + --config-options DEFAULT.undercloud_timezone=UTC +} + +stf_create_config() { + sed -r "s/<>/${AMQP_HOST}/;s/<>/${AMQP_PORT}/;s/<>/${CLOUD_NAME}/;s%<>%${CA_CERT_FILE_CONTENT//$'\n'/<@@@>}%;s/<@@@>/\n /g" ${STF_ENVIRONMENT_TEMPLATE} > outputs/stf-connectors.yaml +} + +gnocchi_create_config() { + cat ${GNOCCHI_ENVIRONMENT_TEMPLATE} > outputs/gnocchi-connectors.yaml +} + +enable_stf_create_config() { + cat ${ENABLE_STF_ENVIRONMENT_TEMPLATE} > outputs/enable-stf.yaml +} + +enable_extra_host_file_create_config() { + sed -r "s/<>/${OCP_ROUTE_IP} ${AMQP_HOST}/g" ${EXTRA_HOST_FILE_TEMPLATE} > outputs/extra-hosts.yaml +} + +ir_create_overcloud() { + infrared tripleo-overcloud \ + -vv \ + -o outputs/overcloud-install.yml \ + --version "${OSP_VERSION}" \ + --deployment-files virt \ + --overcloud-debug yes \ + --network-backend geneve \ + --network-protocol ipv4 \ + --network-bgpvpn no \ + --network-dvr no \ + --network-l2gw no \ + --storage-backend lvm \ + --overcloud-ssl no \ + --introspect yes \ + --tagging yes \ + --deploy yes \ + --overcloud-templates ceilometer-write-qdr-edge-only,outputs/enable-stf.yaml,outputs/stf-connectors.yaml,outputs/gnocchi-connectors.yaml,outputs/extra-hosts.yaml \ + --overcloud-domain "${OVERCLOUD_DOMAIN}" \ + --containers yes \ + --vbmc-force False \ + --vbmc-host undercloud \ + --config-heat ComputeParameters.NeutronBridgeMappings='tenant:br-isolated' \ + --extra-vars osp_version="${OSP_VERSION}" +} + +ir_run_tempest() { + infrared tempest \ + -vv \ + -o outputs/test.yml \ + --openstack-installer tripleo \ + --openstack-version "${OSP_VERSION}" \ + --tests smoke \ + --setup rpm \ + --revision=HEAD \ + --image http://download.cirros-cloud.net/0.4.0/cirros-0.4.0-x86_64-disk.img +} + +ir_expose_ui() { + infrared cloud-config --external-dhcp True \ + --external-shared True \ + --deployment-files virt \ + --tasks create_external_network,forward_overcloud_dashboard +} + +ir_run_workload() { + infrared cloud-config --deployment-files virt --tasks launch_workload +} + + +if [ -z "${CA_CERT_FILE_CONTENT}" ]; then + echo "CA_CERT_FILE_CONTENT must be set and passed to the deployment, or QDR will fail to connect." + exit 1 +fi + +time if ${TEMPEST_ONLY}; then + echo "-- Running tempest tests" + ir_run_tempest +else + echo "-- full cloud deployment" + echo ">> Cloud name: ${CLOUD_NAME}" + echo ">> Overcloud domain: ${OVERCLOUD_DOMAIN}" + echo ">> STF enabled: ${ENABLE_STF_CONNECTORS}" + echo ">> Gnocchi enabled: ${ENABLE_GNOCCHI_CONNECTORS}" + echo ">> OSP version: ${OSP_VERSION}" + echo ">> OSP build: ${OSP_BUILD}" + echo ">> OSP topology: ${OSP_TOPOLOGY}" + + ir_run_cleanup + if ${ENABLE_STF_CONNECTORS}; then + stf_create_config + enable_stf_create_config + if [ -z "${OCP_ROUTE_IP}" ]; then + touch outputs/extra-hosts.yaml + truncate --size 0 outputs/extra-hosts.yaml + else + enable_extra_host_file_create_config + fi + else + touch outputs/stf-connectors.yaml + truncate --size 0 outputs/stf-connectors.yaml + touch outputs/enable-stf.yaml + truncate --size 0 outputs/enable-stf.yaml + touch outputs/extra-hosts.yaml + truncate --size 0 outputs/extra-hosts.yaml + fi + if ${ENABLE_GNOCCHI_CONNECTORS}; then + gnocchi_create_config + else + touch outputs/gnocchi-connectors.yaml + truncate --size 0 outputs/gnocchi-connectors.yaml + fi + ir_run_provision + ir_create_undercloud + ir_create_overcloud + ir_expose_ui + if ${RUN_WORKLOAD}; then + ir_run_workload + fi + + echo "-- deployment completed" + echo ">> Cloud name: ${CLOUD_NAME}" + echo ">> Overcloud domain: ${OVERCLOUD_DOMAIN}" + echo ">> STF enabled: ${ENABLE_STF_CONNECTORS}" + echo ">> Gnocchi enabled: ${ENABLE_GNOCCHI_CONNECTORS}" + echo ">> OSP version: ${OSP_VERSION}" + echo ">> OSP build: ${OSP_BUILD}" + echo ">> OSP topology: ${OSP_TOPOLOGY}" +fi diff --git a/tests/infrared/17.1/outputs/.KEEPIT b/tests/infrared/17.1/outputs/.KEEPIT new file mode 100644 index 000000000..e69de29bb diff --git a/tests/infrared/17.1/stf-connectors.yaml.template b/tests/infrared/17.1/stf-connectors.yaml.template new file mode 100644 index 000000000..1031e097b --- /dev/null +++ b/tests/infrared/17.1/stf-connectors.yaml.template @@ -0,0 +1,37 @@ +--- +tripleo_heat_templates: + [] + +custom_templates: + resource_registry: + OS::TripleO::Services::Collectd: /usr/share/openstack-tripleo-heat-templates/deployment/metrics/collectd-container-puppet.yaml + + parameter_defaults: + ExtraConfig: + qdr::router_id: "%{::hostname}.<>" + + MetricsQdrConnectors: + - host: <> + port: <> + role: edge + verifyHostname: false + sslProfile: sslProfile + + MetricsQdrSSLProfiles: + - name: sslProfile + caCertFileContent: | + <> + + CeilometerQdrMetricsConfig: + driver: amqp + topic: <>-metering + + CollectdAmqpInstances: + <>-telemetry: + format: JSON + presettle: false + + CollectdSensubilityResultsChannel: sensubility/<>-telemetry + + # --- below here, extended configuration for environment beyond what is documented in stable-1.3 + CollectdSensubilityLogLevel: DEBUG diff --git a/tests/infrared/README.md b/tests/infrared/README.md index 9b15471dc..ce497e1a7 100644 --- a/tests/infrared/README.md +++ b/tests/infrared/README.md @@ -17,7 +17,7 @@ to an STF instance all on one (large) baremetal machine. Once the deployment is complete, you can check prometheus for data, like so: ```shells -$ PROM_HOST=$(oc get route prometheus -o jsonpath='{.spec.host}') +$ PROM_HOST=$(oc get route default-prometheus-proxy -o jsonpath='{.spec.host}') $ curl "http://${PROM_HOST}/api/v1/query?query=collectd_uptime\[10s\]" {"status":"success","data":{"resultType":"matrix","result":[{"metric":{"__name__":"collectd_uptime","endpoint":"prom-http","host":"compute-0.localdomain","service":"white-smartgateway","type":"base","uptime":"base"},"values":[[1566500715.207,"88719"],[1566500716.214,"88720"],[1566500717.207,"88721"],[1566500718.207,"88722"],[1566500720.207,"88724"],[1566500721.207,"88725"],[1566500722.207,"88726"],[1566500723.207,"88727"]]},{"metric":{"__name__":"collectd_uptime","endpoint":"prom-http","host":"controller-0.localdomain","service":"white-smartgateway","type":"base","uptime":"base"},"values":[[1566500715.207,"88700"],[1566500717.207,"88701"],[1566500718.207,"88702"],[1566500719.209,"88703"],[1566500721.207,"88704"],[1566500723.207,"88705"]]}]}} ``` diff --git a/tests/smoketest/smoketest.sh b/tests/smoketest/smoketest.sh index 8a801c004..2909e694f 100755 --- a/tests/smoketest/smoketest.sh +++ b/tests/smoketest/smoketest.sh @@ -27,7 +27,15 @@ if [ "${OC_CLIENT_VERSION_Y}" -lt "${OC_CLIENT_VERSION_Y_REQUIRED}" ] || [ "${OC exit 1 fi +if [ "$(oc get stf default -o=jsonpath='{.spec.transports.qdr.auth}')" != "none" ]; then + echo "*** QDR authentication is currently not supported in smoketests." + echo "To disable it, use: oc patch stf default --patch '{\"spec\":{\"transports\":{\"qdr\":{\"auth\":\"none\"}}}}' --type=merge" + echo "For more info: https://github.com/infrawatch/service-telemetry-operator/pull/492" + exit 1 +fi + CLEANUP=${CLEANUP:-true} +SMOKETEST_VERBOSE=${SMOKETEST_VERBOSE:-true} for ((i=1; i<=NUMCLOUDS; i++)); do NAME="smoke${i}" @@ -71,33 +79,18 @@ echo "*** [INFO] Triggering an alertmanager notification..." # check if the oc client version is less than 4.11 and adjust the token command to match available commands if [ 0${OC_CLIENT_VERSION_Y} -lt 011 ]; then - PROMETHEUS_K8S_TOKEN=$(oc serviceaccounts get-token prometheus-k8s) + PROMETHEUS_K8S_TOKEN=$(oc serviceaccounts get-token prometheus-stf) else - PROMETHEUS_K8S_TOKEN=$(oc create token prometheus-k8s) + PROMETHEUS_K8S_TOKEN=$(oc create token prometheus-stf) fi -oc run curl --restart='Never' --image=quay.io/infrawatch/busyboxplus:curl -- sh -c "curl -k -H \"Content-Type: application/json\" -H \"Authorization: Bearer ${PROMETHEUS_K8S_TOKEN}\" -d '[{\"labels\":{\"alertname\":\"Testalert1\"}}]' https://default-alertmanager-proxy:9095/api/v1/alerts" -# it takes some time to get the alert delivered, continuing with other tests - - -# Trying to find a less brittle test than a timeout -JOB_TIMEOUT=300s -for NAME in "${CLOUDNAMES[@]}"; do - echo "*** [INFO] Waiting on job/stf-smoketest-${NAME}..." - oc wait --for=condition=complete --timeout=${JOB_TIMEOUT} "job/stf-smoketest-${NAME}" - RET=$((RET || $?)) # Accumulate exit codes -done - -echo "*** [INFO] Checking that the qdr certificate has a long expiry" -EXPIRETIME=$(oc get secret default-interconnect-openstack-ca -o json | grep \"tls.crt\"\: | awk -F '": "' '{print $2}' | rev | cut -c3- | rev | base64 -d | openssl x509 -in - -text | grep "Not After" | awk -F " : " '{print $2}') -EXPIRETIME_UNIX=$(date -d "${EXPIRETIME}" "+%s") -TARGET_UNIX=$(date -d "now + 7 years" "+%s") -if [ ${EXPIRETIME_UNIX} -lt ${TARGET_UNIX} ]; then - echo "[FAILURE] Certificate expire time (${EXPIRETIME}) less than 7 years from now" -fi +# create the alert using startsAt which in theory may cause trigger to be faster +echo "*** [INFO] Create alert" +oc delete pod -l run=curl ; oc run curl --wait --restart='Never' --image=quay.io/infrawatch/busyboxplus:curl -- sh -c "curl -v -k -H \"Content-Type: application/json\" -H \"Authorization: Bearer ${PROMETHEUS_K8S_TOKEN}\" -d '[{\"status\":\"firing\",\"labels\":{\"alertname\":\"smoketest\",\"severity\":\"warning\"},\"startsAt\":\"$(date --rfc-3339=seconds | sed 's/ /T/')\"}]' https://default-alertmanager-proxy:9095/api/v1/alerts" +oc wait --for=jsonpath='{.status.phase}'=Succeeded pod/curl +oc logs curl echo "*** [INFO] Waiting to see SNMP trap message in webhook pod" -oc delete pod curl SNMP_WEBHOOK_POD=$(oc get pod -l "app=default-snmp-webhook" -ojsonpath='{.items[0].metadata.name}') SNMP_WEBHOOK_CHECK_MAX_TRIES=5 SNMP_WEBHOOK_CHECK_TIMEOUT=30 @@ -112,74 +105,88 @@ while [ $SNMP_WEBHOOK_CHECK_COUNT -lt $SNMP_WEBHOOK_CHECK_MAX_TRIES ]; do sleep $SNMP_WEBHOOK_CHECK_TIMEOUT done -echo "*** [INFO] Showing oc get all..." -oc get all -echo - -echo "*** [INFO] Showing servicemonitors..." -oc get servicemonitor -o yaml -echo - -echo "*** [INFO] Logs from smoketest containers..." +# Trying to find a less brittle test than a timeout +JOB_TIMEOUT=300s for NAME in "${CLOUDNAMES[@]}"; do - oc logs "$(oc get pod -l "job-name=stf-smoketest-${NAME}" -o jsonpath='{.items[0].metadata.name}')" -c smoketest-collectd - oc logs "$(oc get pod -l "job-name=stf-smoketest-${NAME}" -o jsonpath='{.items[0].metadata.name}')" -c smoketest-ceilometer + echo "*** [INFO] Waiting on job/stf-smoketest-${NAME}..." + oc wait --for=condition=complete --timeout=${JOB_TIMEOUT} "job/stf-smoketest-${NAME}" + RET=$((RET || $?)) # Accumulate exit codes done -echo - -echo "*** [INFO] Logs from qdr..." -oc logs "$(oc get pod -l application=default-interconnect -o jsonpath='{.items[0].metadata.name}')" -echo - -echo "*** [INFO] Logs from smart gateways..." -oc logs "$(oc get pod -l "smart-gateway=default-cloud1-coll-meter" -o jsonpath='{.items[0].metadata.name}')" -c bridge -oc logs "$(oc get pod -l "smart-gateway=default-cloud1-coll-meter" -o jsonpath='{.items[0].metadata.name}')" -c sg-core -oc logs "$(oc get pod -l "smart-gateway=default-cloud1-coll-event" -o jsonpath='{.items[0].metadata.name}')" -c bridge -oc logs "$(oc get pod -l "smart-gateway=default-cloud1-coll-event" -o jsonpath='{.items[0].metadata.name}')" -c sg-core -oc logs "$(oc get pod -l "smart-gateway=default-cloud1-ceil-meter" -o jsonpath='{.items[0].metadata.name}')" -c bridge -oc logs "$(oc get pod -l "smart-gateway=default-cloud1-ceil-meter" -o jsonpath='{.items[0].metadata.name}')" -c sg-core -oc logs "$(oc get pod -l "smart-gateway=default-cloud1-ceil-event" -o jsonpath='{.items[0].metadata.name}')" -c bridge -oc logs "$(oc get pod -l "smart-gateway=default-cloud1-ceil-event" -o jsonpath='{.items[0].metadata.name}')" -c sg-core -oc logs "$(oc get pod -l "smart-gateway=default-cloud1-sens-meter" -o jsonpath='{.items[0].metadata.name}')" -c bridge -oc logs "$(oc get pod -l "smart-gateway=default-cloud1-sens-meter" -o jsonpath='{.items[0].metadata.name}')" -c sg-core -echo - -echo "*** [INFO] Logs from smart gateway operator..." -oc logs "$(oc get pod -l app=smart-gateway-operator -o jsonpath='{.items[0].metadata.name}')" -echo -echo "*** [INFO] Logs from prometheus..." -oc logs "$(oc get pod -l prometheus=default -o jsonpath='{.items[0].metadata.name}')" -c prometheus -echo +echo "*** [INFO] Checking that the qdr certificate has a long expiry" +EXPIRETIME=$(oc get secret default-interconnect-openstack-ca -o json | grep \"tls.crt\"\: | awk -F '": "' '{print $2}' | rev | cut -c3- | rev | base64 -d | openssl x509 -text | grep "Not After" | awk -F " : " '{print $2}') +EXPIRETIME_UNIX=$(date -d "${EXPIRETIME}" "+%s") +TARGET_UNIX=$(date -d "now + 7 years" "+%s") +if [ ${EXPIRETIME_UNIX} -lt ${TARGET_UNIX} ]; then + echo "[FAILURE] Certificate expire time (${EXPIRETIME}) less than 7 years from now" +fi -echo "*** [INFO] Logs from elasticsearch..." -oc logs "$(oc get pod -l common.k8s.elastic.co/type=elasticsearch -o jsonpath='{.items[0].metadata.name}')" +echo "*** [INFO] Showing oc get all..." +oc get all echo -echo "*** [INFO] Logs from snmp webhook..." -oc logs "$(oc get pod -l app=default-snmp-webhook -o jsonpath='{.items[0].metadata.name}')" +echo "*** [INFO] Showing servicemonitors..." +oc get servicemonitors.monitoring.rhobs -o yaml echo -echo "*** [INFO] Logs from alertmanager..." -oc logs "$(oc get pod -l app.kubernetes.io/name=alertmanager -o jsonpath='{.items[0].metadata.name}')" -c alertmanager -echo +if [ "$SMOKETEST_VERBOSE" = "true" ]; then + echo "*** [INFO] Logs from smoketest containers..." + for NAME in "${CLOUDNAMES[@]}"; do + oc logs "$(oc get pod -l "job-name=stf-smoketest-${NAME}" -o jsonpath='{.items[0].metadata.name}')" -c smoketest-collectd + oc logs "$(oc get pod -l "job-name=stf-smoketest-${NAME}" -o jsonpath='{.items[0].metadata.name}')" -c smoketest-ceilometer + done + echo + + echo "*** [INFO] Logs from qdr..." + oc logs "$(oc get pod -l application=default-interconnect -o jsonpath='{.items[0].metadata.name}')" + echo + + echo "*** [INFO] Logs from smart gateways..." + oc logs "$(oc get pod -l "smart-gateway=default-cloud1-coll-meter" -o jsonpath='{.items[0].metadata.name}')" -c bridge + oc logs "$(oc get pod -l "smart-gateway=default-cloud1-coll-meter" -o jsonpath='{.items[0].metadata.name}')" -c sg-core + oc logs "$(oc get pod -l "smart-gateway=default-cloud1-coll-event" -o jsonpath='{.items[0].metadata.name}')" -c bridge + oc logs "$(oc get pod -l "smart-gateway=default-cloud1-coll-event" -o jsonpath='{.items[0].metadata.name}')" -c sg-core + oc logs "$(oc get pod -l "smart-gateway=default-cloud1-ceil-meter" -o jsonpath='{.items[0].metadata.name}')" -c bridge + oc logs "$(oc get pod -l "smart-gateway=default-cloud1-ceil-meter" -o jsonpath='{.items[0].metadata.name}')" -c sg-core + oc logs "$(oc get pod -l "smart-gateway=default-cloud1-ceil-event" -o jsonpath='{.items[0].metadata.name}')" -c bridge + oc logs "$(oc get pod -l "smart-gateway=default-cloud1-ceil-event" -o jsonpath='{.items[0].metadata.name}')" -c sg-core + oc logs "$(oc get pod -l "smart-gateway=default-cloud1-sens-meter" -o jsonpath='{.items[0].metadata.name}')" -c bridge + oc logs "$(oc get pod -l "smart-gateway=default-cloud1-sens-meter" -o jsonpath='{.items[0].metadata.name}')" -c sg-core + echo + + echo "*** [INFO] Logs from smart gateway operator..." + oc logs "$(oc get pod -l app=smart-gateway-operator -o jsonpath='{.items[0].metadata.name}')" + echo + + echo "*** [INFO] Logs from prometheus..." + oc logs "$(oc get pod -l prometheus=default -o jsonpath='{.items[0].metadata.name}')" -c prometheus + echo + + echo "*** [INFO] Logs from elasticsearch..." + oc logs "$(oc get pod -l common.k8s.elastic.co/type=elasticsearch -o jsonpath='{.items[0].metadata.name}')" + echo + + echo "*** [INFO] Logs from snmp webhook..." + oc logs "$(oc get pod -l app=default-snmp-webhook -o jsonpath='{.items[0].metadata.name}')" + echo + + echo "*** [INFO] Logs from alertmanager..." + oc logs "$(oc get pod -l app.kubernetes.io/name=alertmanager -o jsonpath='{.items[0].metadata.name}')" -c alertmanager + echo +fi echo "*** [INFO] Cleanup resources..." if $CLEANUP; then oc delete "job/stf-smoketest-${NAME}" + # resolve the alert to clean up the system, otherwise this expires in 5 minutes + oc delete pod -l run=curl ; oc run curl --restart='Never' --image=quay.io/infrawatch/busyboxplus:curl -- sh -c "curl -v -k -H \"Content-Type: application/json\" -H \"Authorization: Bearer ${PROMETHEUS_K8S_TOKEN}\" -d '[{\"status\":\"firing\",\"labels\":{\"alertname\":\"smoketest\",\"severity\":\"warning\"},\"startsAt\":\"$(date --rfc-3339=seconds | sed 's/ /T/')\",\"endsAt\":\"$(date --rfc-3339=seconds | sed 's/ /T/')\"}]' https://default-alertmanager-proxy:9095/api/v1/alerts" fi echo -if [ $SNMP_WEBHOOK_STATUS -ne 0 ]; then - echo "*** [FAILURE] SNMP Webhook failed" - exit 1 -fi - -if [ $RET -eq 0 ]; then +if [ $RET -eq 0 ] && [ $SNMP_WEBHOOK_STATUS -eq 0 ]; then echo "*** [SUCCESS] Smoke test job completed successfully" + exit 0 else echo "*** [FAILURE] Smoke test job still not succeeded after ${JOB_TIMEOUT}" + exit 1 fi -echo - -exit $RET diff --git a/tests/smoketest/smoketest_ceilometer_entrypoint.sh b/tests/smoketest/smoketest_ceilometer_entrypoint.sh index 8101b00a5..8e2ac7f6f 100644 --- a/tests/smoketest/smoketest_ceilometer_entrypoint.sh +++ b/tests/smoketest/smoketest_ceilometer_entrypoint.sh @@ -1,5 +1,5 @@ #!/bin/sh -set -e +set +e # Executes inside the test harness container to start collectd and look for resulting metrics in prometheus PROMETHEUS=${PROMETHEUS:-"https://default-prometheus-proxy:9092"} @@ -28,42 +28,45 @@ echo "*** [INFO] Checking for recent image metrics..." echo "[DEBUG] Running the curl command to return a query" curl -k -u "internal:${PROMETHEUS_AUTH_PASS}" -g "${PROMETHEUS}/api/v1/query?" --data-urlencode 'query=ceilometer_image_size' 2>&1 | grep '"result":\[{"metric":{"__name__":"ceilometer_image_size"' -echo "[DEBUG] Query returned" metrics_result=$? echo "[DEBUG] Set metrics_result to $metrics_result" -echo "*** [INFO] Get documents for this test from ElasticSearch..." -DOCUMENT_HITS=$(curl -sk -u "elastic:${ELASTICSEARCH_AUTH_PASS}" -X GET "${ELASTICSEARCH}/_search" -H 'Content-Type: application/json' -d'{ - "query": { - "bool": { - "filter": [ - { "term" : { "labels.instance" : { "value" : "'${CLOUDNAME}'", "boost" : 1.0 } } }, - { "range" : { "startsAt" : { "gte" : "now-1m", "lt" : "now" } } } - ] +if [ "$OBSERVABILITY_STRATEGY" != "use_redhat" ]; then + echo "*** [INFO] Get documents for this test from ElasticSearch..." + DOCUMENT_HITS=$(curl -sk -u "elastic:${ELASTICSEARCH_AUTH_PASS}" -X GET "${ELASTICSEARCH}/_search" -H 'Content-Type: application/json' -d'{ + "query": { + "bool": { + "filter": [ + { "term" : { "labels.instance" : { "value" : "'${CLOUDNAME}'", "boost" : 1.0 } } }, + { "range" : { "startsAt" : { "gte" : "now-1m", "lt" : "now" } } } + ] + } } - } -}' | python3 -c "import sys, json; parsed = json.load(sys.stdin); print(parsed['hits']['total']['value'])") + }' | python3 -c "import sys, json; parsed = json.load(sys.stdin); print(parsed['hits']['total']['value'])") -echo "*** [INFO] List of indices for debugging..." -curl -sk -u "elastic:${ELASTICSEARCH_AUTH_PASS}" -X GET "${ELASTICSEARCH}/_cat/indices/ceilometer_*?s=index" -echo + echo "*** [INFO] List of indices for debugging..." + curl -sk -u "elastic:${ELASTICSEARCH_AUTH_PASS}" -X GET "${ELASTICSEARCH}/_cat/indices/ceilometer_*?s=index" + echo -echo "*** [INFO] Get documents for this test from ElasticSearch..." -ES_INDEX=ceilometer_image -DOCUMENT_HITS=$(curl -sk -u "elastic:${ELASTICSEARCH_AUTH_PASS}" -X GET "${ELASTICSEARCH}/${ES_INDEX}/_search" -H 'Content-Type: application/json' -d'{ - "query": { - "match_all": {} - } -}'| python3 -c "import sys, json; parsed = json.load(sys.stdin); print(parsed['hits']['total']['value'])") + echo "*** [INFO] Get documents for this test from ElasticSearch..." + ES_INDEX=ceilometer_image + DOCUMENT_HITS=$(curl -sk -u "elastic:${ELASTICSEARCH_AUTH_PASS}" -X GET "${ELASTICSEARCH}/${ES_INDEX}/_search" -H 'Content-Type: application/json' -d'{ + "query": { + "match_all": {} + } + }'| python3 -c "import sys, json; parsed = json.load(sys.stdin); print(parsed['hits']['total']['value'])") -echo "*** [INFO] Found ${DOCUMENT_HITS} documents" -echo; echo + echo "*** [INFO] Found ${DOCUMENT_HITS} documents" + echo; echo -# check if we got documents back for this test -events_result=1 -if [ "$DOCUMENT_HITS" -gt "0" ]; then - events_result=0 + # check if we got documents back for this test + events_result=1 + if [ "$DOCUMENT_HITS" -gt "0" ]; then + events_result=0 + fi +else + events_result=0 fi echo "[INFO] Verification exit codes (0 is passing, non-zero is a failure): events=${events_result} metrics=${metrics_result}" diff --git a/tests/smoketest/smoketest_collectd_entrypoint.sh b/tests/smoketest/smoketest_collectd_entrypoint.sh index 81c12d9fe..d7f5132e8 100755 --- a/tests/smoketest/smoketest_collectd_entrypoint.sh +++ b/tests/smoketest/smoketest_collectd_entrypoint.sh @@ -1,5 +1,5 @@ #!/bin/sh -set -e +set +e # Executes inside the test harness container to start collectd and look for resulting metrics in prometheus PROMETHEUS=${PROMETHEUS:-"https://default-prometheus-proxy:9092"} @@ -62,7 +62,7 @@ grep -E '"result":\[{"metric":{"__name__":"sensubility_container_health_status", metrics_result=$((metrics_result || $?)) echo; echo -echo "*** [INFO] Get documents for this test from ElasticSearch..." +echo "*** [INFO] Get documents for this test from Elasticsearch..." DOCUMENT_HITS=$(curl -sk -u "elastic:${ELASTICSEARCH_AUTH_PASS}" -X GET "${ELASTICSEARCH}/_search" -H 'Content-Type: application/json' -d'{ "query": { "bool": { @@ -83,6 +83,7 @@ if [ "$DOCUMENT_HITS" -gt "0" ]; then events_result=0 fi + echo "[INFO] Verification exit codes (0 is passing, non-zero is a failure): events=${events_result} metrics=${metrics_result}" echo; echo diff --git a/tests/smoketest/smoketest_job.yaml.template b/tests/smoketest/smoketest_job.yaml.template index 50735b6a5..4a9c20cc9 100644 --- a/tests/smoketest/smoketest_job.yaml.template +++ b/tests/smoketest/smoketest_job.yaml.template @@ -24,6 +24,8 @@ spec: value: "<>" - name: PROMETHEUS_AUTH_PASS value: "<>" + - name: OBSERVABILITY_STRATEGY + value: "<>" volumeMounts: - name: collectd-config mountPath: /etc/minimal-collectd.conf.template @@ -51,6 +53,8 @@ spec: value: "<>" - name: PROMETHEUS_AUTH_PASS value: "<>" + - name: OBSERVABILITY_STRATEGY + value: "<>" volumeMounts: - name: ceilometer-publisher mountPath: /ceilometer_publish.py