From b2149e596e25832a3e5a356195fa46d00087a989 Mon Sep 17 00:00:00 2001 From: Trevor Vardeman <34040350+tvardema@users.noreply.github.com> Date: Thu, 18 Jul 2024 16:46:18 -0500 Subject: [PATCH] MULTIARCH-4785 Multiarch Compute Day 0 arm x86 GCP (#54324) The multiarch team is working on an installer PR to enable day-0 multiarch compute deployments. This PR enables this on GCP. --- .../installer/openshift-installer-master.yaml | 16 ++++ ...nshift-multiarch-master__nightly-4.17.yaml | 22 +++++ ...openshift-installer-master-presubmits.yaml | 81 +++++++++++++++++++ .../openshift-multiarch-master-periodics.yaml | 81 +++++++++++++++++++ .../ipi/conf/gcp/ipi-conf-gcp-commands.sh | 55 ++++++++----- .../ipi/conf/gcp/ipi-conf-gcp-ref.yaml | 10 ++- 6 files changed, 245 insertions(+), 20 deletions(-) diff --git a/ci-operator/config/openshift/installer/openshift-installer-master.yaml b/ci-operator/config/openshift/installer/openshift-installer-master.yaml index 3cffb65510ddc..a3248292e1427 100644 --- a/ci-operator/config/openshift/installer/openshift-installer-master.yaml +++ b/ci-operator/config/openshift/installer/openshift-installer-master.yaml @@ -887,8 +887,24 @@ tests: CONTROL_ARCH: arm64 FEATURE_GATES: '["MultiArchInstallAWS=true"]' FEATURE_SET: CustomNoUpgrade + TEST_SKIPS: oc new-app should succeed\| build can reference a cluster service workflow: openshift-e2e-aws-heterogeneous-day-0 timeout: 6h0m0s +- always_run: false + as: e2e-gcp-ovn-heterogeneous + optional: true + steps: + cluster_profile: gcp-arm64 + dependencies: + OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE: release:multi-latest + env: + COMPUTE_ARCH: amd64 + CONTROL_ARCH: arm64 + FEATURE_GATES: '["MultiArchInstallGCP=true"]' + FEATURE_SET: CustomNoUpgrade + TEST_SKIPS: oc new-app should succeed\| build can reference a cluster service + workflow: openshift-e2e-gcp-ovn + timeout: 6h0m0s zz_generated_metadata: branch: master org: openshift diff --git a/ci-operator/config/openshift/multiarch/openshift-multiarch-master__nightly-4.17.yaml b/ci-operator/config/openshift/multiarch/openshift-multiarch-master__nightly-4.17.yaml index 66056cb977839..9efb399602e24 100644 --- a/ci-operator/config/openshift/multiarch/openshift-multiarch-master__nightly-4.17.yaml +++ b/ci-operator/config/openshift/multiarch/openshift-multiarch-master__nightly-4.17.yaml @@ -718,6 +718,28 @@ tests: - ref: ipi-install-heterogeneous - ref: openshift-e2e-test workflow: openshift-e2e-gcp-ovn +- as: ocp-e2e-gcp-ovn-heterogeneous-day-0 + cron: 0 11 * * 0 + steps: + cluster_profile: gcp-arm64 + dependencies: + OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE: release:multi-latest + env: + COMPUTE_ARCH: amd64 + CONTROL_ARCH: arm64 + OCP_ARCH: multi + TEST_SKIPS: deploymentconfigs\| should expose cluster services outside the cluster\| + FIPS TestFIPS\| Multi-stage image builds should succeed\| Optimized image + builds should succeed\| build can reference a cluster service\| custom build + with buildah\| oc new-app should succeed\| prune builds based on settings\| + s2i build with a root\| verify /run filesystem contents\| oc can run\| oc + debug\| oc idle\| Pods cannot access\| Image append should create\| Image + extract should extract\| Image info should display\| Image layer subresource\| + oc tag should change image\| when installed on the cluster should\| OpenShift + alerting rules\| The HAProxy router should\| egressrouter cni resources\| + pod should start\| pod sysctls\| build volumes should mount given secrets + and configmaps into the build pod + workflow: openshift-e2e-gcp-ovn - as: ocp-e2e-upgrade-gcp-ovn-heterogeneous interval: 72h steps: diff --git a/ci-operator/jobs/openshift/installer/openshift-installer-master-presubmits.yaml b/ci-operator/jobs/openshift/installer/openshift-installer-master-presubmits.yaml index 10bee9a920e18..ba5eb0b290610 100644 --- a/ci-operator/jobs/openshift/installer/openshift-installer-master-presubmits.yaml +++ b/ci-operator/jobs/openshift/installer/openshift-installer-master-presubmits.yaml @@ -5797,6 +5797,87 @@ presubmits: secret: secretName: result-aggregator trigger: (?m)^/test( | .* )e2e-gcp-ovn-byo-vpc,?($|\s.*) + - agent: kubernetes + always_run: false + branches: + - ^master$ + - ^master- + cluster: build02 + context: ci/prow/e2e-gcp-ovn-heterogeneous + decorate: true + decoration_config: + timeout: 6h0m0s + labels: + ci-operator.openshift.io/cloud: gcp + ci-operator.openshift.io/cloud-cluster-profile: gcp-arm64 + ci.openshift.io/generator: prowgen + pj-rehearse.openshift.io/can-be-rehearsed: "true" + name: pull-ci-openshift-installer-master-e2e-gcp-ovn-heterogeneous + optional: true + rerun_command: /test e2e-gcp-ovn-heterogeneous + spec: + containers: + - args: + - --gcs-upload-secret=/secrets/gcs/service-account.json + - --image-import-pull-secret=/etc/pull-secret/.dockerconfigjson + - --lease-server-credentials-file=/etc/boskos/credentials + - --report-credentials-file=/etc/report/credentials + - --secret-dir=/secrets/ci-pull-credentials + - --secret-dir=/usr/local/e2e-gcp-ovn-heterogeneous-cluster-profile + - --target=e2e-gcp-ovn-heterogeneous + command: + - ci-operator + image: ci-operator:latest + imagePullPolicy: Always + name: "" + resources: + requests: + cpu: 10m + volumeMounts: + - mountPath: /etc/boskos + name: boskos + readOnly: true + - mountPath: /secrets/ci-pull-credentials + name: ci-pull-credentials + readOnly: true + - mountPath: /usr/local/e2e-gcp-ovn-heterogeneous-cluster-profile + name: cluster-profile + - mountPath: /secrets/gcs + name: gcs-credentials + readOnly: true + - mountPath: /secrets/manifest-tool + name: manifest-tool-local-pusher + readOnly: true + - mountPath: /etc/pull-secret + name: pull-secret + readOnly: true + - mountPath: /etc/report + name: result-aggregator + readOnly: true + serviceAccountName: ci-operator + volumes: + - name: boskos + secret: + items: + - key: credentials + path: credentials + secretName: boskos-credentials + - name: ci-pull-credentials + secret: + secretName: ci-pull-credentials + - name: cluster-profile + secret: + secretName: cluster-secrets-gcp-arm64 + - name: manifest-tool-local-pusher + secret: + secretName: manifest-tool-local-pusher + - name: pull-secret + secret: + secretName: registry-pull-credentials + - name: result-aggregator + secret: + secretName: result-aggregator + trigger: (?m)^/test( | .* )e2e-gcp-ovn-heterogeneous,?($|\s.*) - agent: kubernetes always_run: false branches: diff --git a/ci-operator/jobs/openshift/multiarch/openshift-multiarch-master-periodics.yaml b/ci-operator/jobs/openshift/multiarch/openshift-multiarch-master-periodics.yaml index 7e857171bf431..be0fa6409edc1 100644 --- a/ci-operator/jobs/openshift/multiarch/openshift-multiarch-master-periodics.yaml +++ b/ci-operator/jobs/openshift/multiarch/openshift-multiarch-master-periodics.yaml @@ -24664,6 +24664,87 @@ periodics: - name: result-aggregator secret: secretName: result-aggregator +- agent: kubernetes + cluster: build02 + cron: 0 11 * * 0 + decorate: true + decoration_config: + skip_cloning: true + extra_refs: + - base_ref: master + org: openshift + repo: multiarch + labels: + ci-operator.openshift.io/cloud: gcp + ci-operator.openshift.io/cloud-cluster-profile: gcp-arm64 + ci-operator.openshift.io/variant: nightly-4.17 + ci.openshift.io/generator: prowgen + job-release: "4.17" + pj-rehearse.openshift.io/can-be-rehearsed: "true" + name: periodic-ci-openshift-multiarch-master-nightly-4.17-ocp-e2e-gcp-ovn-heterogeneous-day-0 + spec: + containers: + - args: + - --gcs-upload-secret=/secrets/gcs/service-account.json + - --image-import-pull-secret=/etc/pull-secret/.dockerconfigjson + - --lease-server-credentials-file=/etc/boskos/credentials + - --report-credentials-file=/etc/report/credentials + - --secret-dir=/secrets/ci-pull-credentials + - --secret-dir=/usr/local/ocp-e2e-gcp-ovn-heterogeneous-day-0-cluster-profile + - --target=ocp-e2e-gcp-ovn-heterogeneous-day-0 + - --variant=nightly-4.17 + command: + - ci-operator + image: ci-operator:latest + imagePullPolicy: Always + name: "" + resources: + requests: + cpu: 10m + volumeMounts: + - mountPath: /etc/boskos + name: boskos + readOnly: true + - mountPath: /secrets/ci-pull-credentials + name: ci-pull-credentials + readOnly: true + - mountPath: /usr/local/ocp-e2e-gcp-ovn-heterogeneous-day-0-cluster-profile + name: cluster-profile + - mountPath: /secrets/gcs + name: gcs-credentials + readOnly: true + - mountPath: /secrets/manifest-tool + name: manifest-tool-local-pusher + readOnly: true + - mountPath: /etc/pull-secret + name: pull-secret + readOnly: true + - mountPath: /etc/report + name: result-aggregator + readOnly: true + serviceAccountName: ci-operator + volumes: + - name: boskos + secret: + items: + - key: credentials + path: credentials + secretName: boskos-credentials + - name: ci-pull-credentials + secret: + secretName: ci-pull-credentials + - name: cluster-profile + secret: + secretName: cluster-secrets-gcp-arm64 + - name: manifest-tool-local-pusher + secret: + secretName: manifest-tool-local-pusher + - name: pull-secret + secret: + secretName: registry-pull-credentials + - name: result-aggregator + secret: + secretName: result-aggregator - agent: kubernetes cluster: build02 cron: 20 13 * * * diff --git a/ci-operator/step-registry/ipi/conf/gcp/ipi-conf-gcp-commands.sh b/ci-operator/step-registry/ipi/conf/gcp/ipi-conf-gcp-commands.sh index 4e290367c6e04..6cbb55976d82e 100755 --- a/ci-operator/step-registry/ipi/conf/gcp/ipi-conf-gcp-commands.sh +++ b/ci-operator/step-registry/ipi/conf/gcp/ipi-conf-gcp-commands.sh @@ -20,24 +20,41 @@ fi # Do not change the default family type without consulting with cloud financial operations as their may # be active savings plans targeting this machine class. master_type="" -# Temporary test to see if this helps the consistent high CPU alerts and random test failures -master_type_suffix="-custom-6-16384" -# TODO: remove if block and revert master_type_suffix back to standard if/when we switch back to standard -# custom sizes are not supported by arm64 VMs -if [ "${OCP_ARCH}" = "arm64" ]; then - master_type_suffix="-standard-4" -fi -if [[ "${SIZE_VARIANT}" == "xlarge" ]]; then - master_type_suffix="-standard-32" -elif [[ "${SIZE_VARIANT}" == "large" ]]; then - master_type_suffix="-standard-16" -elif [[ "${SIZE_VARIANT}" == "compact" ]]; then - master_type_suffix="-standard-8" + +case "${SIZE_VARIANT}" in + "xlarge") + master_type_suffix="standard-32" + ;; + "large") + master_type_suffix="standard-16" + ;; + "compact") + master_type_suffix="standard-8" + ;; + *) + if [[ "${CONTROL_ARCH}" == "arm64" ]]; then + master_type_suffix="standard-4" + else + # Temporary test to see if this helps the consistent high CPU alerts and random test failures + master_type_suffix="custom-6-16384" + # TODO: remove if block and revert master_type_suffix back to standard if/when we switch back to standard + # custom sizes are not supported by arm64 VMs + fi + ;; +esac + +if [[ "${CONTROL_ARCH}" == "amd64" ]]; then + master_type="e2-${master_type_suffix}" +elif [[ "${CONTROL_ARCH}" == "arm64" ]]; then + master_type="t2a-${master_type_suffix}" fi -if [ "${OCP_ARCH}" = "amd64" ]; then - master_type="e2${master_type_suffix}" -elif [ "${OCP_ARCH}" = "arm64" ]; then - master_type="t2a${master_type_suffix}" + +if [[ -z "${COMPUTE_NODE_TYPE}" ]]; then + if [[ "${COMPUTE_ARCH}" == "arm64" ]]; then + COMPUTE_NODE_TYPE="t2a-standard-4" + else + COMPUTE_NODE_TYPE="e2-standard-4" + fi fi cat >> "${CONFIG}" << EOF @@ -47,7 +64,7 @@ platform: projectID: ${GCP_PROJECT} region: ${GCP_REGION} controlPlane: - architecture: ${OCP_ARCH} + architecture: ${CONTROL_ARCH:-${OCP_ARCH}} name: master platform: gcp: @@ -57,7 +74,7 @@ controlPlane: diskSizeGB: 200 replicas: ${masters} compute: -- architecture: ${OCP_ARCH} +- architecture: ${COMPUTE_ARCH:-${OCP_ARCH}} name: worker replicas: ${workers} platform: diff --git a/ci-operator/step-registry/ipi/conf/gcp/ipi-conf-gcp-ref.yaml b/ci-operator/step-registry/ipi/conf/gcp/ipi-conf-gcp-ref.yaml index aa9ba362c7e53..e369ee49577e4 100644 --- a/ci-operator/step-registry/ipi/conf/gcp/ipi-conf-gcp-ref.yaml +++ b/ci-operator/step-registry/ipi/conf/gcp/ipi-conf-gcp-ref.yaml @@ -47,11 +47,19 @@ ref: If SIZE_VARIANT is set to "compact", this is ignored and no workers are created. (default: 3) - name: COMPUTE_NODE_TYPE - default: "e2-standard-4" + default: "" documentation: |- The instance type to use for compute nodes (e.g. GCP https://cloud.google.com/compute/docs/machine-types). We use a 4 core worker to match the median configuration of the fleet. Do not change the default family type without consulting with cloud financial operations as their may be active savings plans targeting this machine class. + - name: COMPUTE_ARCH + default: "" + documentation: |- + Compute node architecture specification. Used for multiarch compute clusters. + - name: CONTROL_ARCH + default: "" + documentation: |- + Control plane node architecture specification. Used for multiarch compute clusters. - name: OCP_ARCH default: "amd64" documentation: |-