From 5ac0da52cb1b524cfed79a4287bae27b9c352826 Mon Sep 17 00:00:00 2001 From: Julius von Kohout <45896133+juliusvonkohout@users.noreply.github.com> Date: Fri, 26 Jul 2024 15:56:55 +0200 Subject: [PATCH] Training operator CICD improvements (#2779) * Add the networkpolicies Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com> * rework the training operator tests Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com> * fix the comments Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com> * fix filename Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com> * try to fix the permissions Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com> * try to fix the permissions Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com> * change to the user namespace Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com> * update the image to rc.1 Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com> * fixes Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com> * fixes Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com> * fixes Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com> * fixes Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com> * fixes Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com> --------- Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com> --- .../linting_bash_python_yaml_files.yaml | 2 +- .github/workflows/model_registry_test.yaml | 2 +- .github/workflows/train_operator_test.yaml | 43 -------------- .github/workflows/training_operator_test.yaml | 57 +++++++++++++++++++ .../base/training-operator-webhook.yaml | 8 +-- tests/gh-actions/install_multi_tenancy.sh | 3 + tests/gh-actions/install_training_operator.sh | 9 +++ ...tebook.test.kubeflow-user-example.com.yaml | 2 +- tests/gh-actions/kf-objects/tfjob.yaml | 21 ------- .../kf-objects/training_operator_job.yaml | 40 +++++++++++++ 10 files changed, 116 insertions(+), 71 deletions(-) delete mode 100644 .github/workflows/train_operator_test.yaml create mode 100644 .github/workflows/training_operator_test.yaml create mode 100755 tests/gh-actions/install_training_operator.sh delete mode 100644 tests/gh-actions/kf-objects/tfjob.yaml create mode 100644 tests/gh-actions/kf-objects/training_operator_job.yaml diff --git a/.github/workflows/linting_bash_python_yaml_files.yaml b/.github/workflows/linting_bash_python_yaml_files.yaml index 33e5e6914c..2a7e1c6921 100644 --- a/.github/workflows/linting_bash_python_yaml_files.yaml +++ b/.github/workflows/linting_bash_python_yaml_files.yaml @@ -1,6 +1,6 @@ name: Proper linting on Bash, Python, and YAML files -on: [push, pull_request] +on: [pull_request] jobs: format_python_files: diff --git a/.github/workflows/model_registry_test.yaml b/.github/workflows/model_registry_test.yaml index 18345afb5f..082d9b540b 100644 --- a/.github/workflows/model_registry_test.yaml +++ b/.github/workflows/model_registry_test.yaml @@ -61,7 +61,7 @@ jobs: 'http://localhost:8081/api/model_registry/v1alpha3/registered_models?pageSize=100&orderBy=ID&sortOrder=DESC' \ -H 'accept: application/json' - # for these steps below ensure same steps as kserve (ie: Istio with ext external authentication, cert-manager, knative) so to achieve same setup + # for these steps below ensure same steps as kserve (ie: Istio with external authentication, cert-manager, knative) so to achieve same setup - name: Port forward Istio gateway run: | INGRESS_GATEWAY_SERVICE=$(kubectl get svc --namespace istio-system --selector="app=istio-ingressgateway" --output jsonpath='{.items[0].metadata.name}') diff --git a/.github/workflows/train_operator_test.yaml b/.github/workflows/train_operator_test.yaml deleted file mode 100644 index 23d891f627..0000000000 --- a/.github/workflows/train_operator_test.yaml +++ /dev/null @@ -1,43 +0,0 @@ -name: Build & Apply Training Operator manifests in KinD -on: - pull_request: - paths: - - .github/workflows/train_operator_test.yaml - - apps/training-operator/upstream/** - - tests/gh-actions/kind-cluster.yaml - - tests/gh-actions/install_kind.sh - - tests/gh-actions/install_kustomize.sh - - tests/gh-actions/install_istio.sh - - common/istio*/** - - tests/gh-actions/kf-objects/tfjob.yaml - -jobs: - build: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Install KinD - run: ./tests/gh-actions/install_kind.sh - - - name: Create KinD Cluster - run: kind create cluster --config tests/gh-actions/kind-cluster.yaml - - - name: Install kustomize - run: ./tests/gh-actions/install_kustomize.sh - - - name: Install Istio - run: ./tests/gh-actions/install_istio.sh - - - name: Build & Apply manifests - run: | - cd apps/training-operator/upstream - kubectl create ns kubeflow - kustomize build overlays/kubeflow | kubectl apply -f - - kubectl wait --for=condition=Ready pods --all --all-namespaces --timeout 180s - - - name: Create a TFJob - run: | - kubectl create -f tests/gh-actions/kf-objects/tfjob.yaml - kubectl wait --for=condition=Succeeded tfjobs tfjob-simple -n kubeflow --timeout 600s diff --git a/.github/workflows/training_operator_test.yaml b/.github/workflows/training_operator_test.yaml new file mode 100644 index 0000000000..5be7d5dffe --- /dev/null +++ b/.github/workflows/training_operator_test.yaml @@ -0,0 +1,57 @@ +name: Build & Apply Training Operator manifests in KinD +on: + pull_request: + paths: + - .github/workflows/training_operator_test.yaml + - apps/training-operator/upstream/** + - tests/gh-actions/kind-cluster.yaml + - tests/gh-actions/install_kind.sh + - tests/gh-actions/install_kustomize.sh + - tests/gh-actions/install_istio.sh + - common/istio*/** + - tests/gh-actions/kf-objects/tfjob.yaml + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install KinD + run: ./tests/gh-actions/install_kind.sh + + - name: Create KinD Cluster + run: kind create cluster --config tests/gh-actions/kind-cluster.yaml + + - name: Install kustomize + run: ./tests/gh-actions/install_kustomize.sh + + - name: Install kubectl + run: ./tests/gh-actions/install_kubectl.sh + + - name: Install Istio with external authentication + run: ./tests/gh-actions/install_istio_with_ext_auth.sh + + - name: Install cert-manager + run: ./tests/gh-actions/install_cert_manager.sh + + - name: Create kubeflow namespace + run: kustomize build common/kubeflow-namespace/base | kubectl apply -f - + + - name: Install KF Multi Tenancy + run: ./tests/gh-actions/install_multi_tenancy.sh + + - name: Install kubeflow-istio-resources + run: kustomize build common/istio-1-22/kubeflow-istio-resources/base | kubectl apply -f - + + - name: Create KF Profile + run: kustomize build common/user-namespace/base | kubectl apply -f - + + - name: Install training operator + run: ./tests/gh-actions/install_training_operator.sh + + - name: Create a PyTorchJob + run: | + kubectl create -f tests/gh-actions/kf-objects/training_operator_job.yaml -n kubeflow-user-example-com + kubectl wait --for=condition=Succeeded PyTorchJob pytorch-simple -n kubeflow-user-example-com --timeout 600s diff --git a/common/networkpolicies/base/training-operator-webhook.yaml b/common/networkpolicies/base/training-operator-webhook.yaml index bbf6e373a3..8f9a6a7ea2 100644 --- a/common/networkpolicies/base/training-operator-webhook.yaml +++ b/common/networkpolicies/base/training-operator-webhook.yaml @@ -13,8 +13,8 @@ spec: # https://www.elastic.co/guide/en/cloud-on-k8s/1.1/k8s-webhook-network-policies.html # The kubernetes api server must reach the webhook ingress: - - ports: - - protocol: TCP - port: 9443 + - ports: + - protocol: TCP + port: 9443 policyTypes: - - Ingress \ No newline at end of file + - Ingress diff --git a/tests/gh-actions/install_multi_tenancy.sh b/tests/gh-actions/install_multi_tenancy.sh index ac9048ff68..29cc4d67a4 100755 --- a/tests/gh-actions/install_multi_tenancy.sh +++ b/tests/gh-actions/install_multi_tenancy.sh @@ -7,3 +7,6 @@ kubectl -n kubeflow wait --for=condition=Ready pods -l kustomize.component=profi echo "Installing Multitenancy Kubeflow Roles" kustomize build common/kubeflow-roles/base | kubectl apply -f - + +echo "Installing Multitenancy Network policies" +kustomize build common/networkpolicies/base | kubectl apply -f - diff --git a/tests/gh-actions/install_training_operator.sh b/tests/gh-actions/install_training_operator.sh new file mode 100755 index 0000000000..a0d50c1b1c --- /dev/null +++ b/tests/gh-actions/install_training_operator.sh @@ -0,0 +1,9 @@ +#!/bin/bash +set -euo pipefail +echo "Installing training operator ..." + +cd apps/training-operator/upstream +kustomize build overlays/kubeflow | kubectl apply -f - +kubectl wait --for=condition=Ready pods --all --all-namespaces --timeout=600s \ + --field-selector=status.phase!=Succeeded +cd - diff --git a/tests/gh-actions/kf-objects/notebook.test.kubeflow-user-example.com.yaml b/tests/gh-actions/kf-objects/notebook.test.kubeflow-user-example.com.yaml index 8e5ee2bb81..fdc8e53e0c 100644 --- a/tests/gh-actions/kf-objects/notebook.test.kubeflow-user-example.com.yaml +++ b/tests/gh-actions/kf-objects/notebook.test.kubeflow-user-example.com.yaml @@ -15,7 +15,7 @@ spec: spec: containers: - name: test - image: kubeflownotebookswg/jupyter-scipy:v1.9.0-rc.1 + image: kubeflownotebookswg/jupyter-scipy:v1.9.0 imagePullPolicy: IfNotPresent resources: limits: diff --git a/tests/gh-actions/kf-objects/tfjob.yaml b/tests/gh-actions/kf-objects/tfjob.yaml deleted file mode 100644 index 313c3312c0..0000000000 --- a/tests/gh-actions/kf-objects/tfjob.yaml +++ /dev/null @@ -1,21 +0,0 @@ -apiVersion: "kubeflow.org/v1" -kind: TFJob -metadata: - name: tfjob-simple - namespace: kubeflow -spec: - tfReplicaSpecs: - Worker: - replicas: 2 - restartPolicy: OnFailure - template: - spec: - containers: - - name: tensorflow - image: gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0 - command: - - "python" - - "/var/tf_mnist/mnist_with_summaries.py" - - "--log_dir=/train/logs" - - "--learning_rate=0.01" - - "--batch_size=150" \ No newline at end of file diff --git a/tests/gh-actions/kf-objects/training_operator_job.yaml b/tests/gh-actions/kf-objects/training_operator_job.yaml new file mode 100644 index 0000000000..68ee31c4be --- /dev/null +++ b/tests/gh-actions/kf-objects/training_operator_job.yaml @@ -0,0 +1,40 @@ +# from https://github.com/kubeflow/training-operator/blob/master/examples/pytorch/simple.yaml +# and disabled istio as stated in the documentation https://www.kubeflow.org/docs/components/training/user-guides/pytorch/ +apiVersion: "kubeflow.org/v1" +kind: PyTorchJob +metadata: + name: pytorch-simple +spec: + pytorchReplicaSpecs: + Master: + replicas: 1 + restartPolicy: OnFailure + template: + metadata: + annotations: + sidecar.istio.io/inject: "false" + spec: + containers: + - name: pytorch + image: docker.io/kubeflowkatib/pytorch-mnist:v1beta1-45c5727 + imagePullPolicy: Always + command: + - "python3" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" + Worker: + replicas: 1 + restartPolicy: OnFailure + template: + metadata: + annotations: + sidecar.istio.io/inject: "false" + spec: + containers: + - name: pytorch + image: docker.io/kubeflowkatib/pytorch-mnist:v1beta1-45c5727 + imagePullPolicy: Always + command: + - "python3" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" \ No newline at end of file