From 5ac0da52cb1b524cfed79a4287bae27b9c352826 Mon Sep 17 00:00:00 2001
From: Julius von Kohout <45896133+juliusvonkohout@users.noreply.github.com>
Date: Fri, 26 Jul 2024 15:56:55 +0200
Subject: [PATCH] Training operator  CICD improvements (#2779)

* Add the networkpolicies

Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com>

* rework the training operator tests

Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com>

* fix the comments

Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com>

* fix filename

Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com>

* try to fix the permissions

Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com>

* try to fix the permissions

Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com>

* change to the user namespace

Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com>

* update the image to rc.1

Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com>

* fixes

Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com>

* fixes

Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com>

* fixes

Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com>

* fixes

Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com>

* fixes

Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com>

---------

Signed-off-by: juliusvonkohout <45896133+juliusvonkohout@users.noreply.github.com>
---
 .../linting_bash_python_yaml_files.yaml       |  2 +-
 .github/workflows/model_registry_test.yaml    |  2 +-
 .github/workflows/train_operator_test.yaml    | 43 --------------
 .github/workflows/training_operator_test.yaml | 57 +++++++++++++++++++
 .../base/training-operator-webhook.yaml       |  8 +--
 tests/gh-actions/install_multi_tenancy.sh     |  3 +
 tests/gh-actions/install_training_operator.sh |  9 +++
 ...tebook.test.kubeflow-user-example.com.yaml |  2 +-
 tests/gh-actions/kf-objects/tfjob.yaml        | 21 -------
 .../kf-objects/training_operator_job.yaml     | 40 +++++++++++++
 10 files changed, 116 insertions(+), 71 deletions(-)
 delete mode 100644 .github/workflows/train_operator_test.yaml
 create mode 100644 .github/workflows/training_operator_test.yaml
 create mode 100755 tests/gh-actions/install_training_operator.sh
 delete mode 100644 tests/gh-actions/kf-objects/tfjob.yaml
 create mode 100644 tests/gh-actions/kf-objects/training_operator_job.yaml

diff --git a/.github/workflows/linting_bash_python_yaml_files.yaml b/.github/workflows/linting_bash_python_yaml_files.yaml
index 33e5e6914c..2a7e1c6921 100644
--- a/.github/workflows/linting_bash_python_yaml_files.yaml
+++ b/.github/workflows/linting_bash_python_yaml_files.yaml
@@ -1,6 +1,6 @@
 name: Proper linting on Bash, Python, and YAML files
 
-on: [push, pull_request]
+on: [pull_request]
 
 jobs:
   format_python_files:
diff --git a/.github/workflows/model_registry_test.yaml b/.github/workflows/model_registry_test.yaml
index 18345afb5f..082d9b540b 100644
--- a/.github/workflows/model_registry_test.yaml
+++ b/.github/workflows/model_registry_test.yaml
@@ -61,7 +61,7 @@ jobs:
           'http://localhost:8081/api/model_registry/v1alpha3/registered_models?pageSize=100&orderBy=ID&sortOrder=DESC' \
           -H 'accept: application/json'
 
-    # for these steps below ensure same steps as kserve (ie: Istio with ext external authentication, cert-manager, knative) so to achieve same setup 
+    # for these steps below ensure same steps as kserve (ie: Istio with external authentication, cert-manager, knative) so to achieve same setup 
     - name: Port forward Istio gateway
       run: |
         INGRESS_GATEWAY_SERVICE=$(kubectl get svc --namespace istio-system --selector="app=istio-ingressgateway" --output jsonpath='{.items[0].metadata.name}')
diff --git a/.github/workflows/train_operator_test.yaml b/.github/workflows/train_operator_test.yaml
deleted file mode 100644
index 23d891f627..0000000000
--- a/.github/workflows/train_operator_test.yaml
+++ /dev/null
@@ -1,43 +0,0 @@
-name: Build & Apply Training Operator manifests in KinD
-on:
-  pull_request:
-    paths:
-      - .github/workflows/train_operator_test.yaml
-      - apps/training-operator/upstream/**
-      - tests/gh-actions/kind-cluster.yaml
-      - tests/gh-actions/install_kind.sh
-      - tests/gh-actions/install_kustomize.sh
-      - tests/gh-actions/install_istio.sh
-      - common/istio*/**
-      - tests/gh-actions/kf-objects/tfjob.yaml
-
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v4
-
-    - name: Install KinD
-      run: ./tests/gh-actions/install_kind.sh
-
-    - name: Create KinD Cluster
-      run: kind create cluster --config tests/gh-actions/kind-cluster.yaml
-
-    - name: Install kustomize
-      run: ./tests/gh-actions/install_kustomize.sh
-
-    - name: Install Istio
-      run: ./tests/gh-actions/install_istio.sh
-
-    - name: Build & Apply manifests
-      run: |
-        cd apps/training-operator/upstream
-        kubectl create ns kubeflow
-        kustomize build overlays/kubeflow | kubectl apply -f -
-        kubectl wait --for=condition=Ready pods --all --all-namespaces --timeout 180s
-
-    - name: Create a TFJob
-      run: |
-        kubectl create -f tests/gh-actions/kf-objects/tfjob.yaml
-        kubectl wait --for=condition=Succeeded tfjobs tfjob-simple -n kubeflow --timeout 600s
diff --git a/.github/workflows/training_operator_test.yaml b/.github/workflows/training_operator_test.yaml
new file mode 100644
index 0000000000..5be7d5dffe
--- /dev/null
+++ b/.github/workflows/training_operator_test.yaml
@@ -0,0 +1,57 @@
+name: Build & Apply Training Operator manifests in KinD
+on:
+  pull_request:
+    paths:
+      - .github/workflows/training_operator_test.yaml
+      - apps/training-operator/upstream/**
+      - tests/gh-actions/kind-cluster.yaml
+      - tests/gh-actions/install_kind.sh
+      - tests/gh-actions/install_kustomize.sh
+      - tests/gh-actions/install_istio.sh
+      - common/istio*/**
+      - tests/gh-actions/kf-objects/tfjob.yaml
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+
+    - name: Install KinD
+      run: ./tests/gh-actions/install_kind.sh
+
+    - name: Create KinD Cluster
+      run: kind create cluster --config tests/gh-actions/kind-cluster.yaml
+
+    - name: Install kustomize
+      run: ./tests/gh-actions/install_kustomize.sh
+
+    - name: Install kubectl
+      run: ./tests/gh-actions/install_kubectl.sh
+
+    - name: Install Istio with external authentication
+      run: ./tests/gh-actions/install_istio_with_ext_auth.sh
+
+    - name: Install cert-manager
+      run: ./tests/gh-actions/install_cert_manager.sh
+
+    - name: Create kubeflow namespace
+      run: kustomize build common/kubeflow-namespace/base | kubectl apply -f -
+
+    - name: Install KF Multi Tenancy
+      run: ./tests/gh-actions/install_multi_tenancy.sh
+
+    - name: Install kubeflow-istio-resources
+      run: kustomize build common/istio-1-22/kubeflow-istio-resources/base | kubectl apply -f -
+
+    - name: Create KF Profile
+      run: kustomize build common/user-namespace/base | kubectl apply -f -
+
+    - name: Install training operator
+      run: ./tests/gh-actions/install_training_operator.sh
+
+    - name: Create a PyTorchJob
+      run: |
+        kubectl create -f tests/gh-actions/kf-objects/training_operator_job.yaml -n kubeflow-user-example-com
+        kubectl wait --for=condition=Succeeded PyTorchJob pytorch-simple -n kubeflow-user-example-com --timeout 600s
diff --git a/common/networkpolicies/base/training-operator-webhook.yaml b/common/networkpolicies/base/training-operator-webhook.yaml
index bbf6e373a3..8f9a6a7ea2 100644
--- a/common/networkpolicies/base/training-operator-webhook.yaml
+++ b/common/networkpolicies/base/training-operator-webhook.yaml
@@ -13,8 +13,8 @@ spec:
   # https://www.elastic.co/guide/en/cloud-on-k8s/1.1/k8s-webhook-network-policies.html
   # The kubernetes api server must reach the webhook
   ingress:
-    - ports:
-        - protocol: TCP
-          port: 9443
+  - ports:
+    - protocol: TCP
+      port: 9443
   policyTypes:
-    - Ingress
\ No newline at end of file
+  - Ingress
diff --git a/tests/gh-actions/install_multi_tenancy.sh b/tests/gh-actions/install_multi_tenancy.sh
index ac9048ff68..29cc4d67a4 100755
--- a/tests/gh-actions/install_multi_tenancy.sh
+++ b/tests/gh-actions/install_multi_tenancy.sh
@@ -7,3 +7,6 @@ kubectl -n kubeflow wait --for=condition=Ready pods -l kustomize.component=profi
 
 echo "Installing Multitenancy Kubeflow Roles"
 kustomize build common/kubeflow-roles/base | kubectl apply -f -
+
+echo "Installing Multitenancy Network policies"
+kustomize build common/networkpolicies/base | kubectl apply -f -
diff --git a/tests/gh-actions/install_training_operator.sh b/tests/gh-actions/install_training_operator.sh
new file mode 100755
index 0000000000..a0d50c1b1c
--- /dev/null
+++ b/tests/gh-actions/install_training_operator.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+set -euo pipefail
+echo "Installing training operator ..."
+
+cd apps/training-operator/upstream
+kustomize build overlays/kubeflow | kubectl apply -f -
+kubectl wait --for=condition=Ready pods --all --all-namespaces --timeout=600s \
+  --field-selector=status.phase!=Succeeded
+cd -
diff --git a/tests/gh-actions/kf-objects/notebook.test.kubeflow-user-example.com.yaml b/tests/gh-actions/kf-objects/notebook.test.kubeflow-user-example.com.yaml
index 8e5ee2bb81..fdc8e53e0c 100644
--- a/tests/gh-actions/kf-objects/notebook.test.kubeflow-user-example.com.yaml
+++ b/tests/gh-actions/kf-objects/notebook.test.kubeflow-user-example.com.yaml
@@ -15,7 +15,7 @@ spec:
     spec:
       containers:
       - name: test
-        image: kubeflownotebookswg/jupyter-scipy:v1.9.0-rc.1
+        image: kubeflownotebookswg/jupyter-scipy:v1.9.0
         imagePullPolicy: IfNotPresent
         resources:
           limits:
diff --git a/tests/gh-actions/kf-objects/tfjob.yaml b/tests/gh-actions/kf-objects/tfjob.yaml
deleted file mode 100644
index 313c3312c0..0000000000
--- a/tests/gh-actions/kf-objects/tfjob.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-apiVersion: "kubeflow.org/v1"
-kind: TFJob
-metadata:
-  name: tfjob-simple
-  namespace: kubeflow
-spec:
-   tfReplicaSpecs:
-     Worker:
-       replicas: 2
-       restartPolicy: OnFailure
-       template:
-         spec:
-           containers:
-             - name: tensorflow
-               image: gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0
-               command:
-                 - "python"
-                 - "/var/tf_mnist/mnist_with_summaries.py"
-                 - "--log_dir=/train/logs"
-                 - "--learning_rate=0.01"
-                 - "--batch_size=150"
\ No newline at end of file
diff --git a/tests/gh-actions/kf-objects/training_operator_job.yaml b/tests/gh-actions/kf-objects/training_operator_job.yaml
new file mode 100644
index 0000000000..68ee31c4be
--- /dev/null
+++ b/tests/gh-actions/kf-objects/training_operator_job.yaml
@@ -0,0 +1,40 @@
+# from https://github.com/kubeflow/training-operator/blob/master/examples/pytorch/simple.yaml
+# and disabled istio as stated in the documentation https://www.kubeflow.org/docs/components/training/user-guides/pytorch/
+apiVersion: "kubeflow.org/v1"
+kind: PyTorchJob
+metadata:
+  name: pytorch-simple
+spec:
+  pytorchReplicaSpecs:
+    Master:
+      replicas: 1
+      restartPolicy: OnFailure
+      template:
+        metadata:
+          annotations:
+            sidecar.istio.io/inject: "false"
+        spec:
+          containers:
+            - name: pytorch
+              image: docker.io/kubeflowkatib/pytorch-mnist:v1beta1-45c5727
+              imagePullPolicy: Always
+              command:
+                - "python3"
+                - "/opt/pytorch-mnist/mnist.py"
+                - "--epochs=1"
+    Worker:
+      replicas: 1
+      restartPolicy: OnFailure
+      template:
+        metadata:
+          annotations:
+            sidecar.istio.io/inject: "false"
+        spec:
+          containers:
+            - name: pytorch
+              image: docker.io/kubeflowkatib/pytorch-mnist:v1beta1-45c5727
+              imagePullPolicy: Always
+              command:
+                - "python3"
+                - "/opt/pytorch-mnist/mnist.py"
+                - "--epochs=1"
\ No newline at end of file