From e18cc4d93b18e5f6255c8fd628a3538ea29e6f97 Mon Sep 17 00:00:00 2001 From: Richard Liu Date: Tue, 20 Nov 2018 20:24:36 -0800 Subject: [PATCH 1/5] downgrade to 1.10.1 --- Gopkg.lock | 112 ++++-------------- Gopkg.toml | 8 +- .../studyjob/studyjob_controller.go | 4 +- 3 files changed, 30 insertions(+), 94 deletions(-) diff --git a/Gopkg.lock b/Gopkg.lock index 476b2866b66..0d515407d5d 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -148,14 +148,6 @@ revision = "aa810b61a9c79d51363740d207bb46cf8e620ed5" version = "v1.2.0" -[[projects]] - branch = "master" - digest = "1:0bfbe13936953a98ae3cfe8ed6670d396ad81edf069a806d2f6515d7bb6950df" - name = "github.com/google/btree" - packages = ["."] - pruneopts = "T" - revision = "4030bb1f1f0c35b30ca7009e9ebd06849dd45306" - [[projects]] branch = "master" digest = "1:3ee90c0d94da31b442dde97c99635aaafec68d0b8a3c12ee2075c6bdabeec6bb" @@ -164,14 +156,6 @@ pruneopts = "T" revision = "24818f796faf91cd76ec7bddd72458fbced7a6c1" -[[projects]] - digest = "1:3a26588bc48b96825977c1b3df964f8fd842cd6860cc26370588d3563433cf11" - name = "github.com/google/uuid" - packages = ["."] - pruneopts = "T" - revision = "d460ce9f8df2e77fb1ba55ca87fafed96c607494" - version = "v1.0.0" - [[projects]] digest = "1:35735e2255fa34521c2a1355fb2a3a2300bc9949f487be1c1ce8ee8efcfa2d04" name = "github.com/googleapis/gnostic" @@ -184,17 +168,6 @@ revision = "7c663266750e7d82587642f65e60bc4083f1f84e" version = "v0.2.0" -[[projects]] - branch = "master" - digest = "1:4607fd19c69c3deee61840fca759fedb3908e4fcb09385e2f3783b93e0035c73" - name = "github.com/gregjones/httpcache" - packages = [ - ".", - "diskcache", - ] - pruneopts = "T" - revision = "9cad4c3443a7200dd6400aef47183728de563a38" - [[projects]] digest = "1:69cd81163a00bb8405194d47b8be19283744779b6104f2d6b3735e2a01cdb6fa" name = "github.com/grpc-ecosystem/grpc-gateway" @@ -246,6 +219,14 @@ revision = "8cb6e5b959231cc1119e43259c4a608f9c51a241" version = "v1.0.0" +[[projects]] + branch = "master" + digest = "1:0778dc7fce1b4669a8bfa7ae506ec1f595b6ab0f8989c1c0d22a8ca1144e9972" + name = "github.com/howeyc/gopass" + packages = ["."] + pruneopts = "T" + revision = "bf9dde6d0d2c004a008c27aaee91170c786f6db8" + [[projects]] digest = "1:8f20c8dd713564fa97299fbcb77d729c6de9c33f3222812a76e6ecfaef80fd61" name = "github.com/hpcloud/tail" @@ -308,14 +289,6 @@ revision = "dd7de90c06bca70f18136e59dec2270c19a401e7" version = "v1.0.0" -[[projects]] - branch = "master" - digest = "1:fc2b04b0069d6b10bdef96d278fe20c345794009685ed3c8c7f1a6dc023eefec" - name = "github.com/mattbaird/jsonpatch" - packages = ["."] - pruneopts = "T" - revision = "81af80346b1a01caae0cbc27fd3c1ba5b11e189f" - [[projects]] digest = "1:645110e089152bd0f4a011a2648fbb0e4df5977be73ca605781157ac297f50c4" name = "github.com/mitchellh/mapstructure" @@ -390,14 +363,6 @@ revision = "7615b9433f86a8bdf29709bf288bc4fd0636a369" version = "v1.4.2" -[[projects]] - digest = "1:e5d0bd87abc2781d14e274807a470acd180f0499f8bf5bb18606e9ec22ad9de9" - name = "github.com/pborman/uuid" - packages = ["."] - pruneopts = "T" - revision = "adf5a7427709b9deb95d29d3fa8a2bf9cfd388f1" - version = "v1.2" - [[projects]] digest = "1:ccf9949c9c53e85dcb7e2905fc620571422567040925381e6baa62f0b7b850fe" name = "github.com/pelletier/go-toml" @@ -406,22 +371,6 @@ revision = "c01d1270ff3e442a8a57cddc1c92dc1138598194" version = "v1.2.0" -[[projects]] - branch = "master" - digest = "1:0c29d499ffc3b9f33e7136444575527d0c3a9463a89b3cbeda0523b737f910b3" - name = "github.com/petar/GoLLRB" - packages = ["llrb"] - pruneopts = "T" - revision = "53be0d36a84c2a886ca057d34b6aa4468df9ccb4" - -[[projects]] - digest = "1:598241bd36d3a5f6d9102a306bd9bf78f3bc253672460d92ac70566157eae648" - name = "github.com/peterbourgon/diskv" - packages = ["."] - pruneopts = "T" - revision = "5f041e8faa004a95c88a202771f4cc3e991971e6" - version = "v2.0.1" - [[projects]] digest = "1:40e195917a951a8bf867cd05de2a46aaf1806c50cf92eebf4c16f78cd196f747" name = "github.com/pkg/errors" @@ -749,10 +698,9 @@ version = "v2.2.1" [[projects]] - digest = "1:75905606c2db8d9526ccff0acf5655759afe3d901d1d64868bd87985b4a46e36" + digest = "1:7b4520bed4eee8895eeb2c30611178146a30b70e9a4d2b07068e3795c6a3bcaf" name = "k8s.io/api" packages = [ - "admission/v1beta1", "admissionregistration/v1alpha1", "admissionregistration/v1beta1", "apps/v1", @@ -777,15 +725,14 @@ "rbac/v1alpha1", "rbac/v1beta1", "scheduling/v1alpha1", - "scheduling/v1beta1", "settings/v1alpha1", "storage/v1", "storage/v1alpha1", "storage/v1beta1", ] pruneopts = "T" - revision = "2d6f90ab1293a1fb871cf149423ebb72aa7423aa" - version = "kubernetes-1.11.2" + revision = "73d903622b7391f3312dcbac6483fed484e185f8" + version = "kubernetes-1.10.1" [[projects]] digest = "1:93787b85c4d0e2ee4ebd1ea1eeb97a4c074536928c045d94333e0654eb3f4428" @@ -799,7 +746,7 @@ version = "kubernetes-1.10.1" [[projects]] - digest = "1:e2035d60919705a125feeb8b13926383aff8817634b2c4550743c19ca21f3b08" + digest = "1:40b0f5d488cfde8efb3953dbcff84d2211fd51e560044c64151273ec54683dc6" name = "k8s.io/apimachinery" packages = [ "pkg/api/errors", @@ -835,7 +782,6 @@ "pkg/util/runtime", "pkg/util/sets", "pkg/util/strategicpatch", - "pkg/util/uuid", "pkg/util/validation", "pkg/util/validation/field", "pkg/util/wait", @@ -846,14 +792,15 @@ "third_party/forked/golang/reflect", ] pruneopts = "T" - revision = "103fd098999dc9c0c88536f5c9ad2e5da39373ae" - version = "kubernetes-1.11.2" + revision = "302974c03f7e50f16561ba237db776ab93594ef6" + version = "kubernetes-1.10.1" [[projects]] - digest = "1:0a19f9bf56ecc683569299c8c2282ee3ac70d4e32eeb170d3265761bff8d5e12" + digest = "1:b324d81a2e103152a81439ec0889b2193c0a14a154bf68f7a24a5b880b7f3ffa" name = "k8s.io/client-go" packages = [ "discovery", + "dynamic", "kubernetes", "kubernetes/scheme", "kubernetes/typed/admissionregistration/v1alpha1", @@ -880,20 +827,17 @@ "kubernetes/typed/rbac/v1alpha1", "kubernetes/typed/rbac/v1beta1", "kubernetes/typed/scheduling/v1alpha1", - "kubernetes/typed/scheduling/v1beta1", "kubernetes/typed/settings/v1alpha1", "kubernetes/typed/storage/v1", "kubernetes/typed/storage/v1alpha1", "kubernetes/typed/storage/v1beta1", "pkg/apis/clientauthentication", "pkg/apis/clientauthentication/v1alpha1", - "pkg/apis/clientauthentication/v1beta1", "pkg/version", "plugin/pkg/client/auth/exec", "plugin/pkg/client/auth/gcp", "rest", "rest/watch", - "restmapper", "third_party/forked/golang/template", "tools/auth", "tools/cache", @@ -901,8 +845,6 @@ "tools/clientcmd/api", "tools/clientcmd/api/latest", "tools/clientcmd/api/v1", - "tools/leaderelection", - "tools/leaderelection/resourcelock", "tools/metrics", "tools/pager", "tools/record", @@ -910,7 +852,6 @@ "transport", "util/buffer", "util/cert", - "util/connrotation", "util/flowcontrol", "util/homedir", "util/integer", @@ -919,8 +860,8 @@ "util/workqueue", ] pruneopts = "T" - revision = "1f13a808da65775f22cbf47862c4e5898d8f4ca1" - version = "kubernetes-1.11.2" + revision = "989be4278f353e42f26c416c53757d16fcff77db" + version = "kubernetes-1.10.1" [[projects]] branch = "master" @@ -959,7 +900,7 @@ revision = "e3762e86a74c878ffed47484592986685639c2cd" [[projects]] - digest = "1:0d252ff794aef84f642866acbbd8044e1e062183629750832f486611294e5ec7" + digest = "1:355ef996107f9975903d75f1f091fca168603d3b5c8eb88d858ee6b90e80d1f5" name = "sigs.k8s.io/controller-runtime" packages = [ "pkg/cache", @@ -973,9 +914,7 @@ "pkg/handler", "pkg/internal/controller", "pkg/internal/recorder", - "pkg/leaderelection", "pkg/manager", - "pkg/patch", "pkg/predicate", "pkg/reconcile", "pkg/recorder", @@ -985,16 +924,13 @@ "pkg/runtime/signals", "pkg/source", "pkg/source/internal", - "pkg/webhook/admission", - "pkg/webhook/admission/types", - "pkg/webhook/types", ] pruneopts = "T" - revision = "79f06014d49b565e2d980fcd9519b33874ddb8d6" - version = "v0.1.3" + revision = "67e28744a963c78a34108c3b52aadbc5e02f4384" + version = "v0.1.1" [[projects]] - digest = "1:6dfe024674229af3c6e3791718837798eeada8ea41628e61e31f0774ffea41b3" + digest = "1:58b878c2275390dcf5160957c3121ebc61e97eb632ae02726d11a4d1988da41f" name = "sigs.k8s.io/controller-tools" packages = [ "cmd/controller-gen", @@ -1006,8 +942,8 @@ "pkg/util", ] pruneopts = "T" - revision = "bfd3eefb7f22ed714b2acd38730c36992ee6af70" - version = "v0.1.5" + revision = "62b5a17481a76b7a1feea0b16188d52a6f61ec64" + version = "v0.1.3" [[projects]] branch = "master" diff --git a/Gopkg.toml b/Gopkg.toml index 0bb4635094b..d0297f92d87 100644 --- a/Gopkg.toml +++ b/Gopkg.toml @@ -59,19 +59,19 @@ required = [ [[constraint]] name = "k8s.io/api" - version = "kubernetes-1.11.2" + version = "kubernetes-1.10.1" [[constraint]] name = "k8s.io/apimachinery" - version = "kubernetes-1.11.2" + version = "kubernetes-1.10.1" [[constraint]] name = "k8s.io/client-go" - version = "kubernetes-1.11.2" + version = "kubernetes-1.10.1" [[constraint]] name = "sigs.k8s.io/controller-runtime" - version = "0.1.3" + version = "0.1.1" [[override]] name="k8s.io/apiextensions-apiserver" diff --git a/pkg/controller/studyjob/studyjob_controller.go b/pkg/controller/studyjob/studyjob_controller.go index ef6991ac98d..2344330ef7c 100644 --- a/pkg/controller/studyjob/studyjob_controller.go +++ b/pkg/controller/studyjob/studyjob_controller.go @@ -289,7 +289,7 @@ func (r *ReconcileStudyJobController) checkStatus(instance *katibv1alpha1.StudyJ job := &batchv1.Job{} joberr := r.Client.Get(context.TODO(), nname, job) if joberr == nil { - if err := r.Delete(context.TODO(), job, client.PropagationPolicy(metav1.DeletePropagationForeground)); err != nil { + if err := r.Delete(context.TODO(), job); err != nil { return false, err } } @@ -301,7 +301,7 @@ func (r *ReconcileStudyJobController) checkStatus(instance *katibv1alpha1.StudyJ cjob := &batchv1beta.CronJob{} cjoberr := r.Client.Get(context.TODO(), nname, cjob) if cjoberr == nil { - if err := r.Delete(context.TODO(), cjob, client.PropagationPolicy(metav1.DeletePropagationForeground)); err != nil { + if err := r.Delete(context.TODO(), cjob); err != nil { return false, err } } From 635320458ee70d0c7236ade3705eb284dd64ad2f Mon Sep 17 00:00:00 2001 From: Richard Liu Date: Mon, 26 Nov 2018 16:29:24 -0800 Subject: [PATCH 2/5] Delete pods --- pkg/controller/studyjob/pod_control.go | 61 +++++++++++++++++++ .../studyjob/studyjob_controller.go | 22 ++++++- 2 files changed, 80 insertions(+), 3 deletions(-) create mode 100644 pkg/controller/studyjob/pod_control.go diff --git a/pkg/controller/studyjob/pod_control.go b/pkg/controller/studyjob/pod_control.go new file mode 100644 index 00000000000..1ed75752dd7 --- /dev/null +++ b/pkg/controller/studyjob/pod_control.go @@ -0,0 +1,61 @@ +/* + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package studyjob + +import ( + "log" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + clientset "k8s.io/client-go/kubernetes" + restclient "k8s.io/client-go/rest" +) + +type PodControl struct { + kubeClient clientset.Interface +} + +func NewPodControl() (*PodControl, error) { + config, err := restclient.InClusterConfig() + if err != nil { + return nil, err + } + kc, err := clientset.NewForConfig(config) + if err != nil { + return nil, err + } + return &PodControl{ + kubeClient: kc, + }, nil +} + +func (c PodControl) DeletePodsForWorker(namespace string, wid string) error { + selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{ + MatchLabels: map[string]string{"job-name":wid}, + }) + if err != nil { + return err + } + log.Printf("Listing pods with selector %v", selector.String()) + listOptions := metav1.ListOptions{ + LabelSelector: selector.String(), + } + err = c.kubeClient.CoreV1().Pods(namespace).DeleteCollection(&metav1.DeleteOptions{}, listOptions) + if err != nil { + return err + } + log.Printf("Deleted pods with selector %v.", selector.String()) + return nil +} diff --git a/pkg/controller/studyjob/studyjob_controller.go b/pkg/controller/studyjob/studyjob_controller.go index 2344330ef7c..200d4910cd4 100644 --- a/pkg/controller/studyjob/studyjob_controller.go +++ b/pkg/controller/studyjob/studyjob_controller.go @@ -55,12 +55,20 @@ const maxMsgSize = 1<<31 - 1 // and Start it when the Manager is Started. // USER ACTION REQUIRED: update cmd/manager/main.go to call this studyjob.Add(mgr) to install this Controller func Add(mgr manager.Manager) error { - return add(mgr, newReconciler(mgr)) + r, err := newReconciler(mgr) + if err != nil { + return err + } + return add(mgr, r) } // newReconciler returns a new reconcile.Reconciler -func newReconciler(mgr manager.Manager) reconcile.Reconciler { - return &ReconcileStudyJobController{Client: mgr.GetClient(), scheme: mgr.GetScheme(), muxMap: sync.Map{}} +func newReconciler(mgr manager.Manager) (reconcile.Reconciler, error) { + pc, err := NewPodControl() + if err != nil { + return nil, err + } + return &ReconcileStudyJobController{Client: mgr.GetClient(), scheme: mgr.GetScheme(), muxMap: sync.Map{}, podControl: pc}, nil } // add adds a new Controller to mgr with r as the reconcile.Reconciler @@ -106,6 +114,7 @@ type ReconcileStudyJobController struct { client.Client scheme *runtime.Scheme muxMap sync.Map + podControl *PodControl } // Reconcile reads that state of the cluster for a StudyJob object and makes changes based on the state read @@ -292,6 +301,9 @@ func (r *ReconcileStudyJobController) checkStatus(instance *katibv1alpha1.StudyJ if err := r.Delete(context.TODO(), job); err != nil { return false, err } + if err := r.podControl.DeletePodsForWorker(ns, w.WorkerID); err != nil { + return false, err + } } } if instance.Spec.MetricsCollectorSpec != nil { @@ -304,6 +316,10 @@ func (r *ReconcileStudyJobController) checkStatus(instance *katibv1alpha1.StudyJ if err := r.Delete(context.TODO(), cjob); err != nil { return false, err } + if err := r.podControl.DeletePodsForWorker(ns, w.WorkerID); err != nil { + return false, err + } + } } } From 447def796afab11ebd9d1fd4f7e49e844e69ca91 Mon Sep 17 00:00:00 2001 From: Richard Liu Date: Mon, 26 Nov 2018 18:36:07 -0800 Subject: [PATCH 3/5] Fix job-name --- pkg/controller/studyjob/pod_control.go | 2 +- pkg/controller/studyjob/studyjob_controller.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/controller/studyjob/pod_control.go b/pkg/controller/studyjob/pod_control.go index 1ed75752dd7..aab23e73293 100644 --- a/pkg/controller/studyjob/pod_control.go +++ b/pkg/controller/studyjob/pod_control.go @@ -48,7 +48,7 @@ func (c PodControl) DeletePodsForWorker(namespace string, wid string) error { if err != nil { return err } - log.Printf("Listing pods with selector %v", selector.String()) + log.Printf("Deleting pods with selector %v", selector.String()) listOptions := metav1.ListOptions{ LabelSelector: selector.String(), } diff --git a/pkg/controller/studyjob/studyjob_controller.go b/pkg/controller/studyjob/studyjob_controller.go index 200d4910cd4..a04b71cea1b 100644 --- a/pkg/controller/studyjob/studyjob_controller.go +++ b/pkg/controller/studyjob/studyjob_controller.go @@ -301,7 +301,7 @@ func (r *ReconcileStudyJobController) checkStatus(instance *katibv1alpha1.StudyJ if err := r.Delete(context.TODO(), job); err != nil { return false, err } - if err := r.podControl.DeletePodsForWorker(ns, w.WorkerID); err != nil { + if err := r.podControl.DeletePodsForWorker(ns, job.GetName()); err != nil { return false, err } } @@ -316,7 +316,7 @@ func (r *ReconcileStudyJobController) checkStatus(instance *katibv1alpha1.StudyJ if err := r.Delete(context.TODO(), cjob); err != nil { return false, err } - if err := r.podControl.DeletePodsForWorker(ns, w.WorkerID); err != nil { + if err := r.podControl.DeletePodsForWorker(ns, cjob.GetName()); err != nil { return false, err } From 478a38d53aa74c812f35782dfdb4b6de4effe8e5 Mon Sep 17 00:00:00 2001 From: Richard Liu Date: Tue, 27 Nov 2018 20:25:58 -0800 Subject: [PATCH 4/5] Set successfulJobsHistoryLimit to 0 --- manifests/studyjobcontroller/metricsControllerConfigMap.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/studyjobcontroller/metricsControllerConfigMap.yaml b/manifests/studyjobcontroller/metricsControllerConfigMap.yaml index b01dedb259e..02b750c725e 100644 --- a/manifests/studyjobcontroller/metricsControllerConfigMap.yaml +++ b/manifests/studyjobcontroller/metricsControllerConfigMap.yaml @@ -12,7 +12,7 @@ data: namespace: kubeflow spec: schedule: "*/1 * * * *" - successfulJobsHistoryLimit: 1 + successfulJobsHistoryLimit: 0 failedJobsHistoryLimit: 1 jobTemplate: spec: From e3cdda2af7579987cecfac2d3d8089bbfda5a029 Mon Sep 17 00:00:00 2001 From: Richard Liu Date: Tue, 27 Nov 2018 20:33:51 -0800 Subject: [PATCH 5/5] Add comments --- pkg/controller/studyjob/studyjob_controller.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pkg/controller/studyjob/studyjob_controller.go b/pkg/controller/studyjob/studyjob_controller.go index a04b71cea1b..b00a70fb07c 100644 --- a/pkg/controller/studyjob/studyjob_controller.go +++ b/pkg/controller/studyjob/studyjob_controller.go @@ -301,6 +301,10 @@ func (r *ReconcileStudyJobController) checkStatus(instance *katibv1alpha1.StudyJ if err := r.Delete(context.TODO(), job); err != nil { return false, err } + // In order to integrate with tf-operator and pytorch-operator, we need to + // downgrade the k8s dependency for katib from 1.11.2 to 1.10.1, and + // controller-runtime from 0.1.3 to 0.1.1. This means that we cannot use + // DeletePropagationForeground to clean up pods, and must do this manually. if err := r.podControl.DeletePodsForWorker(ns, job.GetName()); err != nil { return false, err } @@ -316,10 +320,9 @@ func (r *ReconcileStudyJobController) checkStatus(instance *katibv1alpha1.StudyJ if err := r.Delete(context.TODO(), cjob); err != nil { return false, err } - if err := r.podControl.DeletePodsForWorker(ns, cjob.GetName()); err != nil { - return false, err - } - + // Depending on the successfulJobsHistoryLimit setting, cronjob controller + // will delete the metrics collector pods accordingly, so we do not need + // to manually delete them here. } } }