diff --git a/main.go b/main.go index 7a71edba..a48d6981 100644 --- a/main.go +++ b/main.go @@ -17,7 +17,9 @@ limitations under the License. package main import ( + "errors" "flag" + "net/http" "os" // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) @@ -136,7 +138,7 @@ func main() { // Controllers who register after manager starts will start directly. go setupControllers(mgr, certsReady) - setupHealthzAndReadyzCheck(mgr) + setupHealthzAndReadyzCheck(mgr, certsReady) setupLog.Info("starting manager") if err := mgr.Start(ctx); err != nil { @@ -186,14 +188,29 @@ func setupControllers(mgr ctrl.Manager, certsReady chan struct{}) { //+kubebuilder:scaffold:builder } -func setupHealthzAndReadyzCheck(mgr ctrl.Manager) { +func setupHealthzAndReadyzCheck(mgr ctrl.Manager, certsReady <-chan struct{}) { defer setupLog.Info("both healthz and readyz check are finished and configured") if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { setupLog.Error(err, "unable to set up health check") os.Exit(1) } - if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { + + // Wait for the webhook server to be listening before advertising the + // Jobset replica as ready. This allows users to wait with sending the first + // requests, requiring webhooks, until the Jobset deployment is available, so + // that the early requests are not rejected during the Jobset's startup. + // We wrap the call to GetWebhookServer in a closure to delay calling + // the function, otherwise a not fully-initialized webhook server (without + // ready certs) fails the start of the manager. + if err := mgr.AddReadyzCheck("readyz", func(req *http.Request) error { + select { + case <-certsReady: + return mgr.GetWebhookServer().StartedChecker()(req) + default: + return errors.New("certificates are not ready") + } + }); err != nil { setupLog.Error(err, "unable to set up ready check") os.Exit(1) } diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go index 00116a46..6f625ebb 100644 --- a/test/e2e/suite_test.go +++ b/test/e2e/suite_test.go @@ -15,18 +15,22 @@ package e2e import ( "context" + "fmt" "testing" "time" + "github.com/google/go-cmp/cmp/cmpopts" "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes/scheme" + "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/config" jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2" - testutils "sigs.k8s.io/jobset/pkg/util/testing" //+kubebuilder:scaffold:imports ) @@ -59,27 +63,30 @@ var _ = ginkgo.BeforeSuite(func() { gomega.Expect(err).NotTo(gomega.HaveOccurred()) gomega.Expect(k8sClient).NotTo(gomega.BeNil()) - JobSetReadyForTesting(k8sClient) + jobSetReadyForTesting(k8sClient) }) -func JobSetReadyForTesting(client client.Client) { +func jobSetReadyForTesting(k8sClient client.Client) { ginkgo.By("waiting for resources to be ready for testing") - // To verify that webhooks are ready, let's create a simple jobset. - js := testutils.MakeJobSet("js", "default"). - ReplicatedJob(testutils.MakeReplicatedJob("rjob"). - Job(testutils.MakeJobTemplate("job", "default"). - PodSpec(testutils.TestPodSpec).Obj()). - Obj()).Obj() - - // Once the creation succeeds, that means the webhooks are ready - // and we can begin testing. - gomega.Eventually(func() error { - return client.Create(context.Background(), js) + deploymentKey := types.NamespacedName{Namespace: "jobset-system", Name: "jobset-controller-manager"} + deployment := &appsv1.Deployment{} + pods := &corev1.PodList{} + gomega.EventuallyWithOffset(1, func(g gomega.Gomega) error { + g.Expect(k8sClient.Get(ctx, deploymentKey, deployment)).To(gomega.Succeed()) + g.Expect(deployment.Status.Conditions).To(gomega.ContainElement(gomega.BeComparableTo( + appsv1.DeploymentCondition{Type: appsv1.DeploymentAvailable, Status: corev1.ConditionTrue}, + cmpopts.IgnoreFields(appsv1.DeploymentCondition{}, "Reason", "Message", "LastUpdateTime", "LastTransitionTime")), + )) + g.Expect(k8sClient.List(ctx, pods, client.InNamespace(deploymentKey.Namespace), client.MatchingLabels(deployment.Spec.Selector.MatchLabels))).To(gomega.Succeed()) + for _, pod := range pods.Items { + for _, cs := range pod.Status.ContainerStatuses { + g.Expect(ptr.Deref(cs.Started, false)).To(gomega.BeTrue()) + g.Expect(cs.Ready).To(gomega.BeTrue()) + if cs.RestartCount > 0 { + return gomega.StopTrying(fmt.Sprintf("%q in %q has restarted %d times", cs.Name, pod.Name, cs.RestartCount)) + } + } + } + return nil }, timeout, interval).Should(gomega.Succeed()) - - // Delete this jobset before beginning tests. - gomega.Expect(client.Delete(ctx, js)) - gomega.Eventually(func() error { - return client.Get(ctx, types.NamespacedName{Name: js.Name, Namespace: js.Namespace}, &jobset.JobSet{}) - }).ShouldNot(gomega.Succeed()) }