From 32f04ca3cbeaa616106d79c35e4946e0c31e468d Mon Sep 17 00:00:00 2001 From: Andrew Obuchowicz Date: Mon, 30 May 2022 16:06:27 -0400 Subject: [PATCH] feat: report error when common PVC cleanup job hangs Fix devfile#551 Signed-off-by: Andrew Obuchowicz --- pkg/provision/storage/cleanup.go | 69 ++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 3 deletions(-) diff --git a/pkg/provision/storage/cleanup.go b/pkg/provision/storage/cleanup.go index 6aa9f0463..262d80491 100644 --- a/pkg/provision/storage/cleanup.go +++ b/pkg/provision/storage/cleanup.go @@ -17,10 +17,10 @@ package storage import ( "fmt" - "path" "time" dw "github.com/devfile/api/v2/pkg/apis/workspaces/v1alpha2" + check "github.com/devfile/devworkspace-operator/pkg/library/status" nsconfig "github.com/devfile/devworkspace-operator/pkg/provision/config" "github.com/devfile/devworkspace-operator/pkg/provision/sync" batchv1 "k8s.io/api/batch/v1" @@ -29,6 +29,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + k8sclient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "github.com/devfile/devworkspace-operator/internal/images" @@ -91,6 +92,21 @@ func runCommonPVCCleanupJob(workspace *dw.DevWorkspace, clusterAPI sync.ClusterA } } } + + msg, err := checkCleanupPodsState(clusterJob, workspace.Status.DevWorkspaceId, clusterAPI) + if err != nil { + return &ProvisioningError{ + Err: err, + } + } + + if msg != "" { + errMsg := fmt.Sprintf("DevWorkspace PVC cleanup job failed: see logs for job %q for details. Additional information: %s", clusterJob.Name, msg) + return &ProvisioningError{ + Message: errMsg, + } + } + // Requeue at least each 10 seconds to check if PVC is not removed by someone else return &NotReadyError{ Message: "Cleanup job is not in completed state", @@ -110,7 +126,9 @@ func getSpecCommonPVCCleanupJob(workspace *dw.DevWorkspace, clusterAPI sync.Clus } jobLabels := map[string]string{ - constants.DevWorkspaceIDLabel: workspaceId, + constants.DevWorkspaceIDLabel: workspaceId, + constants.DevWorkspaceNameLabel: workspace.Name, + constants.DevWorkspaceCreatorLabel: workspace.Labels[constants.DevWorkspaceCreatorLabel], } if restrictedAccess, needsRestrictedAccess := workspace.Annotations[constants.DevWorkspaceRestrictedAccessAnnotation]; needsRestrictedAccess { jobLabels[constants.DevWorkspaceRestrictedAccessAnnotation] = restrictedAccess @@ -126,6 +144,9 @@ func getSpecCommonPVCCleanupJob(workspace *dw.DevWorkspace, clusterAPI sync.Clus Completions: &cleanupJobCompletions, BackoffLimit: &cleanupJobBackoffLimit, Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: jobLabels, + }, Spec: corev1.PodSpec{ RestartPolicy: "Never", SecurityContext: wsprovision.GetDevWorkspaceSecurityContext(), @@ -146,7 +167,8 @@ func getSpecCommonPVCCleanupJob(workspace *dw.DevWorkspace, clusterAPI sync.Clus Command: []string{"/bin/sh"}, Args: []string{ "-c", - fmt.Sprintf(cleanupCommandFmt, path.Join(pvcClaimMountPath, workspaceId)), + //fmt.Sprintf(cleanupCommandFmt, path.Join(pvcClaimMountPath, workspaceId)), + "exit 1", }, Resources: corev1.ResourceRequirements{ Requests: corev1.ResourceList{ @@ -203,3 +225,44 @@ func commonPVCExists(workspace *dw.DevWorkspace, clusterAPI sync.ClusterAPI) (bo } return true, nil } + +func checkCleanupPodsState(job *batchv1.Job, workspaceID string, clusterAPI sync.ClusterAPI) (msg string, err error) { + pods, err := check.GetPods(job.Namespace, k8sclient.MatchingLabels{"job-name": common.PVCCleanupJobName(workspaceID)}, clusterAPI.Client) + if err != nil { + return "", err + } + + for _, pod := range pods.Items { + + for _, containerStatus := range pod.Status.ContainerStatuses { + if check.CheckContainerStatusForFailure(&containerStatus) { + // TODO: Maybe move this logic into CheckContainerStatusForFailure and return bool, reason ? + reason := "" + if containerStatus.State.Waiting != nil { + reason = containerStatus.State.Waiting.Reason + } else if containerStatus.State.Terminated != nil { + reason = containerStatus.State.Terminated.Reason + } + return fmt.Sprintf("Common PVC Cleanup related container %s has state %s.", containerStatus.Name, reason), nil + } + } + + for _, initContainerStatus := range pod.Status.InitContainerStatuses { + if check.CheckContainerStatusForFailure(&initContainerStatus) { + reason := "" + if initContainerStatus.State.Waiting != nil { + reason = initContainerStatus.State.Waiting.Reason + } else if initContainerStatus.State.Terminated != nil { + reason = initContainerStatus.State.Terminated.Reason + } + return fmt.Sprintf("Common PVC Cleanup related init container %s has state %s.", initContainerStatus.Name, reason), nil + } + } + + if msg, err := check.CheckPodEvents(&pod, workspaceID, clusterAPI); err != nil || msg != "" { + return msg, err + } + } + + return "", nil +}