Skip to content

Commit

Permalink
feat: report error when common PVC cleanup job hangs
Browse files Browse the repository at this point in the history
Fix devfile#551

Signed-off-by: Andrew Obuchowicz <aobuchow@redhat.com>
  • Loading branch information
AObuchow committed May 26, 2022
1 parent d7b3273 commit 5058883
Show file tree
Hide file tree
Showing 3 changed files with 234 additions and 145 deletions.
170 changes: 170 additions & 0 deletions pkg/clusterstatus/check.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
//
// Copyright (c) 2019-2022 Red Hat, Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

package clusterstatus

import (
"context"
"fmt"
"strings"

dw "github.com/devfile/api/v2/pkg/apis/workspaces/v1alpha2"
"github.com/devfile/devworkspace-operator/pkg/common"
"github.com/devfile/devworkspace-operator/pkg/config"
"github.com/devfile/devworkspace-operator/pkg/constants"
"github.com/devfile/devworkspace-operator/pkg/infrastructure"
"github.com/devfile/devworkspace-operator/pkg/provision/sync"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/fields"
k8sclient "sigs.k8s.io/controller-runtime/pkg/client"
runtimeClient "sigs.k8s.io/controller-runtime/pkg/client"
)

var containerFailureStateReasons = []string{
"CrashLoopBackOff",
"ImagePullBackOff",
"CreateContainerError",
"RunContainerError",
}

// unrecoverablePodEventReasons contains Kubernetes events that should fail workspace startup
// if they occur related to a workspace pod. Events are stored as a map with event names as keys
// and values representing the threshold of how many times we can see an event before it is considered
// unrecoverable.
var unrecoverablePodEventReasons = map[string]int32{
"FailedPostStartHook": 1,
"FailedMount": 3,
"FailedScheduling": 1,
"FailedCreate": 1,
"ReplicaSetCreateError": 1,
}

var unrecoverableDeploymentConditionReasons = []string{
"FailedCreate",
}

func CheckDeploymentStatus(deployment *appsv1.Deployment) (ready bool) {
return deployment.Status.ReadyReplicas > 0
}

func CheckDeploymentConditions(deployment *appsv1.Deployment) (healthy bool, errorMsg string) {
conditions := deployment.Status.Conditions
for _, condition := range conditions {
for _, unrecoverableReason := range unrecoverableDeploymentConditionReasons {
if condition.Reason == unrecoverableReason {
return false, fmt.Sprintf("Detected unrecoverable deployment condition: %s %s", condition.Reason, condition.Message)
}
}
}
return true, ""
}

func getPods(workspace *dw.DevWorkspace, client runtimeClient.Client) (*corev1.PodList, error) {
pods := &corev1.PodList{}
if err := client.List(context.TODO(), pods, k8sclient.InNamespace(workspace.Namespace), k8sclient.MatchingLabels{
constants.DevWorkspaceIDLabel: workspace.Status.DevWorkspaceId,
}); err != nil {
return nil, err
}
return pods, nil
}

// checkPodsState checks if workspace-related pods are in an unrecoverable state. A pod is considered to be unrecoverable
// if it has a container with one of the containerFailureStateReasons states, or if an unrecoverable event (with reason
// matching unrecoverablePodEventReasons) has the pod as the involved object.
// Returns optional message with detected unrecoverable state details
// error if any happens during check
func CheckPodsState(workspace *dw.DevWorkspace,
clusterAPI sync.ClusterAPI) (stateMsg string, checkFailure error) {
podList, err := getPods(workspace, clusterAPI.Client)
if err != nil {
return "", err
}

for _, pod := range podList.Items {
for _, containerStatus := range pod.Status.ContainerStatuses {
if !CheckContainerStatusForFailure(&containerStatus) {
return fmt.Sprintf("Container %s has state %s", containerStatus.Name, containerStatus.State.Waiting.Reason), nil
}
}
for _, initContainerStatus := range pod.Status.InitContainerStatuses {
if !CheckContainerStatusForFailure(&initContainerStatus) {
return fmt.Sprintf("Init Container %s has state %s", initContainerStatus.Name, initContainerStatus.State.Waiting.Reason), nil
}
}
if msg, err := CheckPodEvents(&pod, workspace.Status.DevWorkspaceId, clusterAPI); err != nil || msg != "" {
return msg, err
}
}
return "", nil
}

func CheckPodEvents(pod *corev1.Pod, workspaceID string, clusterAPI sync.ClusterAPI) (msg string, err error) {
evs := &corev1.EventList{}
selector, err := fields.ParseSelector(fmt.Sprintf("involvedObject.name=%s", pod.Name))
if err != nil {
return "", fmt.Errorf("failed to parse field selector: %s", err)
}
if err := clusterAPI.Client.List(clusterAPI.Ctx, evs, k8sclient.InNamespace(pod.Namespace), k8sclient.MatchingFieldsSelector{Selector: selector}); err != nil {
return "", fmt.Errorf("failed to list events in namespace %s: %w", pod.Namespace, err)
}
for _, ev := range evs.Items {
if ev.InvolvedObject.Kind != "Pod" {
continue
}

// On OpenShift, it's possible see "FailedMount" events when using a routingClass that depends on the service-ca
// operator. To avoid this, we always ignore FailedMount events if the message refers to the DWO-provisioned volume
if infrastructure.IsOpenShift() &&
ev.Reason == "FailedMount" &&
strings.Contains(ev.Message, common.ServingCertVolumeName(common.ServiceName(workspaceID))) {
continue
}

if maxCount, isUnrecoverableEvent := unrecoverablePodEventReasons[ev.Reason]; isUnrecoverableEvent {
if !checkIfUnrecoverableEventIgnored(ev.Reason) && ev.Count >= maxCount {
var msg string
if ev.Count > 1 {
msg = fmt.Sprintf("Detected unrecoverable event %s %d times: %s", ev.Reason, ev.Count, ev.Message)
} else {
msg = fmt.Sprintf("Detected unrecoverable event %s: %s", ev.Reason, ev.Message)
}
return msg, nil
}
}
}
return "", nil
}

func CheckContainerStatusForFailure(containerStatus *corev1.ContainerStatus) (ok bool) {
if containerStatus.State.Waiting != nil {
for _, failureReason := range containerFailureStateReasons {
if containerStatus.State.Waiting.Reason == failureReason {
return checkIfUnrecoverableEventIgnored(containerStatus.State.Waiting.Reason)
}
}
}
return true
}

func checkIfUnrecoverableEventIgnored(reason string) (ignored bool) {
for _, ignoredReason := range config.Workspace.IgnoredUnrecoverableEvents {
if ignoredReason == reason {
return true
}
}
return false
}
63 changes: 60 additions & 3 deletions pkg/provision/storage/cleanup.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,21 @@ package storage

import (
"fmt"
"path"
"time"

dw "github.com/devfile/api/v2/pkg/apis/workspaces/v1alpha2"
"github.com/devfile/devworkspace-operator/pkg/clusterstatus"
nsconfig "github.com/devfile/devworkspace-operator/pkg/provision/config"
"github.com/devfile/devworkspace-operator/pkg/provision/sync"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
k8sErrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/selection"
"k8s.io/apimachinery/pkg/types"
k8sclient "sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"

"github.com/devfile/devworkspace-operator/internal/images"
Expand Down Expand Up @@ -91,6 +94,14 @@ func runCommonPVCCleanupJob(workspace *dw.DevWorkspace, clusterAPI sync.ClusterA
}
}
}

if msg, err := checkCleanupPodsState(clusterJob, workspace.Status.DevWorkspaceId, clusterAPI); err != nil || msg != "" {
return &ProvisioningError{
Err: err,
Message: fmt.Sprintf("DevWorkspace PVC cleanup job failed: see logs for job %q for details. Additional information: %s", clusterJob.Name, msg),
}
}

// Requeue at least each 10 seconds to check if PVC is not removed by someone else
return &NotReadyError{
Message: "Cleanup job is not in completed state",
Expand All @@ -110,7 +121,9 @@ func getSpecCommonPVCCleanupJob(workspace *dw.DevWorkspace, clusterAPI sync.Clus
}

jobLabels := map[string]string{
constants.DevWorkspaceIDLabel: workspaceId,
constants.DevWorkspaceIDLabel: workspaceId,
constants.DevWorkspaceNameLabel: workspace.Name,
constants.DevWorkspaceCreatorLabel: workspace.Labels[constants.DevWorkspaceCreatorLabel],
}
if restrictedAccess, needsRestrictedAccess := workspace.Annotations[constants.DevWorkspaceRestrictedAccessAnnotation]; needsRestrictedAccess {
jobLabels[constants.DevWorkspaceRestrictedAccessAnnotation] = restrictedAccess
Expand All @@ -126,6 +139,9 @@ func getSpecCommonPVCCleanupJob(workspace *dw.DevWorkspace, clusterAPI sync.Clus
Completions: &cleanupJobCompletions,
BackoffLimit: &cleanupJobBackoffLimit,
Template: corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: jobLabels,
},
Spec: corev1.PodSpec{
RestartPolicy: "Never",
SecurityContext: wsprovision.GetDevWorkspaceSecurityContext(),
Expand All @@ -146,7 +162,8 @@ func getSpecCommonPVCCleanupJob(workspace *dw.DevWorkspace, clusterAPI sync.Clus
Command: []string{"/bin/sh"},
Args: []string{
"-c",
fmt.Sprintf(cleanupCommandFmt, path.Join(pvcClaimMountPath, workspaceId)),
//fmt.Sprintf(cleanupCommandFmt, path.Join(pvcClaimMountPath, workspaceId)),
"exit 1",
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
Expand Down Expand Up @@ -203,3 +220,43 @@ func commonPVCExists(workspace *dw.DevWorkspace, clusterAPI sync.ClusterAPI) (bo
}
return true, nil
}

func checkCleanupPodsState(job *batchv1.Job, workspaceID string, clusterAPI sync.ClusterAPI) (msg string, err error) {

// TODO: Ensure we are actually finding the pods related to the job? Maybe add some extra labels to ensure unique matches/we don't find other workspace pods
pods := &corev1.PodList{}

jobNameReq, err := labels.NewRequirement("job-name", selection.Equals, []string{common.PVCCleanupJobName(workspaceID)})
if err != nil {
fmt.Printf("bad requirement: %v\n\n", err)
}

// Init and add to selector.
selector := labels.NewSelector()
selector = selector.Add(*jobNameReq)

if err := clusterAPI.Client.List(clusterAPI.Ctx, pods, k8sclient.InNamespace(job.Namespace), k8sclient.MatchingLabelsSelector{Selector: selector}); err != nil {
return "", err
}

for _, pod := range pods.Items {

for _, containerStatus := range pod.Status.ContainerStatuses {
if clusterstatus.CheckContainerStatusForFailure(&containerStatus) {
return fmt.Sprintf("Common PVC Cleanup related container %s has state %s", containerStatus.Name, containerStatus.State.Waiting.Reason), nil
}
}

for _, initContainerStatus := range pod.Status.InitContainerStatuses {
if clusterstatus.CheckContainerStatusForFailure(&initContainerStatus) {
return fmt.Sprintf("Common PVC Cleanup related init container %s has state %s", initContainerStatus.Name, initContainerStatus.State.Waiting.Reason), nil
}
}

if msg, err := clusterstatus.CheckPodEvents(&pod, workspaceID, clusterAPI); err != nil || msg != "" {
return msg, err
}
}

return "", nil
}
Loading

0 comments on commit 5058883

Please sign in to comment.