Skip to content

Commit

Permalink
Consumed capacity info in SpaceProvisionerConfig (#1109)
Browse files Browse the repository at this point in the history
Make the SPC ready status reflect the ability to place spaces to
the corresponding cluster. 

---------

Co-authored-by: Francisc Munteanu <fmuntean@redhat.com>
Co-authored-by: Matous Jobanek <mjobanek@redhat.com>
Co-authored-by: Feny Mehta <fbm3307@gmail.com>
  • Loading branch information
4 people authored Jan 7, 2025
1 parent 1798976 commit 2b06cc8
Show file tree
Hide file tree
Showing 7 changed files with 563 additions and 186 deletions.
44 changes: 41 additions & 3 deletions controllers/spaceprovisionerconfig/mapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,40 @@ package spaceprovisionerconfig

import (
"context"

toolchainv1alpha1 "github.com/codeready-toolchain/api/api/v1alpha1"
"k8s.io/apimachinery/pkg/types"
runtimeclient "sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
)

func MapToolchainClusterToSpaceProvisionerConfigs(ctx context.Context, cl runtimeclient.Client) func(context.Context, runtimeclient.Object) []reconcile.Request {
return func(context context.Context, obj runtimeclient.Object) []reconcile.Request {
func MapToolchainClusterToSpaceProvisionerConfigs(cl runtimeclient.Client) func(context.Context, runtimeclient.Object) []reconcile.Request {
return func(ctx context.Context, obj runtimeclient.Object) []reconcile.Request {
if _, ok := obj.(*toolchainv1alpha1.ToolchainCluster); !ok {
return nil
}

ret, err := findReferencingProvisionerConfigs(ctx, cl, runtimeclient.ObjectKeyFromObject(obj))
if err != nil {
log.FromContext(ctx).Error(err, "failed to list SpaceProvisionerConfig objects while determining what objects to reconcile",
"toolchainClusterCause", runtimeclient.ObjectKeyFromObject(obj))
"causeObj", runtimeclient.ObjectKeyFromObject(obj), "causeKind", "ToolchainCluster")
return []reconcile.Request{}
}
return ret
}
}

func MapToolchainStatusToSpaceProvisionerConfigs(cl runtimeclient.Client) func(context.Context, runtimeclient.Object) []reconcile.Request {
return func(ctx context.Context, obj runtimeclient.Object) []reconcile.Request {
if _, ok := obj.(*toolchainv1alpha1.ToolchainStatus); !ok {
return nil
}

ret, err := findAllSpaceProvisionerConfigsInNamespace(ctx, cl, obj.GetNamespace())
if err != nil {
log.FromContext(ctx).Error(err, "failed to list SpaceProvisionerConfig objects while determining what objects to reconcile",
"causeObj", runtimeclient.ObjectKeyFromObject(obj), "causeKind", "ToolchainStatus")
return []reconcile.Request{}
}
return ret
Expand All @@ -39,3 +60,20 @@ func findReferencingProvisionerConfigs(ctx context.Context, cl runtimeclient.Cli
}
return ret, nil
}

func findAllSpaceProvisionerConfigsInNamespace(ctx context.Context, cl runtimeclient.Client, ns string) ([]reconcile.Request, error) {
configs := &toolchainv1alpha1.SpaceProvisionerConfigList{}
if err := cl.List(ctx, configs, runtimeclient.InNamespace(ns)); err != nil {
return nil, err
}
ret := make([]reconcile.Request, 0, len(configs.Items))
for _, cfg := range configs.Items {
ret = append(ret, reconcile.Request{
NamespacedName: types.NamespacedName{
Namespace: cfg.Namespace,
Name: cfg.Name,
},
})
}
return ret, nil
}
51 changes: 49 additions & 2 deletions controllers/spaceprovisionerconfig/mapper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ func TestMapToolchainClusterToSpaceProvisionerConfigs(t *testing.T) {
cl := test.NewFakeClient(t, spc0, spc1, spc2)

// when
reqs := MapToolchainClusterToSpaceProvisionerConfigs(context.TODO(), cl)(context.TODO(), &toolchainv1alpha1.ToolchainCluster{
reqs := MapToolchainClusterToSpaceProvisionerConfigs(cl)(context.TODO(), &toolchainv1alpha1.ToolchainCluster{
ObjectMeta: metav1.ObjectMeta{
Name: "cluster1",
Namespace: test.HostOperatorNs,
Expand All @@ -123,7 +123,7 @@ func TestMapToolchainClusterToSpaceProvisionerConfigs(t *testing.T) {
}

// when
reqs := MapToolchainClusterToSpaceProvisionerConfigs(context.TODO(), cl)(context.TODO(), &toolchainv1alpha1.ToolchainCluster{
reqs := MapToolchainClusterToSpaceProvisionerConfigs(cl)(context.TODO(), &toolchainv1alpha1.ToolchainCluster{
ObjectMeta: metav1.ObjectMeta{
Name: "cluster1",
Namespace: test.HostOperatorNs,
Expand All @@ -134,3 +134,50 @@ func TestMapToolchainClusterToSpaceProvisionerConfigs(t *testing.T) {
require.Empty(t, reqs)
})
}

func TestMapToolchainStatusToSpaceProvisionerConfig(t *testing.T) {
t.Run("finds all SPCs in namespace", func(t *testing.T) {
// given
spc0 := NewSpaceProvisionerConfig("spc0", test.HostOperatorNs, ReferencingToolchainCluster("cluster1"))
spc1 := NewSpaceProvisionerConfig("spc1", test.HostOperatorNs, ReferencingToolchainCluster("cluster2"))
spc2 := NewSpaceProvisionerConfig("spc2", test.HostOperatorNs, ReferencingToolchainCluster("cluster1"))
cl := test.NewFakeClient(t, spc0, spc1, spc2)

// when
reqs := MapToolchainStatusToSpaceProvisionerConfigs(cl)(context.TODO(), &toolchainv1alpha1.ToolchainStatus{
ObjectMeta: metav1.ObjectMeta{
Name: "toolchain-status",
Namespace: test.HostOperatorNs,
},
})

// then
require.Equal(t, []reconcile.Request{
requestFromObject(spc0),
requestFromObject(spc1),
requestFromObject(spc2),
}, reqs)
})
t.Run("interprets erors as empty result", func(t *testing.T) {
// given
cl := test.NewFakeClient(t, NewSpaceProvisionerConfig("spc0", test.HostOperatorNs, ReferencingToolchainCluster("cluster1")))
expectedErr := errors.New("expected list error")
cl.MockList = func(ctx context.Context, list runtimeclient.ObjectList, opts ...runtimeclient.ListOption) error {
if _, ok := list.(*toolchainv1alpha1.SpaceProvisionerConfigList); ok {
return expectedErr
}
return cl.Client.List(ctx, list, opts...)
}

// when
reqs := MapToolchainStatusToSpaceProvisionerConfigs(cl)(context.TODO(), &toolchainv1alpha1.ToolchainStatus{
ObjectMeta: metav1.ObjectMeta{
Name: "toolchain-status",
Namespace: test.HostOperatorNs,
},
})

// then
require.Empty(t, reqs)
})
}
194 changes: 148 additions & 46 deletions controllers/spaceprovisionerconfig/spaceprovisionerconfig_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@ import (
"fmt"

toolchainv1alpha1 "github.com/codeready-toolchain/api/api/v1alpha1"
"github.com/codeready-toolchain/host-operator/controllers/toolchainconfig"
"github.com/codeready-toolchain/toolchain-common/pkg/condition"
"github.com/redhat-cop/operator-utils/pkg/util"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/builder"
runtimeclient "sigs.k8s.io/controller-runtime/pkg/client"
Expand All @@ -25,19 +27,28 @@ type Reconciler struct {

var _ reconcile.Reconciler = (*Reconciler)(nil)

func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error {
func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
For(&toolchainv1alpha1.SpaceProvisionerConfig{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})).
Watches(
&toolchainv1alpha1.ToolchainCluster{},
handler.EnqueueRequestsFromMapFunc(MapToolchainClusterToSpaceProvisionerConfigs(ctx, r.Client)),
handler.EnqueueRequestsFromMapFunc(MapToolchainClusterToSpaceProvisionerConfigs(r.Client)),
).
// we use the same information as the ToolchainStatus specific for the SPCs. Because memory consumption is
// read directly out of the member clusters using remote connections, let's look for it only once
// in ToolchainStatus and just read it out "locally" here without needing to reach out to the member clusters
// again.
Watches(
&toolchainv1alpha1.ToolchainStatus{},
handler.EnqueueRequestsFromMapFunc(MapToolchainStatusToSpaceProvisionerConfigs(r.Client)),
).
Complete(r)
}

//+kubebuilder:rbac:groups=toolchain.dev.openshift.com,resources=spaceprovisionerconfigs,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=toolchain.dev.openshift.com,resources=spaceprovisionerconfigs/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=toolchain.dev.openshift.com,resources=toolchainclusters,verbs=get;list;watch
//+kubebuilder:rbac:groups=toolchain.dev.openshift.com,resources=toolchainstatuses,verbs=get;list;watch

// Reconcile ensures that SpaceProvisionerConfig is valid and points to an existing ToolchainCluster.
func (r *Reconciler) Reconcile(ctx context.Context, request ctrl.Request) (ctrl.Result, error) {
Expand All @@ -58,62 +69,153 @@ func (r *Reconciler) Reconcile(ctx context.Context, request ctrl.Request) (ctrl.
return ctrl.Result{}, nil
}

readyCondition, reportedError := r.determineReadyState(ctx, spaceProvisionerConfig)
reportedErr := r.refreshStatus(ctx, spaceProvisionerConfig)

var updated bool
spaceProvisionerConfig.Status.Conditions, updated = condition.AddOrUpdateStatusConditions(spaceProvisionerConfig.Status.Conditions,
readyCondition)
if !updated {
return ctrl.Result{}, reportedError
if err := r.Client.Status().Update(ctx, spaceProvisionerConfig); err != nil {
return ctrl.Result{}, err
}

logger.Info("updating SpaceProvisionerConfig", "readyCondition", readyCondition)
if err := r.Client.Status().Update(ctx, spaceProvisionerConfig); err != nil {
if reportedError != nil {
logger.Info("failed to update the status (reported as failed reconciliation) with a previous unreported error during reconciliation", "unreportedError", reportedError)
return ctrl.Result{}, reportedErr
}

func (r *Reconciler) refreshStatus(ctx context.Context, spc *toolchainv1alpha1.SpaceProvisionerConfig) error {
// clear out the consumed capacity - this will advertise to the user that we either failed before it made sense
// to collect it (and therefore we don't know it) or it was not available (and therefore we again don't know it)
spc.Status.ConsumedCapacity = nil

if !spc.Spec.Enabled {
updateReadyCondition(spc, corev1.ConditionFalse, toolchainv1alpha1.SpaceProvisionerConfigDisabledReason, "")
return nil
}

clusterCondition, err := r.determineClusterReadyState(ctx, spc)
if err != nil {
updateReadyCondition(spc, clusterCondition, toolchainv1alpha1.SpaceProvisionerConfigToolchainClusterNotFoundReason, err.Error())

// the reconciler reacts on ToolchainCluster changes so it will be triggered once a new TC appears
// we therefore don't need to return error from the reconciler in the case the TC is not found.
if errors.IsNotFound(err) {
return nil
}
reportedError = fmt.Errorf("failed to update the SpaceProvisionerConfig status: %w", err)
return err
}

if clusterCondition != corev1.ConditionTrue {
updateReadyCondition(spc, clusterCondition, toolchainv1alpha1.SpaceProvisionerConfigToolchainClusterNotReadyReason, "")
return nil
}

return ctrl.Result{}, reportedError
spc.Status.ConsumedCapacity, err = collectConsumedCapacity(ctx, r.Client, spc.Spec.ToolchainCluster, spc.Namespace)
if err != nil {
updateReadyCondition(spc, corev1.ConditionUnknown, toolchainv1alpha1.SpaceProvisionerConfigFailedToDetermineCapacityReason, err.Error())
return err
}

capacityCondition := r.determineCapacityReadyState(spc)

reason := toolchainv1alpha1.SpaceProvisionerConfigValidReason
if capacityCondition != corev1.ConditionTrue {
reason = toolchainv1alpha1.SpaceProvisionerConfigInsufficientCapacityReason
}

updateReadyCondition(spc, capacityCondition, reason, "")

return nil
}

func (r *Reconciler) determineReadyState(ctx context.Context, spc *toolchainv1alpha1.SpaceProvisionerConfig) (toolchainv1alpha1.Condition, error) {
toolchainCluster := &toolchainv1alpha1.ToolchainCluster{}
toolchainClusterKey := runtimeclient.ObjectKey{Name: spc.Spec.ToolchainCluster, Namespace: spc.Namespace}
var toolchainPresent corev1.ConditionStatus
toolchainPresenceReason := toolchainv1alpha1.SpaceProvisionerConfigValidReason
var reportedError error
toolchainPresenceMessage := ""
if err := r.Client.Get(ctx, toolchainClusterKey, toolchainCluster); err != nil {
if !errors.IsNotFound(err) {
// we need to requeue the reconciliation in this case because we cannot be sure whether the ToolchainCluster
// is really present in the cluster or not. If we did not do that and instead just reported the error in
// the status, we could eventually leave the SPC in an incorrect state once the error condition in the cluster,
// that prevents us from reading the ToolchainCluster, clears. I.e. we need the requeue to keep the promise
// of eventual consistency.

reportedError = fmt.Errorf("failed to get the referenced ToolchainCluster: %w", err)
toolchainPresenceMessage = reportedError.Error()
// Note that this function merely mirrors the usage information found in the ToolchainStatus. This means that it actually may work
// with slightly stale data because the counter.Counts cache might not have been synced yet. This is ok though because the capacity manager
// doesn't completely rely on the readiness status of the SPC and will re-evaluate the decision taking into the account the contents of
// the counter cache and therefore completely "fresh" data.
func collectConsumedCapacity(ctx context.Context, cl runtimeclient.Client, clusterName string, toolchainStatusNs string) (*toolchainv1alpha1.ConsumedCapacity, error) {
status := &toolchainv1alpha1.ToolchainStatus{}
if err := cl.Get(ctx, types.NamespacedName{Namespace: toolchainStatusNs, Name: toolchainconfig.ToolchainStatusName}, status); err != nil {
return nil, fmt.Errorf("unable to read ToolchainStatus resource: %w", err)
}

for _, m := range status.Status.Members {
if m.ClusterName == clusterName {
cc := toolchainv1alpha1.ConsumedCapacity{}
cc.MemoryUsagePercentPerNodeRole = m.MemberStatus.ResourceUsage.MemoryUsagePerNodeRole
cc.SpaceCount = m.SpaceCount

return &cc, nil
}
toolchainPresenceReason = toolchainv1alpha1.SpaceProvisionerConfigToolchainClusterNotFoundReason
toolchainPresent = corev1.ConditionFalse
} else {
readyCond, found := condition.FindConditionByType(toolchainCluster.Status.Conditions, toolchainv1alpha1.ConditionReady)
if !found {
toolchainPresent = corev1.ConditionFalse
} else {
toolchainPresent = readyCond.Status
}

return nil, nil
}

func (r *Reconciler) determineClusterReadyState(ctx context.Context, spc *toolchainv1alpha1.SpaceProvisionerConfig) (corev1.ConditionStatus, error) {
toolchainCluster := &toolchainv1alpha1.ToolchainCluster{}
if err := r.Client.Get(ctx, runtimeclient.ObjectKey{Name: spc.Spec.ToolchainCluster, Namespace: spc.Namespace}, toolchainCluster); err != nil {
if errors.IsNotFound(err) {
return corev1.ConditionFalse, err
}
if toolchainPresent != corev1.ConditionTrue {
toolchainPresenceReason = toolchainv1alpha1.SpaceProvisionerConfigToolchainClusterNotReadyReason
// IsNotFound is self-explanatory but let's add a little bit of context to the error in other cases
return corev1.ConditionFalse, fmt.Errorf("failed to get the referenced ToolchainCluster: %w", err)
}

readyCond, found := condition.FindConditionByType(toolchainCluster.Status.Conditions, toolchainv1alpha1.ConditionReady)
if !found {
return corev1.ConditionFalse, nil
}

return readyCond.Status, nil
}

func (r *Reconciler) determineCapacityReadyState(spc *toolchainv1alpha1.SpaceProvisionerConfig) corev1.ConditionStatus {
if spc.Status.ConsumedCapacity == nil {
// we don't know anything about the resource consumption in the member
return corev1.ConditionUnknown
}

// the cluster capacity is ok if it has room for additional spaces and enough free memory

roomForAdditionalSpaces := determineSpaceCountReadyState(spc)
if !roomForAdditionalSpaces {
return corev1.ConditionFalse
}

return determineMemoryUtilizationReadyState(spc)
}

// determineSpaceCountReadyState checks that there is room for additional spaces in the cluster.
// It always knows this fact so returning a bool is ok, in contrast to determinMemoryUtilizationReadyState.
func determineSpaceCountReadyState(spc *toolchainv1alpha1.SpaceProvisionerConfig) bool {
max := spc.Spec.CapacityThresholds.MaxNumberOfSpaces
return max == 0 || max > uint(spc.Status.ConsumedCapacity.SpaceCount)
}

// determineMemoryUtilizationReadyState checks that the cluster has enough free memory. It may not be able to tell the fact
// if the SPC doesn't contain memory usage information in the status. It therefore can return true, false or
// unknown condition values.
func determineMemoryUtilizationReadyState(spc *toolchainv1alpha1.SpaceProvisionerConfig) corev1.ConditionStatus {
if spc.Spec.CapacityThresholds.MaxMemoryUtilizationPercent == 0 {
// 0 max memory utilization means no limit
return corev1.ConditionTrue
}

if len(spc.Status.ConsumedCapacity.MemoryUsagePercentPerNodeRole) == 0 {
// we don't know the memory utilization in the member
return corev1.ConditionUnknown
}

// the memory utilitzation is ok if it is below the threshold in all node types
for _, val := range spc.Status.ConsumedCapacity.MemoryUsagePercentPerNodeRole {
if uint(val) >= spc.Spec.CapacityThresholds.MaxMemoryUtilizationPercent {
return corev1.ConditionFalse
}
}
return corev1.ConditionTrue
}

return toolchainv1alpha1.Condition{
func updateReadyCondition(spc *toolchainv1alpha1.SpaceProvisionerConfig, status corev1.ConditionStatus, reason, message string) {
readyCondition := toolchainv1alpha1.Condition{
Type: toolchainv1alpha1.ConditionReady,
Status: toolchainPresent,
Message: toolchainPresenceMessage,
Reason: toolchainPresenceReason,
}, reportedError
Status: status,
Reason: reason,
Message: message,
}
spc.Status.Conditions, _ = condition.AddOrUpdateStatusConditions(spc.Status.Conditions, readyCondition)
}
Loading

0 comments on commit 2b06cc8

Please sign in to comment.