Skip to content

Commit 6229eef

Browse files
committed
fix: wait for /var to be mounted in kubelet service controller
This is a cosmetic fix: when `KubeletServiceController` tries to write files to `/etc/kubernetes` before `/var` mounted, it would fail. Controller will be restarted, but each restart involves a backoff on each restart which gets longer with each restart. On the first boot, or when EPHEMERAL is encrypted, mounting might take considerable time (seconds), so during that time controller might enter such long backoff timeout that it will delay whole boot sequence - it won't finish before `kubelet` is started. By waiting for `EPHEMERAL` to be mounted before starting the controller we eliminate long backoff cycles. Also fix a bug when `StartAllServices` task might start a kubelet early (before `KubeletServiceController` is actually going to start it). Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com> (cherry picked from commit c1aed62)
1 parent 4712e73 commit 6229eef

File tree

3 files changed

+34
-6
lines changed

3 files changed

+34
-6
lines changed

internal/app/machined/pkg/controllers/k8s/kubelet_service.go

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,13 @@ import (
2323
"k8s.io/apimachinery/pkg/runtime/serializer/json"
2424
kubeletconfig "k8s.io/kubelet/config/v1beta1"
2525

26+
runtimetalos "github.com/talos-systems/talos/internal/app/machined/pkg/runtime"
2627
"github.com/talos-systems/talos/internal/app/machined/pkg/system"
2728
"github.com/talos-systems/talos/internal/app/machined/pkg/system/services"
2829
"github.com/talos-systems/talos/pkg/machinery/constants"
2930
"github.com/talos-systems/talos/pkg/machinery/resources/files"
3031
"github.com/talos-systems/talos/pkg/machinery/resources/k8s"
32+
runtimeres "github.com/talos-systems/talos/pkg/machinery/resources/runtime"
3133
"github.com/talos-systems/talos/pkg/machinery/resources/secrets"
3234
)
3335

@@ -42,6 +44,7 @@ type ServiceManager interface {
4244
// KubeletServiceController renders kubelet configuration files and controls kubelet service lifecycle.
4345
type KubeletServiceController struct {
4446
V1Alpha1Services ServiceManager
47+
V1Alpha1Mode runtimetalos.Mode
4548
}
4649

4750
// Name implements controller.Controller interface.
@@ -63,14 +66,20 @@ func (ctrl *KubeletServiceController) Outputs() []controller.Output {
6366
//
6467
//nolint:gocyclo,cyclop
6568
func (ctrl *KubeletServiceController) Run(ctx context.Context, r controller.Runtime, logger *zap.Logger) error {
66-
// initially, wait for the machine-id to be generated
69+
// initially, wait for the machine-id to be generated and /var to be mounted
6770
if err := r.UpdateInputs([]controller.Input{
6871
{
6972
Namespace: files.NamespaceName,
7073
Type: files.EtcFileStatusType,
7174
ID: pointer.To("machine-id"),
7275
Kind: controller.InputWeak,
7376
},
77+
{
78+
Namespace: runtimeres.NamespaceName,
79+
Type: runtimeres.MountStatusType,
80+
ID: pointer.To(constants.EphemeralPartitionLabel),
81+
Kind: controller.InputWeak,
82+
},
7483
}); err != nil {
7584
return err
7685
}
@@ -91,10 +100,23 @@ func (ctrl *KubeletServiceController) Run(ctx context.Context, r controller.Runt
91100
return fmt.Errorf("error getting etc file status: %w", err)
92101
}
93102

103+
_, err = r.Get(ctx, resource.NewMetadata(runtimeres.NamespaceName, runtimeres.MountStatusType, constants.EphemeralPartitionLabel, resource.VersionUndefined))
104+
if err != nil {
105+
if state.IsNotFoundError(err) {
106+
// in container mode EPHEMERAL is always mounted
107+
if ctrl.V1Alpha1Mode != runtimetalos.ModeContainer {
108+
// wait for the EPHEMERAL to be mounted
109+
continue
110+
}
111+
} else {
112+
return fmt.Errorf("error getting ephemeral mount status: %w", err)
113+
}
114+
}
115+
94116
break
95117
}
96118

97-
// normal reconcile loop, ignore cri state
119+
// normal reconcile loop
98120
if err := r.UpdateInputs([]controller.Input{
99121
{
100122
Namespace: k8s.NamespaceName,

internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer_tasks.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -731,19 +731,24 @@ func StartAllServices(seq runtime.Sequence, data interface{}) (runtime.TaskExecu
731731

732732
svcs := system.Services(r)
733733

734+
// load the kubelet service, but don't start it;
735+
// KubeletServiceController will start it once it's ready.
734736
svcs.Load(
735-
&services.CRI{},
736737
&services.Kubelet{},
737738
)
738739

740+
serviceList := []system.Service{
741+
&services.CRI{},
742+
}
743+
739744
switch t := r.Config().Machine().Type(); t {
740745
case machine.TypeInit:
741-
svcs.Load(
746+
serviceList = append(serviceList,
742747
&services.Trustd{},
743748
&services.Etcd{Bootstrap: true},
744749
)
745750
case machine.TypeControlPlane:
746-
svcs.Load(
751+
serviceList = append(serviceList,
747752
&services.Trustd{},
748753
&services.Etcd{},
749754
)
@@ -755,7 +760,7 @@ func StartAllServices(seq runtime.Sequence, data interface{}) (runtime.TaskExecu
755760
panic(fmt.Sprintf("unexpected machine type %v", t))
756761
}
757762

758-
system.Services(r).StartAll()
763+
svcs.LoadAndStart(serviceList...)
759764

760765
all := []conditions.Condition{}
761766

internal/app/machined/pkg/runtime/v1alpha2/v1alpha2_controller.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ func (ctrl *Controller) Run(ctx context.Context, drainer *runtime.Drainer) error
115115
&k8s.KubeletConfigController{},
116116
&k8s.KubeletServiceController{
117117
V1Alpha1Services: system.Services(ctrl.v1alpha1Runtime),
118+
V1Alpha1Mode: ctrl.v1alpha1Runtime.State().Platform().Mode(),
118119
},
119120
&k8s.KubeletSpecController{
120121
V1Alpha1Mode: ctrl.v1alpha1Runtime.State().Platform().Mode(),

0 commit comments

Comments
 (0)