Skip to content

Commit

Permalink
feat: support hardware watchdog timers
Browse files Browse the repository at this point in the history
Only enabled when activated by config, disabled on shutdown/reboot

Fixes #8284

Signed-off-by: Dmitry Sharshakov <d3dx12.xx@gmail.com>
  • Loading branch information
dsseng committed Feb 13, 2024
1 parent 5324d39 commit 75a0807
Show file tree
Hide file tree
Showing 8 changed files with 178 additions and 0 deletions.
135 changes: 135 additions & 0 deletions internal/app/machined/pkg/controllers/runtime/watchdog_timer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

package runtime

import (
"context"
"fmt"
"os"
"syscall"
"time"
"unsafe"

"github.com/cosi-project/runtime/pkg/controller"
"github.com/cosi-project/runtime/pkg/safe"
"github.com/cosi-project/runtime/pkg/state"
"github.com/siderolabs/gen/optional"
"go.uber.org/zap"
"golang.org/x/sys/unix"
"sigs.k8s.io/kustomize/kyaml/errors"

"github.com/siderolabs/talos/pkg/machinery/resources/config"
)

// WatchdogTimerController watches v1alpha1.Config, creates/updates/deletes kernel module specs.
type WatchdogTimerController struct{}

// Name implements controller.Controller interface.
func (ctrl *WatchdogTimerController) Name() string {
return "runtime.WatchdogTimerController"
}

// Inputs implements controller.Controller interface.
func (ctrl *WatchdogTimerController) Inputs() []controller.Input {
return []controller.Input{
{
Namespace: config.NamespaceName,
Type: config.MachineConfigType,
ID: optional.Some(config.V1Alpha1ID),
},
}
}

// Outputs implements controller.Controller interface.
func (ctrl *WatchdogTimerController) Outputs() []controller.Output {
return nil
}

// Run implements controller.Controller interface.
//
//nolint:gocyclo,cyclop
func (ctrl *WatchdogTimerController) Run(ctx context.Context, r controller.Runtime, logger *zap.Logger) error {
ticker := time.NewTicker(3 * time.Second)
defer ticker.Stop()

var wd *os.File

wdClose := func() {
logger.Info("Closing hardware watchdog", zap.String("filename", wd.Name()))
// Magic close: make sure old watchdog won't trip after we close it
if _, err := wd.WriteString("V"); err != nil {
logger.Error("Failed to send magic close to watchdog", zap.String("filename", wd.Name()))
}

if err := wd.Close(); err != nil {
logger.Error("Failed to close watchdog", zap.String("filename", wd.Name()))
}

wd = nil
}

for {
select {
case <-ctx.Done():
return nil
case <-ticker.C:
if wd != nil {
if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, wd.Fd(), unix.WDIOC_KEEPALIVE, 0); err != 0 {
return fmt.Errorf("failed to feed watchdog: %w", err)
}
}

continue
case <-r.EventCh():
}

r.ResetRestartBackoff()

cfg, err := safe.ReaderGetByID[*config.MachineConfig](ctx, r, config.V1Alpha1ID)
if err != nil {
if !state.IsNotFoundError(err) {
return fmt.Errorf("error getting watchdog config: %w", err)
}
}

if cfg != nil && cfg.Config().Machine() != nil {
dev := cfg.Config().Machine().Kernel().WatchdogDevice()
timeout := cfg.Config().Machine().Kernel().WatchdogTimeout()

if dev == "" || timeout == 0 {
continue
}

// Close the watchdog if requested to use new one
if wd != nil && wd.Name() != dev {
wdClose()
}

if wd == nil {
wd, err = os.OpenFile(dev, syscall.O_RDWR, 0o600)
if err != nil {
return errors.Errorf("failed to open watchdog device: %s", err)
}

logger.Info("Opened hardware watchdog", zap.String("filename", dev))
// TODO: support reboot/shutdown watchdogs
defer wdClose()
}

if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, wd.Fd(), uintptr(unix.WDIOC_SETTIMEOUT), uintptr(unsafe.Pointer(&timeout))); err != 0 {
return fmt.Errorf("failed to set watchdog timeout: %w", err)
}

// 3 pings per timeout should suffice in any case
ticker = time.NewTicker(time.Duration(timeout/3) * time.Second)

if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, wd.Fd(), uintptr(unix.WDIOC_KEEPALIVE), uintptr(0)); err != 0 {
return fmt.Errorf("failed to feed watchdog: %w", err)
}

logger.Info("Set hardware watchdog timeout", zap.Int("timeout", timeout), zap.Int("feed_interval", timeout/3))
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,7 @@ func (ctrl *Controller) Run(ctx context.Context, drainer *runtime.Drainer) error
V1Alpha1Mode: ctrl.v1alpha1Runtime.State().Platform().Mode(),
},
runtimecontrollers.NewUniqueMachineTokenController(),
&runtimecontrollers.WatchdogTimerController{},
&secrets.APICertSANsController{},
&secrets.APIController{},
&secrets.EtcdController{},
Expand Down
2 changes: 2 additions & 0 deletions pkg/machinery/config/config/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,8 @@ type LoggingDestination interface {
// Kernel describes Talos Linux kernel configuration.
type Kernel interface {
Modules() []KernelModule
WatchdogDevice() string
WatchdogTimeout() int
}

// KernelModule describes Linux module to load.
Expand Down
14 changes: 14 additions & 0 deletions pkg/machinery/config/types/v1alpha1/v1alpha1_kernel.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,20 @@ func (kc *KernelConfig) Modules() []config.KernelModule {
return xslices.Map(kc.KernelModules, func(kmc *KernelModuleConfig) config.KernelModule { return kmc })
}

// WatchdogDevice implements config.Kernel interface.
func (kc *KernelConfig) WatchdogDevice() string {
if kc.WatchdogNode == "" {
return "/dev/watchdog0"
}

return kc.WatchdogNode
}

// WatchdogTimeout implements config.Kernel interface.
func (kc *KernelConfig) WatchdogTimeout() int {
return kc.WatchdogTime
}

// Name implements config.KernelModule interface.
func (kmc *KernelModuleConfig) Name() string {
return kmc.ModuleName
Expand Down
6 changes: 6 additions & 0 deletions pkg/machinery/config/types/v1alpha1/v1alpha1_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -2377,6 +2377,12 @@ type KernelConfig struct {
// description: |
// Kernel modules to load.
KernelModules []*KernelModuleConfig `yaml:"modules,omitempty"`
// description: |
// Watchdog device node to use.
WatchdogNode string `yaml:"watchdogDevice,omitempty"`
// description: |
// Watchdog timeout value.
WatchdogTime int `yaml:"watchdogTimeout,omitempty"`
}

// KernelModuleConfig struct configures Linux kernel modules to load.
Expand Down
14 changes: 14 additions & 0 deletions pkg/machinery/config/types/v1alpha1/v1alpha1_types_doc.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions pkg/provision/providers/qemu/launch.go
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,10 @@ func launchVM(config *LaunchConfig) error {
"virtio-serial",
"-device",
"virtserialport,chardev=qga0,name=org.qemu.guest_agent.0",
"-device",
"i6300esb,id=watchdog0",
"-watchdog-action",
"pause",
}

for i, disk := range config.DiskPaths {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2735,6 +2735,8 @@ machine:
| Field | Type | Description | Value(s) |
|-------|------|-------------|----------|
|`modules` |<a href="#Config.machine.kernel.modules.">[]KernelModuleConfig</a> |Kernel modules to load. | |
|`watchdogDevice` |string |Watchdog device node to use. | |
|`watchdogTimeout` |int |Watchdog timeout value. | |



Expand Down

0 comments on commit 75a0807

Please sign in to comment.