Skip to content

Commit

Permalink
feat: support hardware watchdog timers
Browse files Browse the repository at this point in the history
Only enabled when activated by config, disabled on shutdown/reboot

Fixes #8284

Signed-off-by: Dmitry Sharshakov <dmitry.sharshakov@siderolabs.com>
Signed-off-by: Dmitry Sharshakov <d3dx12.xx@gmail.com>
Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com>
  • Loading branch information
dsseng authored and smira committed Mar 21, 2024
1 parent 84ec8c1 commit 60f47ab
Show file tree
Hide file tree
Showing 28 changed files with 1,156 additions and 26 deletions.
7 changes: 7 additions & 0 deletions api/resource/definitions/runtime/runtime.proto
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package talos.resource.definitions.runtime;
option go_package = "github.com/siderolabs/talos/pkg/machinery/api/resource/definitions/runtime";

import "common/common.proto";
import "google/protobuf/duration.proto";
import "resource/definitions/enums/enums.proto";

// DevicesStatusSpec is the spec for devices status.
Expand Down Expand Up @@ -126,3 +127,9 @@ message UnmetCondition {
string reason = 2;
}

// WatchdogTimerConfigSpec describes configuration of Talos event log streaming.
message WatchdogTimerConfigSpec {
string device = 1;
google.protobuf.Duration timeout = 2;
}

10 changes: 10 additions & 0 deletions hack/modules-amd64.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ kernel/drivers/ata/pata_marvell.ko
kernel/drivers/ata/pata_oldpiix.ko
kernel/drivers/ata/pata_sch.ko
kernel/drivers/block/nbd.ko
kernel/drivers/char/ipmi/ipmi_watchdog.ko
kernel/drivers/gpu/drm/amd/amdgpu/amdgpu.ko
kernel/drivers/gpu/drm/amd/amdxcp/amdxcp.ko
kernel/drivers/gpu/drm/display/drm_display_helper.ko
Expand Down Expand Up @@ -62,6 +63,8 @@ kernel/drivers/message/fusion/mptbase.ko
kernel/drivers/message/fusion/mptsas.ko
kernel/drivers/message/fusion/mptscsih.ko
kernel/drivers/message/fusion/mptspi.ko
kernel/drivers/mfd/lpc_ich.ko
kernel/drivers/mfd/mfd-core.ko
kernel/drivers/misc/hpilo.ko
kernel/drivers/mmc/host/sdhci_f_sdh30.ko
kernel/drivers/mmc/host/sdhci-acpi.ko
Expand Down Expand Up @@ -134,6 +137,13 @@ kernel/drivers/virtio/virtio_mmio.ko
kernel/drivers/virtio/virtio_pci_legacy_dev.ko
kernel/drivers/virtio/virtio_pci_modern_dev.ko
kernel/drivers/virtio/virtio_pci.ko
kernel/drivers/watchdog/i6300esb.ko
kernel/drivers/watchdog/iTCO_vendor_support.ko
kernel/drivers/watchdog/iTCO_wdt.ko
kernel/drivers/watchdog/sp5100_tco.ko
kernel/drivers/watchdog/watchdog.ko
kernel/drivers/watchdog/wdat_wdt.ko
kernel/drivers/watchdog/xen_wdt.ko
kernel/lib/objagg.ko
kernel/lib/parman.ko
kernel/lib/raid6/raid6_pq.ko
Expand Down
16 changes: 16 additions & 0 deletions hack/release.toml
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,22 @@ machine:
servers:
- /dev/ptp0
```
"""

[notes.watchdog]
title = "Hardware Watchdog Timers"
description = """\
Talos Linux now supports hardware watchdog timers configuration.
If enabled, and the machine becomes unresponsive, the hardware watchdog will reset the machine.
The watchdog can be enabled with the following configuration document:
```yaml
apiVersion: v1alpha1
kind: WatchdogTimerConfig
device: /dev/watchdog0
timeout: 3m0s
```
"""

[make_deps]
Expand Down
155 changes: 155 additions & 0 deletions internal/app/machined/pkg/controllers/runtime/watchdog_timer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

package runtime

import (
"context"
"fmt"
"os"
"syscall"
"time"
"unsafe"

"github.com/cosi-project/runtime/pkg/controller"
"github.com/cosi-project/runtime/pkg/safe"
"github.com/cosi-project/runtime/pkg/state"
"github.com/siderolabs/gen/optional"
"go.uber.org/zap"
"golang.org/x/sys/unix"

"github.com/siderolabs/talos/pkg/machinery/resources/runtime"
)

// WatchdogTimerController watches v1alpha1.Config, creates/updates/deletes kernel module specs.
type WatchdogTimerController struct{}

// Name implements controller.Controller interface.
func (ctrl *WatchdogTimerController) Name() string {
return "runtime.WatchdogTimerController"
}

// Inputs implements controller.Controller interface.
func (ctrl *WatchdogTimerController) Inputs() []controller.Input {
return []controller.Input{
{
Namespace: runtime.NamespaceName,
Type: runtime.WatchdogTimerConfigType,
ID: optional.Some(runtime.WatchdogTimerConfigID),
},
}
}

// Outputs implements controller.Controller interface.
func (ctrl *WatchdogTimerController) Outputs() []controller.Output {
return nil
}

// Run implements controller.Controller interface.
//
//nolint:gocyclo
func (ctrl *WatchdogTimerController) Run(ctx context.Context, r controller.Runtime, logger *zap.Logger) error {
var (
ticker *time.Ticker
tickerC <-chan time.Time
)

tickerStop := func() {
if ticker == nil {
return
}

ticker.Stop()

ticker = nil
tickerC = nil
}

defer tickerStop()

var wd *os.File

wdClose := func() {
if wd == nil {
return
}

logger.Info("closing hardware watchdog", zap.String("path", wd.Name()))

// Magic close: make sure old watchdog won't trip after we close it
if _, err := wd.WriteString("V"); err != nil {
logger.Error("failed to send magic close to watchdog", zap.String("path", wd.Name()))
}

if err := wd.Close(); err != nil {
logger.Error("failed to close watchdog", zap.String("path", wd.Name()))
}

wd = nil
}

defer wdClose()

for {
select {
case <-ctx.Done():
return nil
case <-tickerC:
if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, wd.Fd(), unix.WDIOC_KEEPALIVE, 0); err != 0 {
return fmt.Errorf("failed to feed watchdog: %w", err)
}

continue
case <-r.EventCh():
}

cfg, err := safe.ReaderGetByID[*runtime.WatchdogTimerConfig](ctx, r, runtime.WatchdogTimerConfigID)
if err != nil {
if !state.IsNotFoundError(err) {
return fmt.Errorf("error getting watchdog config: %w", err)
}
}

if cfg == nil {
tickerStop()
wdClose()
} else {
// close the watchdog if requested to use new one
if wd != nil && wd.Name() != cfg.TypedSpec().Device {
wdClose()
}

if wd == nil {
wd, err = os.OpenFile(cfg.TypedSpec().Device, syscall.O_RDWR, 0o600)
if err != nil {
return fmt.Errorf("failed to open watchdog device: %s", err)
}

logger.Info("opened hardware watchdog", zap.String("path", cfg.TypedSpec().Device))
}

timeout := int(cfg.TypedSpec().Timeout.Seconds())

if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, wd.Fd(), uintptr(unix.WDIOC_SETTIMEOUT), uintptr(unsafe.Pointer(&timeout))); err != 0 {
return fmt.Errorf("failed to set watchdog timeout: %w", err)
}

tickerStop()

// 3 pings per timeout should suffice in any case
feedInterval := cfg.TypedSpec().Timeout / 3

ticker = time.NewTicker(feedInterval)
tickerC = ticker.C

if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, wd.Fd(), uintptr(unix.WDIOC_KEEPALIVE), 0); err != 0 {
return fmt.Errorf("failed to feed watchdog: %w", err)
}

logger.Info("set hardware watchdog timeout", zap.Duration("timeout", cfg.TypedSpec().Timeout), zap.Duration("feed_interval", feedInterval))
}

r.ResetRestartBackoff()
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

package runtime

import (
"context"
"fmt"

"github.com/cosi-project/runtime/pkg/controller"
"github.com/cosi-project/runtime/pkg/safe"
"github.com/cosi-project/runtime/pkg/state"
"github.com/siderolabs/gen/optional"
"go.uber.org/zap"

"github.com/siderolabs/talos/pkg/machinery/resources/config"
"github.com/siderolabs/talos/pkg/machinery/resources/runtime"
)

// WatchdogTimerConfigController generates configuration for watchdog timers.
type WatchdogTimerConfigController struct{}

// Name implements controller.Controller interface.
func (ctrl *WatchdogTimerConfigController) Name() string {
return "runtime.WatchdogTimerConfigController"
}

// Inputs implements controller.Controller interface.
func (ctrl *WatchdogTimerConfigController) Inputs() []controller.Input {
return []controller.Input{
{
Namespace: config.NamespaceName,
Type: config.MachineConfigType,
ID: optional.Some(config.V1Alpha1ID),
Kind: controller.InputWeak,
},
}
}

// Outputs implements controller.Controller interface.
func (ctrl *WatchdogTimerConfigController) Outputs() []controller.Output {
return []controller.Output{
{
Type: runtime.WatchdogTimerConfigType,
Kind: controller.OutputExclusive,
},
}
}

// Run implements controller.Controller interface.
func (ctrl *WatchdogTimerConfigController) Run(ctx context.Context, r controller.Runtime, logger *zap.Logger) (err error) {
for {
select {
case <-ctx.Done():
return nil
case <-r.EventCh():
}

cfg, err := safe.ReaderGetByID[*config.MachineConfig](ctx, r, config.V1Alpha1ID)
if err != nil && !state.IsNotFoundError(err) {
return fmt.Errorf("error getting machine config: %w", err)
}

r.StartTrackingOutputs()

if cfg != nil {
if watchdogConfig := cfg.Config().Runtime().WatchdogTimer(); watchdogConfig != nil {
if err = safe.WriterModify(ctx, r, runtime.NewWatchdogTimerConfig(), func(cfg *runtime.WatchdogTimerConfig) error {
cfg.TypedSpec().Device = watchdogConfig.Device()
cfg.TypedSpec().Timeout = watchdogConfig.Timeout()

return nil
}); err != nil {
return fmt.Errorf("error updating kmsg log config: %w", err)
}
}
}

if err = safe.CleanupOutputs[*runtime.WatchdogTimerConfig](ctx, r); err != nil {
return err
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

package runtime_test

import (
"testing"

"github.com/cosi-project/runtime/pkg/resource"
"github.com/cosi-project/runtime/pkg/resource/rtestutils"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/suite"

"github.com/siderolabs/talos/internal/app/machined/pkg/controllers/ctest"
runtimectrls "github.com/siderolabs/talos/internal/app/machined/pkg/controllers/runtime"
"github.com/siderolabs/talos/pkg/machinery/config/container"
runtimecfg "github.com/siderolabs/talos/pkg/machinery/config/types/runtime"
"github.com/siderolabs/talos/pkg/machinery/resources/config"
"github.com/siderolabs/talos/pkg/machinery/resources/runtime"
)

type WatchdogTimerConfigSuite struct {
ctest.DefaultSuite
}

func TestWatchdogTimerConfigSuite(t *testing.T) {
suite.Run(t, new(WatchdogTimerConfigSuite))
}

func (suite *WatchdogTimerConfigSuite) TestWatchdogTimerConfigNone() {
suite.Require().NoError(suite.Runtime().RegisterController(&runtimectrls.WatchdogTimerConfigController{}))

rtestutils.AssertNoResource[*runtime.WatchdogTimerConfig](suite.Ctx(), suite.T(), suite.State(), runtime.WatchdogTimerConfigID)
}

func (suite *WatchdogTimerConfigSuite) TestWatchdogTimerConfigMachineConfig() {
suite.Require().NoError(suite.Runtime().RegisterController(&runtimectrls.WatchdogTimerConfigController{}))

watchdogTimerConfig := &runtimecfg.WatchdogTimerV1Alpha1{
WatchdogDevice: "/dev/watchdog0",
}

cfg, err := container.New(watchdogTimerConfig)
suite.Require().NoError(err)

suite.Require().NoError(suite.State().Create(suite.Ctx(), config.NewMachineConfig(cfg)))

rtestutils.AssertResources[*runtime.WatchdogTimerConfig](suite.Ctx(), suite.T(), suite.State(), []resource.ID{runtime.WatchdogTimerConfigID},
func(cfg *runtime.WatchdogTimerConfig, asrt *assert.Assertions) {
asrt.Equal(
"/dev/watchdog0",
cfg.TypedSpec().Device,
)
asrt.Equal(
runtimecfg.DefaultWatchdogTimeout,
cfg.TypedSpec().Timeout,
)
})
}
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,8 @@ func (ctrl *Controller) Run(ctx context.Context, drainer *runtime.Drainer) error
V1Alpha1Mode: ctrl.v1alpha1Runtime.State().Platform().Mode(),
},
runtimecontrollers.NewUniqueMachineTokenController(),
&runtimecontrollers.WatchdogTimerConfigController{},
&runtimecontrollers.WatchdogTimerController{},
&secrets.APICertSANsController{},
&secrets.APIController{},
&secrets.EtcdController{},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ func NewState() (*State, error) {
&runtime.PlatformMetadata{},
&runtime.SecurityState{},
&runtime.UniqueMachineToken{},
&runtime.WatchdogTimerConfig{},
&secrets.API{},
&secrets.CertSAN{},
&secrets.Etcd{},
Expand Down
Loading

0 comments on commit 60f47ab

Please sign in to comment.