Skip to content

Commit

Permalink
fix: talosctl support and race tests
Browse files Browse the repository at this point in the history
1. Don't set max cgroups limit if race mode is enabled (only in test
   mode). When e.g. apid/trustd are built with race detector on, they
   consume 10x the memory.
2. Fix a data race in `talosctl support` when showing UI progress.
3. Fix an issue pulling `kubeconfig` in `talosctl support` - pull from
   endpoints (controlplanes) without setting any nodes.

Fixes #10036

Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com>
(cherry picked from commit b72bda0)
  • Loading branch information
smira committed Dec 26, 2024
1 parent edd7844 commit 73c25ee
Show file tree
Hide file tree
Showing 8 changed files with 53 additions and 22 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
#
# Generated on 2024-12-26T12:25:23Z by kres fcff05e.
# Generated on 2024-12-26T12:26:49Z by kres fcff05e.

name: default
concurrency:
Expand Down Expand Up @@ -3360,6 +3360,8 @@ jobs:
QEMU_EXTRA_DISKS: "3"
QEMU_EXTRA_DISKS_DRIVERS: ide,nvme
QEMU_EXTRA_DISKS_SIZE: "10240"
QEMU_MEMORY_CONTROLPLANES: "4096"
QEMU_MEMORY_WORKERS: "4096"
TAG_SUFFIX: -race
WITH_CONFIG_PATCH_WORKER: '@hack/test/patches/ephemeral-nvme.yaml:@hack/test/patches/dm-raid-module.yaml'
run: |
Expand Down
4 changes: 3 additions & 1 deletion .github/workflows/integration-qemu-race-cron.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
#
# Generated on 2024-11-28T13:53:18Z by kres 232fe63.
# Generated on 2024-12-25T15:13:54Z by kres fcff05e.

name: integration-qemu-race-cron
concurrency:
Expand Down Expand Up @@ -94,6 +94,8 @@ jobs:
QEMU_EXTRA_DISKS: "3"
QEMU_EXTRA_DISKS_DRIVERS: ide,nvme
QEMU_EXTRA_DISKS_SIZE: "10240"
QEMU_MEMORY_CONTROLPLANES: "4096"
QEMU_MEMORY_WORKERS: "4096"
TAG_SUFFIX: -race
WITH_CONFIG_PATCH_WORKER: '@hack/test/patches/ephemeral-nvme.yaml:@hack/test/patches/dm-raid-module.yaml'
run: |
Expand Down
2 changes: 2 additions & 0 deletions .kres.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1277,6 +1277,8 @@ spec:
QEMU_EXTRA_DISKS_SIZE: "10240"
QEMU_EXTRA_DISKS_DRIVERS: "ide,nvme"
WITH_CONFIG_PATCH_WORKER: "@hack/test/patches/ephemeral-nvme.yaml:@hack/test/patches/dm-raid-module.yaml"
QEMU_MEMORY_CONTROLPLANES: 4096 # race-enabled Talos consumes lots of RAM
QEMU_MEMORY_WORKERS: 4096
TAG_SUFFIX: -race
IMAGE_REGISTRY: registry.dev.siderolabs.io
- name: save-talos-logs
Expand Down
34 changes: 21 additions & 13 deletions cmd/talosctl/cmd/talos/support.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"io"
"os"
"strings"
"sync"
"text/tabwriter"

"github.com/cosi-project/runtime/pkg/resource"
Expand Down Expand Up @@ -111,7 +112,7 @@ var supportCmd = &cobra.Command{
}

func collectData(dest *os.File, progress chan bundle.Progress) error {
return WithClient(func(ctx context.Context, c *client.Client) error {
return WithClientNoNodes(func(ctx context.Context, c *client.Client) error {
clientset, err := getKubernetesClient(ctx, c)
if err != nil {
fmt.Fprintf(os.Stderr, "Failed to create kubernetes client %s\n", err)
Expand Down Expand Up @@ -142,11 +143,7 @@ func collectData(dest *os.File, progress chan bundle.Progress) error {
}

func getKubernetesClient(ctx context.Context, c *client.Client) (*k8s.Clientset, error) {
if len(GlobalArgs.Endpoints) == 0 {
fmt.Fprintln(os.Stderr, "No endpoints set for the cluster, the command might not be able to get kubeconfig")
}

kubeconfig, err := c.Kubeconfig(client.WithNodes(ctx, GlobalArgs.Endpoints...))
kubeconfig, err := c.Kubeconfig(ctx)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -284,6 +281,7 @@ func showProgress(progress <-chan bundle.Progress, errors *supportBundleErrors)
uiprogress.Start()

type nodeProgress struct {
mu sync.Mutex
state string
bar *uiprogress.Bar
}
Expand All @@ -298,29 +296,39 @@ func showProgress(progress <-chan bundle.Progress, errors *supportBundleErrors)
ok bool
)

if np, ok = nodes[p.Source]; !ok {
src := p.Source

if _, ok = nodes[p.Source]; !ok {
bar := uiprogress.AddBar(p.Total)
bar = bar.AppendCompleted().PrependElapsed()

src := p.Source

np = &nodeProgress{
state: "initializing...",
bar: bar,
}

bar.AppendFunc(func(b *uiprogress.Bar) string {
return fmt.Sprintf("%s: %s", src, np.state)
})
bar.AppendFunc(
func(src string, np *nodeProgress) func(b *uiprogress.Bar) string {
return func(b *uiprogress.Bar) string {
np.mu.Lock()
defer np.mu.Unlock()

return fmt.Sprintf("%s: %s", src, np.state)
}
}(src, np),
)

bar.Width = 20

nodes[src] = np
} else {
np = nodes[p.Source]
np = nodes[src]
}

np.mu.Lock()
np.state = p.State
np.mu.Unlock()

np.bar.Incr()
}

Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ require (
github.com/siderolabs/go-retry v0.3.3
github.com/siderolabs/go-smbios v0.3.3
github.com/siderolabs/go-tail v0.1.1
github.com/siderolabs/go-talos-support v0.1.1
github.com/siderolabs/go-talos-support v0.1.2
github.com/siderolabs/grpc-proxy v0.5.1
github.com/siderolabs/kms-client v0.1.0
github.com/siderolabs/net v0.4.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -675,8 +675,8 @@ github.com/siderolabs/go-smbios v0.3.3 h1:rM3UKHQ8in1mqNRkpV75Ls3Wnk6rAhQJVYKUsK
github.com/siderolabs/go-smbios v0.3.3/go.mod h1:kScnr0XSyzLfkRo/ChjITgI0rPRQnIi6PdgbxVCwA9U=
github.com/siderolabs/go-tail v0.1.1 h1:3XeJgd97OHyFAIE7nQEMcRhOfnv7DvXbu0BRKbtT6u8=
github.com/siderolabs/go-tail v0.1.1/go.mod h1:IihAL39acadXHfb5fEAOKK2DaDFIrG2+VD3b2H/ziZ0=
github.com/siderolabs/go-talos-support v0.1.1 h1:g51J0WQssQAycU/0cDliC2l4uX2H02yUs2+fa5pCvHg=
github.com/siderolabs/go-talos-support v0.1.1/go.mod h1:o4woiYS+2J3djCQgyHZRVZQm8XpazQr+XPcTXAZvamo=
github.com/siderolabs/go-talos-support v0.1.2 h1:xKFwT8emzxpmamIe3W35QlmadC54OaPNO9/Y+fL7WwM=
github.com/siderolabs/go-talos-support v0.1.2/go.mod h1:o9zRfWJQhW5j3PQxs7v0jmG4igD4peDatqbAGQFe4oo=
github.com/siderolabs/grpc-proxy v0.5.1 h1:WTZYLMPTZPt43BzEJ02LT9kYA9qAfquWwCezc6NPPYE=
github.com/siderolabs/grpc-proxy v0.5.1/go.mod h1:EQwE87LiWxhiIUPBeWmpjJb9DIWxWID8R6ARtdTC+8A=
github.com/siderolabs/kms-client v0.1.0 h1:rCDWzcDDsNlp6zdyLngOuuhchVILn+vwUQy3tk6rQps=
Expand Down
17 changes: 14 additions & 3 deletions internal/app/machined/pkg/startup/cgroups.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/containerd/cgroups/v3/cgroup1"
"github.com/containerd/cgroups/v3/cgroup2"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/siderolabs/go-debug"
"github.com/siderolabs/go-pointer"
"go.uber.org/zap"

Expand All @@ -23,6 +24,16 @@ import (
"github.com/siderolabs/talos/pkg/machinery/constants"
)

func zeroIfRace[T any](v T) T {
if debug.RaceEnabled {
var zeroT T

return zeroT
}

return v
}

// CreateSystemCgroups creates system cgroups.
//
//nolint:gocyclo
Expand Down Expand Up @@ -131,7 +142,7 @@ func CreateSystemCgroups(ctx context.Context, log *zap.Logger, rt runtime.Runtim
name: constants.CgroupDashboard,
resources: &cgroup2.Resources{
Memory: &cgroup2.Memory{
Max: pointer.To[int64](constants.CgroupDashboardMaxMemory),
Max: zeroIfRace(pointer.To[int64](constants.CgroupDashboardMaxMemory)),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupDashboardMillicores))),
Expand All @@ -144,7 +155,7 @@ func CreateSystemCgroups(ctx context.Context, log *zap.Logger, rt runtime.Runtim
Memory: &cgroup2.Memory{
Min: pointer.To[int64](constants.CgroupApidReservedMemory),
Low: pointer.To[int64](constants.CgroupApidReservedMemory * 2),
Max: pointer.To[int64](constants.CgroupApidMaxMemory),
Max: zeroIfRace(pointer.To[int64](constants.CgroupApidMaxMemory)),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupApidMillicores))),
Expand All @@ -157,7 +168,7 @@ func CreateSystemCgroups(ctx context.Context, log *zap.Logger, rt runtime.Runtim
Memory: &cgroup2.Memory{
Min: pointer.To[int64](constants.CgroupTrustdReservedMemory),
Low: pointer.To[int64](constants.CgroupTrustdReservedMemory * 2),
Max: pointer.To[int64](constants.CgroupTrustdMaxMemory),
Max: zeroIfRace(pointer.To[int64](constants.CgroupTrustdMaxMemory)),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupTrustdMillicores))),
Expand Down
8 changes: 7 additions & 1 deletion pkg/cluster/crashdump.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"fmt"
"io"
"os"
"time"

"github.com/siderolabs/gen/xslices"
"github.com/siderolabs/go-talos-support/support"
Expand All @@ -33,6 +34,10 @@ func Crashdump(ctx context.Context, cluster provision.Cluster, logWriter io.Writ

defer supportFile.Close() //nolint:errcheck

// limit support bundle generation time
ctx, cancel := context.WithTimeout(ctx, 5*time.Minute)
defer cancel()

c, err := client.New(ctx, client.WithDefaultConfig())
if err != nil {
fmt.Fprintf(logWriter, "error creating crashdump: %s\n", err)
Expand All @@ -50,7 +55,8 @@ func Crashdump(ctx context.Context, cluster provision.Cluster, logWriter io.Writ
bundle.WithArchiveOutput(supportFile),
bundle.WithTalosClient(c),
bundle.WithNodes(nodes...),
bundle.WithNumWorkers(1),
bundle.WithNumWorkers(4),
bundle.WithLogOutput(io.Discard),
}

kubeclient, err := getKubernetesClient(ctx, c, controlplane)
Expand Down

0 comments on commit 73c25ee

Please sign in to comment.