Skip to content

Commit dcbab71

Browse files
committed
fix container_oom_events_total always returns 0.
In a Kubernetes pod, if a container is OOM-killed, it will be deleted and a new container will be created. Therefore, the `container_oom_events_total` metric will always be 0. It would be useful to preserve the metrics of the OOM-killed containers instead of deleting them. Signed-off-by: joey <zchengjoey@gmail.com>
1 parent b4c955f commit dcbab71

File tree

1 file changed

+12
-8
lines changed

1 file changed

+12
-8
lines changed

manager/manager.go

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,7 @@ func (m *manager) Start() error {
297297
return err
298298
}
299299
klog.V(2).Infof("Starting recovery of all containers")
300-
err = m.detectSubcontainers("/")
300+
err = m.detectSubContainers("/")
301301
if err != nil {
302302
return err
303303
}
@@ -385,7 +385,7 @@ func (m *manager) globalHousekeeping(quit chan error) {
385385
start := time.Now()
386386

387387
// Check for new containers.
388-
err := m.detectSubcontainers("/")
388+
err := m.detectSubContainers("/")
389389
if err != nil {
390390
klog.Errorf("Failed to detect containers: %s", err)
391391
}
@@ -1009,6 +1009,10 @@ func (m *manager) destroyContainerLocked(containerName string) error {
10091009
// Already destroyed, done.
10101010
return nil
10111011
}
1012+
// If the container is OOM-killed, keep monitoring metrics for the container.
1013+
if cont.oomEvents != 0 {
1014+
return nil
1015+
}
10121016

10131017
// Tell the container to stop.
10141018
err := cont.Stop()
@@ -1045,7 +1049,7 @@ func (m *manager) destroyContainerLocked(containerName string) error {
10451049

10461050
// Detect all containers that have been added or deleted from the specified container.
10471051
func (m *manager) getContainersDiff(containerName string) (added []info.ContainerReference, removed []info.ContainerReference, err error) {
1048-
// Get all subcontainers recursively.
1052+
// Get all subContainers recursively.
10491053
m.containersLock.RLock()
10501054
cont, ok := m.containers[namespacedContainerName{
10511055
Name: containerName,
@@ -1067,8 +1071,8 @@ func (m *manager) getContainersDiff(containerName string) (added []info.Containe
10671071
// Determine which were added and which were removed.
10681072
allContainersSet := make(map[string]*containerData)
10691073
for name, d := range m.containers {
1070-
// Only add the canonical name.
1071-
if d.info.Name == name.Name {
1074+
// Only add the canonical name. Preserve containers that were deleted due to OOMKilled.
1075+
if d.info.Name == name.Name && d.oomEvents != 0 {
10721076
allContainersSet[name.Name] = d
10731077
}
10741078
}
@@ -1092,8 +1096,8 @@ func (m *manager) getContainersDiff(containerName string) (added []info.Containe
10921096
return
10931097
}
10941098

1095-
// Detect the existing subcontainers and reflect the setup here.
1096-
func (m *manager) detectSubcontainers(containerName string) error {
1099+
// Detect the existing subContainers and reflect the setup here.
1100+
func (m *manager) detectSubContainers(containerName string) error {
10971101
added, removed, err := m.getContainersDiff(containerName)
10981102
if err != nil {
10991103
return err
@@ -1136,7 +1140,7 @@ func (m *manager) watchForNewContainers(quit chan error) error {
11361140
}
11371141

11381142
// There is a race between starting the watch and new container creation so we do a detection before we read new containers.
1139-
err := m.detectSubcontainers("/")
1143+
err := m.detectSubContainers("/")
11401144
if err != nil {
11411145
return err
11421146
}

0 commit comments

Comments
 (0)