From 68756c6d8d34fbe9afd4d472b0aac84f9b3fa67a Mon Sep 17 00:00:00 2001 From: Adam Azarchs Date: Fri, 8 Dec 2023 10:13:09 -0800 Subject: [PATCH] mrjob: restore vmem limit after launching job When --monitor is enabled, we've been seeing issues with errors like runtime/cgo: out of memory in thread_start This appears to be due to setting the vmem limit too low in some cases. Once we've launched the subprocess, restore the vmem limit to what it was originally, to avoid this sort of problem; monitor is supposed to be for the stage code, not mrjob itself. --- cmd/mrjob/mrjob.go | 49 +++++++++++++++++++++++++++--------------- martian/core/rlimit.go | 12 +++++++---- 2 files changed, 40 insertions(+), 21 deletions(-) diff --git a/cmd/mrjob/mrjob.go b/cmd/mrjob/mrjob.go index a60798d0..e3f537e3 100644 --- a/cmd/mrjob/mrjob.go +++ b/cmd/mrjob/mrjob.go @@ -318,25 +318,40 @@ func (self *runner) StartJob(args []string) error { self.metadata.MetadataFilePath(core.PerfData), self.metadata.MetadataFilePath(core.ProfileOut)) } - if self.monitoring && self.jobInfo.VMemGB > 0 { - // Exclude mrjob's vmem usage from the rlimit. - mem, _ := core.GetProcessTreeMemory(self.jobInfo.Pid, true, nil) - amount := int64(self.jobInfo.VMemGB)*1024*1024*1024 - mem.Vmem - if amount < mem.Vmem+1024*1024 { - amount = mem.Vmem + 1024*1024 + if err := func(cmd *exec.Cmd) error { + if self.monitoring && self.jobInfo.VMemGB > 0 { + // Exclude mrjob's vmem usage from the rlimit. + mem, _ := core.GetProcessTreeMemory(self.jobInfo.Pid, true, nil) + amount := int64(self.jobInfo.VMemGB)*1024*1024*1024 - mem.Vmem + if amount < mem.Vmem+1024*1024 { + amount = mem.Vmem + 1024*1024 + } + if oldAmount, err := core.SetVMemRLimit(uint64(amount)); err != nil { + util.LogError(err, "monitor", + "Could not set VM rlimit.") + } else { + // After launching the subprocess, restore the vmem + // limit for this process. Otherwise the go runtime can run + // into various kinds of trouble. + defer func(amt uint64) { + if _, err := core.SetVMemRLimit(amt); err != nil { + util.LogError(err, "monitor", + "Could not restore VM rlimit.") + } + }(oldAmount) + } } - if err := core.SetVMemRLimit(uint64(amount)); err != nil { - util.LogError(err, "monitor", - "Could not set VM rlimit.") + if err := func() error { + util.EnterCriticalSection() + defer util.ExitCriticalSection() + self.job = cmd + return self.job.Start() + }(); err != nil { + self.errorReader.Close() + return err } - } - if err := func() error { - util.EnterCriticalSection() - defer util.ExitCriticalSection() - self.job = cmd - return self.job.Start() - }(); err != nil { - self.errorReader.Close() + return nil + }(cmd); err != nil { return err } if err := self.startProfile(); err != nil { diff --git a/martian/core/rlimit.go b/martian/core/rlimit.go index 5e967b41..b2361b7e 100644 --- a/martian/core/rlimit.go +++ b/martian/core/rlimit.go @@ -125,14 +125,18 @@ func CheckMaxVmem(amount uint64) uint64 { return min } -func SetVMemRLimit(amount uint64) error { +func SetVMemRLimit(amount uint64) (uint64, error) { var rlim unix.Rlimit if err := unix.Getrlimit(unix.RLIMIT_AS, &rlim); err != nil { - return err + return rlim.Cur, err } else if rlim.Max != unix.RLIM_INFINITY && rlim.Max < amount { - return fmt.Errorf("could not set RLIMIT_AS %d > %d", + return rlim.Cur, fmt.Errorf("could not set RLIMIT_AS %d > %d", amount, rlim.Max) + } else if rlim.Cur == amount { + // Nothing to do. + return amount, nil } + oldAmount := rlim.Cur rlim.Cur = amount - return unix.Setrlimit(unix.RLIMIT_AS, &rlim) + return oldAmount, unix.Setrlimit(unix.RLIMIT_AS, &rlim) }