Skip to content

Commit

Permalink
Add --experimental_worker_memory_limit_mb flag that kills (-9) work…
Browse files Browse the repository at this point in the history
…ers using more than a given amount of memory.

This flag works without cgroups and on all OSes, with or without sandboxing, but relies on polling memory info.

Note: Can't reliably integration test this without having more control over the timing.
PiperOrigin-RevId: 529347598
Change-Id: Iabeb8a45850a619dcb6ef8be9369fdc221f952ef
  • Loading branch information
larsrc-google authored and copybara-github committed May 4, 2023
1 parent 4073bcd commit 8d1db5b
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
// limitations under the License.
package com.google.devtools.build.lib.worker;

import static com.google.common.collect.ImmutableList.toImmutableList;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
Expand All @@ -22,6 +24,7 @@
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.pool2.PooledObject;
Expand Down Expand Up @@ -53,7 +56,7 @@ public void setReporter(Reporter reporter) {

@Override
public void run() {
if (options.totalWorkerMemoryLimitMb == 0) {
if (options.totalWorkerMemoryLimitMb == 0 && options.workerMemoryLimitMb == 0) {
return;
}

Expand All @@ -69,10 +72,17 @@ public void run() {

ImmutableList<WorkerMetric> workerMetrics =
WorkerMetricsCollector.instance().collectMetrics();
try {
evictWorkers(workerMetrics);
} catch (InterruptedException e) {
break;

if (options.totalWorkerMemoryLimitMb > 0) {
try {
evictWorkers(workerMetrics);
} catch (InterruptedException e) {
break;
}
}

if (options.workerMemoryLimitMb > 0) {
killLargeWorkers(workerMetrics, options.workerMemoryLimitMb);
}
}

Expand All @@ -83,6 +93,35 @@ void stopProcessing() {
isWorking = false;
}

/** Kills any worker that uses more than {@code limitMb} MB of memory. */
void killLargeWorkers(ImmutableList<WorkerMetric> workerMetrics, int limitMb) {
ImmutableList<WorkerMetric> large =
workerMetrics.stream()
.filter(m -> m.getWorkerStat().getUsedMemoryInKB() / 1000 > limitMb)
.collect(toImmutableList());

for (WorkerMetric l : large) {
String msg;

ImmutableList<Integer> workerIds = l.getWorkerProperties().getWorkerIds();
Optional<ProcessHandle> ph = ProcessHandle.of(l.getWorkerProperties().getProcessId());
if (ph.isPresent()) {
msg =
String.format(
"Killing %s worker %s (pid %d) taking %dMB",
l.getWorkerProperties().getMnemonic(),
workerIds.size() == 1 ? workerIds.get(0) : workerIds,
l.getWorkerProperties().getProcessId(),
l.getWorkerStat().getUsedMemoryInKB() / 1000);
ph.get().destroyForcibly();
logger.atInfo().log("%s", msg);
if (reporter != null) {
reporter.handle(Event.info(msg));
}
}
}
}

@VisibleForTesting // productionVisibility = Visibility.PRIVATE
void evictWorkers(ImmutableList<WorkerMetric> workerMetrics) throws InterruptedException {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -225,4 +225,16 @@ public String getTypeDescription() {
"The interval between collecting worker metrics and possibly attempting evictions. "
+ "Cannot effectively be less than 1s for performance reasons.")
public Duration workerMetricsPollInterval;

@Option(
name = "experimental_worker_memory_limit_mb",
converter = RamResourceConverter.class,
defaultValue = "0",
documentationCategory = OptionDocumentationCategory.EXECUTION_STRATEGY,
effectTags = {OptionEffectTag.EXECUTION, OptionEffectTag.HOST_MACHINE_RESOURCE_OPTIMIZATIONS},
help =
"If this limit is greater than zero, workers might be killed if the memory usage of the "
+ "worker exceeds the limit. If not used together with dynamic execution and "
+ "`--experimental_dynamic_ignore_local_signals=9`, this may crash your build.")
public int workerMemoryLimitMb;
}
29 changes: 29 additions & 0 deletions src/test/shell/integration/bazel_worker_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -728,6 +728,35 @@ EOF
expect_log "^---8<---8<--- End of log ---8<---8<---"
}

function test_worker_memory_limit() {
prepare_example_worker
cat >>BUILD <<EOF
work(
name = "hello_world",
worker = ":worker",
worker_args = [
"--worker_protocol=${WORKER_PROTOCOL}",
],
args = [
"--work_time=3s",
]
)
EOF

bazel build --experimental_worker_memory_limit_mb=1000 \
--experimental_worker_metrics_poll_interval=1s :hello_world &> "$TEST_log" \
|| fail "build failed"
bazel clean
bazel build --experimental_worker_memory_limit_mb=1 \
--experimental_worker_metrics_poll_interval=1s :hello_world &> "$TEST_log" \
&& fail "expected build to fail" || true

expect_log "^---8<---8<--- Start of log, file at /"
expect_log "Worker process did not return a WorkResponse:"
expect_log "Killing [a-zA-Z]\+ worker [0-9]\+ (pid [0-9]\+) taking [0-9]\+MB"
expect_log "^---8<---8<--- End of log ---8<---8<---"
}

function test_worker_metrics_collection() {
prepare_example_worker
cat >>BUILD <<EOF
Expand Down

0 comments on commit 8d1db5b

Please sign in to comment.