From 8f5745b27ecd42519a8c00e3257dd5e304568f75 Mon Sep 17 00:00:00 2001 From: yuhao <72971170+howin98@users.noreply.github.com> Date: Sat, 20 Aug 2022 11:42:40 +0800 Subject: [PATCH] support throttle to fix ninja c1 oom (#8953) * throttle * use throttle and fix ninja c1 * add deps * add server and with-cuda flag * enhance * portalocker * timeout 10 * update deps * rm * fix * fix * fix * auto format by CI * Update python/oneflow/test_utils/throttle.py Co-authored-by: Shenghang Tsai * auto format by CI * fix * fix * fix * rename and stop using shell * fix * fix * minor refactor * minor refactor * fix * fix Co-authored-by: jackalcooper Co-authored-by: oneflow-ci-bot --- dev-requirements.txt | 1 + .../auto_nhwc/test_nhwc_batchnorm_relu.py | 2 +- .../OneFlow/auto_nhwc/test_nhwc_bias_add.py | 2 +- .../test/OneFlow/auto_nhwc/test_nhwc_conv.py | 2 +- .../auto_nhwc/test_nhwc_conv2d_maxpool2d.py | 2 +- .../auto_nhwc/test_nhwc_conv_relu_add.py | 2 +- .../test/OneFlow/auto_nhwc/test_nhwc_lenet.py | 2 +- .../OneFlow/auto_nhwc/test_nhwc_maxpool_2d.py | 2 +- .../OneFlow/auto_nhwc/test_nhwc_resnet.py | 2 +- .../test_nhwc_transpose_eliminate.py | 2 +- .../auto_nhwc/test_resnet101_benchmark.py | 2 +- .../cuda_code_gen/test_fuser_cast_scale.py | 2 +- .../OneFlow/folding/test_simple_multiply.py | 2 +- oneflow/ir/test/OneFlow/test_fuse_pad_conv.py | 2 +- .../with_cuda/test_conv_bn_auto_nhwc.py | 2 +- oneflow/ir/test/lit.cfg.py | 1 + python/oneflow/test_utils/throttle.py | 60 +++++++++++++++++++ 17 files changed, 76 insertions(+), 14 deletions(-) create mode 100644 python/oneflow/test_utils/throttle.py diff --git a/dev-requirements.txt b/dev-requirements.txt index b6e64ecf514..16f3fb0cc3c 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -14,3 +14,4 @@ dataclasses; python_version<"3.7" cmakelang==0.6.13 pytest-xdist rich +portalocker diff --git a/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_batchnorm_relu.py b/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_batchnorm_relu.py index 7c103238f4a..588553abf65 100644 --- a/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_batchnorm_relu.py +++ b/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_batchnorm_relu.py @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. """ -# RUN: python3 %s | FileCheck %s +# RUN: python3 -m oneflow.test_utils.throttle --with-cuda=%with_cuda python3 %s | FileCheck %s # CHECK: oneflow.transpose import unittest diff --git a/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_bias_add.py b/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_bias_add.py index 21201e5fdda..991c0d20b32 100644 --- a/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_bias_add.py +++ b/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_bias_add.py @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. """ -# RUN: python3 %s | FileCheck %s +# RUN: python3 -m oneflow.test_utils.throttle --with-cuda=%with_cuda python3 %s | FileCheck %s # CHECK: oneflow.transpose import unittest diff --git a/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_conv.py b/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_conv.py index 81118c94893..833d08add2b 100644 --- a/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_conv.py +++ b/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_conv.py @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. """ -# RUN: python3 %s | FileCheck %s +# RUN: python3 -m oneflow.test_utils.throttle --with-cuda=%with_cuda python3 %s | FileCheck %s # CHECK: oneflow.transpose import unittest diff --git a/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_conv2d_maxpool2d.py b/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_conv2d_maxpool2d.py index 9a9333414ca..98d9aed9f56 100644 --- a/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_conv2d_maxpool2d.py +++ b/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_conv2d_maxpool2d.py @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. """ -# RUN: python3 %s | FileCheck %s +# RUN: python3 -m oneflow.test_utils.throttle --with-cuda=%with_cuda python3 %s | FileCheck %s # CHECK: oneflow.transpose import unittest diff --git a/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_conv_relu_add.py b/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_conv_relu_add.py index 36796d301f0..42ba1c18860 100644 --- a/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_conv_relu_add.py +++ b/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_conv_relu_add.py @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. """ -# RUN: python3 %s | FileCheck %s +# RUN: python3 -m oneflow.test_utils.throttle --with-cuda=%with_cuda python3 %s | FileCheck %s # CHECK: oneflow.transpose import unittest diff --git a/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_lenet.py b/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_lenet.py index c4ca285d85f..4c1eb2ae762 100644 --- a/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_lenet.py +++ b/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_lenet.py @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. """ -# RUN: python3 %s | FileCheck %s +# RUN: python3 -m oneflow.test_utils.throttle --with-cuda=%with_cuda python3 %s | FileCheck %s # CHECK: oneflow.transpose import unittest diff --git a/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_maxpool_2d.py b/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_maxpool_2d.py index c7d74cec828..e8b87995dd1 100644 --- a/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_maxpool_2d.py +++ b/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_maxpool_2d.py @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. """ -# RUN: python3 %s | FileCheck %s +# RUN: python3 -m oneflow.test_utils.throttle --with-cuda=%with_cuda python3 %s | FileCheck %s # CHECK: oneflow.transpose import unittest diff --git a/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_resnet.py b/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_resnet.py index 4f4770a86d1..a52e13bf4bd 100644 --- a/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_resnet.py +++ b/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_resnet.py @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. """ -# RUN: python3 %s | FileCheck %s +# RUN: python3 -m oneflow.test_utils.throttle --with-cuda=%with_cuda python3 %s | FileCheck %s # CHECK: oneflow.transpose import unittest import numpy as np diff --git a/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_transpose_eliminate.py b/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_transpose_eliminate.py index 859f4f8ea0d..1dc875e62ca 100644 --- a/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_transpose_eliminate.py +++ b/oneflow/ir/test/OneFlow/auto_nhwc/test_nhwc_transpose_eliminate.py @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. """ -# RUN: python3 %s | FileCheck %s +# RUN: python3 -m oneflow.test_utils.throttle --with-cuda=%with_cuda python3 %s | FileCheck %s # CHECK-NOT: oneflow.transpose import unittest diff --git a/oneflow/ir/test/OneFlow/auto_nhwc/test_resnet101_benchmark.py b/oneflow/ir/test/OneFlow/auto_nhwc/test_resnet101_benchmark.py index 82598b1421d..c81a6fbbc4b 100644 --- a/oneflow/ir/test/OneFlow/auto_nhwc/test_resnet101_benchmark.py +++ b/oneflow/ir/test/OneFlow/auto_nhwc/test_resnet101_benchmark.py @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. """ -# RUN: python3 %s | FileCheck %s +# RUN: python3 -m oneflow.test_utils.throttle --with-cuda=%with_cuda python3 %s | FileCheck %s # CHECK: oneflow.transpose import unittest import numpy as np diff --git a/oneflow/ir/test/OneFlow/cuda_code_gen/test_fuser_cast_scale.py b/oneflow/ir/test/OneFlow/cuda_code_gen/test_fuser_cast_scale.py index 8d961630363..191dc810e9c 100644 --- a/oneflow/ir/test/OneFlow/cuda_code_gen/test_fuser_cast_scale.py +++ b/oneflow/ir/test/OneFlow/cuda_code_gen/test_fuser_cast_scale.py @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. """ -# RUN: python3 %s | FileCheck %s +# RUN: python3 -m oneflow.test_utils.throttle --with-cuda=%with_cuda python3 %s | FileCheck %s # CHECK: jit import unittest diff --git a/oneflow/ir/test/OneFlow/folding/test_simple_multiply.py b/oneflow/ir/test/OneFlow/folding/test_simple_multiply.py index c07e307f822..4e1b42e990b 100644 --- a/oneflow/ir/test/OneFlow/folding/test_simple_multiply.py +++ b/oneflow/ir/test/OneFlow/folding/test_simple_multiply.py @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. """ -# RUN: python3 %s | FileCheck %s +# RUN: python3 -m oneflow.test_utils.throttle --with-cuda=%with_cuda python3 %s | FileCheck %s # CHECK-NOT: oneflow.broadcast_mul import os diff --git a/oneflow/ir/test/OneFlow/test_fuse_pad_conv.py b/oneflow/ir/test/OneFlow/test_fuse_pad_conv.py index 8914dd62857..e5728617214 100644 --- a/oneflow/ir/test/OneFlow/test_fuse_pad_conv.py +++ b/oneflow/ir/test/OneFlow/test_fuse_pad_conv.py @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. """ -# RUN: python3 %s | FileCheck %s +# RUN: python3 -m oneflow.test_utils.throttle --with-cuda=%with_cuda python3 %s | FileCheck %s # CHECK-NOT: oneflow.pad import unittest diff --git a/oneflow/ir/test/OneFlow/with_cuda/test_conv_bn_auto_nhwc.py b/oneflow/ir/test/OneFlow/with_cuda/test_conv_bn_auto_nhwc.py index 8202c49ae89..8750cb4310d 100644 --- a/oneflow/ir/test/OneFlow/with_cuda/test_conv_bn_auto_nhwc.py +++ b/oneflow/ir/test/OneFlow/with_cuda/test_conv_bn_auto_nhwc.py @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. """ -# RUN: python3 %s | FileCheck %s +# RUN: python3 -m oneflow.test_utils.throttle --with-cuda=%with_cuda python3 %s | FileCheck %s # CHECK: oneflow.transpose import os diff --git a/oneflow/ir/test/lit.cfg.py b/oneflow/ir/test/lit.cfg.py index 55ba1bdf6b5..7a5545f4f30 100644 --- a/oneflow/ir/test/lit.cfg.py +++ b/oneflow/ir/test/lit.cfg.py @@ -100,6 +100,7 @@ tools = ["oneflow-opt", "oneflow-translate", "oneflow-runner"] tools.extend( [ + ToolSubst("%with_cuda", config.BUILD_CUDA, unresolved="ignore"), ToolSubst("%linalg_test_lib_dir", config.llvm_lib_dir, unresolved="ignore"), ToolSubst("%test_exec_root", config.test_exec_root, unresolved="ignore"), ] diff --git a/python/oneflow/test_utils/throttle.py b/python/oneflow/test_utils/throttle.py new file mode 100644 index 00000000000..39a846dc07f --- /dev/null +++ b/python/oneflow/test_utils/throttle.py @@ -0,0 +1,60 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import argparse +import hashlib +import subprocess +import portalocker +import os + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Control when the script runs through special variables." + ) + parser.add_argument( + "--with-cuda", type=int, default=1, help="whether has cuda device." + ) + parser.add_argument("cmd", type=str, nargs="...", help="command to run") + return parser.parse_args() + + +def hash_cli2gpu(cmd: list): + import pynvml + + pynvml.nvmlInit() + slot = pynvml.nvmlDeviceGetCount() + hash = hashlib.sha1(" ".join(cmd).encode("utf-8")).hexdigest() + gpu_id = int(hash, 16) % slot + return [gpu_id] + + +def main(): + args = parse_args() + if args.with_cuda: + cuda_visible_devices = [str(i) for i in hash_cli2gpu(args.cmd)] + with portalocker.Lock( + ".oneflow-throttle-gpu-" + "-".join(cuda_visible_devices) + ".lock", + timeout=400, + ): + env = dict(os.environ, CUDA_VISIBLE_DEVICES=",".join(cuda_visible_devices)) + return subprocess.call(args.cmd, env=env) + else: + return subprocess.call(args.cmd) + + +if __name__ == "__main__": + returncode = main() + exit(returncode)