Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

[FEATURE] Add g5 instance to CI #20876

Merged
merged 21 commits into from
Mar 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
73431dd
Add g5 instance to jenkinsfiles where both p3 and g4 are mentioned
DickJC123 Feb 5, 2022
1433911
Remove reference to non-existent restricted-mxnetlinux-gpu-g5
DickJC123 Feb 7, 2022
dd3b58a
Enable unittest job on g5
DickJC123 Feb 8, 2022
48b6173
Fix Jenkinsfile_unix_gpu syntax
DickJC123 Feb 8, 2022
1c11634
Include A10G arch 86 in build for g5
DickJC123 Feb 9, 2022
3d8f5cc
Merge remote-tracking branch 'mxnet/master' into add_g5_instance_to_CI
DickJC123 Feb 9, 2022
ec30697
Update is_TF32_enabled() for SM arch > 80
DickJC123 Feb 15, 2022
1c96ef9
Remove gpu arch 86 from centos builds on cuda 10
DickJC123 Feb 16, 2022
4b93cdb
Fix test_convolution_{grouping,dilated_impulse_response}, test_np_lin…
DickJC123 Jan 12, 2022
5583ae6
Fix test_convolution_grouping on A100
DickJC123 Jan 18, 2022
00cfc89
Fix test_rnn_unroll_variant_length
DickJC123 Jan 18, 2022
9161495
Fix test_convolution_dilated_impulse_response
DickJC123 Aug 23, 2021
f4e2f40
Skip test_np_standard_binary_funcs test of 0-dim array broadcast
DickJC123 Feb 17, 2022
4a9056a
Temporarily add '-s' to pytest cpu tests
DickJC123 Feb 18, 2022
6c316fc
Revert "Temporarily add '-s' to pytest cpu tests"
DickJC123 Feb 21, 2022
02a0944
Improve test_rnn_layers_fp{16,32} invocation
DickJC123 Feb 21, 2022
ae17b1f
Pin MarkupSafe==2.0.1 to avoid soft_unicode import failure
DickJC123 Feb 21, 2022
223d9bf
Run test_rnn_layers_fp32 only when cuDNN is present
DickJC123 Feb 21, 2022
9114332
Fix potential out-of-bounds write in count_sketch.cu
DickJC123 Feb 22, 2022
3b9a520
Revert "Pin MarkupSafe==2.0.1 to avoid soft_unicode import failure"
DickJC123 Feb 24, 2022
26734de
Merge remote-tracking branch 'mxnet/master' into add_g5_instance_to_CI
DickJC123 Feb 24, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ci/Jenkinsfile_utils.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ def assign_node_labels(args) {
NODE_LINUX_CPU = args.linux_cpu
NODE_LINUX_GPU = args.linux_gpu
NODE_LINUX_GPU_G4 = args.linux_gpu_g4
NODE_LINUX_GPU_G5 = args.linux_gpu_g5
NODE_LINUX_GPU_P3 = args.linux_gpu_p3
NODE_WINDOWS_CPU = args.windows_cpu
NODE_WINDOWS_GPU = args.windows_gpu
Expand Down
9 changes: 6 additions & 3 deletions ci/docker/runtime_functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,11 @@

set -ex

CI_CUDA_COMPUTE_CAPABILITIES="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_70,code=sm_70"
CI_CMAKE_CUDA_ARCH="5.2 7.0"
# compute capabilities for CI instances supported by CUDA 10.x (i.e. p3, g4)
CI_CMAKE_CUDA10_ARCH="5.2 7.5"

# compute capabilities for CI instances supported by CUDA >= 11.1 (i.e. p3, g4, g5)
CI_CMAKE_CUDA_ARCH="5.2 7.5 8.6"

clean_repo() {
set -ex
Expand Down Expand Up @@ -298,7 +301,7 @@ build_centos7_gpu() {
-DUSE_BLAS=Open \
-DUSE_ONEDNN=ON \
-DUSE_CUDA=ON \
-DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
-DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA10_ARCH" \
-DUSE_DIST_KVSTORE=ON \
-DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \
-DUSE_INT64_TENSOR_SIZE=OFF \
Expand Down
16 changes: 16 additions & 0 deletions ci/jenkins/Jenkins_steps.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -716,6 +716,22 @@ def test_unix_python3_gpu(lib_name) {
}]
}

def test_unix_python3_ampere_gpu(lib_name) {
return ['Python3: Ampere-GPU': {
node(NODE_LINUX_GPU_G5) {
ws('workspace/ut-python3-gpu') {
try {
utils.unpack_and_init(lib_name, mx_lib_cython)
python3_gpu_ut_cython('ubuntu_gpu_cu111')
utils.publish_test_coverage()
} finally {
utils.collect_test_results_unix('tests_gpu.xml', 'tests_python3_ampere_gpu.xml')
}
}
}
}]
}

def test_unix_python3_debug_cpu() {
return ['Python3: CPU debug': {
node(NODE_LINUX_CPU) {
Expand Down
3 changes: 2 additions & 1 deletion ci/jenkins/Jenkinsfile_unix_gpu
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ node('utility') {
utils = load('ci/Jenkinsfile_utils.groovy')
custom_steps = load('ci/jenkins/Jenkins_steps.groovy')
}
utils.assign_node_labels(utility: 'utility', linux_cpu: 'mxnetlinux-cpu', linux_gpu: 'mxnetlinux-gpu', linux_gpu_p3: 'mxnetlinux-gpu-p3', linux_gpu_g4: 'mxnetlinux-gpu-g4')
utils.assign_node_labels(utility: 'utility', linux_cpu: 'mxnetlinux-cpu', linux_gpu: 'mxnetlinux-gpu', linux_gpu_p3: 'mxnetlinux-gpu-p3', linux_gpu_g4: 'mxnetlinux-gpu-g4', linux_gpu_g5: 'mxnetlinux-gpu-g5')

utils.main_wrapper(
core_logic: {
Expand All @@ -44,6 +44,7 @@ core_logic: {

utils.parallel_stage('Tests', [
custom_steps.test_unix_python3_gpu('gpu'),
custom_steps.test_unix_python3_ampere_gpu('gpu'),
custom_steps.test_unix_python3_onednn_gpu('onednn_gpu'),
custom_steps.test_unix_python3_onednn_nocudnn_gpu('onednn_gpu_nocudnn'),
custom_steps.test_unix_cpp_package_gpu('gpu'),
Expand Down
6 changes: 3 additions & 3 deletions python/mxnet/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,15 +112,15 @@ def effective_dtype(dat):
----------
dat : np.ndarray or mx.nd.array or mx.np.ndarray
"""
# On arch 80 gpus, a float32-io gemm or conv op will trim the mantissa of data
# inputs to be of comparable precision to a float16, so float16 becomes the
# On arch 80 gpus or later, a float32-io gemm or conv op will trim the mantissa of
# data inputs to be of comparable precision to a float16, so float16 becomes the
# 'effective dtype' for tolerance tests involving such op outputs.

# Is TF32 enabled in the device (the default on arch 80 GPUs)
def is_TF32_enabled(device):
try:
return (device.device_type == 'gpu' and
get_cuda_compute_capability(device) == 80 and
get_cuda_compute_capability(device) >= 80 and
os.environ.get('NVIDIA_TF32_OVERRIDE') != '0')
except: # pylint: disable=bare-except
return False
Expand Down
3 changes: 3 additions & 0 deletions src/operator/contrib/count_sketch.cu
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,9 @@ __global__ void sketch_backward_kernel(const int nthreads,
// only calculate gradient regarding x
// can also calculate gradient regarding s if needed
const int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index >= nthreads) {
return;
}
const int i_indim = index % in_dim;
const int i_sample = index / in_dim;
const int i_outdim = i_sample * out_dim + h[i_indim];
Expand Down
9 changes: 8 additions & 1 deletion tests/python/gpu/test_operator_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
import mxnet.ndarray.sparse as mxsps
from mxnet.test_utils import check_consistency, set_default_device, assert_almost_equal, assert_allclose
from mxnet.test_utils import check_symbolic_forward, check_symbolic_backward, discard_stderr
from mxnet.test_utils import default_device, rand_shape_2d, rand_ndarray, same, environment, get_rtc_compile_opts
from mxnet.test_utils import default_device, rand_shape_2d, rand_ndarray, same, environment, get_rtc_compile_opts, get_cuda_compute_capability
from mxnet.base import MXNetError
from mxnet import autograd

Expand All @@ -54,6 +54,13 @@

set_default_device(mx.gpu(0))

# For info purposes, log GPU compute cababilities. Run serially so output appears in log.
@pytest.mark.serial
def test_report_compute_capabilities(capsys):
with capsys.disabled():
sys.stdout.write('= {} '.format(
[get_cuda_compute_capability(mx.gpu(i)) for i in range(mx.device.num_gpus())] ))

def check_countsketch(in_dim,out_dim,n):
data = mx.sym.Variable("data")
h = mx.sym.Variable("h")
Expand Down
26 changes: 13 additions & 13 deletions tests/python/unittest/test_gluon_rnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -606,7 +606,8 @@ def check_rnn_layer_forward(layer, inputs, states=None, run_only=False, device=m


@mx.util.use_np
def run_rnn_layers(dtype, dtype2, device=mx.cpu()):
def run_rnn_layers(dtype, dtype2):
device = default_device()

check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype), mx.np.ones((8, 3, 20), dtype=dtype), device=device)
check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype, bidirectional=True), mx.np.ones((8, 3, 20), dtype=dtype), mx.np.ones((4, 3, 10), dtype=dtype), device=device)
Expand Down Expand Up @@ -668,15 +669,18 @@ def run_rnn_layers(dtype, dtype2, device=mx.cpu()):
out.backward()
out = out.asnumpy()

@assert_raises_cudnn_not_satisfied(min_version='5.1.10')
@pytest.mark.serial
def test_rnn_layers_fp32():
run_rnn_layers('float32', 'float32')

@assert_raises_cudnn_not_satisfied(min_version='5.1.10')
@pytest.mark.skipif(mx.device.num_gpus() == 0, reason="RNN FP16 only implemented for GPU for now")
@pytest.mark.serial
def test_rnn_layers_fp16():
run_rnn_layers('float16', 'float32', mx.gpu())
# Dynamic skip condition is best handled this way, rather than with pytest.mark.skipIf
if default_device().device_type == 'cpu':
pytest.skip('RNN FP16 only implemented for GPU for now')
run_rnn_layers('float16', 'float32')


def check_rnn_consistency(fused_layer, stack_layer, loss, mode, num_layers, input_size, hidden_size, bidirectional=False, rtol=1e-2, atol=1e-4):
Expand Down Expand Up @@ -844,14 +848,12 @@ def test_rnn_unroll_variant_length():
inputs=data_nd[i:(i+1), :ele_length, :],
merge_outputs=True,
layout='NTC')
assert_allclose(ele_out.asnumpy(), outs[i:(i+1), :ele_length, :].asnumpy(),
atol=1E-4, rtol=1E-4)
assert_almost_equal(ele_out, outs[i:(i+1), :ele_length, :])
if ele_length < max_length:
# Check the padded outputs are all zero
assert_allclose(outs[i:(i+1), ele_length:max_length, :].asnumpy(), 0)
assert_almost_equal(outs[i:(i+1), ele_length:max_length, :], 0)
for valid_out_state, gt_state in zip(states, ele_states):
assert_allclose(valid_out_state[i:(i+1)].asnumpy(), gt_state.asnumpy(),
atol=1E-4, rtol=1E-4)
assert_almost_equal(valid_out_state[i:(i+1)], gt_state)

# Test for TNC layout
data_nd = mx.np.random.normal(0, 1, size=(max_length, batch_size, 20))
Expand All @@ -864,14 +866,12 @@ def test_rnn_unroll_variant_length():
inputs=data_nd[:ele_length, i:(i+1), :],
merge_outputs=True,
layout='TNC')
assert_allclose(ele_out.asnumpy(), outs[:ele_length, i:(i + 1), :].asnumpy(),
atol=1E-4, rtol=1E-4)
assert_almost_equal(ele_out, outs[:ele_length, i:(i + 1), :])
if ele_length < max_length:
# Check the padded outputs are all zero
assert_allclose(outs[ele_length:max_length, i:(i+1), :].asnumpy(), 0)
assert_almost_equal(outs[ele_length:max_length, i:(i+1), :], 0)
for valid_out_state, gt_state in zip(states, ele_states):
assert_allclose(valid_out_state[i:(i+1)].asnumpy(), gt_state.asnumpy(),
atol=1E-4, rtol=1E-4)
assert_almost_equal(valid_out_state[i:(i+1)], gt_state)


def test_cell_fill_shape():
Expand Down
11 changes: 9 additions & 2 deletions tests/python/unittest/test_numpy_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -6477,6 +6477,9 @@ def check_qr(q, r, a_np):

data_np = onp.array(data_np, dtype=dtype)
data = np.array(data_np, dtype=dtype)
if effective_dtype(data) == onp.dtype(np.float16):
print('Skipping test on this platform: {} has a float16 effective dtype'.format(dtype))
pytest.skip()

data.attach_grad()
with mx.autograd.record():
Expand Down Expand Up @@ -11712,8 +11715,12 @@ def array_values(low, high, shape):
((3, 1), (3, 0)),
((0, 2), (1, 2)),
((2, 3, 4), (3, 1)),
((2, 3), ()),
((), (2, 3))
# MXNet numpy does not match original numpy behavior when broadcasting 0-dim arrays.
# See https://github.com/apache/incubator-mxnet/issues/20898.
# ((2, 3), ()),
# ((), (2, 3))
((2, 3), (1,)),
((1,), (2, 3))
])
def test_np_standard_binary_funcs(func, func2, promoted, dtypes, ref_grad_a, ref_grad_b, low, high, lshape, rshape):
class TestStandardBinary(HybridBlock):
Expand Down
25 changes: 16 additions & 9 deletions tests/python/unittest/test_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -1724,6 +1724,7 @@ def np_groupnorm_grad(ograd, data, gamma, beta, mean, std, num_groups, eps):
atol=5e-2 if dtype == np.float16 else 1e-4, dtype=dtype)


@pytest.mark.serial
def test_convolution_grouping():
for dim in [1, 2, 3]:
num_filter = 4
Expand All @@ -1745,15 +1746,15 @@ def test_convolution_grouping():
exe1 = y1._simple_bind(default_device(), x=shape)
exe2 = y2._simple_bind(default_device(), x=shape, w=(num_filter, shape[1]//num_group) + kernel, b=(num_filter,))
for arr1, arr2 in zip(exe1.arg_arrays, exe2.arg_arrays):
arr1[:] = np.float32(np.random.normal(size=arr1.shape))
arr1[:] = np.random.normal(size=arr1.shape).astype(effective_dtype(mx.nd.array([1.,])))
arr2[:] = arr1
exe1.forward(is_train=True)
exe1.backward(exe1.outputs[0])
exe2.forward(is_train=True)
exe2.backward(exe2.outputs[0])

for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays):
np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-3)
assert_almost_equal(arr1, arr2)


@pytest.mark.skip(reason="Flaky test https://github.com/apache/incubator-mxnet/issues/14052")
Expand Down Expand Up @@ -2216,7 +2217,8 @@ def test_bxor(a, b):
test_bor(a, b)
test_bxor(a, b)

def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3), verbose=False):

def run_convolution_dilated_impulse_response(dil, kernel_shape, tol):
dim = len(dil)
assert(len(kernel_shape) == dim)
# Input for spike response
Expand Down Expand Up @@ -2259,7 +2261,7 @@ def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3),
out_o = be.outputs[0].asnumpy()
assert_allclose(out_o[center],np.prod(kernel_shape),atol=1e-5)

rnd_kernel_s = np.random.uniform(low=0.0, high=1.0, size=tuple([1,1]+list(kernel_shape))).astype(np.float32)
rnd_kernel_s = np.random.uniform(low=-0.5, high=0.5, size=tuple([1,1]+list(kernel_shape))).astype(np.float32)
impulse_error = mx.nd.array(out_o/np.sum(out_o)) # This should be 1.0 at [0,0,16,16]
rnd_kernel = mx.nd.array(rnd_kernel_s)

Expand All @@ -2282,22 +2284,27 @@ def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3),
be.forward(True)
out = be.outputs[0].asnumpy()
# Now do a simple check of the kernel gradient
assert(out[center] - np.sum(kernel_gradient) - out_orig[center] < 0.001)

d = np.abs(out[center] - np.sum(kernel_gradient) - out_orig[center])
assert d < tol, f'd: {d}'

@pytest.mark.serial
def test_convolution_dilated_impulse_response():
tol = 1e-3
# 1D
for dil in [ (1,), (2,), (3,) ]:
for ks in [ (1,), (2,), (3,), (4,)]:
test_run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks)
run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks, tol=tol)
# 2D
for dil in [ (1,1), (2,2), (3,3) ]:
for ks in [ (3,3), (4,4), (2,3), (3,2), (1,1) ]:
test_run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks)
run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks, tol=tol)
# 3D
# On Ampere, autotuning might select a TensorCore conv engine, which effectively
# does a cast to fp16 of the weights and data. Expand tol in these 3D cases.
tol3D = 1e-2 if effective_dtype(mx.nd.array([1.,])) == np.float16 else tol
for dil in [ (1,1,1), (2,2,2), (3,3,3) ]:
for ks in [ (3,3,3), (4,4,4), (2,3,4), (3,2,4), (1,1,1) ]:
test_run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks)
run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks, tol=tol3D)


@pytest.mark.serial
Expand Down