From 9733134115838026de280b08380bdb307cd39820 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Mon, 8 Jun 2020 22:56:43 +0000 Subject: [PATCH 01/36] remove xxOutput operators used in Module --- src/operator/regression_output.cc | 159 ------------------ src/operator/regression_output.cu | 53 ------ tests/python/unittest/test_operator.py | 221 ------------------------- 3 files changed, 433 deletions(-) delete mode 100644 src/operator/regression_output.cc delete mode 100644 src/operator/regression_output.cu diff --git a/src/operator/regression_output.cc b/src/operator/regression_output.cc deleted file mode 100644 index a337ec1ca1ad..000000000000 --- a/src/operator/regression_output.cc +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file regression_ouput.cc - * \brief Regression output operator. -*/ - -#include "./regression_output-inl.h" -#include "./elemwise_op_common.h" - - -#define MXNET_OPERATOR_REGISTER_REGRESSION_FWD(__name$, __kernel$, __bwdop$) \ - NNVM_REGISTER_OP(__name$) \ - MXNET_ADD_SPARSE_OP_ALIAS(__name$) \ - .set_num_inputs(2) \ - .set_num_outputs(1) \ - .set_attr("FListInputNames", \ - [](const NodeAttrs& attrs) { \ - return std::vector{"data", "label"}; \ - }) \ - .set_attr("FInferShape", RegressionOpShape) \ - .set_attr("FGradient", RegressionOpGrad{__bwdop$}) \ - .set_attr("FInferType", ElemwiseType<2, 1>) \ - .set_attr("FInplaceOption", \ - [](const NodeAttrs& attrs){ \ - return std::vector >{{0, 0}}; \ - }) \ - .set_attr("FCompute", RegressionForward) \ - .add_argument("data", "NDArray-or-Symbol", "Input data to the function.") \ - .add_argument("label", "NDArray-or-Symbol", "Input label to the function.") \ - .add_arguments(RegressionOutputParam::__FIELDS__()) - -#define MXNET_OPERATOR_REGISTER_REGRESSION_BWD(__name$, __kernel$) \ - NNVM_REGISTER_OP(__name$) \ - .set_num_inputs(2) \ - .set_num_outputs(2) \ - .set_attr_parser(ParamParser) \ - .set_attr("TIsBackward", true) \ - .set_attr("FInferType", ElemwiseType<2, 2>) \ - .set_attr("FInplaceOption", \ - [](const NodeAttrs& attrs){ \ - return std::vector >{{1, 0}}; \ - }) \ - .set_attr("FCompute", RegressionBackward) - -namespace mxnet { -namespace op { - - -DMLC_REGISTER_PARAMETER(RegressionOutputParam); - -MXNET_OPERATOR_REGISTER_REGRESSION_FWD(LinearRegressionOutput, - mshadow_op::identity, "_backward_linear_reg_out") -.set_attr("FInferStorageType", RegressionInferStorageType) -.set_attr("FComputeEx", RegressionForwardEx) -.describe(R"code(Computes and optimizes for squared loss during backward propagation. -Just outputs ``data`` during forward propagation. - -If :math:`\hat{y}_i` is the predicted value of the i-th sample, and :math:`y_i` is the corresponding target value, -then the squared loss estimated over :math:`n` samples is defined as - -:math:`\text{SquaredLoss}(\textbf{Y}, \hat{\textbf{Y}} ) = \frac{1}{n} \sum_{i=0}^{n-1} \lVert \textbf{y}_i - \hat{\textbf{y}}_i \rVert_2` - -.. note:: - Use the LinearRegressionOutput as the final output layer of a net. - -The storage type of ``label`` can be ``default`` or ``csr`` - -- LinearRegressionOutput(default, default) = default -- LinearRegressionOutput(default, csr) = default - -By default, gradients of this loss function are scaled by factor `1/m`, where m is the number of regression outputs of a training example. -The parameter `grad_scale` can be used to change this scale to `grad_scale/m`. - -)code" ADD_FILELINE); - -MXNET_OPERATOR_REGISTER_REGRESSION_BWD(_backward_linear_reg_out, mshadow_op::minus) -.set_attr("FInferStorageType", RegressionInferStorageType) -.set_attr("FComputeEx", RegressionBackwardEx); - -MXNET_OPERATOR_REGISTER_REGRESSION_FWD(MAERegressionOutput, - mshadow_op::identity, "_backward_mae_reg_out") -.describe(R"code(Computes mean absolute error of the input. - -MAE is a risk metric corresponding to the expected value of the absolute error. - -If :math:`\hat{y}_i` is the predicted value of the i-th sample, and :math:`y_i` is the corresponding target value, -then the mean absolute error (MAE) estimated over :math:`n` samples is defined as - -:math:`\text{MAE}(\textbf{Y}, \hat{\textbf{Y}} ) = \frac{1}{n} \sum_{i=0}^{n-1} \lVert \textbf{y}_i - \hat{\textbf{y}}_i \rVert_1` - -.. note:: - Use the MAERegressionOutput as the final output layer of a net. - -The storage type of ``label`` can be ``default`` or ``csr`` - -- MAERegressionOutput(default, default) = default -- MAERegressionOutput(default, csr) = default - -By default, gradients of this loss function are scaled by factor `1/m`, where m is the number of regression outputs of a training example. -The parameter `grad_scale` can be used to change this scale to `grad_scale/m`. - -)code" ADD_FILELINE); - -MXNET_OPERATOR_REGISTER_REGRESSION_BWD(_backward_mae_reg_out, mshadow_op::minus_sign); - -MXNET_OPERATOR_REGISTER_REGRESSION_FWD(LogisticRegressionOutput, - mshadow_op::sigmoid, "_backward_logistic_reg_out") -.set_attr("FInferStorageType", RegressionInferStorageType) -.set_attr("FComputeEx", RegressionForwardEx) -.describe(R"code(Applies a logistic function to the input. - -The logistic function, also known as the sigmoid function, is computed as -:math:`\frac{1}{1+exp(-\textbf{x})}`. - -Commonly, the sigmoid is used to squash the real-valued output of a linear model -:math:`wTx+b` into the [0,1] range so that it can be interpreted as a probability. -It is suitable for binary classification or probability prediction tasks. - -.. note:: - Use the LogisticRegressionOutput as the final output layer of a net. - -The storage type of ``label`` can be ``default`` or ``csr`` - -- LogisticRegressionOutput(default, default) = default -- LogisticRegressionOutput(default, csr) = default - -The loss function used is the Binary Cross Entropy Loss: - -:math:`-{(y\log(p) + (1 - y)\log(1 - p))}` - -Where `y` is the ground truth probability of positive outcome for a given example, and `p` the probability predicted by the model. By default, gradients of this loss function are scaled by factor `1/m`, where m is the number of regression outputs of a training example. -The parameter `grad_scale` can be used to change this scale to `grad_scale/m`. - -)code" ADD_FILELINE); - -MXNET_OPERATOR_REGISTER_REGRESSION_BWD(_backward_logistic_reg_out, mshadow_op::minus) -.set_attr("FInferStorageType", RegressionInferStorageType) -.set_attr("FComputeEx", RegressionBackwardEx); - -} // namespace op -} // namespace mxnet diff --git a/src/operator/regression_output.cu b/src/operator/regression_output.cu deleted file mode 100644 index ca11b84a212d..000000000000 --- a/src/operator/regression_output.cu +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file regression_ouput.cu - * \brief Regression output operator. -*/ -#include "./regression_output-inl.h" - - -namespace mxnet { -namespace op { - -NNVM_REGISTER_OP(LinearRegressionOutput) -.set_attr("FCompute", RegressionForward) -.set_attr("FComputeEx", RegressionForwardEx); - -NNVM_REGISTER_OP(_backward_linear_reg_out) -.set_attr("FCompute", RegressionBackward) -.set_attr("FComputeEx", RegressionBackwardEx); - -NNVM_REGISTER_OP(MAERegressionOutput) -.set_attr("FCompute", RegressionForward); - -NNVM_REGISTER_OP(_backward_mae_reg_out) -.set_attr("FCompute", RegressionBackward); - -NNVM_REGISTER_OP(LogisticRegressionOutput) -.set_attr("FCompute", RegressionForward) -.set_attr("FComputeEx", RegressionForwardEx); - -NNVM_REGISTER_OP(_backward_logistic_reg_out) -.set_attr("FCompute", RegressionBackward) -.set_attr("FComputeEx", RegressionBackwardEx); - -} // namespace op -} // namespace mxnet diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 0e4405379c13..5d6233e7274f 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -389,157 +389,6 @@ def check_slice_channel(data_ndim, axis, num_outputs, squeeze_axis): check_slice_channel(data_ndim=3, axis=-1, num_outputs=2, squeeze_axis=False) check_slice_channel(data_ndim=5, axis=-2, num_outputs=3, squeeze_axis=True) -@with_seed() -def test_regression(): - ''' test regression operator ''' - def check_regression(symbol, forward, backward, shape, stype='default', densities=[0, 0.5, 1]): - # init executor - data = mx.symbol.Variable('data') - label = mx.symbol.Variable('label', stype=stype) - out = symbol(data, label) - grad_req = {'data': 'write', 'label': 'null'} - out_exec = out.simple_bind(default_context(), grad_req=grad_req, - data=shape, label=shape) - arg_map = dict(zip(out.list_arguments(), out_exec.arg_arrays)) - grad_map = dict(zip(out.list_arguments(), out_exec.grad_arrays)) - # init data - arr_data = mx.random.uniform(-1, 1, shape) - arg_map["data"][:] = arr_data - # init label based on density - arr_label = arg_map["label"] - atol = 1e-5 - for density in densities: - arr_label[:] = rand_ndarray(shape, stype, density=density) - out_exec.forward(is_train=True) - out_exec.backward() - np_out = forward(arr_data.asnumpy()) - out_grad = backward(np_out, arr_label.asnumpy().reshape(np_out.shape)) / shape[1] - assert_almost_equal(out_exec.outputs[0], np_out, atol=atol) - assert_almost_equal(grad_map["data"], out_grad, atol=atol) - - shape = (50, 30) - - check_regression(mx.symbol.LogisticRegressionOutput, - lambda x: 1.0 / (1.0 + np.exp(-x)), - lambda x, y : x - y, - shape) - check_regression(mx.symbol.LinearRegressionOutput, - lambda x: x, - lambda x, y : x - y, - shape) - check_regression(mx.symbol.MAERegressionOutput, - lambda x: x, - lambda x, y : np.where(x > y, np.ones(x.shape), -np.ones(x.shape)), - shape) - check_regression(mx.symbol.LogisticRegressionOutput, - lambda x: 1.0 / (1.0 + np.exp(-x)), - lambda x, y : x - y, - shape, stype='csr') - check_regression(mx.symbol.LinearRegressionOutput, - lambda x: x, - lambda x, y : x - y, - shape, stype='csr') - - -def check_softmax_grad(xpu): - x = mx.sym.Variable('x') - label = mx.sym.Variable('label') - x_nd = mx.nd.array([[1, 6, 4, 2]], ctx=xpu) - grad_x = mx.nd.zeros((1,4), ctx=xpu) - label_nd = mx.nd.array([1], ctx=xpu) - - sym = mx.sym.SoftmaxOutput(data=x, label=label, ignore_label=0, use_ignore=False) - ex = sym.bind(ctx=xpu, args={'x': x_nd, 'label': label_nd}, args_grad={'x': grad_x}) - - ex.forward(is_train=True) - softmax_out = ex.outputs[0].asnumpy() - expected_softmax_out = [[0.005806628, 0.861780069, 0.116629249, 0.015784052]] - assert np.isclose(softmax_out, expected_softmax_out).all() - - ex.backward(is_train=True) - grad_out = ex.grad_arrays[0].asnumpy() - k = int(label_nd[0].asscalar()) - expected_grad_out = np.zeros((1,4)) - expected_grad_out[0, k] = -1 - assert np.isclose(grad_out - softmax_out, expected_grad_out).all() - - -def check_smoothed_softmax_grad(xpu): - alpha = 0.2 - x = mx.sym.Variable('x') - label = mx.sym.Variable('label') - x_nd = mx.nd.array([[1, 6, 4, 2]], ctx=xpu) - grad_x = mx.nd.zeros((1,4), ctx=xpu) - label_nd = mx.nd.array([1], ctx=xpu) - - sym = mx.sym.SoftmaxOutput(data=x, label=label, ignore_label=0, use_ignore=False, smooth_alpha=alpha) - ex = sym.bind(ctx=xpu, args={'x': x_nd, 'label': label_nd}, args_grad={'x': grad_x}) - - ex.forward(is_train=True) - softmax_out = ex.outputs[0].asnumpy() - expected_softmax_out = [[0.005806628, 0.861780069, 0.116629249, 0.015784052]] - assert np.isclose(softmax_out, expected_softmax_out).all() - - ex.backward(is_train=True) - grad_out = ex.grad_arrays[0].asnumpy() - k = int(label_nd[0].asscalar()) - expected_grad_out = np.full((1,4), fill_value=-alpha/float(4-1)) - expected_grad_out[0, k] = - (1 - alpha) - assert np.isclose(grad_out - softmax_out, expected_grad_out).all() - - -def check_softmax_with_ignore_label(xpu): - X = mx.symbol.Variable('X') - L = mx.symbol.Variable('L') - Y = mx.symbol.SoftmaxOutput(data=X, label=L, ignore_label=0, use_ignore=True) - - shape = (20, 10) - x = mx.nd.empty(shape, ctx = xpu) - l = mx.nd.empty((shape[0],), ctx = xpu) - x_np = np.random.rand(*shape) - l_np = np.random.randint(0, shape[1]-1, (shape[0],)) - x[:] = x_np - l[:] = l_np - - grad = mx.nd.empty(shape, ctx = xpu) - - exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad}) - exec1.forward(is_train=True) - exec1.backward() - - grad0 = grad.asnumpy() - - for i in range(int(shape[0]/2)): - l_np[i] = 0 - l[:] = l_np - - exec1.forward(is_train=True) - exec1.backward() - grad1 = grad.asnumpy() - - assert abs(np.sum(grad1[:int(shape[0]/2)])) < 1e-5 - assert_almost_equal(grad0[int(shape[0]/2):], grad1[int(shape[0]/2):]) - - -def check_softmax_with_shape(shape, xpu, preserve_shape=False): - # bind with label - X = mx.symbol.Variable('X') - L = mx.symbol.Variable('L') - Y = mx.symbol.SoftmaxOutput(data=X, label=L, preserve_shape=preserve_shape) - x = mx.random.uniform(-1, 1, shape, ctx=xpu) - l = mx.random.uniform(-1, 1, shape, ctx=xpu) - l[:] = np_softmax(l.asnumpy()) - grad = mx.nd.empty(shape, ctx = xpu) - exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad}) - exec1.forward(is_train=True) - out = exec1.outputs[0].asnumpy() - # Non-zero atol required by test_softmax with seed 781663739 - rtol = 1e-4 - atol = 1e-6 - assert_almost_equal(out, np_softmax(x.asnumpy()), rtol=rtol, atol=atol) - exec1.backward() - assert_almost_equal(grad, np_softmax(x.asnumpy()) - l.asnumpy(), rtol=rtol, atol=atol) - def test_python_op(): X = mx.symbol.Variable('X') @@ -7391,76 +7240,6 @@ def test_softmax(): check_smoothed_softmax_grad(default_context()) -@xfail_when_nonstandard_decimal_separator -@with_seed() -def test_softmax_output_normalization(): - def _softmaxoutput_normalization(multi_output, use_ignore, normalization): - grad_scale = np.random.random() - batch_size = 8 - num_labels = 6 - H, W = 3, 3 - ignore_label = np.random.randint(0, num_labels) if use_ignore else -1 - - if multi_output: - data_shape = (batch_size, num_labels, H, W) - label_shape = (batch_size, H, W) - else: - data_shape = (batch_size, num_labels) - label_shape = (batch_size, ) - - data = mx.nd.random.uniform(-1, 1, shape=data_shape) - label = mx.nd.random.randint( - 0, num_labels, shape=label_shape).astype('float32') - data.attach_grad() - - kwargs = dict(grad_scale=grad_scale, - normalization=normalization, multi_output=multi_output) - if use_ignore: - kwargs.update(use_ignore=True, ignore_label=ignore_label) - - with mx.autograd.record(): - out = mx.nd.SoftmaxOutput(data=data, label=label, **kwargs) - out.backward(mx.nd.ones_like(data)) - - exp_data = mx.nd.exp(data) - softmax_data = exp_data / exp_data.sum(1, keepdims=True) - argmax_data = mx.nd.argmax(data, axis=1) - - assert_almost_equal(out.asnumpy(), softmax_data.asnumpy()) - one_hot_label = mx.nd.one_hot(label, num_labels) - if multi_output: - one_hot_label = one_hot_label.transpose((0, 3, 1, 2)) - data_grad = softmax_data - one_hot_label - - if use_ignore: - if multi_output: - data_grad *= (label != - ignore_label).reshape((batch_size, 1, H, W)) - else: - data_grad *= (label != ignore_label).reshape((batch_size, 1)) - - valid_cnt = 1 - if normalization == 'batch': - valid_cnt = batch_size - elif normalization == 'valid': - valid_cnt = mx.nd.maximum(1, (label != ignore_label).sum()) - scale = grad_scale / valid_cnt - - if multi_output: - if normalization != 'valid': - scale /= H * W - - data_grad *= scale - - assert_almost_equal(data.grad.asnumpy(), data_grad.asnumpy()) - - for multi_output in [False, True]: - for use_ignore in [False, True]: - for normalization in ['null', 'batch', 'valid']: - _softmaxoutput_normalization( - multi_output, use_ignore, normalization) - - @with_seed() @pytest.mark.serial def test_slice(): From d3d1a97cb007840d18d300e54c47cd1551aaa60b Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Mon, 8 Jun 2020 22:59:55 +0000 Subject: [PATCH 02/36] remove SVMOutput --- tests/python/gpu/test_operator_gpu.py | 12 ----- tests/python/unittest/test_operator.py | 61 -------------------------- 2 files changed, 73 deletions(-) diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index b6d0011f1a2f..dd3cd1b0388c 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -1659,18 +1659,6 @@ def test_embedding_helper(data_types, weight_types, low_pad, high_pad): test_embedding_helper(data_types, weight_types, 0, 5) -@with_seed() -def test_svmoutput_with_type(): - sym = mx.sym.SVMOutput(name='svmoutput', use_linear=True) - ctx_list = [{'ctx': mx.gpu(0), 'svmoutput_data': (20, 10), 'type_dict': {'svmoutput_data': np.float64}}, - {'ctx': mx.gpu(0), 'svmoutput_data': (20, 10), 'type_dict': {'svmoutput_data': np.float32}}, - {'ctx': mx.gpu(0), 'svmoutput_data': (20, 10), 'type_dict': {'svmoutput_data': np.float16}}, - {'ctx': mx.cpu(0), 'svmoutput_data': (20, 10), 'type_dict': {'svmoutput_data': np.float64}}, - {'ctx': mx.cpu(0), 'svmoutput_data': (20, 10), 'type_dict': {'svmoutput_data': np.float32}}, - {'ctx': mx.cpu(0), 'svmoutput_data': (20, 10), 'type_dict': {'svmoutput_data': np.float16}}] - check_consistency(sym, ctx_list, use_uniform=True) - - @with_seed() def test_take_with_type(): sym = mx.sym.take(name='take') diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 5d6233e7274f..ec8506a4f1e4 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -3275,67 +3275,6 @@ def test_infer_type(dtype): unittest_correlation((5,1,11,11), kernel_size = 5,max_displacement = 1,stride1 = 1,stride2 = 1,pad_size = 2,is_multiply = False, dtype = dtype) -@with_seed() -def test_support_vector_machine_l1_svm(): - xpu = default_context() - shape = (20, 10) - - X = mx.symbol.Variable('X') - L = mx.symbol.Variable('L') - Y = mx.symbol.SVMOutput(data=X, label=L, use_linear=True) - x = mx.nd.empty(shape, ctx = xpu) - l = mx.nd.empty((shape[0],), ctx = xpu) - x_np = np.random.rand(*shape) - l_np = np.random.randint(0, shape[1], (shape[0],)) - x[:] = x_np - l[:] = l_np - - grad = mx.nd.empty(shape, ctx = xpu) - exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad}) - exec1.forward(is_train=True) - - assert_almost_equal(x_np, exec1.outputs[0]) - - exec1.backward() - - l_mask = np.equal(l_np.reshape(shape[0],1),range(shape[1])) - l_mask = np.array(l_mask, dtype=np.float32)*2 -1 - grad_np = (-1) * l_mask * np.greater(1 - l_mask * x_np, 0) - - assert_almost_equal(grad_np, grad) - - -@with_seed() -def test_support_vector_machine_l2_svm(): - xpu = default_context() - shape = (20, 10) - - X = mx.symbol.Variable('X') - L = mx.symbol.Variable('L') - Y = mx.symbol.SVMOutput(data=X, label=L) - x = mx.nd.empty(shape, ctx = xpu) - l = mx.nd.empty((shape[0],), ctx = xpu) - x_np = np.random.rand(*shape) - x_np = x_np.astype(np.float32) - l_np = np.random.randint(0, shape[1], (shape[0],)) - x[:] = x_np - l[:] = l_np - - grad = mx.nd.empty(shape, ctx = xpu) - exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad}) - exec1.forward(is_train=True) - - assert_almost_equal(x_np, exec1.outputs[0]) - - exec1.backward() - - l_mask = np.equal(l_np.reshape(shape[0],1),range(shape[1])) - l_mask = np.array(l_mask, dtype=np.float32)*2 -1 - grad_np = (-2)*l_mask*np.maximum(1-l_mask*x_np,0) - grad_np = grad_np.astype(np.float32) - assert_almost_equal(grad_np, grad) - - # Seed set because the test is not robust enough to operate on random data @with_seed(1234) def test_roipooling(): From 7c92f119b5f79f9f2c6e5112b80c2f70c43d8df4 Mon Sep 17 00:00:00 2001 From: Lin Date: Wed, 10 Jun 2020 10:52:06 -0700 Subject: [PATCH 03/36] remove RegressionOutput in language binding --- R-package/R/mlp.R | 82 --- R-package/R/rnn.graph.R | 372 ------------ R-package/R/viz.graph.R | 5 - R-package/tests/testthat/test_model.R | 200 ------- R-package/tests/testthat/test_optimizer.R | 251 -------- R-package/tests/testthat/test_symbol.R | 11 - R-package/vignettes/CallbackFunction.Rmd | 160 ----- R-package/vignettes/CustomLossFunction.Rmd | 151 ----- .../vignettes/fiveMinutesNeuralNetwork.Rmd | 173 ------ .../nd_operations/nn_basic_operators.py | 5 - benchmark/opperf/utils/op_registry_utils.py | 8 +- cpp-package/example/CMakeLists.txt | 3 - cpp-package/example/test_regress_label.cpp | 57 -- .../packages/ndarray/sparse/train.md | 117 +--- .../tutorials/performance/backend/amp.md | 5 +- .../src/pages/api/faq/visualize_graph.md | 88 --- .../api/r/docs/tutorials/callback_function.md | 278 --------- .../r/docs/tutorials/custom_loss_function.md | 231 ------- .../tutorials/five_minutes_neural_network.md | 341 ----------- .../src/pages/api/r/docs/tutorials/ndarray.md | 2 - .../src/pages/api/r/docs/tutorials/symbol.md | 2 - example/recommenders/demo-MF.R | 84 --- julia/examples/regression-example.jl | 101 ---- julia/test/unittest/symbolic-node.jl | 22 - .../sparse/matrix_factorization/README.md | 26 - .../sparse/matrix_factorization/get_data.sh | 33 - .../sparse/matrix_factorization/train.pl | 184 ------ perl-package/AI-MXNet/t/test_module.t | 565 ------------------ python/mxnet/contrib/amp/lists/symbol_bf16.py | 12 - python/mxnet/contrib/amp/lists/symbol_fp16.py | 12 - .../scala/org/apache/mxnet/ModuleSuite.scala | 205 ------- .../org/apache/mxnet/OperatorSuite.scala | 35 -- tests/nightly/test_large_array.py | 68 --- tests/nightly/test_large_vector.py | 29 - tests/tutorials/test_sanity_tutorials.py | 3 - 35 files changed, 7 insertions(+), 3914 deletions(-) delete mode 100644 R-package/R/mlp.R delete mode 100644 R-package/R/rnn.graph.R delete mode 100644 R-package/tests/testthat/test_optimizer.R delete mode 100644 R-package/vignettes/CallbackFunction.Rmd delete mode 100644 R-package/vignettes/CustomLossFunction.Rmd delete mode 100644 R-package/vignettes/fiveMinutesNeuralNetwork.Rmd delete mode 100644 cpp-package/example/test_regress_label.cpp delete mode 100644 docs/static_site/src/pages/api/faq/visualize_graph.md delete mode 100644 docs/static_site/src/pages/api/r/docs/tutorials/callback_function.md delete mode 100644 docs/static_site/src/pages/api/r/docs/tutorials/custom_loss_function.md delete mode 100644 docs/static_site/src/pages/api/r/docs/tutorials/five_minutes_neural_network.md delete mode 100644 example/recommenders/demo-MF.R delete mode 100644 julia/examples/regression-example.jl delete mode 100644 perl-package/AI-MXNet/examples/sparse/matrix_factorization/README.md delete mode 100755 perl-package/AI-MXNet/examples/sparse/matrix_factorization/get_data.sh delete mode 100755 perl-package/AI-MXNet/examples/sparse/matrix_factorization/train.pl diff --git a/R-package/R/mlp.R b/R-package/R/mlp.R deleted file mode 100644 index 3cfa06e967ce..000000000000 --- a/R-package/R/mlp.R +++ /dev/null @@ -1,82 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -#' Convenience interface for multiple layer perceptron -#' -#' @param data the input matrix. Only mx.io.DataIter and R array/matrix types supported. -#' @param label the training label. Only R array type supported. -#' @param hidden_node a vector containing number of hidden nodes on each hidden layer as well as the output layer. -#' @param out_node the number of nodes on the output layer. -#' @param dropout a number in [0,1) containing the dropout ratio from the last hidden layer to the output layer. -#' @param activation either a single string or a vector containing the names of the activation functions. -#' @param out_activation a single string containing the name of the output activation function. -#' @param ctx whether train on cpu (default) or gpu. -#' @param eval.metric the evaluation metric/ -#' @param ... other parameters passing to \code{mx.model.FeedForward.create}/ -#' -#' @examples -#' -#' require(mlbench) -#' data(Sonar, package="mlbench") -#' Sonar[,61] = as.numeric(Sonar[,61])-1 -#' train.ind = c(1:50, 100:150) -#' train.x = data.matrix(Sonar[train.ind, 1:60]) -#' train.y = Sonar[train.ind, 61] -#' test.x = data.matrix(Sonar[-train.ind, 1:60]) -#' test.y = Sonar[-train.ind, 61] -#' model = mx.mlp(train.x, train.y, hidden_node = 10, out_node = 2, out_activation = "softmax", -#' learning.rate = 0.1) -#' preds = predict(model, test.x) -#' -#' @export -mx.mlp <- function(data, label, hidden_node = 1, out_node, dropout = NULL, - activation = "tanh", out_activation = "softmax", - ctx = mx.ctx.default(), ...) { - - m <- length(hidden_node) - if (!is.null(dropout)) { - if (length(dropout) != 1) { - stop("only accept dropout ratio of length 1.") - } - dropout = max(0,min(dropout, 1-1e-7)) - } - - # symbol construction - act <- mx.symbol.Variable("data") - if (length(activation) == 1) { - activation <- rep(activation, m) - } else { - if (length(activation) != m) { - stop(paste("Length of activation should be",m)) - } - } - for (i in seq_len(m)) { - fc <- mx.symbol.FullyConnected(act, num_hidden=hidden_node[i]) - act <- mx.symbol.Activation(fc, act_type=activation[i]) - if (i == m && !is.null(dropout)) { - act <- mx.symbol.Dropout(act, p = dropout) - } - } - fc <- mx.symbol.FullyConnected(act, num_hidden=out_node) - out <- switch(out_activation, - "rmse" = mx.symbol.LinearRegressionOutput(fc), - "softmax" = mx.symbol.SoftmaxOutput(fc), - "logistic" = mx.symbol.LogisticRegressionOutput(fc), - stop("Not supported yet.")) - model <- mx.model.FeedForward.create(out, X=data, y=label, ctx = ctx, ...) - return(model) -} diff --git a/R-package/R/rnn.graph.R b/R-package/R/rnn.graph.R deleted file mode 100644 index 1225fa511b51..000000000000 --- a/R-package/R/rnn.graph.R +++ /dev/null @@ -1,372 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -#' Generate a RNN symbolic model - requires CUDA -#' -#' @param config Either seq-to-one or one-to-one -#' @param cell_type Type of RNN cell: either gru or lstm -#' @param num_rnn_layer int, number of stacked layers -#' @param num_hidden int, size of the state in each RNN layer -#' @param num_embed int, default = NULL - no embedding. Dimension of the embedding vectors -#' @param num_decode int, number of output variables in the decoding layer -#' @param input_size int, number of levels in the data - only used for embedding -#' @param dropout -#' -#' @export -rnn.graph <- function (num_rnn_layer, input_size = NULL, num_embed = NULL, - num_hidden, num_decode, dropout = 0, ignore_label = -1, bidirectional = F, - loss_output = NULL, config, cell_type, masking = F, output_last_state = F, - rnn.state = NULL, rnn.state.cell = NULL, prefix = "") { - - data <- mx.symbol.Variable("data") - label <- mx.symbol.Variable("label") - seq.mask <- mx.symbol.Variable("seq.mask") - if (!is.null(num_embed)) - embed.weight <- mx.symbol.Variable("embed.weight") - rnn.params.weight <- mx.symbol.Variable("rnn.params.weight") - - if (is.null(rnn.state)) rnn.state <- mx.symbol.Variable("rnn.state") - if (cell_type == "lstm" & is.null(rnn.state.cell)) { - rnn.state.cell <- mx.symbol.Variable("rnn.state.cell") - } - - cls.weight <- mx.symbol.Variable("cls.weight") - cls.bias <- mx.symbol.Variable("cls.bias") - if (!is.null(num_embed)) { - data <- mx.symbol.Embedding(data = data, input_dim = input_size, - weight = embed.weight, output_dim = num_embed, name = "embed") - } - - data = mx.symbol.swapaxes(data = data, dim1 = 0, dim2 = 1, name = paste0(prefix, "swap_pre")) - - if (cell_type == "lstm") { - rnn <- mx.symbol.RNN(data = data, state = rnn.state, - state_cell = rnn.state.cell, parameters = rnn.params.weight, - state.size = num_hidden, num.layers = num_rnn_layer, - bidirectional = bidirectional, mode = cell_type, state.outputs = output_last_state, - p = dropout, name = paste0(prefix, "RNN")) - } else { - rnn <- mx.symbol.RNN(data = data, state = rnn.state, - parameters = rnn.params.weight, state.size = num_hidden, - num.layers = num_rnn_layer, bidirectional = bidirectional, mode = cell_type, - state.outputs = output_last_state, p = dropout, - name = paste0(prefix, "RNN")) - } - - if (config == "seq-to-one") { - if (masking) mask <- mx.symbol.SequenceLast(data = rnn[[1]], use.sequence.length = T, sequence_length = seq.mask, name = "mask") else - mask <- mx.symbol.SequenceLast(data = rnn[[1]], use.sequence.length = F, name = "mask") - - if (!is.null(loss_output)) { - decode <- mx.symbol.FullyConnected(data = mask, weight = cls.weight, bias = cls.bias, num_hidden = num_decode, name = "decode") - out <- switch(loss_output, softmax = mx.symbol.SoftmaxOutput(data = decode, label = label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"), - linear = mx.symbol.LinearRegressionOutput(data = decode, label = label, name = "loss"), - logistic = mx.symbol.LogisticRegressionOutput(data = decode, label = label, name = "loss"), - MAE = mx.symbol.MAERegressionOutput(data = decode, label = label, name = "loss")) - } - else out <- mask - } - - else if (config == "one-to-one") { - - if (masking) mask <- mx.symbol.SequenceMask(data = rnn[[1]], use.sequence.length = T, sequence_length = seq.mask, value = 0, name = "mask") else - mask <- mx.symbol.identity(data = rnn[[1]], name = "mask") - - mask = mx.symbol.swapaxes(data = mask, dim1 = 0, dim2 = 1, name = paste0(prefix, "swap_post")) - - if (!is.null(loss_output)) { - - mask <- mx.symbol.reshape(data = mask, shape = c(0, -1), reverse = TRUE) - label <- mx.symbol.reshape(data = label, shape = c(-1)) - - decode <- mx.symbol.FullyConnected(data = mask, weight = cls.weight, bias = cls.bias, num_hidden = num_decode, - flatten = TRUE, name = paste0(prefix, "decode")) - - out <- switch(loss_output, softmax = mx.symbol.SoftmaxOutput(data = decode, label = label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"), - linear = mx.symbol.LinearRegressionOutput(data = decode, label = label, name = "loss"), - logistic = mx.symbol.LogisticRegressionOutput(data = decode, label = label, name = "loss"), - MAE = mx.symbol.MAERegressionOutput(data = decode, label = label, name = "loss")) - } else out <- mask - } - return(out) -} - -# LSTM cell symbol -lstm.cell <- function(num_hidden, indata, prev.state, param, seqidx, layeridx, dropout = 0, prefix = "") { - - if (dropout > 0 && layeridx > 1) - indata <- mx.symbol.Dropout(data = indata, p = dropout) - - i2h <- mx.symbol.FullyConnected(data = indata, weight = param$i2h.weight, bias = param$i2h.bias, - num_hidden = num_hidden * 4, name = paste0(prefix, "t", seqidx, ".l", layeridx, ".i2h")) - - if (!is.null(prev.state)) { - h2h <- mx.symbol.FullyConnected(data = prev.state$h, weight = param$h2h.weight, - bias = param$h2h.bias, num_hidden = num_hidden * 4, - name = paste0(prefix, "t", seqidx, ".l", layeridx, ".h2h")) - gates <- i2h + h2h - } else { - gates <- i2h - } - - split.gates <- mx.symbol.split(gates, num.outputs = 4, axis = 1, squeeze.axis = F, - name = paste0(prefix, "t", seqidx, ".l", layeridx, ".slice")) - - in.gate <- mx.symbol.Activation(split.gates[[1]], act.type = "sigmoid") - in.transform <- mx.symbol.Activation(split.gates[[2]], act.type = "tanh") - forget.gate <- mx.symbol.Activation(split.gates[[3]], act.type = "sigmoid") - out.gate <- mx.symbol.Activation(split.gates[[4]], act.type = "sigmoid") - - if (is.null(prev.state)) { - next.c <- in.gate * in.transform - } else { - next.c <- (forget.gate * prev.state$c) + (in.gate * in.transform) - } - - next.h <- out.gate * mx.symbol.Activation(next.c, act.type = "tanh") - - return(list(h = next.h, c = next.c)) -} - - -# GRU cell symbol -gru.cell <- function(num_hidden, indata, prev.state, param, seqidx, layeridx, dropout = 0, prefix) -{ - if (dropout > 0 && layeridx > 1) - indata <- mx.symbol.Dropout(data = indata, p = dropout) - - i2h <- mx.symbol.FullyConnected(data = indata, weight = param$gates.i2h.weight, - bias = param$gates.i2h.bias, num_hidden = num_hidden * 2, - name = paste0(prefix, "t", seqidx, ".l", layeridx, ".gates.i2h")) - - if (!is.null(prev.state)) { - h2h <- mx.symbol.FullyConnected(data = prev.state$h, weight = param$gates.h2h.weight, - bias = param$gates.h2h.bias, num_hidden = num_hidden * 2, - name = paste0(prefix, "t", seqidx, ".l", layeridx, ".gates.h2h")) - gates <- i2h + h2h - } else { - gates <- i2h - } - - split.gates <- mx.symbol.split(gates, num.outputs = 2, axis = 1, squeeze.axis = F, - name = paste0(prefix, "t", seqidx, ".l", layeridx, ".split")) - - update.gate <- mx.symbol.Activation(split.gates[[1]], act.type = "sigmoid") - reset.gate <- mx.symbol.Activation(split.gates[[2]], act.type = "sigmoid") - - htrans.i2h <- mx.symbol.FullyConnected(data = indata, weight = param$trans.i2h.weight, - bias = param$trans.i2h.bias, num_hidden = num_hidden, - name = paste0(prefix, "t", seqidx, ".l", layeridx, ".trans.i2h")) - - if (is.null(prev.state)) { - h.after.reset <- reset.gate * 0 - } else { - h.after.reset <- prev.state$h * reset.gate - } - - htrans.h2h <- mx.symbol.FullyConnected(data = h.after.reset, weight = param$trans.h2h.weight, - bias = param$trans.h2h.bias, num_hidden = num_hidden, - name = paste0(prefix, "t", seqidx, ".l", layeridx, ".trans.h2h")) - - h.trans <- htrans.i2h + htrans.h2h - h.trans.active <- mx.symbol.Activation(h.trans, act.type = "tanh") - - if (is.null(prev.state)) { - next.h <- update.gate * h.trans.active - } else { - next.h <- prev.state$h + update.gate * (h.trans.active - prev.state$h) - } - - return(list(h = next.h)) -} - - -#' Unroll representation of RNN running on non CUDA device -#' -#' @param config Either seq-to-one or one-to-one -#' @param cell_type Type of RNN cell: either gru or lstm -#' @param num_rnn_layer int, number of stacked layers -#' @param seq_len int, number of time steps to unroll -#' @param num_hidden int, size of the state in each RNN layer -#' @param num_embed int, default = NULL - no embedding. Dimension of the embedding vectors -#' @param num_decode int, number of output variables in the decoding layer -#' @param input_size int, number of levels in the data - only used for embedding -#' @param dropout -#' -#' @export -rnn.graph.unroll <- function(num_rnn_layer, - seq_len, - input_size = NULL, - num_embed = NULL, - num_hidden, - num_decode, - dropout = 0, - ignore_label = -1, - loss_output = NULL, - init.state = NULL, - config, - cell_type = "lstm", - masking = F, - output_last_state = F, - prefix = "", - data_name = "data", - label_name = "label") { - - if (!is.null(num_embed)) embed.weight <- mx.symbol.Variable(paste0(prefix, "embed.weight")) - - # Initial state - if (is.null(init.state) & output_last_state) { - init.state <- lapply(1:num_rnn_layer, function(i) { - if (cell_type=="lstm") { - state <- list(h = mx.symbol.Variable(paste0("init_", prefix, i, "_h")), - c = mx.symbol.Variable(paste0("init_", prefix, i, "_c"))) - } else if (cell_type=="gru") { - state <- list(h = mx.symbol.Variable(paste0("init_", prefix, i, "_h"))) - } - return (state) - }) - } - - cls.weight <- mx.symbol.Variable(paste0(prefix, "cls.weight")) - cls.bias <- mx.symbol.Variable(paste0(prefix, "cls.bias")) - - param.cells <- lapply(1:num_rnn_layer, function(i) { - - if (cell_type=="lstm") { - cell <- list(i2h.weight = mx.symbol.Variable(paste0(prefix, "l", i, ".i2h.weight")), - i2h.bias = mx.symbol.Variable(paste0(prefix, "l", i, ".i2h.bias")), - h2h.weight = mx.symbol.Variable(paste0(prefix, "l", i, ".h2h.weight")), - h2h.bias = mx.symbol.Variable(paste0(prefix, "l", i, ".h2h.bias"))) - } else if (cell_type=="gru") { - cell <- list(gates.i2h.weight = mx.symbol.Variable(paste0(prefix, "l", i, ".gates.i2h.weight")), - gates.i2h.bias = mx.symbol.Variable(paste0(prefix, "l", i, ".gates.i2h.bias")), - gates.h2h.weight = mx.symbol.Variable(paste0(prefix, "l", i, ".gates.h2h.weight")), - gates.h2h.bias = mx.symbol.Variable(paste0(prefix, "l", i, ".gates.h2h.bias")), - trans.i2h.weight = mx.symbol.Variable(paste0(prefix, "l", i, ".trans.i2h.weight")), - trans.i2h.bias = mx.symbol.Variable(paste0(prefix, "l", i, ".trans.i2h.bias")), - trans.h2h.weight = mx.symbol.Variable(paste0(prefix, "l", i, ".trans.h2h.weight")), - trans.h2h.bias = mx.symbol.Variable(paste0(prefix, "l", i, ".trans.h2h.bias"))) - } - return (cell) - }) - - # embeding layer - data <- mx.symbol.Variable(data_name) - label <- mx.symbol.Variable(label_name) - seq.mask <- mx.symbol.Variable(paste0(prefix, "seq.mask")) - - data = mx.symbol.swapaxes(data = data, dim1 = 0, dim2 = 1, name = paste0(prefix, "swap_pre")) - - if (!is.null(num_embed)) { - data <- mx.symbol.Embedding(data = data, input_dim = input_size, - weight=embed.weight, output_dim = num_embed, name = paste0(prefix, "embed")) - } - - data <- mx.symbol.split(data = data, axis = 0, num.outputs = seq_len, squeeze_axis = T) - - last.hidden <- list() - last.states <- list() - - for (seqidx in 1:seq_len) { - hidden <- data[[seqidx]] - - for (i in 1:num_rnn_layer) { - - if (seqidx==1) prev.state <- init.state[[i]] else - prev.state <- last.states[[i]] - - if (cell_type=="lstm") { - cell.symbol <- lstm.cell - } else if (cell_type=="gru"){ - cell.symbol <- gru.cell - } - - next.state <- cell.symbol(num_hidden = num_hidden, - indata = hidden, - prev.state = prev.state, - param = param.cells[[i]], - seqidx = seqidx, - layeridx = i, - dropout = dropout, - prefix = prefix) - - hidden <- next.state$h - last.states[[i]] <- next.state - } - - # Aggregate outputs from each timestep - last.hidden <- c(last.hidden, hidden) - } - - if (output_last_state) { - out.states = mx.symbol.Group(unlist(last.states)) - } - - # concat hidden units - concat seq_len blocks of dimension num_hidden x batch.size - concat <- mx.symbol.concat(data = last.hidden, num.args = seq_len, dim = 0, name = paste0(prefix, "concat")) - concat <- mx.symbol.reshape(data = concat, shape = c(num_hidden, -1, seq_len), name = paste0(prefix, "rnn_reshape")) - - if (config=="seq-to-one") { - - if (masking) mask <- mx.symbol.SequenceLast(data=concat, use.sequence.length = T, sequence_length = seq.mask, name = paste0(prefix, "mask")) else - mask <- mx.symbol.SequenceLast(data=concat, use.sequence.length = F, name = paste0(prefix, "mask")) - - if (!is.null(loss_output)) { - - decode <- mx.symbol.FullyConnected(data = mask, - weight = cls.weight, - bias = cls.bias, - num_hidden = num_decode, - name = paste0(prefix, "decode")) - - out <- switch(loss_output, - softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = paste0(prefix, "loss")), - linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = paste0(prefix, "loss")), - logistic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, paste0(prefix, name = "loss")), - MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, paste0(prefix, name = "loss")) - ) - } else out <- mask - - } else if (config=="one-to-one"){ - - if (masking) mask <- mx.symbol.SequenceMask(data = concat, use.sequence.length = T, sequence_length = seq.mask, value = 0, name = paste0(prefix, "mask")) else - mask <- mx.symbol.identity(data = concat, name = paste0(prefix, "mask")) - - mask = mx.symbol.swapaxes(data = mask, dim1 = 0, dim2 = 1, name = paste0(prefix, "swap_post")) - - if (!is.null(loss_output)) { - - mask <- mx.symbol.reshape(data = mask, shape = c(0, -1), reverse = TRUE) - label <- mx.symbol.reshape(data = label, shape = c(-1)) - - decode <- mx.symbol.FullyConnected(data = mask, weight = cls.weight, bias = cls.bias, num_hidden = num_decode, - flatten = T, name = paste0(prefix, "decode")) - - out <- switch(loss_output, - softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, - name = paste0(prefix, "loss")), - linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = paste0(prefix, "loss")), - logistic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, name = paste0(prefix, "loss")), - MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, name = paste0(prefix, "loss")) - ) - } else out <- mask - } - - if (output_last_state) { - return(mx.symbol.Group(c(out, out.states))) - } else return(out) -} diff --git a/R-package/R/viz.graph.R b/R-package/R/viz.graph.R index ab876afdfa1e..488ed24818a1 100644 --- a/R-package/R/viz.graph.R +++ b/R-package/R/viz.graph.R @@ -57,12 +57,7 @@ graph.viz <- function(symbol, shape=NULL, direction="TD", type="graph", graph.wi "Flatten" = , "Reshape" = , "Concat" = "#fdb462", - "LinearRegressionOutput"=, - "MAERegressionOutput"=, - "SVMOutput"=, - "LogisticRegressionOutput"=, "MakeLoss"=, - "SoftmaxOutput" = "#b3de69", "#fccde5" # default value ) } diff --git a/R-package/tests/testthat/test_model.R b/R-package/tests/testthat/test_model.R index e62f334a4ab8..1cc21db87d6e 100644 --- a/R-package/tests/testthat/test_model.R +++ b/R-package/tests/testthat/test_model.R @@ -73,42 +73,6 @@ test_that("MNIST", { file.remove("chkpt-symbol.json") }) -test_that("Regression", { - data(BostonHousing, package = "mlbench") - train.ind <- seq(1, 506, 3) - train.x <- data.matrix(BostonHousing[train.ind, -14]) - train.y <- BostonHousing[train.ind, 14] - test.x <- data.matrix(BostonHousing[-train.ind, -14]) - test.y <- BostonHousing[-train.ind, 14] - data <- mx.symbol.Variable("data") - fc1 <- mx.symbol.FullyConnected(data, num_hidden = 1) - lro <- mx.symbol.LinearRegressionOutput(fc1) - - demo.metric.mae <- mx.metric.custom("mae", function(label, pred) { - pred <- mx.nd.reshape(pred, shape = 0) - res <- mx.nd.mean(mx.nd.abs(label - pred)) - return(as.array(res)) - }) - mx.set.seed(0) - model <- mx.model.FeedForward.create(lro, X = train.x, y = train.y, ctx = mx.ctx.default(), - num.round = 5, array.batch.size = 20, learning.rate = 2e-06, momentum = 0.9, - eval.metric = demo.metric.mae) - - train.x <- data.matrix(BostonHousing[train.ind, -(13:14)]) - train.y <- BostonHousing[train.ind, c(13:14)] - test.x <- data.matrix(BostonHousing[-train.ind, -(13:14)]) - test.y <- BostonHousing[-train.ind, c(13:14)] - - data <- mx.symbol.Variable("data") - fc2 <- mx.symbol.FullyConnected(data, num_hidden = 2) - lro2 <- mx.symbol.LinearRegressionOutput(fc2) - - mx.set.seed(0) - train_iter <- mx.io.arrayiter(data = t(train.x), label = t(train.y)) - - model <- mx.model.FeedForward.create(lro2, X = train_iter, ctx = mx.ctx.default(), - num.round = 50, array.batch.size = 20, learning.rate = 2e-06, momentum = 0.9) -}) test_that("Classification", { @@ -124,167 +88,3 @@ test_that("Classification", { num.round = 5, array.batch.size = 15, learning.rate = 0.07, momentum = 0.9, eval.metric = mx.metric.accuracy) }) - -test_that("Fine-tune", { - GetInception() - GetCatDog() - train_iter <- mx.io.ImageRecordIter(path.imgrec = "./data/cats_dogs/cats_dogs_train.rec", - batch.size = 8, data.shape = c(224, 224, 3), rand.crop = TRUE, rand.mirror = TRUE) - val_iter <- mx.io.ImageRecordIter(path.imgrec = "./data/cats_dogs/cats_dogs_val.rec", - batch.size = 8, data.shape = c(224, 224, 3), rand.crop = FALSE, rand.mirror = FALSE) - inception_bn <- mx.model.load("./model/Inception-BN", iteration = 126) - symbol <- inception_bn$symbol - internals <- symbol$get.internals() - outputs <- internals$outputs - - flatten <- internals$get.output(which(outputs == "flatten_output")) - - new_fc <- mx.symbol.FullyConnected(data = flatten, num_hidden = 2, name = "fc1") - new_soft <- mx.symbol.SoftmaxOutput(data = new_fc, name = "softmax") - arg_params_new <- mx.model.init.params(symbol = new_soft, input.shape = list(data = c(224, - 224, 3, 8)), output.shape = NULL, initializer = mx.init.uniform(0.1), ctx = mx.cpu())$arg.params - fc1_weights_new <- arg_params_new[["fc1_weight"]] - fc1_bias_new <- arg_params_new[["fc1_bias"]] - - arg_params_new <- inception_bn$arg.params - - arg_params_new[["fc1_weight"]] <- fc1_weights_new - arg_params_new[["fc1_bias"]] <- fc1_bias_new - - # model <- mx.model.FeedForward.create(symbol = new_soft, X = train_iter, - # eval.data = val_iter, ctx = mx.ctx.default(), eval.metric = mx.metric.accuracy, - # num.round = 2, learning.rate = 0.05, momentum = 0.9, wd = 0.00001, kvstore = - # 'local', batch.end.callback = mx.callback.log.train.metric(50), initializer = - # mx.init.Xavier(factor_type = 'in', magnitude = 2.34), optimizer = 'sgd', - # arg.params = arg_params_new, aux.params = inception_bn$aux.params) -}) - -test_that("Matrix Factorization", { - - # Use fake random data instead of GetMovieLens() to remove external dependency - set.seed(123) - user <- sample(943, size = 1e+05, replace = T) - item <- sample(1682, size = 1e+05, replace = T) - score <- sample(5, size = 1e+05, replace = T) - DF <- data.frame(user, item, score) - - max_user <- max(DF$user) - max_item <- max(DF$item) - DF_mat_x <- data.matrix(t(DF[, 1:2])) - DF_y <- DF[, 3] - k <- 64 - user <- mx.symbol.Variable("user") - item <- mx.symbol.Variable("item") - score <- mx.symbol.Variable("score") - user1 <- mx.symbol.Embedding(data = mx.symbol.BlockGrad(user), input_dim = max_user, - output_dim = k, name = "user1") - item1 <- mx.symbol.Embedding(data = mx.symbol.BlockGrad(item), input_dim = max_item, - output_dim = k, name = "item1") - pred <- user1 * item1 - pred1 <- mx.symbol.sum_axis(pred, axis = 1, name = "pred1") - pred2 <- mx.symbol.Flatten(pred1, name = "pred2") - pred3 <- mx.symbol.LinearRegressionOutput(data = pred2, label = score, name = "pred3") - - mx.set.seed(123) - - CustomIter <- setRefClass("CustomIter", fields = c("iter1", "iter2"), contains = "Rcpp_MXArrayDataIter", - methods = list(initialize = function(iter1, iter2) { - .self$iter1 <- iter1 - .self$iter2 <- iter2 - .self - }, value = function() { - user <- .self$iter1$value()$data - item <- .self$iter2$value()$data - score <- .self$iter1$value()$label - list(user = user, item = item, score = score) - }, iter.next = function() { - .self$iter1$iter.next() - .self$iter2$iter.next() - }, reset = function() { - .self$iter1$reset() - .self$iter2$reset() - }, num.pad = function() { - .self$iter1$num.pad() - }, finalize = function() { - .self$iter1$finalize() - .self$iter2$finalize() - })) - - user_iter <- mx.io.arrayiter(data = DF[, 1], label = DF[, 3], batch.size = k) - - item_iter <- mx.io.arrayiter(data = DF[, 2], label = DF[, 3], batch.size = k) - - train_iter <- CustomIter$new(user_iter, item_iter) - - model <- mx.model.FeedForward.create(pred3, X = train_iter, ctx = mx.ctx.default(), - num.round = 5, initializer = mx.init.uniform(0.07), learning.rate = 0.07, - eval.metric = mx.metric.rmse, momentum = 0.9, epoch.end.callback = mx.callback.log.train.metric(1), - input.names = c("user", "item"), output.names = "score") -}) - -test_that("Captcha", { - GetCaptcha_data() - data.shape <- c(80, 30, 3) - batch_size <- 40 - train <- mx.io.ImageRecordIter(path.imgrec = "./data/captcha_example/captcha_train.rec", - path.imglist = "./data/captcha_example/captcha_train.lst", batch.size = batch_size, - label.width = 4, data.shape = data.shape, mean.img = "mean.bin") - - val <- mx.io.ImageRecordIter(path.imgrec = "./data/captcha_example/captcha_test.rec", - path.imglist = "./data/captcha_example/captcha_test.lst", batch.size = batch_size, - label.width = 4, data.shape = data.shape, mean.img = "mean.bin") - - data <- mx.symbol.Variable("data") - label <- mx.symbol.Variable("label") - conv1 <- mx.symbol.Convolution(data = data, kernel = c(5, 5), num_filter = 32) - pool1 <- mx.symbol.Pooling(data = conv1, pool_type = "max", kernel = c(2, 2), - stride = c(1, 1)) - relu1 <- mx.symbol.Activation(data = pool1, act_type = "relu") - - conv2 <- mx.symbol.Convolution(data = relu1, kernel = c(5, 5), num_filter = 32) - pool2 <- mx.symbol.Pooling(data = conv2, pool_type = "avg", kernel = c(2, 2), - stride = c(1, 1)) - relu2 <- mx.symbol.Activation(data = pool2, act_type = "relu") - - flatten <- mx.symbol.Flatten(data = relu2) - fc1 <- mx.symbol.FullyConnected(data = flatten, num_hidden = 120) - fc21 <- mx.symbol.FullyConnected(data = fc1, num_hidden = 10) - fc22 <- mx.symbol.FullyConnected(data = fc1, num_hidden = 10) - fc23 <- mx.symbol.FullyConnected(data = fc1, num_hidden = 10) - fc24 <- mx.symbol.FullyConnected(data = fc1, num_hidden = 10) - fc2 <- mx.symbol.concat(c(fc21, fc22, fc23, fc24), dim = 0, num.args = 4) - label <- mx.symbol.transpose(data = label) - label <- mx.symbol.Reshape(data = label, target_shape = c(0)) - captcha_net <- mx.symbol.SoftmaxOutput(data = fc2, label = label, name = "softmax") - - mx.metric.acc2 <- mx.metric.custom("accuracy", function(label, pred) { - label <- as.array(label) - pred <- as.array(pred) - ypred <- max.col(t(pred)) - 1 - ypred <- matrix(ypred, nrow = nrow(label), ncol = ncol(label), byrow = TRUE) - return(sum(colSums(label == ypred) == 4)/ncol(label)) - }) - - mx.set.seed(42) - - train$reset() - train$iter.next() - - input.names <- "data" - input.shape <- sapply(input.names, function(n) { - dim(train$value()[[n]]) - }, simplify = FALSE) - arg_names <- arguments(captcha_net) - output.names <- "label" - output.shape <- sapply(output.names, function(n) { - dim(train$value()[[n]]) - }, simplify = FALSE) - params <- mx.model.init.params(captcha_net, input.shape, output.shape, mx.init.Xavier(factor_type = "in", - magnitude = 2.34), mx.cpu()) - - # model <- mx.model.FeedForward.create( X = train, eval.data = val, ctx = - # mx.ctx.default(), symbol = captcha_net, eval.metric = mx.metric.acc2, num.round - # = 1, learning.rate = 1e-04, momentum = 0.9, wd = 1e-05, batch.end.callback = - # mx.callback.log.train.metric(50), initializer = mx.init.Xavier(factor_type = - # 'in', magnitude = 2.34), optimizer = 'sgd', clip_gradient = 10) -}) diff --git a/R-package/tests/testthat/test_optimizer.R b/R-package/tests/testthat/test_optimizer.R deleted file mode 100644 index cbe9575c90ca..000000000000 --- a/R-package/tests/testthat/test_optimizer.R +++ /dev/null @@ -1,251 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -context("optimizer") - -if (Sys.getenv("R_GPU_ENABLE") != "" & as.integer(Sys.getenv("R_GPU_ENABLE")) == - 1) { - mx.ctx.default(new = mx.gpu()) - message("Using GPU for testing.") -} - -test_that("sgd", { - - data <- mx.symbol.Variable("data") - label <- mx.symbol.Variable("label") - fc_weight <- mx.symbol.Variable("fc_weight") - fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, - name = "fc1", num_hidden = 1) - loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss") - - x <- mx.nd.array(array(1:6, dim = 2:3)) - y <- mx.nd.array(c(5, 11, 16)) - w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1))) - - exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x, - fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", - "null")) - - optimizer <- mx.opt.create("sgd", learning.rate = 1, momentum = 0, wd = 0, rescale.grad = 1, - clip_gradient = -1) - - updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default()) - - mx.exec.forward(exec, is.train = T) - mx.exec.backward(exec) - - arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays) - mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE) - - expect_equal(as.array(arg.blocks[[2]]), array(c(1.4, 2.6), dim = c(2, 1)), tolerance = 0.1) - -}) - - -test_that("rmsprop", { - - data <- mx.symbol.Variable("data") - label <- mx.symbol.Variable("label") - fc_weight <- mx.symbol.Variable("fc_weight") - fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, - name = "fc1", num_hidden = 1) - loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss") - - x <- mx.nd.array(array(1:6, dim = 2:3)) - y <- mx.nd.array(c(5, 11, 16)) - w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1))) - - exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x, - fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", - "null")) - - optimizer <- mx.opt.create("rmsprop", learning.rate = 1, centered = TRUE, rho = 0.95, - momentum = 0.9, epsilon = 1e-04, wd = 0, rescale.grad = 1, clip_gradient = -1) - - updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default()) - - mx.exec.forward(exec, is.train = T) - mx.exec.backward(exec) - - arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays) - mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE) - - expect_equal(as.array(arg.blocks[[2]]), array(c(5.64, 6.38), dim = c(2, 1)), - tolerance = 0.1) - -}) - - -test_that("adam", { - - data <- mx.symbol.Variable("data") - label <- mx.symbol.Variable("label") - fc_weight <- mx.symbol.Variable("fc_weight") - fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, - name = "fc1", num_hidden = 1) - loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss") - - x <- mx.nd.array(array(1:6, dim = 2:3)) - y <- mx.nd.array(c(5, 11, 16)) - w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1))) - - exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x, - fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", - "null")) - - optimizer <- mx.opt.create("adam", learning.rate = 1, beta1 = 0.9, beta2 = 0.999, - epsilon = 1e-08, wd = 0, rescale.grad = 1, clip_gradient = -1) - - updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default()) - - mx.exec.forward(exec, is.train = T) - mx.exec.backward(exec) - - arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays) - mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE) - - expect_equal(as.array(arg.blocks[[2]]), array(c(4.26, 4.96), dim = c(2, 1)), - tolerance = 0.1) - -}) - - -test_that("adagrad", { - - data <- mx.symbol.Variable("data") - label <- mx.symbol.Variable("label") - fc_weight <- mx.symbol.Variable("fc_weight") - fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, - name = "fc1", num_hidden = 1) - loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss") - - x <- mx.nd.array(array(1:6, dim = 2:3)) - y <- mx.nd.array(c(5, 11, 16)) - w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1))) - - exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x, - fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", - "null")) - - optimizer <- mx.opt.create("adagrad", learning.rate = 1, epsilon = 1e-08, wd = 0, - rescale.grad = 1, clip_gradient = -1) - - updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default()) - - mx.exec.forward(exec, is.train = T) - mx.exec.backward(exec) - - arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays) - mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE) - - expect_equal(as.array(arg.blocks[[2]]), array(c(2.1, 2.8), dim = c(2, 1)), tolerance = 0.1) - -}) - - -test_that("adadelta", { - - data <- mx.symbol.Variable("data") - label <- mx.symbol.Variable("label") - fc_weight <- mx.symbol.Variable("fc_weight") - fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, - name = "fc1", num_hidden = 1) - loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss") - - x <- mx.nd.array(array(1:6, dim = 2:3)) - y <- mx.nd.array(c(5, 11, 16)) - w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1))) - - exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x, - fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", - "null")) - - optimizer <- mx.opt.create("adadelta", rho = 0.9, epsilon = 1e-05, wd = 0, rescale.grad = 1, - clip_gradient = -1) - - updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default()) - - mx.exec.forward(exec, is.train = T) - mx.exec.backward(exec) - - arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays) - mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE) - - expect_equal(as.array(arg.blocks[[2]]), array(c(1.11, 1.81), dim = c(2, 1)), - tolerance = 0.1) - -}) - - -test_that("nag_no_momentum", { - data <- mx.symbol.Variable("data") - label <- mx.symbol.Variable("label") - fc_weight <- mx.symbol.Variable("fc_weight") - fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, - name = "fc1", num_hidden = 1) - loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss") - - x <- mx.nd.array(array(1:6, dim = 2:3)) - y <- mx.nd.array(c(5, 11, 16)) - w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1))) - - exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x, - fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", "null")) - - optimizer <- mx.opt.create("nag", learning.rate = 1, momentum = 0, wd = 0, rescale.grad = 1, - clip_gradient = -1) - - updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default()) - - mx.exec.forward(exec, is.train = T) - mx.exec.backward(exec) - - arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays) - mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE) - - expect_equal(as.array(arg.blocks[[2]]), array(c(1.4, 2.6), dim = c(2, 1)), tolerance = 0.05) -}) - - -test_that("nag_momentum", { - data <- mx.symbol.Variable("data") - label <- mx.symbol.Variable("label") - fc_weight <- mx.symbol.Variable("fc_weight") - fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, - name = "fc1", num_hidden = 1) - loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss") - - x <- mx.nd.array(array(1:6, dim = 2:3)) - y <- mx.nd.array(c(5, 11, 16)) - w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1))) - - exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x, - fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", "null")) - - optimizer <- mx.opt.create("nag", learning.rate = 1, momentum = 0.1, wd = 0, rescale.grad = 1, - clip_gradient = 5) - - updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default()) - - mx.exec.forward(exec, is.train = T) - mx.exec.backward(exec) - - arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays) - mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE) - - expect_equal(as.array(arg.blocks[[2]]), array(c(1.45, 2.65), dim = c(2, 1)), tolerance = 0.1) -}) diff --git a/R-package/tests/testthat/test_symbol.R b/R-package/tests/testthat/test_symbol.R index c93118a7db1f..acad98ac7b1f 100644 --- a/R-package/tests/testthat/test_symbol.R +++ b/R-package/tests/testthat/test_symbol.R @@ -91,17 +91,6 @@ test_that("symbol infer type", { expect_equal(ret, NULL) }) -test_that("symbol save/load", { - data <- mx.symbol.Variable("data") - fc1 <- mx.symbol.FullyConnected(data, num_hidden = 1) - lro <- mx.symbol.LinearRegressionOutput(fc1) - mx.symbol.save(lro, "tmp_r_sym.json") - data2 <- mx.symbol.load("tmp_r_sym.json") - - expect_equal(data2$as.json(), lro$as.json()) - file.remove("tmp_r_sym.json") -}) - test_that("symbol attributes access", { str <- "(1, 1, 1, 1)" x <- mx.symbol.Variable("x") diff --git a/R-package/vignettes/CallbackFunction.Rmd b/R-package/vignettes/CallbackFunction.Rmd deleted file mode 100644 index 12b7e28247e9..000000000000 --- a/R-package/vignettes/CallbackFunction.Rmd +++ /dev/null @@ -1,160 +0,0 @@ -# Customized callback function - -This vignette gives users a guideline for using and writing callback functions, -which can be very useful in model training. - -## Model training example - -Let's begin from a small example. We can build and train a model using the following code: - -```{r} -library(mxnet) -data(BostonHousing, package="mlbench") -train.ind = seq(1, 506, 3) -train.x = data.matrix(BostonHousing[train.ind, -14]) -train.y = BostonHousing[train.ind, 14] -test.x = data.matrix(BostonHousing[-train.ind, -14]) -test.y = BostonHousing[-train.ind, 14] -data <- mx.symbol.Variable("data") -fc1 <- mx.symbol.FullyConnected(data, num_hidden=1) -lro <- mx.symbol.LinearRegressionOutput(fc1) -mx.set.seed(0) -model <- mx.model.FeedForward.create( - lro, X=train.x, y=train.y, - eval.data=list(data=test.x, label=test.y), - ctx=mx.cpu(), num.round=10, array.batch.size=20, - learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse) -``` - -Besides, we provide two optional parameters, `batch.end.callback` and `epoch.end.callback`, which can provide great flexibility in model training. - -## How to use callback functions - - -Two callback functions are provided in this package: - -- `mx.callback.save.checkpoint` is used to save checkpoint to files each period iteration. - -```{r} -model <- mx.model.FeedForward.create( - lro, X=train.x, y=train.y, - eval.data=list(data=test.x, label=test.y), - ctx=mx.cpu(), num.round=10, array.batch.size=20, - learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse, - epoch.end.callback = mx.callback.save.checkpoint("boston")) -list.files(pattern = "^boston") -``` - - -- `mx.callback.log.train.metric` is used to log training metric each period. -You can use it either as a `batch.end.callback` or a `epoch.end.callback`. - -```{r} -model <- mx.model.FeedForward.create( - lro, X=train.x, y=train.y, - eval.data=list(data=test.x, label=test.y), - ctx=mx.cpu(), num.round=10, array.batch.size=20, - learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse, - batch.end.callback = mx.callback.log.train.metric(5)) -``` - -You can also save the training and evaluation errors for later usage by passing a reference class. - -```{r} -logger <- mx.metric.logger$new() -model <- mx.model.FeedForward.create( - lro, X=train.x, y=train.y, - eval.data=list(data=test.x, label=test.y), - ctx=mx.cpu(), num.round=10, array.batch.size=20, - learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse, - epoch.end.callback = mx.callback.log.train.metric(5, logger)) -head(logger$train) -head(logger$eval) -``` - -## How to write your own callback functions - - -You can find the source code for two callback functions from [here](https://github.com/dmlc/mxnet/blob/master/R-package/R/callback.R) and they can be used as your template: - -Basically, all callback functions follow the structure below: - -```{r, eval=FALSE} -mx.callback.fun <- function() { - function(iteration, nbatch, env, verbose) { - } -} -``` - -The `mx.callback.save.checkpoint` function below is stateless. It just get the model from environment and save it. - -```{r, eval=FALSE} -mx.callback.save.checkpoint <- function(prefix, period=1) { - function(iteration, nbatch, env, verbose=TRUE) { - if (iteration %% period == 0) { - mx.model.save(env$model, prefix, iteration) - if(verbose) message(sprintf("Model checkpoint saved to %s-%04d.params\n", prefix, iteration)) - } - return(TRUE) - } -} -``` - -The `mx.callback.log.train.metric` is a little more complex. It holds a reference class and update it during the training process. - -```{r, eval=FALSE} -mx.callback.log.train.metric <- function(period, logger=NULL) { - function(iteration, nbatch, env, verbose=TRUE) { - if (nbatch %% period == 0 && !is.null(env$metric)) { - result <- env$metric$get(env$train.metric) - if (nbatch != 0 & verbose) - message(paste0("Batch [", nbatch, "] Train-", result$name, "=", result$value)) - if (!is.null(logger)) { - if (class(logger) != "mx.metric.logger") { - stop("Invalid mx.metric.logger.") - } - logger$train <- c(logger$train, result$value) - if (!is.null(env$eval.metric)) { - result <- env$metric$get(env$eval.metric) - if (nbatch != 0 & verbose) - message(paste0("Batch [", nbatch, "] Validation-", result$name, "=", result$value)) - logger$eval <- c(logger$eval, result$value) - } - } - } - return(TRUE) - } -} -``` - -Now you might be curious why both callback functions `return(TRUE)`. -Can we `return(FALSE)`? - -Yes! You can stop the training early by `return(FALSE)`. See the examples below. - -```{r} -mx.callback.early.stop <- function(eval.metric) { - function(iteration, nbatch, env, verbose) { - if (!is.null(env$metric)) { - if (!is.null(eval.metric)) { - result <- env$metric$get(env$eval.metric) - if (result$value < eval.metric) { - return(FALSE) - } - } - } - return(TRUE) - } -} -model <- mx.model.FeedForward.create( - lro, X=train.x, y=train.y, - eval.data=list(data=test.x, label=test.y), - ctx=mx.cpu(), num.round=10, array.batch.size=20, - learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse, - epoch.end.callback = mx.callback.early.stop(10)) -``` - -You can see once the validation metric goes below the threshold we set, the training process will stop early. - - - diff --git a/R-package/vignettes/CustomLossFunction.Rmd b/R-package/vignettes/CustomLossFunction.Rmd deleted file mode 100644 index 85e882567f8e..000000000000 --- a/R-package/vignettes/CustomLossFunction.Rmd +++ /dev/null @@ -1,151 +0,0 @@ -# Customized loss function - -This tutorial provides guidelines for using customized loss function in network construction. - -## Model Training Example - -Let's begin with a small regression example. We can build and train a regression model with the following code: - -```{r} -data(BostonHousing, package = "mlbench") -BostonHousing[, sapply(BostonHousing, is.factor)] <- - as.numeric(as.character(BostonHousing[, sapply(BostonHousing, is.factor)])) -BostonHousing <- data.frame(scale(BostonHousing)) - -test.ind = seq(1, 506, 5) # 1 pt in 5 used for testing -train.x = data.matrix(BostonHousing[-test.ind,-14]) -train.y = BostonHousing[-test.ind, 14] -test.x = data.matrix(BostonHousing[--test.ind,-14]) -test.y = BostonHousing[--test.ind, 14] - -require(mxnet) - -data <- mx.symbol.Variable("data") -label <- mx.symbol.Variable("label") -fc1 <- mx.symbol.FullyConnected(data, num_hidden = 14, name = "fc1") -tanh1 <- mx.symbol.Activation(fc1, act_type = "tanh", name = "tanh1") -fc2 <- mx.symbol.FullyConnected(tanh1, num_hidden = 1, name = "fc2") -lro <- mx.symbol.LinearRegressionOutput(fc2, name = "lro") - -mx.set.seed(0) -model <- mx.model.FeedForward.create(lro, X = train.x, y = train.y, - ctx = mx.cpu(), - num.round = 5, - array.batch.size = 60, - optimizer = "rmsprop", - verbose = TRUE, - array.layout = "rowmajor", - batch.end.callback = NULL, - epoch.end.callback = NULL) - -pred <- predict(model, test.x) -sum((test.y - pred[1,])^2) / length(test.y) -``` - -Besides the `LinearRegressionOutput`, we also provide `LogisticRegressionOutput` and `MAERegressionOutput`. -However, this might not be enough for real-world models. You can provide your own loss function -by using `mx.symbol.MakeLoss` when constructing the network. - -## How to Use Your Own Loss Function - -We still use our previous example, but this time we use `mx.symbol.MakeLoss` to minimize the `(pred-label)^2` - -```{r} -data <- mx.symbol.Variable("data") -label <- mx.symbol.Variable("label") -fc1 <- mx.symbol.FullyConnected(data, num_hidden = 14, name = "fc1") -tanh1 <- mx.symbol.Activation(fc1, act_type = "tanh", name = "tanh1") -fc2 <- mx.symbol.FullyConnected(tanh1, num_hidden = 1, name = "fc2") -lro2 <- mx.symbol.MakeLoss(mx.symbol.square(mx.symbol.Reshape(fc2, shape = 0) - label), name="lro2") -``` - -Then we can train the network just as usual. - -```{r} -mx.set.seed(0) -model2 <- mx.model.FeedForward.create(lro2, X = train.x, y = train.y, - ctx = mx.cpu(), - num.round = 5, - array.batch.size = 60, - optimizer = "rmsprop", - verbose = TRUE, - array.layout = "rowmajor", - batch.end.callback = NULL, - epoch.end.callback = NULL) -``` - -We should get very similar results because we are actually minimizing the same loss function. -However, the result is quite different. - -```{r} -pred2 <- predict(model2, test.x) -sum((test.y - pred2)^2) / length(test.y) -``` - -This is because output of `mx.symbol.MakeLoss` is the gradient of loss with respect to the input data. -We can get the real prediction as below. - -```{r} -internals = internals(model2$symbol) -fc_symbol = internals[[match("fc2_output", outputs(internals))]] - -model3 <- list(symbol = fc_symbol, - arg.params = model2$arg.params, - aux.params = model2$aux.params) - -class(model3) <- "MXFeedForwardModel" - -pred3 <- predict(model3, test.x) -sum((test.y - pred3[1,])^2) / length(test.y) -``` - -We have provided many operations on the symbols. An example of `|pred-label|` can be found below. - -```{r} -lro_abs <- mx.symbol.MakeLoss(mx.symbol.abs(mx.symbol.Reshape(fc2, shape = 0) - label)) -mx.set.seed(0) -model4 <- mx.model.FeedForward.create(lro_abs, X = train.x, y = train.y, - ctx = mx.cpu(), - num.round = 20, - array.batch.size = 60, - optimizer = "sgd", - learning.rate = 0.001, - verbose = TRUE, - array.layout = "rowmajor", - batch.end.callback = NULL, - epoch.end.callback = NULL) - -internals = internals(model4$symbol) -fc_symbol = internals[[match("fc2_output", outputs(internals))]] - -model5 <- list(symbol = fc_symbol, - arg.params = model4$arg.params, - aux.params = model4$aux.params) - -class(model5) <- "MXFeedForwardModel" - -pred5 <- predict(model5, test.x) -sum(abs(test.y - pred5[1,])) / length(test.y) -``` - - -```{r} -lro_mae <- mx.symbol.MAERegressionOutput(fc2, name = "lro") -mx.set.seed(0) -model6 <- mx.model.FeedForward.create(lro_mae, X = train.x, y = train.y, - ctx = mx.cpu(), - num.round = 20, - array.batch.size = 60, - optimizer = "sgd", - learning.rate = 0.001, - verbose = TRUE, - array.layout = "rowmajor", - batch.end.callback = NULL, - epoch.end.callback = NULL) -pred6 <- predict(model6, test.x) -sum(abs(test.y - pred6[1,])) / length(test.y) -``` - -We got the same result as expected. - - diff --git a/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd b/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd deleted file mode 100644 index fb023bb5435f..000000000000 --- a/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd +++ /dev/null @@ -1,173 +0,0 @@ -# Neural Network with MXNet in Five Minutes - -This is the first tutorial for new users of the R package `mxnet`. You will learn to construct a neural network to do regression in 5 minutes. - -We will show you how to do classification and regression tasks respectively. The data we use comes from the package `mlbench`. - -## Classification - -First of all, let us load in the data and preprocess it: - -```{r} -require(mlbench) -require(mxnet) - -data(Sonar, package = "mlbench") - -Sonar[,61] <- as.numeric(Sonar[,61])-1 -train.ind <- c(1:50, 100:150) -train.x <- data.matrix(Sonar[train.ind, 1:60]) -train.y <- Sonar[train.ind, 61] -test.x <- data.matrix(Sonar[-train.ind, 1:60]) -test.y <- Sonar[-train.ind, 61] -``` - -Next we are going to use a multi-layer perceptron (MLP) as our classifier. -In `mxnet`, we have a function called `mx.mlp` so that users can build a general multi-layer neural network to do classification (`out_activation="softmax"`) or regression (`out_activation="rmse"`). -Note for the `softmax` activation, the output is zero-indexed not one-indexed. In the data we use: - -```{r} -table(train.y) -table(test.y) -``` - -There are several parameters we have to feed to `mx.mlp`: - -- Training data and label. -- Number of hidden nodes in each hidden layers. -- Number of nodes in the output layer. -- Type of the activation. -- Type of the output loss. -- The device to train `mx.gpu()` for GPU or `mx.cpu()` for CPU. -- Other parameters for `mx.model.FeedForward.create`. - -The following code piece is showing a possible usage of `mx.mlp`: - -```{r} -mx.set.seed(0) -model <- mx.mlp(train.x, train.y, hidden_node=10, out_node=2, out_activation="softmax", - num.round=20, array.batch.size=15, learning.rate=0.07, momentum=0.9, - eval.metric=mx.metric.accuracy) -``` - -Note that `mx.set.seed` is the correct function to control the random process in `mxnet`. You can see the accuracy in each round during training. It is also easy to make prediction and evaluate. - -To get an idea of what is happening, we can easily view the computation graph from R. - -```{r} -graph.viz(model$symbol) -``` - -```{r} -preds <- predict(model, test.x) -pred.label <- max.col(t(preds)) - 1 -table(pred.label, test.y) -``` - -Note for multi-class prediction, mxnet outputs `nclass` x `nexamples`, each each row corresponding to probability of that class. - -## Regression - -Again, let us preprocess the data first. - -```{r} -data(BostonHousing, package="mlbench") - -train.ind <- seq(1, 506, 3) -train.x <- data.matrix(BostonHousing[train.ind, -14]) -train.y <- BostonHousing[train.ind, 14] -test.x <- data.matrix(BostonHousing[-train.ind, -14]) -test.y <- BostonHousing[-train.ind, 14] -``` - -Although we can use `mx.mlp` again to do regression by changing the `out_activation`, this time we are going to introduce a flexible way to configure neural networks in `mxnet`. The configuration is done by the "Symbol" system in `mxnet`, which takes care of the links among nodes, the activation, dropout ratio, etc. To configure a multi-layer neural network, we can do it in the following way: - -```{r} -# Define the input data -data <- mx.symbol.Variable("data") -# A fully connected hidden layer -# data: input source -# num_hidden: number of neurons in this hidden layer -fc1 <- mx.symbol.FullyConnected(data, num_hidden=1) - -# Use linear regression for the output layer -lro <- mx.symbol.LinearRegressionOutput(fc1) -``` - -What matters for a regression task is mainly the last function, this enables the new network to optimize for squared loss. We can now train on this simple data set. In this configuration, we dropped the hidden layer so the input layer is directly connected to the output layer. - -next we can make prediction with this structure and other parameters with `mx.model.FeedForward.create`: - -```{r} -mx.set.seed(0) -model <- mx.model.FeedForward.create(lro, X=train.x, y=train.y, - ctx=mx.cpu(), num.round=50, array.batch.size=20, - learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse) -``` - -It is also easy to make prediction and evaluate - -```{r} -preds <- predict(model, test.x) -sqrt(mean((preds-test.y)^2)) -``` - -Currently we have four pre-defined metrics "accuracy", "rmse", "mae" and "rmsle". One might wonder how to customize the evaluation metric. `mxnet` provides the interface for users to define their own metric of interests: - -```{r} -demo.metric.mae <- mx.metric.custom("mae", function(label, pred) { - res <- mean(abs(label-pred)) - return(res) -}) -``` - -This is an example for mean absolute error. We can simply plug it in the training function: - -```{r} -mx.set.seed(0) -model <- mx.model.FeedForward.create(lro, X=train.x, y=train.y, - ctx=mx.cpu(), num.round=50, array.batch.size=20, - learning.rate=2e-6, momentum=0.9, eval.metric=demo.metric.mae) -``` - -In the previous example, our target is to predict the last column ("medv") in the dataset. -It is also possible to build a regression model with multiple outputs. -This time we use the last two columns as the targets: - -```{r} -train.x <- data.matrix(BostonHousing[train.ind, -(13:14)]) -train.y <- BostonHousing[train.ind, c(13:14)] -test.x <- data.matrix(BostonHousing[-train.ind, -(13:14)]) -test.y <- BostonHousing[-train.ind, c(13:14)] -``` - -and build a similar network symbol: - -```{r} -data <- mx.symbol.Variable("data") -fc2 <- mx.symbol.FullyConnected(data, num_hidden=2) -lro2 <- mx.symbol.LinearRegressionOutput(fc2) -``` - -We use `mx.io.arrayiter` to build an iter for our training set and train the model using `mx.model.FeedForward.create`: - -```{r} -mx.set.seed(0) -train_iter = mx.io.arrayiter(data = t(train.x), label = t(train.y)) - -model <- mx.model.FeedForward.create(lro2, X=train_iter, - ctx=mx.cpu(), num.round=50, array.batch.size=20, - learning.rate=2e-6, momentum=0.9) -``` - -After training, we can see that the dimension of the prediction is the same with our target. - -```{r} -preds <- t(predict(model, test.x)) -dim(preds) -dim(test.y) -``` -Congratulations! Now you have learnt the basic for using `mxnet`. Please check the other tutorials for advanced features. - - - diff --git a/benchmark/opperf/nd_operations/nn_basic_operators.py b/benchmark/opperf/nd_operations/nn_basic_operators.py index f3007bac188c..d669b83d653e 100644 --- a/benchmark/opperf/nd_operations/nn_basic_operators.py +++ b/benchmark/opperf/nd_operations/nn_basic_operators.py @@ -29,11 +29,6 @@ 1. FullyConnected 2. Dropout 3. BatchNorm -4. SoftmaxOutput -5. LinearRegressionOutput -6. LogisticRegressionOutput -7. MAERegressionOutput -8. SVMOutput 9. L2Normalization 10. LayerNorm 11. InstanceNorm diff --git a/benchmark/opperf/utils/op_registry_utils.py b/benchmark/opperf/utils/op_registry_utils.py index 65eb6aab2aac..d3cf1a418334 100644 --- a/benchmark/opperf/utils/op_registry_utils.py +++ b/benchmark/opperf/utils/op_registry_utils.py @@ -121,8 +121,8 @@ def prepare_op_inputs(op, arg_params, int64_tensor): # For ops with args that need to change shape/value for different ops custom_data = {'Activation', 'LeakyReLU', 'Softmax', 'BilinearSampler', 'GridGenerator', 'sample_multinomial', 'linalg_maketrian', 'SpatialTransformer', 'col2im', 'GroupNorm', 'Dropout', 'FullyConnected', - 'SoftmaxOutput', 'LinearRegressionOutput', 'BatchNorm', 'LogisticRegressionOutput', - 'MAERegressionOutput', 'SVMOutput', 'L2Normalization', 'LayerNorm', 'InstanceNorm', + 'BatchNorm', + 'L2Normalization', 'LayerNorm', 'InstanceNorm', 'Embedding', 'Correlation', 'im2col', 'LRN', 'squeeze', 'fill_element_0index'} custom_data_int64 = {'random_pdf_dirichlet', 'random_pdf_exponential', 'random_pdf_gamma', @@ -366,8 +366,8 @@ def get_all_nn_basic_operators(): ------- {"operator_name": {"has_backward", "nd_op_handle", "params"}} """ - nn_basic_ops = ['FullyConnected', 'Dropout', 'BatchNorm', 'SoftmaxOutput', 'LinearRegressionOutput', - 'LogisticRegressionOutput', 'MAERegressionOutput', 'SVMOutput', 'L2Normalization', + nn_basic_ops = ['FullyConnected', 'Dropout', 'BatchNorm', + 'L2Normalization', 'LayerNorm', 'InstanceNorm', 'Embedding', 'Correlation', 'SpatialTransformer', 'im2col', 'col2im', 'GroupNorm', 'LRN'] diff --git a/cpp-package/example/CMakeLists.txt b/cpp-package/example/CMakeLists.txt index bf9427af03ee..d682a88c7760 100644 --- a/cpp-package/example/CMakeLists.txt +++ b/cpp-package/example/CMakeLists.txt @@ -18,9 +18,6 @@ # Explicitly set GENERATED property https://gitlab.kitware.com/cmake/cmake/issues/18399 set_property(SOURCE ${CMAKE_CURRENT_LIST_DIR}/../include/mxnet-cpp/op.h PROPERTY GENERATED 1) -add_executable(test_regress_label test_regress_label.cpp) -target_link_libraries(test_regress_label mxnet_cpp) - add_executable(lenet lenet.cpp) target_link_libraries(lenet mxnet_cpp) diff --git a/cpp-package/example/test_regress_label.cpp b/cpp-package/example/test_regress_label.cpp deleted file mode 100644 index 8ef9d000922c..000000000000 --- a/cpp-package/example/test_regress_label.cpp +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - * - * This file is used for testing LinearRegressionOutput can - * still bind if label is not provided - */ - -#include -#include -#include -#include "dmlc/logging.h" -#include "mxnet-cpp/MxNetCpp.h" - -using namespace mxnet::cpp; - -int main() { - LOG(INFO) << "Running LinearRegressionOutput symbol testing, " - "executor should be able to bind without label."; - Symbol data = Symbol::Variable("data"); - Symbol label = Symbol::Variable("regress_label"); - Symbol symbol = LinearRegressionOutput(data, label); - std::map opReqMap; - for (const auto& iter : symbol.ListArguments()) { - opReqMap[iter] = mxnet::cpp::OpReqType::kNullOp; - } - std::map argMap({ - {"data", NDArray(Shape{1, 3}, Context::cpu(), true)} - }); - - try { - symbol.SimpleBind(Context::cpu(), - argMap, - std::map(), - opReqMap, - std::map()); - } catch (const std::exception& e) { - LOG(ERROR) << "Error binding the symbol: " << MXGetLastError() << " " << e.what(); - throw; - } - MXNotifyShutdown(); - return 0; -} diff --git a/docs/python_docs/python/tutorials/packages/ndarray/sparse/train.md b/docs/python_docs/python/tutorials/packages/ndarray/sparse/train.md index 23654fc6a33a..794c27e2f20a 100644 --- a/docs/python_docs/python/tutorials/packages/ndarray/sparse/train.md +++ b/docs/python_docs/python/tutorials/packages/ndarray/sparse/train.md @@ -212,122 +212,7 @@ fallback_log = fallback_exec.outputs[1] When the environment variable `MXNET_INFER_STORAGE_TYPE_VERBOSE_LOGGING` is set to `1`, MXNet will log the storage type information of operators' inputs and outputs in the computation graph. For example, we can inspect the storage types of -a linear classification network with sparse operators. Uncomment the line below and inspect your console.: - - -```python -# Set logging level for executor -import mxnet as mx -import os -#os.environ['MXNET_INFER_STORAGE_TYPE_VERBOSE_LOGGING'] = "1" -# Data in csr format -data = mx.sym.var('data', stype='csr', shape=(32, 10000)) -# Weight in row_sparse format -weight = mx.sym.var('weight', stype='row_sparse', shape=(10000, 2)) -bias = mx.symbol.Variable("bias", shape=(2,)) -dot = mx.symbol.sparse.dot(data, weight) -pred = mx.symbol.broadcast_add(dot, bias) -y = mx.symbol.Variable("label") -output = mx.symbol.SoftmaxOutput(data=pred, label=y, name="output") -executor = output.simple_bind(ctx=mx.cpu()) -``` - -## Training with Module APIs - -In the following section we'll walk through how one can implement **linear regression** using sparse symbols and sparse optimizers. - -The function you will explore is: *y = x1 + 2x2 + ... 100x100*, where *(x1,x2, ..., x100)* are input features and *y* is the corresponding label. - -### Preparing the Data - -In MXNet, both [mx.io.LibSVMIter](/api/python/docs/api/mxnet/io/index.html#mxnet.io.LibSVMIter) -and [mx.io.NDArrayIter](/api/python/docs/api/mxnet/io/index.html#mxnet.io.NDArrayIter) -support loading sparse data in CSR format. In this example, we'll use the `NDArrayIter`. - -You may see some warnings from SciPy. You don't need to worry about those for this example. - - -```python -# Random training data -feature_dimension = 100 -train_data = mx.test_utils.rand_ndarray((1000, feature_dimension), 'csr', 0.01) -target_weight = mx.nd.arange(1, feature_dimension + 1).reshape((feature_dimension, 1)) -train_label = mx.nd.dot(train_data, target_weight) -batch_size = 1 -train_iter = mx.io.NDArrayIter(train_data, train_label, batch_size, last_batch_handle='discard', label_name='label') -``` - -### Defining the Model - -Below is an example of a linear regression model specifying the storage type of the variables. - - -```python -initializer = mx.initializer.Normal(sigma=0.01) -X = mx.sym.Variable('data', stype='csr') -Y = mx.symbol.Variable('label') -weight = mx.symbol.Variable('weight', stype='row_sparse', shape=(feature_dimension, 1), init=initializer) -bias = mx.symbol.Variable('bias', shape=(1, )) -pred = mx.sym.broadcast_add(mx.sym.sparse.dot(X, weight), bias) -lro = mx.sym.LinearRegressionOutput(data=pred, label=Y, name="lro") -``` - -The above network uses the following symbols: - -1. `Variable X`: The placeholder for sparse data inputs. The `csr` stype indicates that the array to hold is in CSR format. - -2. `Variable Y`: The placeholder for dense labels. - -3. `Variable weight`: The placeholder for the weight to learn. The `stype` of weight is specified as `row_sparse` so that it is initialized as RowSparseNDArray, - and the optimizer will perform sparse update rules on it. The `init` attribute specifies what initializer to use for this variable. - -4. `Variable bias`: The placeholder for the bias to learn. - -5. `sparse.dot`: The dot product operation of `X` and `weight`. The sparse implementation will be invoked to handle `csr` and `row_sparse` inputs. - -6. `broadcast_add`: The broadcasting add operation to apply `bias`. - -7. `LinearRegressionOutput`: The output layer which computes *l2* loss against its input and the labels provided to it. - -### Training the model - -Once we have defined the model structure, the next step is to create a module and initialize the parameters and optimizer. - - -```python -# Create module -mod = mx.mod.Module(symbol=lro, data_names=['data'], label_names=['label']) -# Allocate memory by giving the input data and label shapes -mod.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label) -# Initialize parameters by random numbers -mod.init_params(initializer=initializer) -# Use SGD as the optimizer, which performs sparse update on "row_sparse" weight -sgd = mx.optimizer.SGD(learning_rate=0.05, rescale_grad=1.0/batch_size, momentum=0.9) -mod.init_optimizer(optimizer=sgd) -``` - -Finally, we train the parameters of the model to fit the training data by using the `forward`, `backward`, and `update` methods in Module. - - -```python -# Use mean square error as the metric -metric = mx.metric.create('MSE') -# Train 10 epochs -for epoch in range(10): - train_iter.reset() - metric.reset() - for batch in train_iter: - mod.forward(batch, is_train=True) # compute predictions - mod.update_metric(metric, batch.label) # accumulate prediction accuracy - mod.backward() # compute gradients - mod.update() # update parameters - print('Epoch %d, Metric = %s' % (epoch, metric.get())) -assert metric.get()[1] < 1, "Achieved MSE (%f) is larger than expected (1.0)" % metric.get()[1] -``` - -`Epoch 9, Metric = ('mse', 0.35979430613957991)` - - +a linear classification network with sparse operators. ### Training the model with multiple machines or multiple devices diff --git a/docs/python_docs/python/tutorials/performance/backend/amp.md b/docs/python_docs/python/tutorials/performance/backend/amp.md index c862b51131f4..247582801468 100644 --- a/docs/python_docs/python/tutorials/performance/backend/amp.md +++ b/docs/python_docs/python/tutorials/performance/backend/amp.md @@ -317,13 +317,13 @@ with mx.Context(mx.gpu(0)): prefix, epoch = mx.test_utils.download_model("imagenet1k-resnet-18", dst_dir=model_path) sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch) - # All Convolution ops should run in FP16, SoftmaxOutput and FullyConnected should run in FP32 + # All Convolution ops should run in FP16, FullyConnected should run in FP32 # cast_optional_params=True: Force cast params to FP16 wherever possible result_sym, result_arg_params, result_aux_params = amp.convert_model(sym, arg_params, aux_params, target_dtype_ops=["Convolution"], - fp32_ops=["SoftmaxOutput", "FullyConnected"], + fp32_ops=["FullyConnected"], cast_optional_params=True) # Run dummy inference with the converted symbolic model @@ -348,4 +348,3 @@ with mx.Context(mx.gpu(0)): ## Current limitations of AMP - AMP's dynamic loss scaling currently supports only Gluon trainer with `update_on_kvstore=False` option set -- Using `SoftmaxOutput`, `LinearRegressionOutput`, `LogisticRegressionOutput`, `MAERegressionOutput` with dynamic loss scaling does not work when training networks with multiple Gluon trainers and so multiple loss scales \ No newline at end of file diff --git a/docs/static_site/src/pages/api/faq/visualize_graph.md b/docs/static_site/src/pages/api/faq/visualize_graph.md deleted file mode 100644 index 8d477779b54f..000000000000 --- a/docs/static_site/src/pages/api/faq/visualize_graph.md +++ /dev/null @@ -1,88 +0,0 @@ ---- -layout: page_category -title: Visualize Neural Networks -category: faq -faq_c: Model -question: How do I visualize neural networks as computation graphs? -permalink: /api/faq/visualize_graph ---- - - - - - - - - - - - - - - - - -# How to visualize Neural Networks as computation graph - -Here, we'll demonstrate how to use ```mx.viz.plot_network``` -for visualizing your neural networks. ```mx.viz.plot_network``` -represents the neural network as a computation graph consisting of nodes and edges. -The visualizations make clear which nodes correspond to inputs, -where the computation starts, -and which correspond to output nodes, -from which the result can be read. - -## Prerequisites -You need the [Jupyter Notebook](http://jupyter.readthedocs.io/en/latest/) -and [Graphviz](https://www.graphviz.org/) libraries to visualize the network. -Please make sure you have followed [installation instructions]({{'get_started'|relative_url}}) -in setting up above dependencies along with setting up MXNet. - -## Visualize the sample Neural Network - -```mx.viz.plot_network``` takes [Symbol]({{'/api/python/docs/api/symbol/index'|relative}}), with your Network definition, and optional node_attrs, parameters for the shape of the node in the graph, as input and generates a computation graph. - -We will now try to visualize a sample Neural Network for linear matrix factorization: -- Start Jupyter notebook server -```bash - $ jupyter notebook -``` -- Access Jupyter notebook in your browser - http://localhost:8888/. -- Create a new notebook - "File -> New Notebook -> Python 2" -- Copy and run below code to visualize a simple network. - -```python -import mxnet as mx -user = mx.symbol.Variable('user') -item = mx.symbol.Variable('item') -score = mx.symbol.Variable('score') - -# Set dummy dimensions -k = 64 -max_user = 100 -max_item = 50 - -# user feature lookup -user = mx.symbol.Embedding(data = user, input_dim = max_user, output_dim = k) - -# item feature lookup -item = mx.symbol.Embedding(data = item, input_dim = max_item, output_dim = k) - -# predict by the inner product, which is elementwise product and then sum -net = user * item -net = mx.symbol.sum_axis(data = net, axis = 1) -net = mx.symbol.Flatten(data = net) - -# loss layer -net = mx.symbol.LinearRegressionOutput(data = net, label = score) - -# Visualize your network -mx.viz.plot_network(net) -``` -You should see computation graph something like the following image: - - -# References -* [Example MXNet Matrix Factorization](https://github.com/dmlc/mxnet/blob/master/example/recommenders/demo1-MF.ipynb) -* [Visualizing CNN Architecture of MXNet Tutorials](http://josephpcohen.com/w/visualizing-cnn-architectures-side-by-side-with-mxnet/) diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/callback_function.md b/docs/static_site/src/pages/api/r/docs/tutorials/callback_function.md deleted file mode 100644 index d74112db98b5..000000000000 --- a/docs/static_site/src/pages/api/r/docs/tutorials/callback_function.md +++ /dev/null @@ -1,278 +0,0 @@ ---- -layout: page_api -title: Callback Function -is_tutorial: true -tag: r -permalink: /api/r/docs/tutorials/callback_function ---- - - - - - - - - - - - - - - - - - -Callback Function -====================================== - -This tutorial provides guidelines for using and writing callback functions, -which can very useful in model training. - -Model Training Example ----------- - -Let's begin with a small example. We can build and train a model with the following code: - - - ```r - library(mxnet) - data(BostonHousing, package="mlbench") - train.ind = seq(1, 506, 3) - train.x = data.matrix(BostonHousing[train.ind, -14]) - train.y = BostonHousing[train.ind, 14] - test.x = data.matrix(BostonHousing[-train.ind, -14]) - test.y = BostonHousing[-train.ind, 14] - data <- mx.symbol.Variable("data") - fc1 <- mx.symbol.FullyConnected(data, num_hidden=1) - lro <- mx.symbol.LinearRegressionOutput(fc1) - mx.set.seed(0) - model <- mx.model.FeedForward.create( - lro, X=train.x, y=train.y, - eval.data=list(data=test.x, label=test.y), - ctx=mx.cpu(), num.round=10, array.batch.size=20, - learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse) - ``` - - ``` - ## Auto detect layout of input matrix, use row major.. - ## Start training with 1 devices - ## [1] Train-rmse=16.063282524034 - ## [1] Validation-rmse=10.1766446093622 - ## [2] Train-rmse=12.2792375712573 - ## [2] Validation-rmse=12.4331776190813 - ## [3] Train-rmse=11.1984634005885 - ## [3] Validation-rmse=10.3303041888193 - ## [4] Train-rmse=10.2645236892904 - ## [4] Validation-rmse=8.42760407903415 - ## [5] Train-rmse=9.49711005504284 - ## [5] Validation-rmse=8.44557808483234 - ## [6] Train-rmse=9.07733734175182 - ## [6] Validation-rmse=8.33225500266177 - ## [7] Train-rmse=9.07884450847991 - ## [7] Validation-rmse=8.38827833418459 - ## [8] Train-rmse=9.10463850277417 - ## [8] Validation-rmse=8.37394452365264 - ## [9] Train-rmse=9.03977049028532 - ## [9] Validation-rmse=8.25927979725672 - ## [10] Train-rmse=8.96870685004475 - ## [10] Validation-rmse=8.19509291481822 - ``` - -We also provide two optional parameters, `batch.end.callback` and `epoch.end.callback`, which can provide great flexibility in model training. - -How to Use Callback Functions ---------- - -This package provides two callback functions: - -- `mx.callback.save.checkpoint` saves a checkpoint to files during each period iteration. - -```r - model <- mx.model.FeedForward.create( - lro, X=train.x, y=train.y, - eval.data=list(data=test.x, label=test.y), - ctx=mx.cpu(), num.round=10, array.batch.size=20, - learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse, - epoch.end.callback = mx.callback.save.checkpoint("boston")) -``` - -``` - ## Auto detect layout of input matrix, use row major.. - ## Start training with 1 devices - ## [1] Train-rmse=19.1621424021617 - ## [1] Validation-rmse=20.721515592165 - ## Model checkpoint saved to boston-0001.params - ## [2] Train-rmse=13.5127391952367 - ## [2] Validation-rmse=14.1822123675007 - ## Model checkpoint saved to boston-0002.params -``` - - -- `mx.callback.log.train.metric` logs a training metric each period. You can use it either as a `batch.end.callback` or an -`epoch.end.callback`. - - -```r - model <- mx.model.FeedForward.create( - lro, X=train.x, y=train.y, - eval.data=list(data=test.x, label=test.y), - ctx=mx.cpu(), num.round=10, array.batch.size=20, - learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse, - batch.end.callback = mx.callback.log.train.metric(5)) - ``` - -``` - ## Auto detect layout of input matrix, use row major.. - ## Start training with 1 devices - ## Batch [5] Train-rmse=17.6514558545416 - ## [1] Train-rmse=15.2879610219001 - ## [1] Validation-rmse=12.3332062820921 - ## Batch [5] Train-rmse=11.939392828565 - ## [2] Train-rmse=11.4382242547217 - ## [2] Validation-rmse=9.91176550103181 - ............ -``` - -You also can save the training and evaluation errors for later use by passing a reference class: - - - ```r - logger <- mx.metric.logger$new() - model <- mx.model.FeedForward.create( - lro, X=train.x, y=train.y, - eval.data=list(data=test.x, label=test.y), - ctx=mx.cpu(), num.round=10, array.batch.size=20, - learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse, - epoch.end.callback = mx.callback.log.train.metric(5, logger)) - ``` - - ``` - ## Auto detect layout of input matrix, use row major.. - ## Start training with 1 devices - ## [1] Train-rmse=19.1083228733256 - ## [1] Validation-rmse=12.7150687428974 - ## [2] Train-rmse=15.7684378116157 - ## [2] Validation-rmse=14.8105319420491 - ............ - ``` - - ```r - head(logger$train) - ``` - - ``` - ## [1] 19.108323 15.768438 13.531470 11.386050 9.555477 9.351324 - ``` - - ```r - head(logger$eval) - ``` - - ``` - ## [1] 12.715069 14.810532 15.840361 10.898733 9.349706 9.363087 - ``` - -How to Write Your Own Callback Functions ----------- - -You can find the source code for the two callback functions on [GitHub](https://github.com/dmlc/mxnet/blob/master/R-package/R/callback.R) and use it as a template: - -Basically, all callback functions follow the following structure: - - - ```r - mx.callback.fun <- function() { - function(iteration, nbatch, env) { - } - } - ``` - -The following `mx.callback.save.checkpoint` function is stateless. It gets the model from the environment and saves it:. - - - ```r - mx.callback.save.checkpoint <- function(prefix, period=1) { - function(iteration, nbatch, env) { - if (iteration %% period == 0) { - mx.model.save(env$model, prefix, iteration) - cat(sprintf("Model checkpoint saved to %s-%04d.params\n", prefix, iteration)) - } - return(TRUE) - } - } - ``` - -The `mx.callback.log.train.metric` is a little more complex. It holds a reference class and updates it during the training -process: - - - ```r - mx.callback.log.train.metric <- function(period, logger=NULL) { - function(iteration, nbatch, env) { - if (nbatch %% period == 0 && !is.null(env$metric)) { - result <- env$metric$get(env$train.metric) - if (nbatch != 0) - cat(paste0("Batch [", nbatch, "] Train-", result$name, "=", result$value, "\n")) - if (!is.null(logger)) { - if (class(logger) != "mx.metric.logger") { - stop("Invalid mx.metric.logger.") - } - logger$train <- c(logger$train, result$value) - if (!is.null(env$eval.metric)) { - result <- env$metric$get(env$eval.metric) - if (nbatch != 0) - cat(paste0("Batch [", nbatch, "] Validation-", result$name, "=", result$value, "\n")) - logger$eval <- c(logger$eval, result$value) - } - } - } - return(TRUE) - } - } - ``` - -Now you might be curious why both callback functions `return(TRUE)`. - -Can we `return(FALSE)`? - -Yes! You can stop the training early with `return(FALSE)`. See the following examples. - - - ```r - mx.callback.early.stop <- function(eval.metric) { - function(iteration, nbatch, env) { - if (!is.null(env$metric)) { - if (!is.null(eval.metric)) { - result <- env$metric$get(env$eval.metric) - if (result$value < eval.metric) { - return(FALSE) - } - } - } - return(TRUE) - } - } - model <- mx.model.FeedForward.create( - lro, X=train.x, y=train.y, - eval.data=list(data=test.x, label=test.y), - ctx=mx.cpu(), num.round=10, array.batch.size=20, - learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse, - epoch.end.callback = mx.callback.early.stop(10)) - ``` - - ``` - ## Auto detect layout of input matrix, use row major.. - ## Start training with 1 devices - ## [1] Train-rmse=18.5897984387033 - ## [1] Validation-rmse=13.5555213820571 - ## [2] Train-rmse=12.5867564040256 - ## [2] Validation-rmse=9.76304967080928 - ``` - -When the validation metric dips below the threshold we set, the training process stops. - -## Next Steps -* [Neural Networks with MXNet in Five Minutes](/api/r/docs/tutorials/five_minutes_neural_network) -* [Classify Real-World Images with a Pretrained Model](/api/r/docs/tutorials/classify_real_image_with_pretrained_model) -* [Handwritten Digits Classification Competition](/api/r/docs/tutorials/mnist_competition) -* [Character Language Model Using RNN](/api/r/docs/tutorials/char_rnn_model) diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/custom_loss_function.md b/docs/static_site/src/pages/api/r/docs/tutorials/custom_loss_function.md deleted file mode 100644 index a4ca967d8e2c..000000000000 --- a/docs/static_site/src/pages/api/r/docs/tutorials/custom_loss_function.md +++ /dev/null @@ -1,231 +0,0 @@ ---- -layout: page_api -title: Custom Loss Function -is_tutorial: true -tag: r -permalink: /api/r/docs/tutorials/custom_loss_function ---- - - - - - - - - - - - - - - - - - - -Customized loss function -====================================== - -This tutorial provides guidelines for using customized loss function in network construction. - -Model Training Example ----------------------- - -Let's begin with a small regression example. We can build and train a regression model with the following code: - -``` r -data(BostonHousing, package = "mlbench") -BostonHousing[, sapply(BostonHousing, is.factor)] <- - as.numeric(as.character(BostonHousing[, sapply(BostonHousing, is.factor)])) -BostonHousing <- data.frame(scale(BostonHousing)) - -test.ind = seq(1, 506, 5) # 1 pt in 5 used for testing -train.x = data.matrix(BostonHousing[-test.ind,-14]) -train.y = BostonHousing[-test.ind, 14] -test.x = data.matrix(BostonHousing[--test.ind,-14]) -test.y = BostonHousing[--test.ind, 14] - -require(mxnet) -``` - - ## Loading required package: mxnet - -``` r -data <- mx.symbol.Variable("data") -label <- mx.symbol.Variable("label") -fc1 <- mx.symbol.FullyConnected(data, num_hidden = 14, name = "fc1") -tanh1 <- mx.symbol.Activation(fc1, act_type = "tanh", name = "tanh1") -fc2 <- mx.symbol.FullyConnected(tanh1, num_hidden = 1, name = "fc2") -lro <- mx.symbol.LinearRegressionOutput(fc2, name = "lro") - -mx.set.seed(0) -model <- mx.model.FeedForward.create(lro, X = train.x, y = train.y, - ctx = mx.cpu(), - num.round = 5, - array.batch.size = 60, - optimizer = "rmsprop", - verbose = TRUE, - array.layout = "rowmajor", - batch.end.callback = NULL, - epoch.end.callback = NULL) -``` - - ## Start training with 1 devices - -``` r -pred <- predict(model, test.x) -``` - - ## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor.. - -``` r -sum((test.y - pred[1,])^2) / length(test.y) -``` - - ## [1] 0.2485236 - -Besides the `LinearRegressionOutput`, we also provide `LogisticRegressionOutput` and `MAERegressionOutput`. However, this might not be enough for real-world models. You can provide your own loss function by using `mx.symbol.MakeLoss` when constructing the network. - -How to Use Your Own Loss Function ---------------------------------- - -We still use our previous example, but this time we use `mx.symbol.MakeLoss` to minimize the `(pred-label)^2` - -``` r -data <- mx.symbol.Variable("data") -label <- mx.symbol.Variable("label") -fc1 <- mx.symbol.FullyConnected(data, num_hidden = 14, name = "fc1") -tanh1 <- mx.symbol.Activation(fc1, act_type = "tanh", name = "tanh1") -fc2 <- mx.symbol.FullyConnected(tanh1, num_hidden = 1, name = "fc2") -lro2 <- mx.symbol.MakeLoss(mx.symbol.square(mx.symbol.Reshape(fc2, shape = 0) - label), name="lro2") -``` - -Then we can train the network just as usual. - -``` r -mx.set.seed(0) -model2 <- mx.model.FeedForward.create(lro2, X = train.x, y = train.y, - ctx = mx.cpu(), - num.round = 5, - array.batch.size = 60, - optimizer = "rmsprop", - verbose = TRUE, - array.layout = "rowmajor", - batch.end.callback = NULL, - epoch.end.callback = NULL) -``` - - ## Start training with 1 devices - -We should get very similar results because we are actually minimizing the same loss function. However, the result is quite different. - -``` r -pred2 <- predict(model2, test.x) -``` - - ## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor.. - -``` r -sum((test.y - pred2)^2) / length(test.y) -``` - - ## [1] 1.234584 - -This is because output of `mx.symbol.MakeLoss` is the gradient of loss with respect to the input data. We can get the real prediction as below. - -``` r -internals = internals(model2$symbol) -fc_symbol = internals[[match("fc2_output", outputs(internals))]] - -model3 <- list(symbol = fc_symbol, - arg.params = model2$arg.params, - aux.params = model2$aux.params) - -class(model3) <- "MXFeedForwardModel" - -pred3 <- predict(model3, test.x) -``` - - ## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor.. - -``` r -sum((test.y - pred3[1,])^2) / length(test.y) -``` - - ## [1] 0.248294 - -We have provided many operations on the symbols. An example of `|pred-label|` can be found below. - -``` r -lro_abs <- mx.symbol.MakeLoss(mx.symbol.abs(mx.symbol.Reshape(fc2, shape = 0) - label)) -mx.set.seed(0) -model4 <- mx.model.FeedForward.create(lro_abs, X = train.x, y = train.y, - ctx = mx.cpu(), - num.round = 20, - array.batch.size = 60, - optimizer = "sgd", - learning.rate = 0.001, - verbose = TRUE, - array.layout = "rowmajor", - batch.end.callback = NULL, - epoch.end.callback = NULL) -``` - - ## Start training with 1 devices - -``` r -internals = internals(model4$symbol) -fc_symbol = internals[[match("fc2_output", outputs(internals))]] - -model5 <- list(symbol = fc_symbol, - arg.params = model4$arg.params, - aux.params = model4$aux.params) - -class(model5) <- "MXFeedForwardModel" - -pred5 <- predict(model5, test.x) -``` - - ## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor.. - -``` r -sum(abs(test.y - pred5[1,])) / length(test.y) -``` - - ## [1] 0.7056902 - -``` r -lro_mae <- mx.symbol.MAERegressionOutput(fc2, name = "lro") -mx.set.seed(0) -model6 <- mx.model.FeedForward.create(lro_mae, X = train.x, y = train.y, - ctx = mx.cpu(), - num.round = 20, - array.batch.size = 60, - optimizer = "sgd", - learning.rate = 0.001, - verbose = TRUE, - array.layout = "rowmajor", - batch.end.callback = NULL, - epoch.end.callback = NULL) -``` - - ## Start training with 1 devices - -``` r -pred6 <- predict(model6, test.x) -``` - - ## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor.. - -``` r -sum(abs(test.y - pred6[1,])) / length(test.y) -``` - - ## [1] 0.7056902 - - -## Next Steps -* [Neural Networks with MXNet in Five Minutes](/api/r/docs/tutorials/five_minutes_neural_network) -* [Classify Real-World Images with a PreTrained Model](/api/r/docs/tutorials/classify_real_image_with_pretrained_model) -* [Handwritten Digits Classification Competition](/api/r/docs/tutorials/mnist_competition) -* [Character Language Model Using RNN](/api/r/docs/tutorials/char_rnn_model) diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/five_minutes_neural_network.md b/docs/static_site/src/pages/api/r/docs/tutorials/five_minutes_neural_network.md deleted file mode 100644 index f7121407892e..000000000000 --- a/docs/static_site/src/pages/api/r/docs/tutorials/five_minutes_neural_network.md +++ /dev/null @@ -1,341 +0,0 @@ ---- -layout: page_api -title: Five Minutes Neural Network -is_tutorial: true -tag: r -permalink: /api/r/docs/tutorials/five_minutes_neural_network ---- - - - - - - - - - - - - - - - - - -Develop a Neural Network with MXNet in Five Minutes -============================================= - -This tutorial is designed for new users of the `mxnet` package for R. It shows how to construct a neural network to do regression in 5 minutes. It shows how to perform classification and regression tasks, respectively. The data we use is in the `mlbench` package. Instructions to install R and MXNet's R package in different environments can be found [here](/get_started?version=master&platform=linux&language=r&environ=pip&processor=cpu). - -## Classification - - ``` - ## Loading required package: mlbench - ``` - ```r - if (!require(mlbench)) { - install.packages('mlbench') - } - ``` - - ``` - ## Loading required package: mxnet - ``` - - ```r - require(mxnet) - ``` - - ``` - ## Loading required datasets - ``` - - ```r - data(Sonar, package="mlbench") - - Sonar[,61] = as.numeric(Sonar[,61])-1 - train.ind = c(1:50, 100:150) - train.x = data.matrix(Sonar[train.ind, 1:60]) - train.y = Sonar[train.ind, 61] - test.x = data.matrix(Sonar[-train.ind, 1:60]) - test.y = Sonar[-train.ind, 61] - ``` - -We are going to use a multi-layer perceptron as our classifier. In `mxnet`, we have a function called `mx.mlp` for building a general multi-layer neural network to do classification or regression. - -`mx.mlp` requires the following parameters: - -- Training data and label -- Number of hidden nodes in each hidden layer -- Number of nodes in the output layer -- Type of the activation -- Type of the output loss -- The device to train (GPU or CPU) -- Other parameters for `mx.model.FeedForward.create` - -The following code shows an example usage of `mx.mlp`: - - - ```r - mx.set.seed(0) - model <- mx.mlp(train.x, train.y, hidden_node=10, out_node=2, out_activation="softmax", - num.round=20, array.batch.size=15, learning.rate=0.07, momentum=0.9, - eval.metric=mx.metric.accuracy) - ``` - - ``` - ## Auto detect layout of input matrix, use rowmajor.. - ## Start training with 1 devices - ## [1] Train-accuracy=0.488888888888889 - ## [2] Train-accuracy=0.514285714285714 - ## [3] Train-accuracy=0.514285714285714 - ## [4] Train-accuracy=0.514285714285714 - ## [5] Train-accuracy=0.514285714285714 - ## [6] Train-accuracy=0.523809523809524 - ## [7] Train-accuracy=0.619047619047619 - ## [8] Train-accuracy=0.695238095238095 - ## [9] Train-accuracy=0.695238095238095 - ## [10] Train-accuracy=0.761904761904762 - ## [11] Train-accuracy=0.828571428571429 - ## [12] Train-accuracy=0.771428571428571 - ## [13] Train-accuracy=0.742857142857143 - ## [14] Train-accuracy=0.733333333333333 - ## [15] Train-accuracy=0.771428571428571 - ## [16] Train-accuracy=0.847619047619048 - ## [17] Train-accuracy=0.857142857142857 - ## [18] Train-accuracy=0.838095238095238 - ## [19] Train-accuracy=0.838095238095238 - ## [20] Train-accuracy=0.838095238095238 - ``` - -Note that `mx.set.seed` controls the random process in `mxnet`. You can see the accuracy in each round during training. It's also easy to make predictions and evaluate. - -To get an idea of what is happening, view the computation graph from R: - - ```r - graph.viz(model$symbol) - ``` - -[](https://github.com/dmlc/mxnet) - - ```r - preds = predict(model, test.x) - ``` - - ``` - ## Auto detect layout of input matrix, use rowmajor. - ``` - - ```r - pred.label = max.col(t(preds))-1 - table(pred.label, test.y) - ``` - - ``` - ## test.y - ## pred.label 0 1 - ## 0 24 14 - ## 1 36 33 - ``` - -Note for that for multi-class predictions, mxnet outputs `nclass` x `nexamples`, with each row corresponding to the probability of the class. - -## Regression - -Again, let us preprocess the data: - - - ```r - data(BostonHousing, package="mlbench") - - train.ind = seq(1, 506, 3) - train.x = data.matrix(BostonHousing[train.ind, -14]) - train.y = BostonHousing[train.ind, 14] - test.x = data.matrix(BostonHousing[-train.ind, -14]) - test.y = BostonHousing[-train.ind, 14] - ``` - -Although we can use `mx.mlp` again to do regression by changing the `out_activation`, this time we are going to introduce a flexible way to configure neural networks in `mxnet`. Configuration is done by the "Symbol" system in `mxnet`. The Symbol system takes care of the links among nodes, activation, dropout ratio, etc. Configure a multi-layer neural network as follows: - - - ```r - # Define the input data - data <- mx.symbol.Variable("data") - # A fully connected hidden layer - # data: input source - # num_hidden: number of neurons in this hidden layer - fc1 <- mx.symbol.FullyConnected(data, num_hidden=1) - - # Use linear regression for the output layer - lro <- mx.symbol.LinearRegressionOutput(fc1) - ``` - -What matters for a regression task is mainly the last function. It enables the new network to optimize for squared loss. Now let's train on this simple data set. In this configuration, we dropped the hidden layer so that the input layer is directly connected to the output layer. - -Next, make prediction with this structure and other parameters with `mx.model.FeedForward.create`: - - - ```r - mx.set.seed(0) - model <- mx.model.FeedForward.create(lro, X=train.x, y=train.y, - ctx=mx.cpu(), num.round=50, array.batch.size=20, - learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse) - ``` - - ``` - ## Auto detect layout of input matrix, use rowmajor. - ## Start training with 1 devices - ## [1] Train-rmse=16.063282524034 - ## [2] Train-rmse=12.2792375712573 - ## [3] Train-rmse=11.1984634005885 - ## [4] Train-rmse=10.2645236892904 - ## [5] Train-rmse=9.49711005504284 - ## [6] Train-rmse=9.07733734175182 - ## [7] Train-rmse=9.07884450847991 - ## [8] Train-rmse=9.10463850277417 - ## [9] Train-rmse=9.03977049028532 - ## [10] Train-rmse=8.96870685004475 - ## [11] Train-rmse=8.93113287361574 - ## [12] Train-rmse=8.89937257821847 - ## [13] Train-rmse=8.87182096922953 - ## [14] Train-rmse=8.84476075083586 - ## [15] Train-rmse=8.81464673014974 - ## [16] Train-rmse=8.78672567900196 - ## [17] Train-rmse=8.76265872846474 - ## [18] Train-rmse=8.73946101419974 - ## [19] Train-rmse=8.71651926303267 - ## [20] Train-rmse=8.69457600919277 - ## [21] Train-rmse=8.67354928674563 - ## [22] Train-rmse=8.65328755392436 - ## [23] Train-rmse=8.63378039680078 - ## [24] Train-rmse=8.61488162586984 - ## [25] Train-rmse=8.5965105183022 - ## [26] Train-rmse=8.57868133563275 - ## [27] Train-rmse=8.56135851937663 - ## [28] Train-rmse=8.5444819772098 - ## [29] Train-rmse=8.52802114610432 - ## [30] Train-rmse=8.5119504512622 - ## [31] Train-rmse=8.49624261719241 - ## [32] Train-rmse=8.48087453238701 - ## [33] Train-rmse=8.46582689119887 - ## [34] Train-rmse=8.45107881002491 - ## [35] Train-rmse=8.43661331401712 - ## [36] Train-rmse=8.42241575909639 - ## [37] Train-rmse=8.40847217331365 - ## [38] Train-rmse=8.39476931796395 - ## [39] Train-rmse=8.38129658373974 - ## [40] Train-rmse=8.36804269059018 - ## [41] Train-rmse=8.35499817678397 - ## [42] Train-rmse=8.34215505742154 - ## [43] Train-rmse=8.32950441908131 - ## [44] Train-rmse=8.31703985777311 - ## [45] Train-rmse=8.30475363906755 - ## [46] Train-rmse=8.29264031506106 - ## [47] Train-rmse=8.28069372820073 - ## [48] Train-rmse=8.26890902770415 - ## [49] Train-rmse=8.25728089053853 - ## [50] Train-rmse=8.24580511500735 - ``` - -It's also easy to make a prediction and evaluate it: - - - ```r - preds = predict(model, test.x) - ``` - - ``` - ## Auto detect layout of input matrix, use rowmajor.. - ``` - - ```r - sqrt(mean((preds-test.y)^2)) - ``` - - ``` - ## [1] 7.800502 - ``` - -Currently, we have four predefined metrics: "accuracy", "rmse", "mae", and "rmsle". MXNet provides the interface for defining your own metrics: - - - ```r - demo.metric.mae <- mx.metric.custom("mae", function(label, pred) { - pred <- mx.nd.reshape(pred, shape = 0) - res <- mx.nd.mean(mx.nd.abs(label-pred)) - return(res) - }) - ``` - -This is an example of the mean absolute error metric. Simply plug it into the training function: - - - ```r - mx.set.seed(0) - model <- mx.model.FeedForward.create(lro, X=train.x, y=train.y, - ctx=mx.cpu(), num.round=50, array.batch.size=20, - learning.rate=2e-6, momentum=0.9, eval.metric=demo.metric.mae) - ``` - - ``` - ## Auto detect layout of input matrix, use rowmajor. - ## Start training with 1 devices - ## [1] Train-mae=14.953625731998 - ## [2] Train-mae=11.4802955521478 - ## [3] Train-mae=8.50700579749213 - ## [4] Train-mae=7.30591265360514 - ## [5] Train-mae=7.38049803839789 - ## [6] Train-mae=7.36036252975464 - ## [7] Train-mae=7.06519222259521 - ## [8] Train-mae=6.9962231847975 - ## [9] Train-mae=6.96296903822157 - ## [10] Train-mae=6.9046172036065 - ## [11] Train-mae=6.87867620256212 - ## [12] Train-mae=6.85872554779053 - ## [13] Train-mae=6.81936407089233 - ## [14] Train-mae=6.79135354359945 - ## [15] Train-mae=6.77438741260105 - ## [16] Train-mae=6.75365140702989 - ## [17] Train-mae=6.73369296391805 - ## [18] Train-mae=6.71600982877943 - ## [19] Train-mae=6.69932826360067 - ## [20] Train-mae=6.6852519777086 - ## [21] Train-mae=6.67343420452542 - ## [22] Train-mae=6.66315894656711 - ## [23] Train-mae=6.65314838621351 - ## [24] Train-mae=6.64388704299927 - ## [25] Train-mae=6.63480265935262 - ## [26] Train-mae=6.62583245171441 - ## [27] Train-mae=6.61697626113892 - ## [28] Train-mae=6.60842116673787 - ## [29] Train-mae=6.60040124257406 - ## [30] Train-mae=6.59264140658908 - ## [31] Train-mae=6.58551020092434 - ## [32] Train-mae=6.57864215638902 - ## [33] Train-mae=6.57178926467896 - ## [34] Train-mae=6.56495311525133 - ## [35] Train-mae=6.55813185373942 - ## [36] Train-mae=6.5513252152337 - ## [37] Train-mae=6.54453214009603 - ## [38] Train-mae=6.53775374094645 - ## [39] Train-mae=6.53098879920112 - ## [40] Train-mae=6.52423816257053 - ## [41] Train-mae=6.51764053768582 - ## [42] Train-mae=6.51121346155802 - ## [43] Train-mae=6.5047902001275 - ## [44] Train-mae=6.49837123023139 - ## [45] Train-mae=6.49216641320123 - ## [46] Train-mae=6.48598252402412 - ## [47] Train-mae=6.4798010720147 - ## [48] Train-mae=6.47362396452162 - ## [49] Train-mae=6.46745183732775 - ## [50] Train-mae=6.46128723356459 - ``` - -Congratulations! You've learned the basics for using MXNet in R. To learn how to use MXNet's advanced features, see the other tutorials. - - -## Next Steps -* [Classify Real-World Images with Pre-trained Model](https://mxnet.io/tutorials/r/classifyRealImageWithPretrainedModel.html) -* [Handwritten Digits Classification Competition](https://mxnet.io/tutorials/r/mnistCompetition.html) -* [Character Language Model using RNN](https://mxnet.io/tutorials/r/charRnnModel.html) diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/ndarray.md b/docs/static_site/src/pages/api/r/docs/tutorials/ndarray.md index dc3d1c5a028e..92f5b09c4571 100644 --- a/docs/static_site/src/pages/api/r/docs/tutorials/ndarray.md +++ b/docs/static_site/src/pages/api/r/docs/tutorials/ndarray.md @@ -224,8 +224,6 @@ the results. ## Next Steps * [Symbol](/api/r/docs/tutorials/symbol) -* [Write and use callback functions](/api/r/docs/tutorials/callback_function) -* [Neural Networks with MXNet in Five Minutes](/api/r/docs/tutorials/five_minutes_neural_network) * [Classify Real-World Images with Pre-trained Model](/api/r/docs/tutorials/classify_real_image_with_pretrained_model) * [Handwritten Digits Classification Competition](/api/r/docs/tutorials/mnist_competition) * [Character Language Model using RNN](/api/r/docs/tutorials/char_rnn_model) diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/symbol.md b/docs/static_site/src/pages/api/r/docs/tutorials/symbol.md index 5827d6f9b50c..554ccb77ecbf 100644 --- a/docs/static_site/src/pages/api/r/docs/tutorials/symbol.md +++ b/docs/static_site/src/pages/api/r/docs/tutorials/symbol.md @@ -147,8 +147,6 @@ be more memory efficient than CXXNet and gets to the same runtime with greater flexibility. ## Next Steps -* [Write and use callback functions](/api/r/docs/tutorials/callback_function) -* [Neural Networks with MXNet in Five Minutes](/api/r/docs/tutorials/five_minutes_neural_network) * [Classify Real-World Images with Pre-trained Model](/api/r/docs/tutorials/classify_real_image_with_pretrained_model) * [Handwritten Digits Classification Competition](/api/r/docs/tutorials/mnist_competition) * [Character Language Model using RNN](/api/r/docs/tutorials/char_rnn_model) diff --git a/example/recommenders/demo-MF.R b/example/recommenders/demo-MF.R deleted file mode 100644 index 82c0aae9c62c..000000000000 --- a/example/recommenders/demo-MF.R +++ /dev/null @@ -1,84 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -library(mxnet) -DF <- read.table("./ml-100k/u.data", header = F, sep = "\t") -names(DF) <- c("user", "item", "score", "time") -max_user <- max(DF$user) -max_item <- max(DF$item) -DF_mat_x <- data.matrix(t(DF[, 1:2])) -DF_y <- DF[, 3] -k <- 64 -user <- mx.symbol.Variable("user") -item <- mx.symbol.Variable("item") -score <- mx.symbol.Variable("label") -user1 <-mx.symbol.Embedding(data = mx.symbol.BlockGrad(user), input_dim = max_user, - output_dim = k, name = "user1") -item1 <- mx.symbol.Embedding(data = mx.symbol.BlockGrad(item), input_dim = max_item, - output_dim = k, name = "item1") -pred <- user1 * item1 -pred1 <- mx.symbol.sum_axis(pred, axis = 1, name = "pred1") -pred2 <- mx.symbol.Flatten(pred1, name = "pred2") -pred3 <- mx.symbol.LinearRegressionOutput(data = pred2, label = score, name = "pred3") -devices <- mx.cpu() -mx.set.seed(123) - -CustomIter <- setRefClass("CustomIter", fields = c("iter1", "iter2"), - contains = "Rcpp_MXArrayDataIter", - methods = list( - initialize = function(iter1, iter2) { - .self$iter1 <- iter1 - .self$iter2 <- iter2 - .self - }, - value = function() { - user <- .self$iter1$value()$data - item <- .self$iter2$value()$data - label <- .self$iter1$value()$label - list(user = user, - item = item, - label = label) - }, - iter.next = function() { - .self$iter1$iter.next() - .self$iter2$iter.next() - }, - reset = function() { - .self$iter1$reset() - .self$iter2$reset() - }, - num.pad = function() { - .self$iter1$num.pad() - }, - finalize = function() { - .self$iter1$finalize() - .self$iter2$finalize() - } - ) -) - -user_iter = mx.io.arrayiter(data = DF[, 1], label = DF[, 3], batch.size = k) - -item_iter = mx.io.arrayiter(data = DF[, 2], label = DF[, 3], batch.size = k) - -train_iter <- CustomIter$new(user_iter, item_iter) - -model <- mx.model.FeedForward.create(pred3, X = train_iter, ctx = devices, - num.round = 10, initializer = mx.init.uniform(0.07), - learning.rate = 0.07, eval.metric = mx.metric.rmse, - momentum = 0.9, epoch.end.callback = mx.callback.log.train.metric(1), - input.names = c("user", "item"), output.names = "label") diff --git a/julia/examples/regression-example.jl b/julia/examples/regression-example.jl deleted file mode 100644 index bbbb415fe664..000000000000 --- a/julia/examples/regression-example.jl +++ /dev/null @@ -1,101 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -#= -This script shows how a simple MLP net may be used -for regression. It shows how data in memory may be -used for training and evaluation, and how to obtain -the predictions from the trained net. -=# -using MXNet -using Distributions -#using Plots - -# data generating process -generate_inputs(mean, var, size) = rand(MvNormal(mean, var), size) -output(data) = sin.(data[1:1,:]).*sin.(data[2:2,:])./(data[1:1,:].*data[2:2,:]) - -# create training and evaluation data sets -mean=[0.0; 0.0] -var=[1.0 0.0; 0.0 1.0] -samplesize = 5000 -TrainInput = generate_inputs(mean, var, samplesize) -TrainOutput = output(TrainInput) -ValidationInput = generate_inputs(mean, var, samplesize) -ValidationOutput = output(ValidationInput) - -# how to set up data providers using data in memory -function data_source(batchsize = 100) - train = mx.ArrayDataProvider( - :data => TrainInput, - :label => TrainOutput, - batch_size = batchsize, - shuffle = true, - ) - valid = mx.ArrayDataProvider( - :data => ValidationInput, - :label => ValidationOutput, - batch_size = batchsize, - shuffle = true, - ) - - train, valid -end - -# create a two hidden layer MPL: try varying num_hidden, and change tanh to relu, -# or add/remove a layer -data = mx.Variable(:data) -label = mx.Variable(:label) -net = @mx.chain mx.Variable(:data) => - mx.FullyConnected(num_hidden=10) => - mx.Activation(act_type=:tanh) => - mx.FullyConnected(num_hidden=3) => - mx.Activation(act_type=:tanh) => - mx.FullyConnected(num_hidden=1) => - mx.LinearRegressionOutput(mx.Variable(:label)) - -# final model definition, don't change, except if using gpu -model = mx.FeedForward(net, context=mx.cpu()) - -# set up the optimizer: select one, explore parameters, if desired -#optimizer = mx.SGD(η=0.01, μ=0.9, λ=0.00001) -optimizer = mx.ADAM() - -# train, reporting loss for training and evaluation sets -# initial training with small batch size, to get to a good neighborhood -trainprovider, evalprovider = data_source(#= batchsize =# 200) -mx.fit(model, optimizer, trainprovider, - initializer = mx.NormalInitializer(0.0, 0.1), - eval_metric = mx.MSE(), - eval_data = evalprovider, - n_epoch = 20, - callbacks = [mx.speedometer()]) -# more training with the full sample -trainprovider, evalprovider = data_source(#= batchsize =# samplesize) -mx.fit(model, optimizer, trainprovider, - initializer = mx.NormalInitializer(0.0, 0.1), - eval_metric = mx.MSE(), - eval_data = evalprovider, - n_epoch = 500, # previous setting is batchsize = 200, epoch = 20 - # implies we did (5000 / 200) * 20 times update in previous `fit` - callbacks = [mx.speedometer()]) - -# obtain predictions -plotprovider = mx.ArrayDataProvider(:data => ValidationInput, :label => ValidationOutput) -fit = mx.predict(model, plotprovider) -println("correlation between fitted values and true regression line: ", cor(vec(fit), vec(ValidationOutput))) -#scatter(ValidationOutput',fit',w = 3, xlabel="true", ylabel="predicted", title="45º line is what we hope for", show=true) diff --git a/julia/test/unittest/symbolic-node.jl b/julia/test/unittest/symbolic-node.jl index 07ef05f704db..9c09515f365c 100644 --- a/julia/test/unittest/symbolic-node.jl +++ b/julia/test/unittest/symbolic-node.jl @@ -34,28 +34,6 @@ function test_basic() @test mx.list_auxiliary_states(model) == Symbol[] end -function test_chain() - @info("SymbolicNode::chain") - - model = mlpchain() - @test mx.list_arguments(model) == [:data,:fc1_weight,:fc1_bias,:fc2_weight,:fc2_bias] - @test mx.list_outputs(model) == [:fc2_output] - @test mx.list_auxiliary_states(model) == Symbol[] - - let layerconfig = [20, 10, 6] - model = @mx.chain mx.Variable(:data) => - mx.MLP(layerconfig, prefix=:magic_) => - mx.LinearRegressionOutput(mx.Variable(:label)) - - @test mx.list_arguments(model) == [ - :data, - :magic_fc1_weight, :magic_fc1_bias, - :magic_fc2_weight, :magic_fc2_bias, - :magic_fc3_weight, :magic_fc3_bias, - :label] - end -end - function test_internal() @info("SymbolicNode::internal") diff --git a/perl-package/AI-MXNet/examples/sparse/matrix_factorization/README.md b/perl-package/AI-MXNet/examples/sparse/matrix_factorization/README.md deleted file mode 100644 index 3eb1bab508e5..000000000000 --- a/perl-package/AI-MXNet/examples/sparse/matrix_factorization/README.md +++ /dev/null @@ -1,26 +0,0 @@ - - - - - - - - - - - - - - - - - -Matrix Factorization w/ Sparse Embedding -=========== -The example demonstrates the basic usage of the SparseEmbedding operator in MXNet, adapted based on @leopd's recommender examples. -The operator is available on both CPU and GPU. This is for demonstration purpose only. - -- get_data.sh -- perl train.pl -- To compare the training speed with (dense) Embedding, run perl train.pl --use-dense 1 -- To run the example on the GPU, run perl train.pl --use-gpu 1 diff --git a/perl-package/AI-MXNet/examples/sparse/matrix_factorization/get_data.sh b/perl-package/AI-MXNet/examples/sparse/matrix_factorization/get_data.sh deleted file mode 100755 index b8b14e136d17..000000000000 --- a/perl-package/AI-MXNet/examples/sparse/matrix_factorization/get_data.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -DIR=$(cd `dirname $0`; pwd) -DATA_DIR="${DIR}/data/" - -if [[ ! -d "${DATA_DIR}" ]]; then - echo "${DATA_DIR} doesn't exist, will create one"; - mkdir -p ${DATA_DIR} -fi - -wget -P ${DATA_DIR} http://files.grouplens.org/datasets/movielens/ml-10m.zip -cd ${DATA_DIR} -unzip ml-10m.zip -cd ml-10M100K -chmod +x allbut.pl -sh split_ratings.sh \ No newline at end of file diff --git a/perl-package/AI-MXNet/examples/sparse/matrix_factorization/train.pl b/perl-package/AI-MXNet/examples/sparse/matrix_factorization/train.pl deleted file mode 100755 index fa6a76376b8e..000000000000 --- a/perl-package/AI-MXNet/examples/sparse/matrix_factorization/train.pl +++ /dev/null @@ -1,184 +0,0 @@ -#!/usr/bin/env perl - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -use strict; -use warnings; -use AI::MXNet qw(mx); -use Getopt::Long qw(HelpMessage); - -GetOptions( - 'print-every=i' => \(my $print_every = 100), - 'factor-size=i' => \(my $factor_size = 128), - 'use-gpu=i' => \(my $use_gpu = 0 ), - 'num-epoch=i' => \(my $num_epoch = 3 ), - 'batch-size=i' => \(my $batch_size = 128), - 'use-dense=i' => \(my $use_dense = 0 ), - 'help' => sub { HelpMessage(0) }, -) or HelpMessage(1); - -=head1 NAME - - train.pl - Run matrix factorization with sparse embedding - -=head1 SYNOPSIS - - --print-every logging frequency, 100 - --factor-size the factor size of the embedding operation, 128 - --use-gpu use gpu, 0 - --num-epoch number of epochs to train, 3 - --batch-size number of examples per batch, 128 - --use-dense use the dense embedding operator, 0 - -=cut - -my %MOVIELENS = ( - dataset => 'ml-10m', - train => './data/ml-10M100K/r1.train', - val => './data/ml-10M100K/r1.test', - max_user => 71569, - max_movie => 65135, -); - -sub get_movielens_iter -{ - my ($filename, $batch_size) = @_; - print "Preparing data iterators for $filename ... \n"; - my @user; - my @item; - my @score; - open(F, $filename) or die $!; - my $num_samples = 0; - while(my $line = ) - { - my @tks = split('::', $line); - next unless @tks == 4; - $num_samples++; - push @user, [$tks[0]]; - push @item, [$tks[1]]; - push @score, [$tks[2]]; - } - # convert to ndarrays - my $user = mx->nd->array(\@user, dtype=>'int32'); - my $item = mx->nd->array(\@item); - my $score = mx->nd->array(\@score); - return mx->io->NDArrayIter( - data => Hash::Ordered->new(user => $user, item => $item), - label => Hash::Ordered->new(score => $score), - batch_size => $batch_size, - shuffle => 1 - ); -} - -sub matrix_fact_net -{ - my ($factor_size, $num_hidden, $max_user, $max_item, $sparse_embed) = @_; - $sparse_embed //= 1; - # input - my $user = mx->symbol->Variable('user'); - my $item = mx->symbol->Variable('item'); - my $score = mx->symbol->Variable('score'); - if($sparse_embed) - { - # user feature lookup - my $user_weight = mx->symbol->Variable('user_weight', stype=>'row_sparse'); - $user = mx->symbol->contrib->SparseEmbedding(data=>$user, weight=>$user_weight, - input_dim=>$max_user, output_dim=>$factor_size); - # item feature lookup - my $item_weight = mx->symbol->Variable('item_weight', stype=>'row_sparse'); - $item = mx->symbol->contrib->SparseEmbedding(data=>$item, weight=>$item_weight, - input_dim=>$max_item, output_dim=>$factor_size); - } - else - { - # user feature lookup - $user = mx->symbol->Embedding(data=>$user, input_dim=>$max_user, output_dim=>$factor_size); - # item feature lookup - $item = mx->symbol->Embedding(data=>$item, input_dim=>$max_item, output_dim=>$factor_size); - } - # non-linear transformation of user features - $user = mx->symbol->Activation(data=>$user, act_type=>'relu'); - $user = mx->symbol->FullyConnected(data=>$user, num_hidden=>$num_hidden); - # non-linear transformation of item features - $item = mx->symbol->Activation(data=>$item, act_type=>'relu'); - $item = mx->symbol->FullyConnected(data=>$item, num_hidden=>$num_hidden); - # predict by the inner product, which is elementwise product and then sum - my $pred = $user * $item; - $pred = mx->symbol->sum(data=>$pred, axis => 1); - $pred = mx->symbol->Flatten(data=>$pred); - # loss layer - $pred = mx->symbol->LinearRegressionOutput(data=>$pred, label=>$score); - return $pred; -} - -my $optimizer = 'sgd'; -my $use_sparse = not $use_dense; - -my $momentum = 0.9; -my $ctx = $use_gpu ? mx->gpu(0) : mx->cpu(0); -my $learning_rate = 0.1; - -# prepare dataset and iterators -my $max_user = $MOVIELENS{max_user}; -my $max_movies = $MOVIELENS{max_movie}; -my $train_iter = get_movielens_iter($MOVIELENS{train}, $batch_size); -my $val_iter = get_movielens_iter($MOVIELENS{val} , $batch_size); - -# construct the model -my $net = matrix_fact_net($factor_size, $factor_size, $max_user, $max_movies, $use_sparse); - -# initialize the module -my $mod = mx->module->Module(symbol=>$net, context=>$ctx, data_names=>['user', 'item'], - label_names=>['score']); -$mod->bind(data_shapes=>$train_iter->provide_data, label_shapes=>$train_iter->provide_label); -$mod->init_params(initializer=>mx->init->Xavier(factor_type=>"in", magnitude=>2.34)); -my $optim = mx->optimizer->create($optimizer, learning_rate=>$learning_rate, momentum=>$momentum, - wd=>1e-4, rescale_grad=>1.0/$batch_size); -$mod->init_optimizer(optimizer=>$optim); - -# use MSE as the metric -my $metric = mx->metric->create(['MSE']); -my $speedometer = mx->callback->Speedometer($batch_size, $print_every); -print "Training started ...\n"; -for my $epoch (0..$num_epoch-1) -{ - my $nbatch = 0; - $metric->reset(); - while(my $batch = <$train_iter>) - { - $nbatch += 1; - $mod->forward_backward($batch); - # update all parameters - $mod->update(); - # update training metric - $mod->update_metric($metric, $batch->label); - my $speedometer_param = AI::MXNet::BatchEndParam->new( - epoch=>$epoch, nbatch=>$nbatch, - eval_metric=>$metric - ); - $speedometer->($speedometer_param); - } - # evaluate metric on validation dataset - my $score = $mod->score($val_iter, ['MSE']); - printf("epoch %d, eval MSE = %s \n", $epoch, $score->{mse}); - # reset the iterator for next pass of data - $train_iter->reset(); - $val_iter->reset(); -} -print "Training completed.\n"; - diff --git a/perl-package/AI-MXNet/t/test_module.t b/perl-package/AI-MXNet/t/test_module.t index 55e098683399..41ace1b391da 100644 --- a/perl-package/AI-MXNet/t/test_module.t +++ b/perl-package/AI-MXNet/t/test_module.t @@ -173,124 +173,6 @@ sub test_module_states } } -sub test_module_switch_bucket -{ - my $vocab_dim = 5000; - my $num_hidden = 100; - my $num_embedding = 100; - my $num_layer = 2; - my $default_key = 10; - my $test_key = 5; - my $batch_size = 32; - my $contexts = [mx->cpu(0)]; - my $initializer = mx->init->Xavier(factor_type=>"in", magnitude=>2.34); - - #generate symbols for an LSTM network - my $gen_sym = sub { - my $seq_len = shift; - my $data = mx->sym->Variable('data'); - my $label = mx->sym->Variable('softmax_label'); - my $embed = mx->sym->Embedding(data=>$data, input_dim=>$vocab_dim, - output_dim=>$num_embedding, name=>'embed'); - my $stack = mx->rnn->SequentialRNNCell(); - for my $i (0..$num_layer-1) - { - $stack->add(mx->rnn->LSTMCell(num_hidden=>$num_hidden, prefix=>"lstm_l${i}_")); - } - my ($outputs, $states) = $stack->unroll($seq_len, inputs=>$embed, merge_outputs=>1); - - my $pred = mx->sym->Reshape($outputs, shape=>[-1, $num_hidden]); - $pred = mx->sym->FullyConnected(data=>$pred, num_hidden=>$vocab_dim, name=>'pred'); - - $label = mx->sym->Reshape($label, shape=>[-1]); - $pred = mx->sym->SoftmaxOutput(data=>$pred, label=>$label, name=>'softmax'); - - return ($pred, ['data'], ['softmax_label']); - }; - my $create_bucketing_module = sub { my $key = shift; - my $model = mx->mod->BucketingModule( - sym_gen => $gen_sym, - default_bucket_key => $key, - context => $contexts - ); - $model->bind(data_shapes=>[['data', [$batch_size, $key]]], - label_shapes=>[['softmax_label', [$batch_size, $key]]] - ); - $model->init_params(initializer=>$initializer); - return $model; - }; - #initialize the bucketing module with the default bucket key - my $bucketing_model = $create_bucketing_module->($default_key); - #switch to test_key - $bucketing_model->switch_bucket( - bucket_key => $test_key, - data_shapes => [['data', [$batch_size, $test_key]]], - label_shapes => [['softmax_label', [$batch_size, $test_key]]] - ); - - delete $bucketing_model->_buckets->{$test_key}; - - $bucketing_model->switch_bucket( - bucket_key => $test_key, - data_shapes => [['data', [$batch_size, $test_key]]], - label_shapes => [['softmax_label', [$batch_size, $test_key]]] - ); -} - -sub test_monitor -{ - mx->random->seed(11); - my $data = mx->nd->array([[0.05, .10]]); - my $label = mx->nd->array([[.01, 0.99]]); - my $train_data = mx->io->NDArrayIter($data, label => $label, batch_size=>1); - - # symbols - my $x = mx->symbol->Variable('data'); - $x = mx->symbol->FullyConnected(name=>'fc_0', data=>$x, num_hidden=>2); - $x = mx->symbol->Activation(name=>"act_0", data=>$x, act_type=>'sigmoid'); - $x = mx->symbol->FullyConnected(name=>'fc_1', data=>$x, num_hidden=>2); - $x = mx->symbol->Activation(name=>"act_1", data=>$x, act_type=>'sigmoid'); - $x = mx->symbol->LinearRegressionOutput(data=>$x, name=>'softmax', grad_scale=>2); - - # create monitor - my $mean_abs = sub { my ($x) = @_; - return $x->abs->sum/$x->size; - }; - my $mon = mx->mon->Monitor(1, stat_func=>$mean_abs, pattern=>'.*', sort=>1); - - # create module - my $mod = mx->mod->Module($x, context=>[mx->cpu()]); - $mod->bind(data_shapes=>$train_data->provide_data, label_shapes=>$train_data->provide_label, - for_training=>1); - $mod->install_monitor($mon); - my $arg_params = {fc_0_weight => mx->nd->array([[.15, .20], [.25, .30]]), - fc_0_bias => mx->nd->array([.35, .35]), - fc_1_weight => mx->nd->array([[.40, .45], [.50, .55]]), - fc_1_bias => mx->nd->array([.60, .60])}; - $mod->init_params(arg_params=>$arg_params); - - my $data_batch = <$train_data>; - $mon->tic(); - $mod->forward_backward($data_batch); - my $res = $mon->toc(); - my $keys = ['act_0', 'act_1', 'data', 'fc_0', 'fc_1', 'softmax']; - my $mon_result_counts = [0, 0, 0, 0, 0, 0]; - ok(@$res == 21); - for my $r (@$res) - { - my ($n, $k, $v) = @$r; - enumerate(sub { - my ($idx, $key) = @_; - if($k =~ /^$key/) - { - $mon_result_counts->[$idx] += 1; - return; - } - }, $keys); - } - is_deeply($mon_result_counts, [2, 2, 1, 6, 6, 4]); -} - sub test_module_dtype { my $dtype = 'float16'; @@ -351,445 +233,6 @@ sub test_module_input_grads ok(($c_grad == 3)->all); } -sub test_executor_group -{ - my $get_rnn_sym = sub { my ($num_layers, $num_words, $num_hidden, $num_embed, $seq_len, $sparse_embedding) = @_; - my $stack = mx->rnn->SequentialRNNCell(); - for my $i (0..$num_layers-1) - { - $stack->add(mx->rnn->LSTMCell(num_hidden=>$num_hidden, prefix=>"lstm_l${i}_")); - } - my $data = mx->sym->Variable('data'); - my $label = mx->sym->Variable('softmax_label'); - my $embed; - if($sparse_embedding) - { - my $embed_weight = mx->sym->Variable('embed_weight', stype=>'row_sparse'); - $embed = mx->sym->contrib->SparseEmbedding(data=>$data, input_dim=>$num_words, - weight=>$embed_weight, output_dim=>$num_embed, - name=>'embed'); - - } - else - { - $embed = mx->sym->Embedding(data=>$data, input_dim=>$num_words, - output_dim=>$num_embed, name=>'embed'); - } - - $stack->reset(); - my ($outputs, $states) = $stack->unroll($seq_len, inputs=>$embed, merge_outputs=>1); - - my $pred = mx->sym->Reshape($outputs, shape=>[-1, $num_hidden]); - $pred = mx->sym->FullyConnected(data=>$pred, num_hidden=>$num_words, name=>'pred'); - - $label = mx->sym->Reshape($label, shape=>[-1]); - $pred = mx->sym->SoftmaxOutput(data=>$pred, label=>$label, name=>'softmax'); - return $pred; - }; - - my $test_shared_exec_group = sub { my ($exec_grp_shared, $exec_grp_created, $shared_arg_names, $extra_args, $check_grads) = @_; - # Test shared data arrays - for my $i (0..@{ $exec_grp_shared->execs }-1) - { - # test same shared_data_arrays for two exec groups - my $shared_data_array1 = $exec_grp_shared->shared_data_arrays->[$i]; - my $shared_data_array2 = $exec_grp_created->shared_data_arrays->[$i]; - if(defined $extra_args) - { - ok(keys(%$shared_data_array1) == @$extra_args); - } - ok(keys(%$shared_data_array1) == keys(%$shared_data_array2)); - while(my ($k, $v) = each %{ $shared_data_array1 }) - { - if(defined $extra_args) - { - ok(grep { $_ eq $k } @$extra_args); - } - ok(exists $shared_data_array2->{$k}); - ok(same_array($v, $shared_data_array2->{$k})); - } - # Test shared argument arrays and gradient arrays - my $exec_shared = $exec_grp_shared->execs->[$i]; - my $exec_created = $exec_grp_created->execs->[$i]; - if(defined $shared_arg_names) - { - # test shared arguments - for my $arg_name (@$shared_arg_names) - { - ok(exists $exec_created->arg_dict->{$arg_name}); - ok(same_array($exec_shared->arg_dict->{$arg_name}, $exec_created->arg_dict->{$arg_name})); - } - # test shared argument gradients - for my $arg_name (@$shared_arg_names) - { - if($check_grads) - { - ok(exists $exec_created->grad_dict->{$arg_name}); - ok(same_array($exec_shared->grad_dict->{$arg_name}, $exec_created->grad_dict->{$arg_name})); - } - } - } - my $grad_req = $exec_grp_shared->grad_req; - while(my ($arg_name, $grad) = each %{ $grad_req }) - { - ok($grad eq $exec_grp_created->grad_req->{$arg_name}); - } - } - }; - - for my $sparse_embedding (0, 1) - { - my $contexts = [mx->cpu(0), mx->cpu(1)]; - my $workload = [(1) x scalar(@$contexts)]; - my $batch_size = 32; - my $max_bucket_size = 80; - my $num_words = 1000; - my $num_hidden = 100; - my $num_embed = 200; - my $data_shapes = [['data', [$batch_size, $max_bucket_size]]]; - my $label_shapes = [['softmax_label', [$batch_size, $max_bucket_size]]]; - - # generate an rnn sym with #layers=5 - my $sym = $get_rnn_sym->(3, $num_words, $num_hidden, - $num_embed, $max_bucket_size, $sparse_embedding); - my $arg_names1 = $sym->list_arguments(); - my $input_names = ['data', 'softmax_label']; - my $shared_arg_names = [grep { !/^(?:data|softmax_label)$/ } @$arg_names1]; - my $exec_group1 = AI::MXNet::DataParallelExecutorGroup->new( - symbol=>$sym, contexts=>$contexts, - workload=>$workload, data_shapes=>$data_shapes, - label_shapes=>$label_shapes, param_names=>$shared_arg_names, - for_training=>1, inputs_need_grad=>0 - ); - # shared_data_arrays should only have input "data" and "softmax_label" arrays - for my $i (0..@{$contexts}-1) - { - ok(keys(%{$exec_group1->shared_data_arrays->[$i]}) == @$input_names); - for my $name (@$input_names) - { - ok(exists $exec_group1->shared_data_arrays->[$i]->{$name}); - } - } - # generate an rnn sym with #layers=5 - $sym = $get_rnn_sym->(5, $num_words, $num_hidden, - $num_embed, $max_bucket_size, $sparse_embedding); - my $arg_names2 = $sym->list_arguments(); - my $exec_group2 = AI::MXNet::DataParallelExecutorGroup->new(symbol=>$sym, contexts=>$contexts, - workload=>$workload, data_shapes=>$data_shapes, - label_shapes=>$label_shapes, param_names=>$shared_arg_names, - for_training=>1, inputs_need_grad=>0, - shared_group=>$exec_group1); - my %shared_arg_names = map { $_ => 1 } @$shared_arg_names; - my $extra_args = [grep { not exists $shared_arg_names{$_} } @$arg_names2]; - $test_shared_exec_group->( - $exec_group1, $exec_group2, - $shared_arg_names, $extra_args, not $sparse_embedding - ); - } -} - -sub test_factorization_machine_module -{ - mx->random->seed(11); - my $check_factorization_machine_module = sub { my ($optimizer, $num_epochs) = @_; - my $fm = sub { my ($factor_size, $feature_dim, $init) = @_; - my $x = mx->symbol->Variable("data", stype=>'csr'); - my $v = mx->symbol->Variable("v", shape=>[$feature_dim, $factor_size], - init=>$init, stype=>'row_sparse'); - - my $w1_weight = mx->symbol->var('w1_weight', shape=>[$feature_dim, 1], - init=>$init, stype=>'row_sparse'); - my $w1_bias = mx->symbol->var('w1_bias', shape=>[1]); - my $w1 = mx->symbol->broadcast_add(mx->symbol->dot($x, $w1_weight), $w1_bias); - - my $v_s = mx->symbol->_square_sum(data=>$v, axis=>1, keepdims=>1); - my $x_s = mx->symbol->square(data=>$x); - my $bd_sum = mx->sym->dot($x_s, $v_s); - - my $w2 = mx->symbol->dot($x, $v); - my $w2_squared = 0.5 * mx->symbol->square(data=>$w2); - - my $w_all = mx->symbol->Concat($w1, $w2_squared, dim=>1); - my $sum1 = mx->symbol->sum(data=>$w_all, axis=>1, keepdims=>1); - my $sum2 = 0.5 * mx->symbol->negative($bd_sum); - my $model = mx->sym->elemwise_add($sum1, $sum2); - - my $y = mx->symbol->Variable("label"); - $model = mx->symbol->LinearRegressionOutput(data=>$model, label=>$y); - return $model - }; - - # model - my $init = mx->initializer->Normal(sigma=>0.01); - my $factor_size = 4; - my $feature_dim = 10000; - my $model = $fm->($factor_size, $feature_dim, $init); - - # data iter - my $num_batches = 5; - my $batch_size = 64; - my $num_samples = $batch_size * $num_batches; - # generate some random csr data - my $csr_nd = rand_ndarray([$num_samples, $feature_dim], 'csr', 0.1); - my $label = mx->nd->ones([$num_samples,1]); - # the alternative is to use LibSVMIter - my $train_iter = mx->io->NDArrayIter(data=>$csr_nd, - label=>Hash::Ordered->new(label => $label), - batch_size=>$batch_size, - last_batch_handle=>'discard'); - # create module - my $mod = mx->mod->Module(symbol=>$model, data_names=>['data'], label_names=>['label']); - # allocate memory by given the input data and lable shapes - $mod->bind(data_shapes=>$train_iter->provide_data, label_shapes=>$train_iter->provide_label); - # initialize parameters by uniform random numbers - $mod->init_params(initializer=>$init); - my $expected_accuracy; - if($optimizer eq 'sgd') - { - # use Sparse SGD with learning rate 0.1 to train - my $sgd = mx->optimizer->SGD(momentum=>0.1, clip_gradient=>5.0, learning_rate=>0.01, - rescale_grad=>1.0/$batch_size); - $mod->init_optimizer(optimizer=>$sgd); - $num_epochs //= 10; - $expected_accuracy = 0.02; - } - elsif($optimizer eq 'adam') - { - # use Sparse Adam to train - my $adam = mx->optimizer->Adam(clip_gradient=>5.0, learning_rate=>0.0005, - rescale_grad=>1.0/$batch_size); - $mod->init_optimizer(optimizer=>$adam); - $num_epochs //= 10; - $expected_accuracy = 0.05; - } - elsif($optimizer eq 'adagrad') - { - # use Sparse AdaGrad with learning rate 0.1 to train - my $adagrad = mx->optimizer->AdaGrad(clip_gradient=>5.0, learning_rate=>0.01, - rescale_grad=>1.0/$batch_size); - $mod->init_optimizer(optimizer=>$adagrad); - $num_epochs //= 20; - $expected_accuracy = 0.09; - } - else - { - die "Unsupported optimizer type $optimizer specified"; - } - # use accuracy as the metric - my $metric = mx->metric->create('MSE'); - # train 'num_epochs' epoch - for my $epoch (1..$num_epochs) - { - $train_iter->reset(); - $metric->reset(); - while(my $batch = <$train_iter>) - { - $mod->forward($batch, is_train=>1); # compute predictions - $mod->update_metric($metric, $batch->label); # accumulate prediction accuracy - $mod->backward(); # compute gradients - $mod->update(); # update parameters - } - } - if($num_epochs > 1) - { - ok(($metric->get)[1] < $expected_accuracy); - } - }; - - $check_factorization_machine_module->('sgd'); - $check_factorization_machine_module->('adam'); - $check_factorization_machine_module->('adagrad'); -} - - -sub test_module_initializer -{ - my $regression_model = sub { my ($m) = @_; - my $x = mx->symbol->var("data", stype=>'csr'); - my $v = mx->symbol->var("v", shape=>[$m, 1], init=>mx->init->Uniform(scale=>.1), - stype=>'row_sparse'); - my $model = mx->symbol->dot(lhs=>$x, rhs=>$v); - my $y = mx->symbol->Variable("label"); - $model = mx->symbol->LinearRegressionOutput(data=>$model, label=>$y, name=>"out"); - return $model - }; - - my ($n, $m) = (128, 100); - my $model = $regression_model->($m); - - my $data = mx->nd->zeros([$n, $m], stype=>'csr'); - my $label = mx->nd->zeros([$n, 1]); - my $iterator = mx->io->NDArrayIter(data=>$data, label=>Hash::Ordered->new(label => $label), - batch_size=>$n, last_batch_handle=>'discard'); - - # create module - my $mod = mx->mod->Module(symbol=>$model, data_names=>['data'], label_names=>['label']); - $mod->bind(data_shapes=>$iterator->provide_data, label_shapes=>$iterator->provide_label); - $mod->init_params(); - my $v = $mod->_arg_params->{v}; - ok($v->stype eq 'row_sparse'); - ok($v->aspdl->sum != 0); -} - -sub test_module_set_params -{ - # data iter - mx->random->seed(11); - my $data = mx->nd->array([[0.05, .10]]); - my $label = mx->nd->array([[.01, 0.99]]); - my $train_data = mx->io->NDArrayIter(data => $data, label => $label, batch_size => 1); - - # symbols - my $x = mx->symbol->Variable('data'); - $x = mx->symbol->FullyConnected(name=>'fc_0', data=>$x, num_hidden=>2); - $x = mx->symbol->Activation(name=>"act_0", data=>$x, act_type=>'sigmoid'); - $x = mx->symbol->FullyConnected(name=>'fc_1', data=>$x, num_hidden=>2); - $x = mx->symbol->Activation(name=>"act_1", data=>$x, act_type=>'sigmoid'); - $x = mx->symbol->LinearRegressionOutput(data=>$x, name=>'softmax', grad_scale=>2); - - # create module - my $mod = mx->mod->Module($x, context=>[mx->cpu()]); - $mod->bind(data_shapes => $train_data->provide_data, label_shapes=>$train_data->provide_label, - for_training=>1); - - my $arg_params_correct = {fc_0_weight => mx->nd->array([[.15, .20], [.25, .30]]), - fc_0_bias => mx->nd->array([.35, .35]), - fc_1_weight => mx->nd->array([[.40, .45], [.50, .55]]), - fc_1_bias => mx->nd->array([.60, .60])}; - - my $arg_params_missing = {fc_0_weight => mx->nd->array([[.15, .20], [.25, .30]]), - fc_0_bias => mx->nd->array([.35, .35]), - fc_1_weight => mx->nd->array([[.40, .45], [.50, .55]])}; - - my $arg_params_extra = {fc_0_weight => mx->nd->array([[.15, .20], [.25, .30]]), - fc_0_bias => mx->nd->array([.35, .35]), - fc_1_weight=> mx->nd->array([[.40, .45], [.50, .55]]), - fc_1_bias => mx->nd->array([.60, .60]), - fc_2_weight => mx->nd->array([.60, .60])}; - - my $arg_params_missing_extra = {fc_3_weight => mx->nd->array([.60, .60])}; - - # test regular set_params - $mod->set_params($arg_params_correct, {}, force_init=>1); - - # test allow missing - $mod->set_params($arg_params_missing, {}, allow_missing=>1, force_init=>1); - ok(dies_like(sub { $mod->set_params($arg_params_missing, {}, force_init=>1, allow_missing=>0); }, qr/fc_/)); - - # test allow extra - $mod->set_params($arg_params_extra, {}, force_init=>1, allow_missing=>1, allow_extra=>1); - ok(dies_like(sub { $mod->set_params($arg_params_extra, {}, force_init=>1, allow_missing=>1, allow_extra=>0); }, qr/fc_/)); - - # test allow missing + extra, this will throw a runtime error - ok(dies_like(sub { $mod->set_params($arg_params_missing_extra, {}, force_init=>1, allow_missing=>1, allow_extra=>0); }, qr/fc_/)); -} - -sub test_forward_reshape -{ - my $num_class = 10; - my $data1 = mx->sym->Variable('data1'); - my $data2 = mx->sym->Variable('data2'); - my $conv1 = mx->sym->Convolution(data=>$data1, kernel=>[2, 2], num_filter=>2, stride=>[2, 2]); - my $conv2 = mx->sym->Convolution(data=>$data2, kernel=>[3, 3], num_filter=>3, stride=>[1, 1]); - my $pooling1 = mx->sym->Pooling(data=>$conv1, kernel=>[2, 2], stride=>[1, 1], pool_type=>"avg"); - my $pooling2 = mx->sym->Pooling(data=>$conv2, kernel=>[2, 2], stride=>[1, 1], pool_type=>"max"); - my $flatten1 = mx->sym->flatten(data=>$pooling1); - my $flatten2 = mx->sym->flatten(data=>$pooling2); - my $sum = mx->sym->sum(data=>$flatten1, axis=>1) + mx->sym->sum(data=>$flatten2, axis=>1); - my $fc = mx->sym->FullyConnected(data=>$sum, num_hidden=>$num_class); - my $sym = mx->sym->SoftmaxOutput(data=>$fc, name=>'softmax'); - - my $dshape1 = [10, 3, 64, 64]; - my $dshape2 = [10, 3, 32, 32]; - my $lshape = [10]; - - my $mod = mx->mod->Module(symbol=>$sym, data_names=>['data1', 'data2'], - label_names=>['softmax_label']); - $mod->bind(data_shapes=>[['data1', $dshape1], ['data2', $dshape2]], - label_shapes=>[['softmax_label', $lshape]]); - $mod->init_params(); - $mod->init_optimizer(optimizer_params=>{learning_rate => 0.01}); - - # Train with original data shapes - my $data_batch = mx->io->DataBatch(data=>[mx->nd->random_uniform(0, 9, $dshape1), - mx->nd->random_uniform(5, 15, $dshape2)], - label=>[mx->nd->ones($lshape)]); - $mod->forward($data_batch); - is_deeply($mod->get_outputs->[0]->shape, [$lshape->[0], $num_class]); - $mod->backward(); - $mod->update(); - - # Train with different batch size - $dshape1 = [3, 3, 64, 64]; - $dshape2 = [3, 3, 32, 32]; - $lshape = [3]; - $data_batch = mx->io->DataBatch(data=>[mx->nd->random_uniform(0, 9, $dshape1), - mx->nd->random_uniform(5, 15, $dshape2)], - label=>[mx->nd->ones($lshape)]); - $mod->forward($data_batch); - is_deeply($mod->get_outputs->[0]->shape, [$lshape->[0], $num_class]); - $mod->backward(); - $mod->update(); - - $dshape1 = [20, 3, 64, 64]; - $dshape2 = [20, 3, 32, 32]; - $lshape = [20]; - $data_batch = mx->io->DataBatch(data=>[mx->nd->random_uniform(3, 5, $dshape1), - mx->nd->random_uniform(10, 25, $dshape2)], - label=>[mx->nd->ones($lshape)]); - $mod->forward($data_batch); - is_deeply($mod->get_outputs->[0]->shape, [$lshape->[0], $num_class]); - $mod->backward(); - $mod->update(); - - #Train with both different batch size and data shapes - $dshape1 = [20, 3, 120, 120]; - $dshape2 = [20, 3, 32, 64]; - $lshape = [20]; - $data_batch = mx->io->DataBatch(data=>[mx->nd->random_uniform(0, 9, $dshape1), - mx->nd->random_uniform(5, 15, $dshape2)], - label=>[mx->nd->ones($lshape)]); - $mod->forward($data_batch); - is_deeply($mod->get_outputs->[0]->shape, [$lshape->[0], $num_class]); - $mod->backward(); - $mod->update(); - - $dshape1 = [5, 3, 28, 40]; - $dshape2 = [5, 3, 24, 16]; - $lshape = [5]; - $data_batch = mx->io->DataBatch(data=>[mx->nd->random_uniform(0, 9, $dshape1), - mx->nd->random_uniform(15, 25, $dshape2)], - label=>[mx->nd->ones($lshape)]); - $mod->forward($data_batch); - is_deeply($mod->get_outputs->[0]->shape, [$lshape->[0], $num_class]); - $mod->backward(); - $mod->update(); - - #Test score - my $dataset_shape1 = [30, 3, 30, 30]; - my $dataset_shape2 = [30, 3, 20, 40]; - my $labelset_shape = [30]; - - my $eval_dataiter = mx->io->NDArrayIter(data=>[mx->nd->random_uniform(0, 9, $dataset_shape1), - mx->nd->random_uniform(15, 25, $dataset_shape2)], - label=>[mx->nd->ones($labelset_shape)], - batch_size=>5); - ok(keys %{ $mod->score($eval_dataiter, 'acc') } == 1); - - #Test prediction - $dshape1 = [1, 3, 30, 30]; - $dshape2 = [1, 3, 20, 40]; - $dataset_shape1 = [10, 3, 30, 30]; - $dataset_shape2 = [10, 3, 20, 40]; - - my $pred_dataiter = mx->io->NDArrayIter(data=>[mx->nd->random_uniform(0, 9, $dataset_shape1), - mx->nd->random_uniform(15, 25, $dataset_shape2)]); - $mod->bind(data_shapes=>[['data1', $dshape1], ['data2', $dshape2]], - for_training=>0, force_rebind=>1); - is_deeply($mod->predict($pred_dataiter)->shape, [10, $num_class]); - -} - sub test_forward_acceptable_input { my $data = mx->sym->Variable('data'); @@ -803,15 +246,7 @@ sub test_forward_acceptable_input test_module_input_grads(); test_module_dtype(); -test_monitor(); -test_module_switch_bucket(); test_module_layout(); test_module_states(); -test_module_reshape(); test_save_load(); -test_executor_group(); -test_module_set_params(); -test_forward_reshape(); -test_module_initializer(); -test_factorization_machine_module(); test_forward_acceptable_input(); diff --git a/python/mxnet/contrib/amp/lists/symbol_bf16.py b/python/mxnet/contrib/amp/lists/symbol_bf16.py index 86edfe6fde8d..da01e6198be5 100644 --- a/python/mxnet/contrib/amp/lists/symbol_bf16.py +++ b/python/mxnet/contrib/amp/lists/symbol_bf16.py @@ -478,7 +478,6 @@ 'topk', # Neural network - 'SoftmaxOutput', 'softmax', 'Softmax', 'log_softmax', @@ -487,13 +486,6 @@ 'GroupNorm', 'L2Normalization', 'SoftmaxActivation', - 'LinearRegressionOutput', - 'LogisticRegressionOutput', - 'MAERegressionOutput', - '_sparse_LinearRegressionOutput', - '_sparse_LogisticRegressionOutput', - '_sparse_MAERegressionOutput', - 'SVMOutput', 'softmax_cross_entropy', 'smooth_l1', 'MakeLoss', @@ -630,8 +622,4 @@ ] LOSS_OUTPUT_FUNCTIONS = [ - 'SoftmaxOutput', - 'LinearRegressionOutput', - 'LogisticRegressionOutput', - 'MAERegressionOutput', ] diff --git a/python/mxnet/contrib/amp/lists/symbol_fp16.py b/python/mxnet/contrib/amp/lists/symbol_fp16.py index d501a7d6c5b5..ae812fbac2d7 100644 --- a/python/mxnet/contrib/amp/lists/symbol_fp16.py +++ b/python/mxnet/contrib/amp/lists/symbol_fp16.py @@ -472,7 +472,6 @@ 'topk', # Neural network - 'SoftmaxOutput', 'softmax', 'Softmax', 'log_softmax', @@ -482,13 +481,6 @@ 'L2Normalization', 'LRN', 'SoftmaxActivation', - 'LinearRegressionOutput', - 'LogisticRegressionOutput', - 'MAERegressionOutput', - '_sparse_LinearRegressionOutput', - '_sparse_LogisticRegressionOutput', - '_sparse_MAERegressionOutput', - 'SVMOutput', 'softmax_cross_entropy', 'smooth_l1', 'MakeLoss', @@ -629,8 +621,4 @@ ] LOSS_OUTPUT_FUNCTIONS = [ - 'SoftmaxOutput', - 'LinearRegressionOutput', - 'LogisticRegressionOutput', - 'MAERegressionOutput', ] diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/ModuleSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/ModuleSuite.scala index 4d95f43751ba..90956f6983e2 100644 --- a/scala-package/core/src/test/scala/org/apache/mxnet/ModuleSuite.scala +++ b/scala-package/core/src/test/scala/org/apache/mxnet/ModuleSuite.scala @@ -208,209 +208,4 @@ class ModuleSuite extends FunSuite with BeforeAndAfterAll { mod.update() assert(mod.getOutputsMerged()(0).shape == dShape) assert(mod.getParams._1("fc_bias").toArray.forall(x => (x - -3f) < 1e-3)) - } - - test ("module setParams") { - val data = NDArray.array(Array(0.05f, 0.1f), Shape(1, 1, 1, 2)) - val label = NDArray.array(Array(0.01f, 0.99f), Shape(1, 1, 1, 2)) - val trainData = new NDArrayIter( - IndexedSeq(data), IndexedSeq(label), labelName = "softmax_label") - - // symbols - var x = Symbol.Variable("data") - x = Symbol.FullyConnected(name = "fc_0")()(Map("data" -> x, "num_hidden" -> 2)) - x = Symbol.Activation(name = "act_0")()(Map("data" -> x, "act_type" -> "sigmoid")) - x = Symbol.FullyConnected(name = "fc_1")()(Map("data" -> x, "num_hidden" -> 2)) - x = Symbol.Activation(name = "act_1")()(Map("data" -> x, "act_type" -> "sigmoid")) - x = Symbol.LinearRegressionOutput(name = "softmax")()(Map("data" -> x, "grad_scale" -> 2)) - - // create module - val mod = new Module(x, contexts = Array(Context.cpu())) - mod.bind(dataShapes = trainData.provideDataDesc, - Option(trainData.provideLabelDesc)) - val argParamsCorrect = Map( - "fc_0_weight" -> NDArray.array(Array(0.15f, 0.2f, 0.25f, 0.3f), Shape(2, 2)), - "fc_0_bias" -> NDArray.array(Array(0.35f, 0.35f), Shape(2)), - "fc_1_weight" -> NDArray.array(Array(0.4f, 0.45f, 0.5f, 0.55f), Shape(2, 2)), - "fc_1_bias" -> NDArray.array(Array(0.6f, 0.6f), Shape(2)) - ) - val argParamsMissing = Map( - "fc_0_weight" -> NDArray.array(Array(0.15f, 0.2f, 0.25f, 0.3f), Shape(2, 2)), - "fc_0_bias" -> NDArray.array(Array(0.35f, 0.35f), Shape(2)), - "fc_1_weight" -> NDArray.array(Array(0.4f, 0.45f, 0.5f, 0.55f), Shape(2, 2)) - ) - val argParamsExtra = Map( - "fc_0_weight" -> NDArray.array(Array(0.15f, 0.2f, 0.25f, 0.3f), Shape(2, 2)), - "fc_0_bias" -> NDArray.array(Array(0.35f, 0.35f), Shape(2)), - "fc_1_weight" -> NDArray.array(Array(0.4f, 0.45f, 0.5f, 0.55f), Shape(2, 2)), - "fc_1_bias" -> NDArray.array(Array(0.6f, 0.6f), Shape(2)), - "fc_2_weight" -> NDArray.array(Array(0.6f, 0.6f), Shape(2)) - ) - - mod.setParams(forceInit = true, argParams = argParamsCorrect, - auxParams = null) - - // test allow missing - mod.setParams(forceInit = true, argParams = argParamsMissing, - auxParams = null, allowMissing = true) - - // test allow extra - mod.setParams(forceInit = true, argParams = argParamsExtra, auxParams = null, - allowMissing = true, allowExtra = true) - } - - test ("monitor") { - // data iter - val data = NDArray.array(Array(0.05f, 0.1f), Shape(1, 1, 1, 2)) - val label = NDArray.array(Array(0.01f, 0.99f), Shape(1, 1, 1, 2)) - val trainData = new NDArrayIter( - IndexedSeq(data), IndexedSeq(label), labelName = "softmax_label") - - // symbols - var x = Symbol.Variable("data") - x = Symbol.FullyConnected(name = "fc_0")()(Map("data" -> x, "num_hidden" -> 2)) - x = Symbol.Activation(name = "act_0")()(Map("data" -> x, "act_type" -> "sigmoid")) - x = Symbol.FullyConnected(name = "fc_1")()(Map("data" -> x, "num_hidden" -> 2)) - x = Symbol.Activation(name = "act_1")()(Map("data" -> x, "act_type" -> "sigmoid")) - x = Symbol.LinearRegressionOutput(name = "softmax")()(Map("data" -> x, "grad_scale" -> 2)) - - // create monitor - def meanAbs(x: NDArray): NDArray = { - val sumAbs = NDArray.sum(NDArray.abs(x)) - sumAbs / x.shape.product - } - val mon = new Monitor(1, statFunc = meanAbs) - - // create module - val mod = new Module(x, contexts = Array(Context.cpu())) - mod.bind(dataShapes = trainData.provideDataDesc, - Option(trainData.provideLabelDesc)) - mod.installMonitor(mon) - val argParams = Map( - "fc_0_weight" -> NDArray.array(Array(0.15f, 0.2f, 0.25f, 0.3f), Shape(2, 2)), - "fc_0_bias" -> NDArray.array(Array(0.35f, 0.35f), Shape(2)), - "fc_1_weight" -> NDArray.array(Array(0.4f, 0.45f, 0.5f, 0.55f), Shape(2, 2)), - "fc_1_bias" -> NDArray.array(Array(0.6f, 0.6f), Shape(2)) - ) - mod.initParams(argParams = argParams) - - val dataBatch = trainData.next() - mon.tic() - mod.forwardBackward(dataBatch) - val res = mon.toc() - val keys = Array("act_0", "act_1", "data", "fc_0", "fc_1", "softmax") - val monResultCounts = Array(0, 0, 0, 0, 0, 0) - assert(res.length == 21) - for ((n, k, v) <- res) { - var break = false - for ((key, idx) <- keys.zipWithIndex) { - if (!break && k.startsWith(key)) { - monResultCounts(idx) += 1 - break = true - } - } - } - assert(monResultCounts.zip(Array(2, 2, 1, 6, 6, 4)).forall(x => x._1 == x._2)) - } - - test ("forward reshape") { - val numClass = 10 - val data1 = Symbol.Variable("data1") - val data2 = Symbol.Variable("data2") - val conv1 = Symbol.Convolution()()(Map("data" -> data1, - "kernel" -> "(2, 2)", "num_filter" -> 2, "stride" -> "(2, 2)")) - val conv2 = Symbol.Convolution()()(Map("data" -> data2, - "kernel" -> "(3, 3)", "num_filter" -> 3, "stride" -> "(1, 1)")) - val pooling1 = Symbol.Pooling()()(Map("data" -> conv1, - "kernel" -> "(2, 2)", "pool_type" -> "avg", "stride" -> "(1, 1)")) - val pooling2 = Symbol.Pooling()()(Map("data" -> conv2, - "kernel" -> "(2, 2)", "pool_type" -> "max", "stride" -> "(1, 1)")) - val flatten1 = Symbol.flatten()()(Map("data" -> pooling1)) - val flatten2 = Symbol.flatten()()(Map("data" -> pooling2)) - val sum = Symbol.sum()()(Map("data" -> flatten1, "axis" -> 1)) + - Symbol.sum()()(Map("data" -> flatten2, "axis" -> 1)) - val fc = Symbol.FullyConnected()()( - Map("data" -> sum, "num_hidden" -> numClass)) - val sym = Symbol.SoftmaxOutput(name = "softmax")()(Map("data" -> fc)) - - var dShape1 = Shape(10, 3, 64, 64) - var dShape2 = Shape(10, 3, 32, 32) - var lShape = Shape(10) - - val mod = new Module(sym, IndexedSeq("data1", "data2")) - mod.bind(dataShapes = IndexedSeq( - DataDesc("data1", dShape1), DataDesc("data2", dShape2, layout = "NCHW")), - labelShapes = Option(IndexedSeq(DataDesc("softmax_label", lShape, layout = "N"))) - ) - mod.initParams() - mod.initOptimizer(optimizer = new SGD(learningRate = 0.01f)) - - // Train with original data shapes - var dataBatch = new DataBatch( - data = IndexedSeq( - NDArray.random_uniform(Map("low" -> 0, "high" -> 9, "shape" -> dShape1.toString()))(), - NDArray.random_uniform(Map("low" -> 5, "high" -> 15, "shape" -> dShape2.toString()))()), - label = IndexedSeq(NDArray.ones(lShape)), index = null, pad = 0) - mod.forward(dataBatch) - assert(mod.getOutputsMerged()(0).shape == Shape(lShape(0), numClass)) - mod.backward() - mod.update() - - dShape1 = Shape(3, 3, 64, 64) - dShape2 = Shape(3, 3, 32, 32) - lShape = Shape(3) - dataBatch = new DataBatch( - data = IndexedSeq( - NDArray.random_uniform(Map("low" -> 0, "high" -> 9, "shape" -> dShape1.toString()))(), - NDArray.random_uniform(Map("low" -> 5, "high" -> 15, "shape" -> dShape2.toString()))()), - label = IndexedSeq(NDArray.ones(lShape)), index = null, pad = 0) - mod.forward(dataBatch) - assert(mod.getOutputsMerged()(0).shape == Shape(lShape(0), numClass)) - mod.backward() - mod.update() - - dShape1 = Shape(20, 3, 64, 64) - dShape2 = Shape(20, 3, 32, 32) - lShape = Shape(20) - dataBatch = new DataBatch( - data = IndexedSeq( - NDArray.random_uniform(Map("low" -> 3, "high" -> 5, "shape" -> dShape1.toString()))(), - NDArray.random_uniform(Map("low" -> 10, "high" -> 25, "shape" -> dShape2.toString()))()), - label = IndexedSeq(NDArray.ones(lShape)), index = null, pad = 0) - mod.forward(dataBatch) - assert(mod.getOutputsMerged()(0).shape == Shape(lShape(0), numClass)) - mod.backward() - mod.update() - - // Train with both different batch size and data shapes - dShape1 = Shape(20, 3, 120, 120) - dShape2 = Shape(20, 3, 32, 64) - lShape = Shape(20) - dataBatch = new DataBatch.Builder() - .setData( - NDArray.random_uniform(Map("low" -> 0, "high" -> 9, "shape" -> dShape1.toString()))(), - NDArray.random_uniform(Map("low" -> 5, "high" -> 15, "shape" -> dShape2.toString()))()) - .setLabel(NDArray.ones(lShape)) - .setPad(0) - .build() - mod.forward(dataBatch) - assert(mod.getOutputsMerged()(0).shape == Shape(lShape(0), numClass)) - mod.backward() - mod.update() - - dShape1 = Shape(5, 3, 28, 40) - dShape2 = Shape(5, 3, 24, 16) - lShape = Shape(5) - dataBatch = new DataBatch.Builder() - .setData( - NDArray.random_uniform(Map("low" -> 0, "high" -> 9, "shape" -> dShape1.toString()))(), - NDArray.random_uniform(Map("low" -> 15, "high" -> 25, "shape" -> dShape2.toString()))()) - .setLabel(NDArray.ones(lShape)) - .setPad(0) - .build() - mod.forward(dataBatch) - assert(mod.getOutputsMerged()(0).shape == Shape(lShape(0), numClass)) - mod.backward() - mod.update() - } } diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/OperatorSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/OperatorSuite.scala index 7c0b009ac8e6..dc11b7bfb9b7 100644 --- a/scala-package/core/src/test/scala/org/apache/mxnet/OperatorSuite.scala +++ b/scala-package/core/src/test/scala/org/apache/mxnet/OperatorSuite.scala @@ -112,41 +112,6 @@ class OperatorSuite extends FunSuite with BeforeAndAfterAll } } - private def checkRegression(model: Symbol, - forward: Float => Float, - backward: (Float, Float) => Float) = { - val shape = Shape(3, 1) - val arrData = Random.uniform(-1, 1, shape) - val arrLabel = Random.uniform(0, 1, Shape(shape.head)) - val arrGrad = NDArray.empty(shape) - val exec1 = model.bind(Context.cpu(), - args = Array(arrData, arrLabel), argsGrad = Map("data" -> arrGrad)) - exec1.forward() - assert(exec1.outputs(0).shape === shape) - val out1 = exec1.outputs(0).toArray - val npout = arrData.toArray.map(forward(_)) - assert(CheckUtils.reldiff(npout, out1) < 1e-6f) - - exec1.backward() - // arrData shape: Vector(3, 1) - // arrLabel shape: Vector(3) - val npoutBack = (npout zip arrLabel.toArray).map { case (data, label) => - backward(data, label) - } - assert(CheckUtils.reldiff(npoutBack, arrGrad.toArray) < 1e-6f) - } - - test("regression") { - checkRegression(Symbol.LogisticRegressionOutput()()( - Map("data" -> Symbol.Variable("data"), "label" -> Symbol.Variable("label"))), - (x: Float) => 1.0f / (1.0f + Math.exp(-x).toFloat), - (x: Float, y: Float) => x - y) - checkRegression(Symbol.LinearRegressionOutput()()( - Map("data" -> Symbol.Variable("data"), "label" -> Symbol.Variable("label"))), - (x: Float) => x, - (x: Float, y: Float) => x - y) - } - // TODO: test softmax test("swap axes") { diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py index 0c9c6f905dd6..377a8709bfbc 100644 --- a/tests/nightly/test_large_array.py +++ b/tests/nightly/test_large_array.py @@ -100,36 +100,6 @@ def check_softmax_cross_entropy(): assert_almost_equal(mx_softmax_cross_entropy.asnumpy(), true_softmax_cross_entropy, rtol=1e-3, atol=1e-5) - def check_softmax_output(): - x = mx.sym.Variable('x') - label = mx.sym.Variable('label') - x_nd = mx.nd.ones((LARGE_X, SMALL_Y)) - grad_x = mx.nd.zeros((LARGE_X, SMALL_Y)) - label_nd = mx.nd.ones((LARGE_X)) - sym = mx.sym.SoftmaxOutput(data=x, label=label, ignore_label=0, - use_ignore=False) - - ex = sym.bind(ctx=default_context(), args={'x': x_nd, 'label': label_nd}, - args_grad=None) - ex.forward(is_train=False) - softmax_out = ex.outputs[0][0].asnumpy() - expected_softmax_out = (1 / SMALL_Y) * mx.nd.ones((SMALL_Y)).asnumpy() - assert np.isclose(softmax_out, expected_softmax_out).all() - - ex = sym.bind(ctx=default_context(), args={'x': x_nd, 'label': label_nd}, - args_grad={'x': grad_x}) - ex.forward(is_train=True) - softmax_out = ex.outputs[0][0].asnumpy() - expected_softmax_out = (1 / SMALL_Y) * mx.nd.ones((SMALL_Y)).asnumpy() - assert np.isclose(softmax_out, expected_softmax_out).all() - - ex.backward(is_train=True) - grad_out = ex.grad_arrays[0][0].asnumpy() - k = int(label_nd[0].asscalar()) - expected_grad_out = np.zeros((SMALL_Y,)) - expected_grad_out[k] = -1 - assert np.isclose(grad_out - softmax_out, expected_grad_out).all() - def check_softmax_activation(): data = nd.random_normal(shape=(2**29, 2, 2, 2)) out = nd.random_normal(shape=(2**29, 2, 2, 2)) @@ -358,42 +328,6 @@ def fsigmoid(a): ya = fsigmoid(xa) check_symbolic_forward(y, [xa], [ya]) - def check_linear_and_logistic_regression(): - shape = (LARGE_X, SMALL_Y) - - def check_regression(symbol, forward, backward, shape): - # init executor - data_s = mx.symbol.Variable('data') - label_s = mx.symbol.Variable('label') - out_s = symbol(data=data_s, label=label_s) - grad_req = {'data': 'write', 'label': 'null'} - exe = out_s.simple_bind(ctx=default_context(), data=shape, label=shape, grad_req=grad_req) - arg_map = dict(zip(out_s.list_arguments(), exe.arg_arrays)) - grad_map = dict(zip(out_s.list_arguments(), exe.grad_arrays)) - # init data - data = mx.random.uniform(-1, -1, shape) - arg_map["data"][:] = data - atol = 1e-5 - density = 0.5 - stype = 'default' - label = arg_map["label"] - label[:] = rand_ndarray(shape, stype, density=density) - exe.forward(is_train=True) - exe.backward() - np_out = forward(data.asnumpy()) - out_grad = backward(np_out, label.asnumpy().reshape(np_out.shape)) / shape[1] - assert_almost_equal(exe.outputs[0].asnumpy(), np_out, atol=atol) - assert_almost_equal(grad_map["data"].asnumpy(), out_grad, atol=atol) - - check_regression(mx.symbol.LogisticRegressionOutput, - lambda x: 1.0 / (1.0 + np.exp(-x)), - lambda x, y: x - y, - shape) - check_regression(mx.symbol.LinearRegressionOutput, - lambda x: x, - lambda x, y: x - y, - shape) - def check_l2_normalization(): x = nd.ones((2, LARGE_X*2)) x[0] = 3 @@ -570,7 +504,6 @@ def check_rnn(): check_dense() check_softmax() check_softmax_cross_entropy() - check_softmax_output() check_softmax_activation() check_log_softmax() check_leaky_relu() @@ -581,7 +514,6 @@ def check_rnn(): check_batchnorm() check_relu() check_sigmoid() - check_linear_and_logistic_regression() check_l2_normalization() check_instance_norm() check_col2im() diff --git a/tests/nightly/test_large_vector.py b/tests/nightly/test_large_vector.py index 74f015cabf0f..e95b411974b2 100644 --- a/tests/nightly/test_large_vector.py +++ b/tests/nightly/test_large_vector.py @@ -45,34 +45,6 @@ def check_dense(): res = linear(data) assert res.shape == (LARGE_X, 2) - def check_regression(): - shape = (LARGE_X, ) - def check_regression(symbol, forward, shape): - # init executor - data_s = mx.symbol.Variable('data') - label_s = mx.symbol.Variable('label') - out_s = symbol(data=data_s, label=label_s) - exe = out_s.simple_bind(ctx=mx.cpu(0), data=shape, label=shape) - arg_map = dict(zip(out_s.list_arguments(), exe.arg_arrays)) - # init data - data = mx.random.uniform(-1, -1, shape) - arg_map["data"][:] = data - atol = 1e-5 - density = 0.5 - stype = 'default' - label = arg_map["label"] - label[:] = rand_ndarray(shape, stype, density=density) - exe.forward(is_train=True) - exe.backward() - np_out = forward(data.asnumpy()) - assert_almost_equal(exe.outputs[0].asnumpy(), np_out, atol=atol) - check_regression(mx.symbol.LogisticRegressionOutput, - lambda x: 1.0 / (1.0 + np.exp(-x)), - shape) - check_regression(mx.symbol.LinearRegressionOutput, - lambda x: x, - shape) - def check_sign(): a = mx.nd.random.normal(-1, 1, shape=LARGE_X) mx_res = mx.nd.sign(a) @@ -155,7 +127,6 @@ def check_sequence_last(): check_sequence_last() check_dense() - check_regression() check_sign() check_layer_norm() check_batchnorm() diff --git a/tests/tutorials/test_sanity_tutorials.py b/tests/tutorials/test_sanity_tutorials.py index fb751b4ddade..e927cdc3ca07 100644 --- a/tests/tutorials/test_sanity_tutorials.py +++ b/tests/tutorials/test_sanity_tutorials.py @@ -39,12 +39,9 @@ 'nlp/index.md', 'onnx/index.md', 'python/index.md', - 'r/CallbackFunction.md', 'r/charRnnModel.md', 'r/classifyRealImageWithPretrainedModel.md', 'r/CustomIterator.md', - 'r/CustomLossFunction.md', - 'r/fiveMinutesNeuralNetwork.md', 'r/index.md', 'r/mnistCompetition.md', 'r/MultidimLstm.md', From 8029d4ace3d8ad56afdbe13c7139486acf62754c Mon Sep 17 00:00:00 2001 From: Lin Date: Wed, 10 Jun 2020 15:32:33 -0700 Subject: [PATCH 04/36] remove more examples --- R-package/tests/testthat/test_img_seg.R | 165 ---- R-package/vignettes/CatsDogsFinetune.Rmd | 272 ------- R-package/vignettes/CustomIterator.Rmd | 207 ----- R-package/vignettes/mnistCompetition.Rmd | 246 ------ benchmark/opperf/rules/default_params.py | 13 - .../imclassification/train_mnist_test.clj | 7 - .../test/test-symbol.json.ref | 105 --- cpp-package/example/CMakeLists.txt | 31 - cpp-package/example/README.md | 94 --- cpp-package/example/alexnet.cpp | 358 --------- cpp-package/example/charRNN.cpp | 756 ------------------ cpp-package/example/googlenet.cpp | 198 ----- cpp-package/example/inception_bn.cpp | 261 ------ cpp-package/example/lenet.cpp | 267 ------- cpp-package/example/lenet_with_mxdataiter.cpp | 203 ----- cpp-package/example/mlp.cpp | 182 ----- cpp-package/example/mlp_cpu.cpp | 147 ---- cpp-package/example/mlp_csv.cpp | 276 ------- cpp-package/example/mlp_gpu.cpp | 163 ---- cpp-package/example/resnet.cpp | 283 ------- cpp-package/example/test_score.cpp | 164 ---- .../example/unittests/unit_test_mlp_csv.sh | 63 -- cpp-package/tests/ci_test.sh | 29 - .../pages/api/cpp/docs/tutorials/basics.md | 114 --- docs/static_site/src/pages/api/faq/caffe.md | 1 - docs/static_site/src/pages/api/faq/float16.md | 81 +- .../src/pages/api/perl/docs/tutorials/io.md | 16 - tests/tutorials/test_sanity_tutorials.py | 2 - 28 files changed, 2 insertions(+), 4702 deletions(-) delete mode 100644 R-package/tests/testthat/test_img_seg.R delete mode 100644 R-package/vignettes/CatsDogsFinetune.Rmd delete mode 100644 R-package/vignettes/CustomIterator.Rmd delete mode 100644 R-package/vignettes/mnistCompetition.Rmd delete mode 100644 contrib/clojure-package/examples/imclassification/test/test-symbol.json.ref delete mode 100644 cpp-package/example/alexnet.cpp delete mode 100644 cpp-package/example/charRNN.cpp delete mode 100644 cpp-package/example/googlenet.cpp delete mode 100644 cpp-package/example/inception_bn.cpp delete mode 100644 cpp-package/example/lenet.cpp delete mode 100644 cpp-package/example/lenet_with_mxdataiter.cpp delete mode 100644 cpp-package/example/mlp.cpp delete mode 100644 cpp-package/example/mlp_cpu.cpp delete mode 100644 cpp-package/example/mlp_csv.cpp delete mode 100644 cpp-package/example/mlp_gpu.cpp delete mode 100644 cpp-package/example/resnet.cpp delete mode 100644 cpp-package/example/test_score.cpp delete mode 100755 cpp-package/example/unittests/unit_test_mlp_csv.sh diff --git a/R-package/tests/testthat/test_img_seg.R b/R-package/tests/testthat/test_img_seg.R deleted file mode 100644 index 4af7d62cf533..000000000000 --- a/R-package/tests/testthat/test_img_seg.R +++ /dev/null @@ -1,165 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -require(mxnet) - -source("get_data.R") - -if (Sys.getenv("R_GPU_ENABLE") != "" & as.integer(Sys.getenv("R_GPU_ENABLE")) == - 1) { - mx.ctx.default(new = mx.gpu()) - message("Using GPU for testing.") -} - -print_inferred_shape <- function(net) { - slist <- mx.symbol.infer.shape(symbol = net, data = c(168, 168, 1, 2)) - print(slist$out.shapes) -} - -convolution_module <- function(net, kernel_size, pad_size, filter_count, stride = c(1, - 1), work_space = 2048, batch_norm = TRUE, down_pool = FALSE, up_pool = FALSE, - act_type = "relu", convolution = TRUE) { - if (up_pool) { - net <- mx.symbol.Deconvolution(net, kernel = c(2, 2), pad = c(0, 0), stride = c(2, - 2), num_filter = filter_count, workspace = work_space) - net <- mx.symbol.BatchNorm(net) - if (act_type != "") { - net <- mx.symbol.Activation(net, act_type = act_type) - } - } - if (convolution) { - conv <- mx.symbol.Convolution(data = net, kernel = kernel_size, stride = stride, - pad = pad_size, num_filter = filter_count, workspace = work_space) - net <- conv - } - if (batch_norm) { - net <- mx.symbol.BatchNorm(net) - } - - if (act_type != "") { - net <- mx.symbol.Activation(net, act_type = act_type) - } - - if (down_pool) { - pool <- mx.symbol.Pooling(net, pool_type = "max", kernel = c(2, 2), stride = c(2, - 2)) - net <- pool - } - print_inferred_shape(net) - return(net) -} - -get_unet <- function() { - data <- mx.symbol.Variable("data") - kernel_size <- c(3, 3) - pad_size <- c(1, 1) - filter_count <- 32 - pool1 <- convolution_module(data, kernel_size, pad_size, filter_count = filter_count, - down_pool = TRUE) - net <- pool1 - pool2 <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * - 2, down_pool = TRUE) - net <- pool2 - pool3 <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * - 4, down_pool = TRUE) - net <- pool3 - pool4 <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * - 4, down_pool = TRUE) - net <- pool4 - net <- mx.symbol.Dropout(net) - pool5 <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * - 8, down_pool = TRUE) - net <- pool5 - net <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * - 4, up_pool = TRUE) - net <- convolution_module(net, kernel_size, pad_size = c(2, 2), filter_count = filter_count * - 4, up_pool = TRUE) - net <- mx.symbol.Crop(net, pool3, num.args = 2) - net <- mx.symbol.concat(c(pool3, net), num.args = 2) - net <- mx.symbol.Dropout(net) - net <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * - 4) - net <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * - 4, up_pool = TRUE) - - net <- mx.symbol.concat(c(pool2, net), num.args = 2) - net <- mx.symbol.Dropout(net) - net <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * - 4) - net <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * - 4, up_pool = TRUE) - convolution_module(net, kernel_size, pad_size, filter_count = filter_count * - 4) - net <- mx.symbol.concat(c(pool1, net), num.args = 2) - net <- mx.symbol.Dropout(net) - net <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * - 2) - net <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * - 2, up_pool = TRUE) - net <- convolution_module(net, kernel_size, pad_size, filter_count = 1, batch_norm = FALSE, - act_type = "") - net <- mx.symbol.SoftmaxOutput(data = net, name = "sm") - return(net) -} - -context("Image segmentation") - -test_that("UNET", { - list.of.packages <- c("imager") - new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[, - "Package"])] - if (length(new.packages)) - install.packages(new.packages, repos = "https://cloud.r-project.org/") - GetISBI_data() - library(imager) - IMG_SIZE <- 168 - files <- list.files(path = "data/ISBI/train-volume/") - a <- "data/ISBI/train-volume/" - filess <- paste(a, files, sep = "") - list_of_images <- lapply(filess, function(x) { - x <- load.image(x) - y <- resize(x, size_x = IMG_SIZE, size_y = IMG_SIZE) - }) - - train.x <- do.call("cbind", lapply(list_of_images, as.vector)) - train.array <- train.x - dim(train.array) <- c(IMG_SIZE, IMG_SIZE, 1, 30) - - files <- list.files(path = "data/ISBI/train-labels") - b <- "data/ISBI/train-labels/" - filess <- paste(b, files, sep = "") - list_of_images <- lapply(filess, function(x) { - x <- load.image(x) - y <- resize(x, size_x = IMG_SIZE, size_y = IMG_SIZE) - }) - - train.y <- do.call("cbind", lapply(list_of_images, as.vector)) - - train.y[which(train.y < 0.5)] <- 0 - train.y[which(train.y > 0.5)] <- 1 - train.y.array <- train.y - dim(train.y.array) <- c(IMG_SIZE, IMG_SIZE, 1, 30) - - devices <- mx.ctx.default() - mx.set.seed(0) - - net <- get_unet() - - model <- mx.model.FeedForward.create(net, X = train.array, y = train.y.array, - ctx = devices, num.round = 2, initializer = mx.init.normal(sqrt(2/576)), - learning.rate = 0.05, momentum = 0.99, array.batch.size = 2) -}) diff --git a/R-package/vignettes/CatsDogsFinetune.Rmd b/R-package/vignettes/CatsDogsFinetune.Rmd deleted file mode 100644 index 726bb1a43c77..000000000000 --- a/R-package/vignettes/CatsDogsFinetune.Rmd +++ /dev/null @@ -1,272 +0,0 @@ -# Dogs vs. Cats classification with mxnet and R - -## Packages and prerequisites - -In this tutorial, we mainly use the following three packages: - -* `mxnet`: model training -* `imager`: image processing -* `abind`: manipulations with arrays. - -It is an end-to-end R solution for the dogs vs cats Kaggle competition (https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/) -and it can be used as an example for fine-tuning. -All the code has been test on Ubuntu 16.04. - -```{r, echo=FALSE} -knitr::opts_chunk$set(eval = FALSE) -``` - - -```{r} -library(imager) -library(mxnet) -library(abind) -``` - - -## Image processing - -### Renaming train files - -```{r} -files <- list.files("./train/") -old_names <- sapply(files, strsplit, split = ".", fixed = TRUE) -max_length <- max(sapply(old_names, function(x) nchar(x[[2]]))) -zeros <- max_length - sapply(old_names, function(x) nchar(x[[2]])) -zeros <- sapply(zeros, function(x) paste(rep(0, x), collapse = "")) -new_names <- Map(function(x, y) {paste0("./train/", x[1], "/", y, x[2], ".jpg")}, - x = old_names, y = zeros) - -# Full names -files <- paste0("./train/", files) - -dir.create("./train/cat") -dir.create("./train/dog") - -# New names will be in 00001.jpg format -Map(function(x, y) file.rename(from = x, to = y), files, new_names) -``` - -### Training images: 224x224, padded with empty space - -```{r} -files <- list.files("./train/", recursive = TRUE) -new_names <- paste0("./train_pad_224x224/", files) -files <- paste0("./train/", files) -dir.create("./train_pad_224x224/") -dir.create("./train_pad_224x224/cat") -dir.create("./train_pad_224x224/dog") - -padImage <- function(x) { - long_side <- max(dim(x)[1:2]) - short_side <- min(dim(x)[1:2]) - pad_img <- pad(x, - nPix = long_side - short_side, - axes = ifelse(dim(x)[1] < dim(x)[2], "x", "y")) - return(pad_img) -} - -Map(function(x, y) { - pad_img <- padImage(load.image(x)) - res_img <- resize(pad_img, size_x = 224, size_y = 224) - imager::save.image(res_img, y) - }, x = files, y = new_names) -``` - -### Renaming test files - -```{r} -files <- list.files("./test/") -max_length <- max(sapply(files, nchar)) -zeros <- max_length - sapply(files, nchar) -zeros <- sapply(zeros, function(x) paste(rep(0, x), collapse = "")) -newnames <- paste0("./test/", zeros, files) - -files <- paste0("./test/", files) - -Map(function(x, y) file.rename(from = x, to = y), files, newnames) -``` - - -### Test images: 224x224, padded with empty space - -```{r} -files <- list.files("./test/") -new_names <- paste0("./test_pad_224x224/", files) -files <- paste0("./test/", files) -dir.create("./test_pad_224x224/") - -Map(function(x, y) { - pad_img <- padImage(load.image(x)) - res_img <- resize(pad_img, size_x = 224, size_y = 224) - imager::save.image(res_img, y) -}, x = files, y = new_names) -``` - -### Creating .rec files - -```{r} -cat_files <- list.files("train_pad_224x224/cat/", recursive=TRUE) -cat_files <- paste0("cat/", cat_files) - -dog_files <- list.files("train_pad_224x224/dog/", recursive=TRUE) -dog_files <- paste0("dog/", dog_files) - -train_ind <- sample(length(cat_files), length(cat_files) * 0.8) -train_data <- c(1:(length(train_ind) * 2)) -train_data <- cbind(train_data, c(rep(0, length(train_ind)), rep(1, length(train_ind)))) -train_data <- cbind(train_data, c(cat_files[train_ind], dog_files[train_ind])) -train_data <- train_data[sample(nrow(train_data)),] -write.table(train_data, "cats_dogs_train.lst", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE) -im2rec("cats_dogs_train.lst", "train_pad_224x224/", "cats_dogs_train.rec") - -val_ind <- c(1:length(cat_files))[!c(1:length(cat_files)) %in% train_ind] -val_data <- c(1:(length(val_ind) * 2)) -val_data <- cbind(val_data, c(rep(0, length(val_ind)), rep(1, length(val_ind)))) -val_data <- cbind(val_data, c(cat_files[val_ind], dog_files[val_ind])) -val_data <- val_data[sample(nrow(val_data)),] -write.table(val_data, "cats_dogs_val.lst", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE) -im2rec("cats_dogs_val.lst", "train_pad_224x224/", "cats_dogs_val.rec") -``` - -## The data iterator - -```{r} -get_iterator <- function(data_shape, train_data, val_data, batch_size = 128) { - train <- mx.io.ImageRecordIter(path.imgrec = train_data, - batch.size = batch_size, - data.shape = data_shape, - rand.crop = TRUE, - rand.mirror = TRUE) - - val <- mx.io.ImageRecordIter(path.imgrec = val_data, - batch.size = batch_size, - data.shape = data_shape, - rand.crop = FALSE, - rand.mirror = FALSE) - - return(list(train = train, val = val)) -} -``` - - -```{r} -data <- get_iterator(data_shape = c(224, 224, 3), - train_data = "cats_dogs_train.rec", - val_data = "cats_dogs_val.rec", - batch_size = 8) -train <- data$train -val <- data$val -``` - - -## Load pretrained model - -Here we use the pretrained model from http://data.mxnet.io/mxnet/data/. -There are 1000 classes in imagenet, -and we need to replace the last fully connected layer with a new layer for 2 classes. - - -```{r} -download.file('http://data.mxnet.io/mxnet/data/Inception.zip', destfile = 'Inception.zip') -unzip("Inception.zip") -inception_bn <- mx.model.load("./Inception-BN", iteration = 126) - -symbol <- inception_bn$symbol -# check symbol$arguments for layer names -internals <- symbol$get.internals() -outputs <- internals$outputs - -flatten <- internals$get.output(which(outputs == "flatten_output")) - -new_fc <- mx.symbol.FullyConnected(data = flatten, - num_hidden = 2, - name = "fc1") -# set name to original name in symbol$arguments -new_soft <- mx.symbol.SoftmaxOutput(data = new_fc, - name = "softmax") -# set name to original name in symbol$arguments - -arg_params_new <- mx.model.init.params(symbol = new_soft, - input.shape = list("data" = c(224, 224, 3, 8)), - output.shape = NULL, - initializer = mx.init.uniform(0.1), - ctx = mx.cpu())$arg.params -fc1_weights_new <- arg_params_new[["fc1_weight"]] -fc1_bias_new <- arg_params_new[["fc1_bias"]] - -arg_params_new <- inception_bn$arg.params - -arg_params_new[["fc1_weight"]] <- fc1_weights_new -arg_params_new[["fc1_bias"]] <- fc1_bias_new -``` - - -## Fine-tuning - -```{r} -model <- mx.model.FeedForward.create( - symbol = new_soft, - X = train, - eval.data = val, - ctx = mx.gpu(0), - eval.metric = mx.metric.accuracy, - num.round = 2, - learning.rate = 0.05, - momentum = 0.9, - wd = 0.00001, - kvstore = "local", - array.batch.size = 128, - epoch.end.callback = mx.callback.save.checkpoint("inception_bn"), - batch.end.callback = mx.callback.log.train.metric(150), - initializer = mx.init.Xavier(factor_type = "in", magnitude = 2.34), - optimizer = "sgd", - arg.params = arg_params_new, - aux.params = inception_bn$aux.params -) -``` -## Making predictions - -```{r} -preprocImage<- function(src, # URL or file location - height = 224, - width = 224, - num_channels = 3, # 3 for RGB, 1 for grayscale - mult_by = 1, # set to 255 for normalized image - crop = FALSE) { # no crop by default - im <- load.image(src) - - if (crop) { - shape <- dim(im) - short_edge <- min(shape[1:2]) - xx <- floor((shape[1] - short_edge) / 2) - yy <- floor((shape[2] - short_edge) / 2) - im <- crop.borders(im, xx, yy) - } - - resized <- resize(im, size_x = width, size_y = height) - arr <- as.array(resized) * mult_by - dim(arr) <- c(width, height, num_channels, 1) - return(arr) -} -``` - -```{r} -files <- list.files("./test_pad_224x224/") -files <- paste0("./test_pad_224x224/", files) - -files <- split(files, rep(1:1250, each = 10)) -probs <- lapply(files, function(x) { - images <- lapply(x, preprocImage, mult_by = 255) - images <- do.call(abind, images) - probs <- predict(model, X = images, ctx = mx.gpu(0)) -}) -saveRDS(probs, "probs.rds") -probs <- t(do.call(cbind, probs)) - -preds <- data.frame(id = 1:12500, label = probs[, 2]) -write.csv(preds, "subm.csv", row.names = FALSE, quote = FALSE) -``` - - - diff --git a/R-package/vignettes/CustomIterator.Rmd b/R-package/vignettes/CustomIterator.Rmd deleted file mode 100644 index b5a6576a5bc6..000000000000 --- a/R-package/vignettes/CustomIterator.Rmd +++ /dev/null @@ -1,207 +0,0 @@ -# Customized iterator - - -This tutorial provides a guideline on how to use and write custom iterators, which can very useful when having a dataset that does not fit into memory. - -## Getting the data - -The data we are going to use is the [MNIST dataset](http://yann.lecun.com/exdb/mnist/) in CSV format, which can be found from [here](https://www.kaggle.com/c/digit-recognizer/data). - -To download the data: - -```{r} -download.file('https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/data/mnist_csv.zip', - destfile = 'mnist_csv.zip') -unzip('mnist_csv.zip', exdir = '.') -``` - -You'll get two files, `mnist_train.csv` that contains 60.000 examples of hand written numbers and `mxnist_test.csv` that contains 10.000 examples. The first element of each line in the CSV is the label, which is a number between 0 and 9. The rest of the line are 784 numbers between 0 and 255, corresponding to the levels of grey of a matrix of 28x28. Therefore, each line contains an image of 28x28 pixels of a hand written number and its true label. - -## Custom CSV Iterator - -Next we are going to create a custom CSV Iterator based on the [C++ CSVIterator class](https://github.com/dmlc/mxnet/blob/master/src/io/iter_csv.cc). - -For that we are going to use the R function `mx.io.CSVIter` as a base class. This class has as parameters `data.csv, data.shape, batch.size` and two main functions, `iter.next()` that calls the iterator in the next batch of data and `value()` that returns the train data and the label. - -The R Custom Iterator needs to inherit from the C++ data iterator class, for that we used the class `Rcpp_MXArrayDataIter` extracted with RCPP. Also, it needs to have the same parameters: `data.csv, data.shape, batch.size`. Apart from that, we can also add the field `iter`, which is the CSV Iterator that we are going to expand. - -```{r, eval=FALSE} -CustomCSVIter <- setRefClass("CustomCSVIter", - fields=c("iter", "data.csv", "data.shape", "batch.size"), - contains = "Rcpp_MXArrayDataIter", - #... - ) -``` - -The next step is to initialize the class. For that we call the base `mx.io.CSVIter` and fill the rest of the fields. - -```{r, eval=FALSE} -CustomCSVIter <- setRefClass("CustomCSVIter", - fields=c("iter", "data.csv", "data.shape", "batch.size"), - contains = "Rcpp_MXArrayDataIter", - methods=list( - initialize=function(iter, data.csv, data.shape, batch.size){ - feature_len <- data.shape*data.shape + 1 - csv_iter <- mx.io.CSVIter(data.csv=data.csv, data.shape=c(feature_len), batch.size=batch.size) - .self$iter <- csv_iter - .self$data.csv <- data.csv - .self$data.shape <- data.shape - .self$batch.size <- batch.size - .self - }, - #... - ) - ) -``` - -So far there is no difference between the original class and the custom class. Let's implement the function `value()`. In this case what we are going to do is transform the data that comes from the original class as an array of 785 numbers into a matrix of 28x28 and a label. We will also normalize the training data to be between 0 and 1. - -```{r, eval=FALSE} -CustomCSVIter <- setRefClass("CustomCSVIter", - fields=c("iter", "data.csv", "data.shape", "batch.size"), - contains = "Rcpp_MXArrayDataIter", - methods=list( - initialize=function(iter, data.csv, data.shape, batch.size){ - feature_len <- data.shape*data.shape + 1 - csv_iter <- mx.io.CSVIter(data.csv=data.csv, data.shape=c(feature_len), batch.size=batch.size) - .self$iter <- csv_iter - .self$data.csv <- data.csv - .self$data.shape <- data.shape - .self$batch.size <- batch.size - .self - }, - value=function(){ - val <- as.array(.self$iter$value()$data) - val.x <- val[-1,] - val.y <- val[1,] - val.x <- val.x/255 - dim(val.x) <- c(data.shape, data.shape, 1, ncol(val.x)) - val.x <- mx.nd.array(val.x) - val.y <- mx.nd.array(val.y) - list(data=val.x, label=val.y) - }, - #... - ) - ) -``` -Finally we are going to add the rest of the functions needed for the training to work correctly. The final `CustomCSVIter` looks like this: - -```{r} -CustomCSVIter <- setRefClass("CustomCSVIter", - fields=c("iter", "data.csv", "data.shape", "batch.size"), - contains = "Rcpp_MXArrayDataIter", - methods=list( - initialize=function(iter, data.csv, data.shape, batch.size){ - feature_len <- data.shape*data.shape + 1 - csv_iter <- mx.io.CSVIter(data.csv=data.csv, data.shape=c(feature_len), batch.size=batch.size) - .self$iter <- csv_iter - .self$data.csv <- data.csv - .self$data.shape <- data.shape - .self$batch.size <- batch.size - .self - }, - value=function(){ - val <- as.array(.self$iter$value()$data) - val.x <- val[-1,] - val.y <- val[1,] - val.x <- val.x/255 - dim(val.x) <- c(data.shape, data.shape, 1, ncol(val.x)) - val.x <- mx.nd.array(val.x) - val.y <- mx.nd.array(val.y) - list(data=val.x, label=val.y) - }, - iter.next=function(){ - .self$iter$iter.next() - }, - reset=function(){ - .self$iter$reset() - }, - num.pad=function(){ - .self$iter$num.pad() - }, - finalize=function(){ - .self$iter$finalize() - } - ) - ) -``` - -To call the class we can just do: - -```{r} -batch.size <- 100 -train.iter <- CustomCSVIter$new(iter = NULL, data.csv = "mnist_train.csv", data.shape = 28, batch.size = batch.size) -``` - -## CNN Model - - -For this tutorial we are going to use the known LeNet architecture: - -```{r} -library(mxnet) -lenet.model <- function(){ - data <- mx.symbol.Variable('data') - conv1 <- mx.symbol.Convolution(data=data, kernel=c(5,5), num_filter=20) #first conv - tanh1 <- mx.symbol.Activation(data=conv1, act_type="tanh") - pool1 <- mx.symbol.Pooling(data=tanh1, pool_type="max", kernel=c(2,2), stride=c(2,2)) - conv2 <- mx.symbol.Convolution(data=pool1, kernel=c(5,5), num_filter=50)# second conv - tanh2 <- mx.symbol.Activation(data=conv2, act_type="tanh") - pool2 <- mx.symbol.Pooling(data=tanh2, pool_type="max", kernel=c(2,2), stride=c(2,2)) - flatten <- mx.symbol.Flatten(data=pool2) - fc1 <- mx.symbol.FullyConnected(data=flatten, num_hidden=100) # first fullc - tanh3 <- mx.symbol.Activation(data=fc1, act_type="tanh") - fc2 <- mx.symbol.FullyConnected(data=tanh3, num_hidden=10) # second fullc - network <- mx.symbol.SoftmaxOutput(data=fc2) # loss - network -} -network <- lenet.model() -``` - -## Training with the Custom Iterator - -Finally, we can directly add the custom iterator as the training data source. - -```{r, eval=FALSE} -model <- mx.model.FeedForward.create(symbol=network, - X=train.iter, - ctx=mx.gpu(0), - num.round=10, - array.batch.size=batch.size, - learning.rate=0.1, - momentum=0.9, - eval.metric=mx.metric.accuracy, - wd=0.00001, - batch.end.callback=mx.callback.log.speedometer(batch.size, frequency = 100) - ) -``` - -The last 2 iterations with a K80 GPU looks like this: - -``` -## [8] Train-accuracy=0.998866666666667 -## Batch [100] Speed: 15413.0104454713 samples/sec Train-accuracy=0.999 -## Batch [200] Speed: 16629.3412459049 samples/sec Train-accuracy=0.99935 -## Batch [300] Speed: 18412.6900509319 samples/sec Train-accuracy=0.9995 -## Batch [400] Speed: 16757.2882328335 samples/sec Train-accuracy=0.999425 -## Batch [500] Speed: 17116.6529207406 samples/sec Train-accuracy=0.99946 -## Batch [600] Speed: 19627.589505195 samples/sec Train-accuracy=0.99945 -## [9] Train-accuracy=0.9991 -## Batch [100] Speed: 18971.5745536982 samples/sec Train-accuracy=0.9992 -## Batch [200] Speed: 15554.8822435383 samples/sec Train-accuracy=0.99955 -## Batch [300] Speed: 18327.6950115053 samples/sec Train-accuracy=0.9997 -## Batch [400] Speed: 17103.0705411788 samples/sec Train-accuracy=0.9997 -## Batch [500] Speed: 15104.8656902394 samples/sec Train-accuracy=0.99974 -## Batch [600] Speed: 13818.7899518255 samples/sec Train-accuracy=0.99975 -## [10] Train-accuracy=0.99975 -``` - -## Conclusion - - -We have shown how to create a custom CSV Iterator by extending the class `mx.io.CSVIter`. In our class, we iteratively read from a CSV file a batch of data that will be transformed and then processed in the stochastic gradient descent optimization. That way, we are able to manage CSV files that are bigger than the memory of the machine we are using. - -Based of this custom iterator, we can also create data loaders that internally transform or expand the data, allowing to manage files of any size. - - - diff --git a/R-package/vignettes/mnistCompetition.Rmd b/R-package/vignettes/mnistCompetition.Rmd deleted file mode 100644 index 055f1ae51d7e..000000000000 --- a/R-package/vignettes/mnistCompetition.Rmd +++ /dev/null @@ -1,246 +0,0 @@ -# Handwritten Digits Classification Competition - -[MNIST](http://yann.lecun.com/exdb/mnist/) is a handwritten digits image data set created by Yann LeCun. Every digit is represented by a 28x28 image. It has become a standard data set to test classifiers on simple image input. Neural network is no doubt a strong model for image classification tasks. There's a [long-term hosted competition](https://www.kaggle.com/c/digit-recognizer) on Kaggle using this data set. -We will present the basic usage of [mxnet](https://github.com/dmlc/mxnet/tree/master/R-package) to compete in this challenge. - -## Data Loading - -First, let us download the data from [here](https://www.kaggle.com/c/digit-recognizer/data), and put them under the `data/` folder in your working directory. - -Then we can read them in R and convert to matrices. - -```{r, echo=FALSE} -download.file('https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/data/mnist_csv.zip', destfile = 'mnist_csv.zip') -unzip('mnist_csv.zip', exdir = '.') -``` - - -```{r} -require(mxnet) -train <- read.csv("train.csv", header=TRUE) -test <- read.csv("test.csv", header=TRUE) -train <- data.matrix(train) -test <- data.matrix(test) - -train.x <- train[,-1] -train.y <- train[,1] -``` - -Besides using the csv files from kaggle, you can also read the orginal MNIST dataset into R. - -```{r, eval=FALSE} -load_image_file <- function(filename) { - f = file(filename, 'rb') - readBin(f, 'integer', n = 1, size = 4, endian = 'big') - n = readBin(f,'integer', n = 1, size = 4, endian = 'big') - nrow = readBin(f,'integer', n = 1, size = 4, endian = 'big') - ncol = readBin(f,'integer', n = 1, size = 4, endian = 'big') - x = readBin(f, 'integer', n = n * nrow * ncol, size = 1, signed = F) - x = matrix(x, ncol = nrow * ncol, byrow = T) - close(f) - x -} - -load_label_file <- function(filename) { - f = file(filename, 'rb') - readBin(f,'integer', n = 1, size = 4, endian = 'big') - n = readBin(f,'integer', n = 1, size = 4, endian = 'big') - y = readBin(f,'integer', n = n, size = 1, signed = F) - close(f) - y -} - -train.x <- load_image_file('mnist/train-images-idx3-ubyte') -test.y <- load_image_file('mnist/t10k-images-idx3-ubyte') - -train.y <- load_label_file('mnist/train-labels-idx1-ubyte') -test.y <- load_label_file('mnist/t10k-labels-idx1-ubyte') -``` - -Here every image is represented as a single row in train/test. The greyscale of each image falls in the range [0, 255], we can linearly transform it into [0,1] by - -```{r} -train.x <- t(train.x/255) -test <- t(test/255) -``` -We also transpose the input matrix to npixel x nexamples, which is the column major format accepted by mxnet (and the convention of R). - -In the label part, we see the number of each digit is fairly even: - -```{r} -table(train.y) -``` - -## Network Configuration - -Now we have the data. The next step is to configure the structure of our network. - -```{r} -data <- mx.symbol.Variable("data") -fc1 <- mx.symbol.FullyConnected(data, name="fc1", num_hidden=128) -act1 <- mx.symbol.Activation(fc1, name="relu1", act_type="relu") -fc2 <- mx.symbol.FullyConnected(act1, name="fc2", num_hidden=64) -act2 <- mx.symbol.Activation(fc2, name="relu2", act_type="relu") -fc3 <- mx.symbol.FullyConnected(act2, name="fc3", num_hidden=10) -softmax <- mx.symbol.SoftmaxOutput(fc3, name="sm") -``` - -1. In `mxnet`, we use its own data type `symbol` to configure the network. `data <- mx.symbol.Variable("data")` use `data` to represent the input data, i.e. the input layer. -2. Then we set the first hidden layer by `fc1 <- mx.symbol.FullyConnected(data, name="fc1", num_hidden=128)`. This layer has `data` as the input, its name and the number of hidden neurons. -3. The activation is set by `act1 <- mx.symbol.Activation(fc1, name="relu1", act_type="relu")`. The activation function takes the output from the first hidden layer `fc1`. -4. The second hidden layer takes the result from `act1` as the input, with its name as "fc2" and the number of hidden neurons as 64. -5. the second activation is almost the same as `act1`, except we have a different input source and name. -6. Here comes the output layer. Since there's only 10 digits, we set the number of neurons to 10. -7. Finally we set the activation to softmax to get a probabilistic prediction. - -If you are a big fan of the `%>%` operator, you can also define the network as below: - -```{r, eval=FALSE} -library(magrittr) -softmax <- mx.symbol.Variable("data") %>% - mx.symbol.FullyConnected(name = "fc1", num_hidden = 128) %>% - mx.symbol.Activation(name = "relu1", act_type = "relu") %>% - mx.symbol.FullyConnected(name = "fc2", num_hidden = 64) %>% - mx.symbol.Activation(name = "relu2", act_type = "relu") %>% - mx.symbol.FullyConnected(name="fc3", num_hidden=10) %>% - mx.symbol.SoftmaxOutput(name="sm") -``` - -## Training - -We are almost ready for the training process. Before we start the computation, let's decide what device should we use. - -```{r} -devices <- mx.cpu() -``` - -Here we assign CPU to `mxnet`. After all these preparation, you can run the following command to train the neural network! Note that `mx.set.seed` is the correct function to control the random process in `mxnet`. - -```{r} -mx.set.seed(0) -model <- mx.model.FeedForward.create(softmax, X = train.x, y = train.y, - ctx = devices, num.round = 5, - array.batch.size = 100, - learning.rate = 0.07, momentum = 0.9, - eval.metric = mx.metric.accuracy, - initializer = mx.init.uniform(0.07), - batch.end.callback = mx.callback.log.train.metric(100)) -``` - -## Prediction and Submission - -To make prediction, we can simply write - -```{r} -preds <- predict(model, test) -dim(preds) -``` - -It is a matrix with 28000 rows and 10 cols, containing the desired classification probabilities from the output layer. To extract the maximum label for each row, we can use the `max.col` in R: - -```{r} -pred.label <- max.col(t(preds)) - 1 -table(pred.label) -``` - -With a little extra effort in the csv format, we can have our submission to the competition! - -```{r, eval = FALSE} -submission <- data.frame(ImageId=1:ncol(test), Label=pred.label) -write.csv(submission, file='submission.csv', row.names=FALSE, quote=FALSE) -``` - -## LeNet - -Next we are going to introduce a new network structure: [LeNet](http://yann.lecun.com/exdb/lenet/). It is proposed by Yann LeCun to recognize handwritten digits. Now we are going to demonstrate how to construct and train an LeNet in `mxnet`. - - -First we construct the network: - -```{r} -require(mxnet) -# input -data <- mx.symbol.Variable('data') -# first conv -conv1 <- mx.symbol.Convolution(data=data, kernel=c(5,5), num_filter=20) -tanh1 <- mx.symbol.Activation(data=conv1, act_type="tanh") -pool1 <- mx.symbol.Pooling(data=tanh1, pool_type="max", - kernel=c(2,2), stride=c(2,2)) -# second conv -conv2 <- mx.symbol.Convolution(data=pool1, kernel=c(5,5), num_filter=50) -tanh2 <- mx.symbol.Activation(data=conv2, act_type="tanh") -pool2 <- mx.symbol.Pooling(data=tanh2, pool_type="max", - kernel=c(2,2), stride=c(2,2)) -# first fullc -flatten <- mx.symbol.Flatten(data=pool2) -fc1 <- mx.symbol.FullyConnected(data=flatten, num_hidden=500) -tanh3 <- mx.symbol.Activation(data=fc1, act_type="tanh") -# second fullc -fc2 <- mx.symbol.FullyConnected(data=tanh3, num_hidden=10) -# loss -lenet <- mx.symbol.SoftmaxOutput(data=fc2) -``` - -Then let us reshape the matrices into arrays: - -```{r} -train.array <- train.x -dim(train.array) <- c(28, 28, 1, ncol(train.x)) -test.array <- test -dim(test.array) <- c(28, 28, 1, ncol(test)) -``` - -Next we are going to compare the training speed on different devices, so the definition of the devices goes first: - -```{r} -n.gpu <- 1 -device.cpu <- mx.cpu() -device.gpu <- lapply(0:(n.gpu-1), function(i) { - mx.gpu(i) -}) -``` - -As you can see, we can pass a list of devices, to ask mxnet to train on multiple GPUs (you can do similar thing for cpu, -but since internal computation of cpu is already multi-threaded, there is less gain than using GPUs). - -We start by training on CPU first. Because it takes a bit time to do so, we will only run it for one iteration. - -```{r} -mx.set.seed(0) -tic <- proc.time() -model <- mx.model.FeedForward.create(lenet, X = train.array, y = train.y, - ctx = device.cpu, num.round = 1, - array.batch.size = 100, - learning.rate = 0.05, momentum = 0.9, wd = 0.00001, - eval.metric = mx.metric.accuracy, - batch.end.callback = mx.callback.log.train.metric(100)) -print(proc.time() - tic) -``` - -Training on GPU: - -```{r} -mx.set.seed(0) -tic <- proc.time() -model <- mx.model.FeedForward.create(lenet, X = train.array, y = train.y, - ctx = device.gpu, num.round = 5, - array.batch.size = 100, - learning.rate = 0.05, momentum = 0.9, wd = 0.00001, - eval.metric = mx.metric.accuracy, - batch.end.callback = mx.callback.log.train.metric(100)) -print(proc.time() - tic) -``` - -As you can see by using GPU, we can get a much faster speedup in training! -Finally we can submit the result to Kaggle again to see the improvement of our ranking! - -```{r, eval = FALSE} -preds <- predict(model, test.array) -pred.label <- max.col(t(preds)) - 1 -submission <- data.frame(ImageId=1:ncol(test), Label=pred.label) -write.csv(submission, file='submission.csv', row.names=FALSE, quote=FALSE) -``` - -![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/knitr/mnistCompetition-kaggle-submission.png) - - diff --git a/benchmark/opperf/rules/default_params.py b/benchmark/opperf/rules/default_params.py index 04ec8100b1b4..fdb149345705 100644 --- a/benchmark/opperf/rules/default_params.py +++ b/benchmark/opperf/rules/default_params.py @@ -188,9 +188,6 @@ DEFAULT_DATA_SVM_LARGE_TENSOR = [(2**29, 2, 2, 2)] DEFAULT_LABEL_SVM_LARGE_TENSOR = [(2**29, 2, 2)] -# SoftmaxOutput -DEFAULT_LABEL_SM = [(32, 3, 256), (32, 3, 10000)] - DEFAULT_DATA_SO_LARGE_TENSOR = [(2**29, 2, 2, 2)] DEFAULT_LABEL_SO_LARGE_TENSOR = [(2**29, 2, 2)] @@ -537,16 +534,6 @@ "moving_mean_batchnorm": DEFAULT_MOVING_MEAN, "moving_var_batchnorm": DEFAULT_MOVING_VAR, "axis_batchnorm": DEFAULT_AXIS_BN, - "data_softmaxoutput": DEFAULT_DATA_NN_BASIC, - "label_softmaxoutput": DEFAULT_LABEL_SM, - "data_maeregressionoutput": DEFAULT_DATA_NN_BASIC, - "label_maeregressionoutput": DEFAULT_LABEL_REG, - "data_logisticregressionoutput": DEFAULT_DATA_NN_BASIC, - "label_logisticregressionoutput": DEFAULT_LABEL_REG, - "data_linearregressionoutput": DEFAULT_DATA_NN_BASIC, - "label_linearregressionoutput": DEFAULT_LABEL_REG, - "data_svmoutput": DEFAULT_DATA_NN_BASIC, - "label_svmoutput": DEFAULT_LABEL_SVM, "grad_scale": DEFAULT_GRAD_SCALE, "normalization": DEFAULT_NORMALIZATION, "margin": DEFAULT_MARGIN, diff --git a/contrib/clojure-package/examples/imclassification/test/imclassification/train_mnist_test.clj b/contrib/clojure-package/examples/imclassification/test/imclassification/train_mnist_test.clj index f185891ab31e..59d37c99ac93 100644 --- a/contrib/clojure-package/examples/imclassification/test/imclassification/train_mnist_test.clj +++ b/contrib/clojure-package/examples/imclassification/test/imclassification/train_mnist_test.clj @@ -31,10 +31,3 @@ (io/reader) (line-seq) (filter #(not (s/includes? % "mxnet_version"))))) - -(deftest mnist-two-epochs-test - (do - (mnist/start [(context/cpu)] 2) - (is (= - (file-to-filtered-seq "test/test-symbol.json.ref") - (file-to-filtered-seq "target/test-symbol.json"))))) diff --git a/contrib/clojure-package/examples/imclassification/test/test-symbol.json.ref b/contrib/clojure-package/examples/imclassification/test/test-symbol.json.ref deleted file mode 100644 index ba1d2fad3a8a..000000000000 --- a/contrib/clojure-package/examples/imclassification/test/test-symbol.json.ref +++ /dev/null @@ -1,105 +0,0 @@ -{ - "nodes": [ - { - "op": "null", - "name": "data", - "inputs": [] - }, - { - "op": "null", - "name": "fc1_weight", - "attrs": {"num_hidden": "128"}, - "inputs": [] - }, - { - "op": "null", - "name": "fc1_bias", - "attrs": {"num_hidden": "128"}, - "inputs": [] - }, - { - "op": "FullyConnected", - "name": "fc1", - "attrs": {"num_hidden": "128"}, - "inputs": [[0, 0, 0], [1, 0, 0], [2, 0, 0]] - }, - { - "op": "Activation", - "name": "relu1", - "attrs": {"act_type": "relu"}, - "inputs": [[3, 0, 0]] - }, - { - "op": "null", - "name": "fc2_weight", - "attrs": {"num_hidden": "64"}, - "inputs": [] - }, - { - "op": "null", - "name": "fc2_bias", - "attrs": {"num_hidden": "64"}, - "inputs": [] - }, - { - "op": "FullyConnected", - "name": "fc2", - "attrs": {"num_hidden": "64"}, - "inputs": [[4, 0, 0], [5, 0, 0], [6, 0, 0]] - }, - { - "op": "Activation", - "name": "relu2", - "attrs": {"act_type": "relu"}, - "inputs": [[7, 0, 0]] - }, - { - "op": "null", - "name": "fc3_weight", - "attrs": {"num_hidden": "10"}, - "inputs": [] - }, - { - "op": "null", - "name": "fc3_bias", - "attrs": {"num_hidden": "10"}, - "inputs": [] - }, - { - "op": "FullyConnected", - "name": "fc3", - "attrs": {"num_hidden": "10"}, - "inputs": [[8, 0, 0], [9, 0, 0], [10, 0, 0]] - }, - { - "op": "null", - "name": "softmax_label", - "inputs": [] - }, - { - "op": "SoftmaxOutput", - "name": "softmax", - "inputs": [[11, 0, 0], [12, 0, 0]] - } - ], - "arg_nodes": [0, 1, 2, 5, 6, 9, 10, 12], - "node_row_ptr": [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14 - ], - "heads": [[13, 0, 0]], - "attrs": {"mxnet_version": ["int", 10400]} -} \ No newline at end of file diff --git a/cpp-package/example/CMakeLists.txt b/cpp-package/example/CMakeLists.txt index d682a88c7760..d54843f319b4 100644 --- a/cpp-package/example/CMakeLists.txt +++ b/cpp-package/example/CMakeLists.txt @@ -18,37 +18,6 @@ # Explicitly set GENERATED property https://gitlab.kitware.com/cmake/cmake/issues/18399 set_property(SOURCE ${CMAKE_CURRENT_LIST_DIR}/../include/mxnet-cpp/op.h PROPERTY GENERATED 1) -add_executable(lenet lenet.cpp) -target_link_libraries(lenet mxnet_cpp) - -add_executable(lenet_with_mxdataiter lenet_with_mxdataiter.cpp) -target_link_libraries(lenet_with_mxdataiter mxnet_cpp) - -add_executable(alexnet alexnet.cpp) -target_link_libraries(alexnet mxnet_cpp) - -add_executable(charRNN charRNN.cpp) -target_link_libraries(charRNN mxnet_cpp) - -add_executable(googlenet googlenet.cpp) -target_link_libraries(googlenet mxnet_cpp) - -add_executable(inception_bn inception_bn.cpp) -target_link_libraries(inception_bn mxnet_cpp) - -add_executable(mlp mlp.cpp) -target_link_libraries(mlp mxnet_cpp) - -add_executable(mlp_cpu mlp_cpu.cpp) -target_link_libraries(mlp_cpu mxnet_cpp) - -add_executable(mlp_gpu mlp_gpu.cpp) -target_link_libraries(mlp_gpu mxnet_cpp) - -add_executable(resnet resnet.cpp) -target_link_libraries(resnet mxnet_cpp) - - if(MSVC) add_custom_target(cpp_package_deploy_library ALL DEPENDS mxnet diff --git a/cpp-package/example/README.md b/cpp-package/example/README.md index 555316dd1ac3..208532e0066e 100644 --- a/cpp-package/example/README.md +++ b/cpp-package/example/README.md @@ -35,97 +35,3 @@ By default, the examples are built to be run on GPU. To build examples to run on The examples that are built to be run on GPU may not work on the non-GPU machines. The makefile will also download the necessary data files and store in a data folder. (The download will take couple of minutes, but will be done only once on a fresh installation.) - - -## Examples demonstrating training workflow - -This directory contains following examples. In order to run the examples, ensure that the path to the MXNet shared library is added to the OS specific environment variable viz. **LD\_LIBRARY\_PATH** for Linux, Mac and Ubuntu OS and **PATH** for Windows OS. For example `export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/home/ubuntu/incubator-mxnet/lib` on ubuntu using gpu. - -### [alexnet.cpp]() - -The example implements the C++ version of AlexNet. The networks trains on MNIST data. The number of epochs can be specified as a command line argument. For example to train with 10 epochs use the following: - -``` -build/alexnet 10 -``` - -### [googlenet.cpp]() - -The code implements a GoogLeNet/Inception network using the C++ API. The example uses MNIST data to train the network. By default, the example trains the model for 100 epochs. The number of epochs can also be specified in the command line. For example, to train the model for 10 epochs use the following: - -``` -build/googlenet 10 -``` - -### [mlp.cpp]() - -The code implements a multilayer perceptron from scratch. The example creates its own dummy data to train the model. The example does not require command line parameters. It trains the model for 20,000 epochs. -To run the example use the following command: - -``` -build/mlp -``` - -### [mlp_cpu.cpp]() - -The code implements a multilayer perceptron to train the MNIST data. The code demonstrates the use of "SimpleBind" C++ API and MNISTIter. The example is designed to work on CPU. The example does not require command line parameters. -To run the example use the following command: - -``` -build/mlp_cpu -``` - -### [mlp_gpu.cpp]() - -The code implements a multilayer perceptron to train the MNIST data. The code demonstrates the use of the "SimpleBind" C++ API and MNISTIter. The example is designed to work on GPU. The example does not require command line arguments. To run the example execute following command: - -``` -build/mlp_gpu -``` - -### [mlp_csv.cpp]() - -The code implements a multilayer perceptron to train the MNIST data. The code demonstrates the use of the "SimpleBind" C++ API and CSVIter. The CSVIter can iterate data that is in CSV format. The example can be run on CPU or GPU. The example usage is as follows: - -``` -build/mlp_csv --train data/mnist_data/mnist_train.csv --test data/mnist_data/mnist_test.csv --epochs 10 --batch_size 100 --hidden_units "128 64 64" --gpu -``` -* To get the `mnist_training_set.csv` and `mnist_test_set.csv` please run the following command: -```python -# in incubator-mxnet/cpp-package/example directory -python mnist_to_csv.py ./data/mnist_data/train-images-idx3-ubyte ./data/mnist_data/train-labels-idx1-ubyte ./data/mnist_data/mnist_train.csv 60000 -python mnist_to_csv.py ./data/mnist_data/t10k-images-idx3-ubyte ./data/mnist_data/t10k-labels-idx1-ubyte ./data/mnist_data/mnist_test.csv 10000 -``` - -### [resnet.cpp]() - -The code implements a resnet model using the C++ API. The model is used to train MNIST data. The number of epochs for training the model can be specified on the command line. By default, model is trained for 100 epochs. For example, to train with 10 epochs use the following command: - -``` -build/resnet 10 -``` - -### [lenet.cpp]() - -The code implements a lenet model using the C++ API. It uses MNIST training data in CSV format to train the network. The example does not use built-in CSVIter to read the data from CSV file. The number of epochs can be specified on the command line. By default, the mode is trained for 100,000 epochs. For example, to train with 10 epochs use the following command: - -``` -build/lenet 10 -``` -### [lenet\_with\_mxdataiter.cpp]() - -The code implements a lenet model using the C++ API. It uses MNIST training data to train the network. The example uses built-in MNISTIter to read the data. The number of epochs can be specified on the command line. By default, the mode is trained for 100 epochs. For example, to train with 10 epochs use the following command: - -``` -build/lenet_with_mxdataiter 10 -``` - -In addition, there is `run_lenet_with_mxdataiter.sh` that downloads the mnist data and run `lenet_with_mxdataiter` example. - -### [inception_bn.cpp]() - -The code implements an Inception network using the C++ API with batch normalization. The example uses MNIST data to train the network. The model trains for 100 epochs. The example can be run by executing the following command: - -``` -build/inception_bn -``` diff --git a/cpp-package/example/alexnet.cpp b/cpp-package/example/alexnet.cpp deleted file mode 100644 index 1c182182c1a5..000000000000 --- a/cpp-package/example/alexnet.cpp +++ /dev/null @@ -1,358 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - */ -#include -#include -#include -#include -#include -#include "utils.h" -#include "mxnet-cpp/MxNetCpp.h" - -using namespace mxnet::cpp; - -Symbol AlexnetSymbol(int num_classes) { - auto input_data = Symbol::Variable("data"); - auto target_label = Symbol::Variable("label"); - /*stage 1*/ - auto conv1 = Operator("Convolution") - .SetParam("kernel", Shape(11, 11)) - .SetParam("num_filter", 96) - .SetParam("stride", Shape(4, 4)) - .SetParam("dilate", Shape(1, 1)) - .SetParam("pad", Shape(0, 0)) - .SetParam("num_group", 1) - .SetParam("workspace", 512) - .SetParam("no_bias", false) - .SetInput("data", input_data) - .CreateSymbol("conv1"); - auto relu1 = Operator("Activation") - .SetParam("act_type", "relu") /*relu,sigmoid,softrelu,tanh */ - .SetInput("data", conv1) - .CreateSymbol("relu1"); - auto pool1 = Operator("Pooling") - .SetParam("kernel", Shape(3, 3)) - .SetParam("pool_type", "max") /*avg,max,sum */ - .SetParam("global_pool", false) - .SetParam("stride", Shape(2, 2)) - .SetParam("pad", Shape(0, 0)) - .SetInput("data", relu1) - .CreateSymbol("pool1"); - auto lrn1 = Operator("LRN") - .SetParam("nsize", 5) - .SetParam("alpha", 0.0001) - .SetParam("beta", 0.75) - .SetParam("knorm", 1) - .SetInput("data", pool1) - .CreateSymbol("lrn1"); - /*stage 2*/ - auto conv2 = Operator("Convolution") - .SetParam("kernel", Shape(5, 5)) - .SetParam("num_filter", 256) - .SetParam("stride", Shape(1, 1)) - .SetParam("dilate", Shape(1, 1)) - .SetParam("pad", Shape(2, 2)) - .SetParam("num_group", 1) - .SetParam("workspace", 512) - .SetParam("no_bias", false) - .SetInput("data", lrn1) - .CreateSymbol("conv2"); - auto relu2 = Operator("Activation") - .SetParam("act_type", "relu") /*relu,sigmoid,softrelu,tanh */ - .SetInput("data", conv2) - .CreateSymbol("relu2"); - auto pool2 = Operator("Pooling") - .SetParam("kernel", Shape(3, 3)) - .SetParam("pool_type", "max") /*avg,max,sum */ - .SetParam("global_pool", false) - .SetParam("stride", Shape(2, 2)) - .SetParam("pad", Shape(0, 0)) - .SetInput("data", relu2) - .CreateSymbol("pool2"); - auto lrn2 = Operator("LRN") - .SetParam("nsize", 5) - .SetParam("alpha", 0.0001) - .SetParam("beta", 0.75) - .SetParam("knorm", 1) - .SetInput("data", pool2) - .CreateSymbol("lrn2"); - /*stage 3*/ - auto conv3 = Operator("Convolution") - .SetParam("kernel", Shape(3, 3)) - .SetParam("num_filter", 384) - .SetParam("stride", Shape(1, 1)) - .SetParam("dilate", Shape(1, 1)) - .SetParam("pad", Shape(1, 1)) - .SetParam("num_group", 1) - .SetParam("workspace", 512) - .SetParam("no_bias", false) - .SetInput("data", lrn2) - .CreateSymbol("conv3"); - auto relu3 = Operator("Activation") - .SetParam("act_type", "relu") /*relu,sigmoid,softrelu,tanh */ - .SetInput("data", conv3) - .CreateSymbol("relu3"); - auto conv4 = Operator("Convolution") - .SetParam("kernel", Shape(3, 3)) - .SetParam("num_filter", 384) - .SetParam("stride", Shape(1, 1)) - .SetParam("dilate", Shape(1, 1)) - .SetParam("pad", Shape(1, 1)) - .SetParam("num_group", 1) - .SetParam("workspace", 512) - .SetParam("no_bias", false) - .SetInput("data", relu3) - .CreateSymbol("conv4"); - auto relu4 = Operator("Activation") - .SetParam("act_type", "relu") /*relu,sigmoid,softrelu,tanh */ - .SetInput("data", conv4) - .CreateSymbol("relu4"); - auto conv5 = Operator("Convolution") - .SetParam("kernel", Shape(3, 3)) - .SetParam("num_filter", 256) - .SetParam("stride", Shape(1, 1)) - .SetParam("dilate", Shape(1, 1)) - .SetParam("pad", Shape(1, 1)) - .SetParam("num_group", 1) - .SetParam("workspace", 512) - .SetParam("no_bias", false) - .SetInput("data", relu4) - .CreateSymbol("conv5"); - auto relu5 = Operator("Activation") - .SetParam("act_type", "relu") - .SetInput("data", conv5) - .CreateSymbol("relu5"); - auto pool3 = Operator("Pooling") - .SetParam("kernel", Shape(3, 3)) - .SetParam("pool_type", "max") - .SetParam("global_pool", false) - .SetParam("stride", Shape(2, 2)) - .SetParam("pad", Shape(0, 0)) - .SetInput("data", relu5) - .CreateSymbol("pool3"); - /*stage4*/ - auto flatten = - Operator("Flatten").SetInput("data", pool3).CreateSymbol("flatten"); - auto fc1 = Operator("FullyConnected") - .SetParam("num_hidden", 4096) - .SetParam("no_bias", false) - .SetInput("data", flatten) - .CreateSymbol("fc1"); - auto relu6 = Operator("Activation") - .SetParam("act_type", "relu") - .SetInput("data", fc1) - .CreateSymbol("relu6"); - auto dropout1 = Operator("Dropout") - .SetParam("p", 0.5) - .SetInput("data", relu6) - .CreateSymbol("dropout1"); - /*stage5*/ - auto fc2 = Operator("FullyConnected") - .SetParam("num_hidden", 4096) - .SetParam("no_bias", false) - .SetInput("data", dropout1) - .CreateSymbol("fc2"); - auto relu7 = Operator("Activation") - .SetParam("act_type", "relu") - .SetInput("data", fc2) - .CreateSymbol("relu7"); - auto dropout2 = Operator("Dropout") - .SetParam("p", 0.5) - .SetInput("data", relu7) - .CreateSymbol("dropout2"); - /*stage6*/ - auto fc3 = Operator("FullyConnected") - .SetParam("num_hidden", num_classes) - .SetParam("no_bias", false) - .SetInput("data", dropout2) - .CreateSymbol("fc3"); - auto softmax = Operator("SoftmaxOutput") - .SetParam("grad_scale", 1) - .SetParam("ignore_label", -1) - .SetParam("multi_output", false) - .SetParam("use_ignore", false) - .SetParam("normalization", "null") /*batch,null,valid */ - .SetInput("data", fc3) - .SetInput("label", target_label) - .CreateSymbol("softmax"); - return softmax; -} - -NDArray ResizeInput(NDArray data, const Shape new_shape) { - NDArray pic = data.Reshape(Shape(0, 1, 28, 28)); - NDArray pic_1channel; - Operator("_contrib_BilinearResize2D") - .SetParam("height", new_shape[2]) - .SetParam("width", new_shape[3]) - (pic).Invoke(pic_1channel); - NDArray output; - Operator("tile") - .SetParam("reps", Shape(1, 3, 1, 1)) - (pic_1channel).Invoke(output); - return output; -} - -int main(int argc, char const *argv[]) { - /*basic config*/ - int max_epo = argc > 1 ? strtol(argv[1], nullptr, 10) : 100; - float learning_rate = 1e-4; - float weight_decay = 1e-4; - - /*context*/ - auto ctx = Context::cpu(); - int num_gpu; - MXGetGPUCount(&num_gpu); - int batch_size = 32; -#if !MXNET_USE_CPU - if (num_gpu > 0) { - ctx = Context::gpu(); - batch_size = 256; - } -#endif - - TRY - /*net symbol*/ - auto Net = AlexnetSymbol(10); - - /*args_map and aux_map is used for parameters' saving*/ - std::map args_map; - std::map aux_map; - - /*we should tell mxnet the shape of data and label*/ - const Shape data_shape = Shape(batch_size, 3, 256, 256), - label_shape = Shape(batch_size); - args_map["data"] = NDArray(data_shape, ctx); - args_map["label"] = NDArray(label_shape, ctx); - - /*with data and label, executor can be generated automatically*/ - auto *exec = Net.SimpleBind(ctx, args_map); - auto arg_names = Net.ListArguments(); - aux_map = exec->aux_dict(); - args_map = exec->arg_dict(); - - /*if fine tune from some pre-trained model, we should load the parameters*/ - // NDArray::Load("./model/alex_params_3", nullptr, &args_map); - /*else, we should use initializer Xavier to init the params*/ - Xavier xavier = Xavier(Xavier::gaussian, Xavier::in, 2.34); - for (auto &arg : args_map) { - /*be careful here, the arg's name must has some specific ends or starts for - * initializer to call*/ - xavier(arg.first, &arg.second); - } - - /*these binary files should be generated using im2rc tools, which can be found - * in mxnet/bin*/ - std::vector data_files = { "./data/mnist_data/train-images-idx3-ubyte", - "./data/mnist_data/train-labels-idx1-ubyte", - "./data/mnist_data/t10k-images-idx3-ubyte", - "./data/mnist_data/t10k-labels-idx1-ubyte" - }; - - auto train_iter = MXDataIter("MNISTIter"); - if (!setDataIter(&train_iter, "Train", data_files, batch_size)) { - return 1; - } - - auto val_iter = MXDataIter("MNISTIter"); - if (!setDataIter(&val_iter, "Label", data_files, batch_size)) { - return 1; - } - - Optimizer* opt = OptimizerRegistry::Find("sgd"); - opt->SetParam("momentum", 0.9) - ->SetParam("rescale_grad", 1.0 / batch_size) - ->SetParam("clip_gradient", 10) - ->SetParam("lr", learning_rate) - ->SetParam("wd", weight_decay); - - Accuracy acu_train, acu_val; - LogLoss logloss_train, logloss_val; - for (int epoch = 0; epoch < max_epo; ++epoch) { - LG << "Train Epoch: " << epoch; - /*reset the metric every epoch*/ - acu_train.Reset(); - /*reset the data iter every epoch*/ - train_iter.Reset(); - int iter = 0; - while (train_iter.Next()) { - auto batch = train_iter.GetDataBatch(); - /*use copyto to feed new data and label to the executor*/ - ResizeInput(batch.data, data_shape).CopyTo(&args_map["data"]); - batch.label.CopyTo(&args_map["label"]); - exec->Forward(true); - exec->Backward(); - for (size_t i = 0; i < arg_names.size(); ++i) { - if (arg_names[i] == "data" || arg_names[i] == "label") continue; - opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]); - } - - NDArray::WaitAll(); - acu_train.Update(batch.label, exec->outputs[0]); - logloss_train.Reset(); - logloss_train.Update(batch.label, exec->outputs[0]); - ++iter; - LG << "EPOCH: " << epoch << " ITER: " << iter - << " Train Accuracy: " << acu_train.Get() - << " Train Loss: " << logloss_train.Get(); - } - LG << "EPOCH: " << epoch << " Train Accuracy: " << acu_train.Get(); - - LG << "Val Epoch: " << epoch; - acu_val.Reset(); - val_iter.Reset(); - logloss_val.Reset(); - iter = 0; - while (val_iter.Next()) { - auto batch = val_iter.GetDataBatch(); - ResizeInput(batch.data, data_shape).CopyTo(&args_map["data"]); - batch.label.CopyTo(&args_map["label"]); - exec->Forward(false); - NDArray::WaitAll(); - acu_val.Update(batch.label, exec->outputs[0]); - logloss_val.Update(batch.label, exec->outputs[0]); - LG << "EPOCH: " << epoch << " ITER: " << iter << " Val Accuracy: " << acu_val.Get(); - ++iter; - } - LG << "EPOCH: " << epoch << " Val Accuracy: " << acu_val.Get(); - LG << "EPOCH: " << epoch << " Val LogLoss: " << logloss_val.Get(); - - /*save the parameters*/ - std::stringstream ss; - ss << epoch; - std::string epoch_str; - ss >> epoch_str; - std::string save_path_param = "alex_param_" + epoch_str; - auto save_args = args_map; - /*we do not want to save the data and label*/ - save_args.erase(save_args.find("data")); - save_args.erase(save_args.find("label")); - /*the alexnet does not get any aux array, so we do not need to save - * aux_map*/ - LG << "EPOCH: " << epoch << " Saving to..." << save_path_param; - NDArray::Save(save_path_param, save_args); - } - /*don't foget to release the executor*/ - delete exec; - delete opt; - MXNotifyShutdown(); - CATCH - return 0; -} diff --git a/cpp-package/example/charRNN.cpp b/cpp-package/example/charRNN.cpp deleted file mode 100644 index 3d90e2e9ed9f..000000000000 --- a/cpp-package/example/charRNN.cpp +++ /dev/null @@ -1,756 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * Hua Zhang mz24cn@hotmail.com - * The code implements C++ version charRNN for mxnet\example\rnn\char-rnn.ipynb with MXNet.cpp API. - * The generated params file is compatiable with python version. - * train() and predict() has been verified with original data samples. - * 2017/1/23: - * Add faster version charRNN based on built-in cuDNN RNN operator, 10 times faster. - * Add time major computation graph, although no substantial performance difference. - * Support continuing training from last params file. - * Rename params file epoch number starts from zero. - */ - -#if _MSC_VER -#pragma warning(disable: 4996) // VS2015 complains on 'std::copy' ... -#endif -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "mxnet-cpp/MxNetCpp.h" -#include "utils.h" - -using namespace mxnet::cpp; - -struct LSTMState { - Symbol C; - Symbol h; -}; - -struct LSTMParam { - Symbol i2h_weight; - Symbol i2h_bias; - Symbol h2h_weight; - Symbol h2h_bias; -}; - -bool TIME_MAJOR = true; - -// LSTM Cell symbol -LSTMState LSTM(int num_hidden, const Symbol& indata, const LSTMState& prev_state, - const LSTMParam& param, int seqidx, int layeridx, mx_float dropout = 0) { - auto input = dropout > 0? Dropout(indata, dropout) : indata; - auto prefix = std::string("t") + std::to_string(seqidx) + "_l" + std::to_string(layeridx); - auto i2h = FullyConnected(prefix + "_i2h", input, param.i2h_weight, param.i2h_bias, - num_hidden * 4); - auto h2h = FullyConnected(prefix + "_h2h", prev_state.h, param.h2h_weight, param.h2h_bias, - num_hidden * 4); - auto gates = i2h + h2h; - auto slice_gates = SliceChannel(prefix + "_slice", gates, 4); - auto in_gate = Activation(slice_gates[0], ActivationActType::kSigmoid); - auto in_transform = Activation(slice_gates[1], ActivationActType::kTanh); - auto forget_gate = Activation(slice_gates[2], ActivationActType::kSigmoid); - auto out_gate = Activation(slice_gates[3], ActivationActType::kSigmoid); - - LSTMState state; - state.C = (forget_gate * prev_state.C) + (in_gate * in_transform); - state.h = out_gate * Activation(state.C, ActivationActType::kTanh); - return state; -} - -Symbol LSTMUnroll(int num_lstm_layer, int sequence_length, int input_dim, - int num_hidden, int num_embed, mx_float dropout = 0) { - auto isTrain = sequence_length > 1; - auto data = Symbol::Variable("data"); - if (TIME_MAJOR && isTrain) - data = transpose(data); - auto embed_weight = Symbol::Variable("embed_weight"); - auto embed = Embedding("embed", data, embed_weight, input_dim, num_embed); - auto wordvec = isTrain? SliceChannel(embed, sequence_length, TIME_MAJOR? 0 : 1, true) : embed; - - std::vector last_states; - std::vector param_cells; - for (int l = 0; l < num_lstm_layer; l++) { - std::string layer = "l" + std::to_string(l); - LSTMParam param; - param.i2h_weight = Symbol::Variable(layer + "_i2h_weight"); - param.i2h_bias = Symbol::Variable(layer + "_i2h_bias"); - param.h2h_weight = Symbol::Variable(layer + "_h2h_weight"); - param.h2h_bias = Symbol::Variable(layer + "_h2h_bias"); - param_cells.push_back(param); - LSTMState state; - state.C = Symbol::Variable(layer + "_init_c"); - state.h = Symbol::Variable(layer + "_init_h"); - last_states.push_back(state); - } - - std::vector hidden_all; - for (int i = 0; i < sequence_length; i++) { - auto hidden = wordvec[i]; - for (int layer = 0; layer < num_lstm_layer; layer++) { - double dp_ratio = layer == 0? 0 : dropout; - auto next_state = LSTM(num_hidden, hidden, last_states[layer], param_cells[layer], - i, layer, dp_ratio); - hidden = next_state.h; - last_states[layer] = next_state; - } - if (dropout > 0) - hidden = Dropout(hidden, dropout); - hidden_all.push_back(hidden); - } - - auto hidden_concat = isTrain? Concat(hidden_all, hidden_all.size(), 0) : hidden_all[0]; - auto cls_weight = Symbol::Variable("cls_weight"); - auto cls_bias = Symbol::Variable("cls_bias"); - auto pred = FullyConnected("pred", hidden_concat, cls_weight, cls_bias, input_dim); - - auto label = Symbol::Variable("softmax_label"); - label = transpose(label); - label = Reshape(label, Shape(), false, Shape(0), false); // -1: infer from graph - auto sm = SoftmaxOutput("softmax", pred, label); - if (isTrain) - return sm; - - std::vector outputs = { sm }; - for (auto& state : last_states) { - outputs.push_back(state.C); - outputs.push_back(state.h); - } - return Symbol::Group(outputs); -} - -// Currently mxnet GPU version RNN operator is implemented via *fast* NVIDIA cuDNN. -Symbol LSTMWithBuiltInRNNOp(int num_lstm_layer, int sequence_length, int input_dim, - int num_hidden, int num_embed, mx_float dropout = 0) { - auto isTrain = sequence_length > 1; - auto data = Symbol::Variable("data"); - if (TIME_MAJOR && isTrain) - data = transpose(data); - - auto embed_weight = Symbol::Variable("embed_weight"); - auto embed = Embedding("embed", data, embed_weight, input_dim, num_embed); - auto label = Symbol::Variable("softmax_label"); - label = transpose(label); - label = Reshape(label, Shape(), false, - Shape(0), false); // FullyConnected requires one dimension - if (!TIME_MAJOR && isTrain) - embed = SwapAxis(embed, 0, 1); // Change to time-major as cuDNN requires - - // We need not do the SwapAxis op as python version does. Direct and better performance in C++! - auto rnn_h_init = Symbol::Variable("LSTM_init_h"); - auto rnn_c_init = Symbol::Variable("LSTM_init_c"); - auto rnn_params = Symbol::Variable("LSTM_parameters"); // See explanations near RNNXavier class - auto variable_sequence_length = Symbol::Variable("sequence_length"); - auto rnn = RNN(embed, rnn_params, rnn_h_init, rnn_c_init, variable_sequence_length, num_hidden, - num_lstm_layer, RNNMode::kLstm, false, dropout, !isTrain); - auto hidden = Reshape(rnn[0], Shape(), false, Shape(0, num_hidden), false); - - auto cls_weight = Symbol::Variable("cls_weight"); - auto cls_bias = Symbol::Variable("cls_bias"); - auto pred = FullyConnected("pred", hidden, cls_weight, cls_bias, input_dim); - /*In rnn-time-major/rnn_cell_demo.py, the author claimed time-major version speeds up - * 1.5~2 times versus batch version. I doubts on the conclusion. In my test, the performance - * of both codes are almost same. In fact, there are no substantially differences between - * two codes. They are both based on time major cuDNN, the computation graph only differs - * slightly on the choices of where to put Reshape/SwapAxis/transpose operation. Here I don't - * use Reshape on pred and keep label shape on SoftmaxOutput like time major version code, - * but Reshape on label for simplification. It doesn't make influence on performacne. */ - - auto sm = SoftmaxOutput("softmax", pred, label); - if (isTrain) - return sm; - else - return Symbol::Group({ sm, rnn[1/*RNNOpOutputs::kStateOut=1*/], - rnn[2/*RNNOpOutputs::kStateCellOut=2*/] }); -} - -class Shuffler { - std::vector sequence; - public: - explicit Shuffler(int size) : sequence(size) { - int* p = sequence.data(); - for (int i = 0; i < size; i++) - *p++ = i; - } - void shuffle(std::function lambda = nullptr) { - random_shuffle(sequence.begin(), sequence.end()); - int n = 0; - if (lambda != nullptr) - for (int i : sequence) - lambda(n++, i); - } - const int* data() { - return sequence.data(); - } -}; - -class BucketSentenceIter : public DataIter { - Shuffler* random; - int batch, current, end; - unsigned int sequence_length; - Context device; - std::vector> sequences; - std::vector index2chars; - std::unordered_map charIndices; - - public: - BucketSentenceIter(std::string filename, int minibatch, Context context) : batch(minibatch), - current(-1), device(context) { - auto content = readContent(filename); - buildCharIndex(content); - sequences = convertTextToSequences(content, '\n'); - - int N = sequences.size() / batch * batch; // total used samples - sequences.resize(N); - sort(sequences.begin(), sequences.end(), [](const std::vector& a, - const std::vector& b) { return a.size() < b.size(); }); - - sequence_length = sequences.back().size(); - random = new Shuffler(N); - // We still can get random results if call Reset() firstly -// std::vector>* target = &sequences; -// random->shuffle([target](int n, int i) { (*target)[n].swap((*target)[i]); }); - end = N / batch; - } - virtual ~BucketSentenceIter() { - delete random; - } - - unsigned int maxSequenceLength() { - return sequence_length; - } - - size_t characterSize() { - return charIndices.size(); - } - - virtual bool Next(void) { - return ++current < end; - } - virtual NDArray GetData(void) { - const int* indices = random->data(); - mx_float *data = new mx_float[sequence_length * batch], *pdata = data; - - for (int i = current * batch, end = i + batch; i < end; i++) { - memcpy(pdata, sequences[indices[i]].data(), sequences[indices[i]].size() * sizeof(mx_float)); - if (sequences[indices[i]].size() < sequence_length) - memset(pdata + sequences[indices[i]].size(), 0, - (sequence_length - sequences[indices[i]].size()) * sizeof(mx_float)); - pdata += sequence_length; - } - NDArray array(Shape(batch, sequence_length), device, false); - array.SyncCopyFromCPU(data, batch * sequence_length); - return array; - } - virtual NDArray GetLabel(void) { - const int* indices = random->data(); - mx_float *label = new mx_float[sequence_length * batch], *plabel = label; - - for (int i = current * batch, end = i + batch; i < end; i++) { - memcpy(plabel, sequences[indices[i]].data() + 1, - (sequences[indices[i]].size() - 1) * sizeof(mx_float)); - memset(plabel + sequences[indices[i]].size() - 1, 0, - (sequence_length - sequences[indices[i]].size() + 1) * sizeof(mx_float)); - plabel += sequence_length; - } - NDArray array(Shape(batch, sequence_length), device, false); - array.SyncCopyFromCPU(label, batch * sequence_length); - return array; - } - virtual int GetPadNum(void) { - return sequence_length - sequences[random->data()[current * batch]].size(); - } - virtual std::vector GetIndex(void) { - const int* indices = random->data(); - std::vector list(indices + current * batch, indices + current * batch + batch); - return list; - } - virtual void BeforeFirst(void) { - current = -1; - random->shuffle(nullptr); - } - - std::wstring readContent(const std::string file) { - std::wifstream ifs(file, std::ios::binary); - if (ifs) { - std::wostringstream os; - os << ifs.rdbuf(); - return os.str(); - } - return L""; - } - - void buildCharIndex(const std::wstring& content) { - // This version buildCharIndex() Compatiable with python version char_rnn dictionary - int n = 1; - charIndices['\0'] = 0; // padding character - index2chars.push_back(0); // padding character index - for (auto c : content) - if (charIndices.find(c) == charIndices.end()) { - charIndices[c] = n++; - index2chars.push_back(c); - } - } -// void buildCharIndex(wstring& content) { -// for (auto c : content) -// charIndices[c]++; // char-frequency map; then char-index map -// std::vector> characters; -// for (auto& iter : charIndices) -// characters.push_back(make_tuple(iter.first, iter.second)); -// sort(characters.begin(), characters.end(), [](const tuple& a, -// const tuple& b) { return get<1>(a) > get<1>(b); }); -// mx_float index = 1; //0 is left for zero-padding -// index2chars.clear(); -// index2chars.push_back(0); //zero-padding -// for (auto& t : characters) { -// charIndices[get<0>(t)] = index++; -// index2chars.push_back(get<0>(t)); -// }s -// } - - inline wchar_t character(int i) { - return index2chars[i]; - } - - inline mx_float index(wchar_t c) { - return charIndices[c]; - } - - void saveCharIndices(const std::string file) { - std::wofstream ofs(file, std::ios::binary); - if (ofs) { - ofs.write(index2chars.data() + 1, index2chars.size() - 1); - ofs.close(); - } - } - - static std::tuple, std::vector> loadCharIndices( - const std::string file) { - std::wifstream ifs(file, std::ios::binary); - std::unordered_map map; - std::vector chars; - if (ifs) { - std::wostringstream os; - os << ifs.rdbuf(); - int n = 1; - map[L'\0'] = 0; - chars.push_back(L'\0'); - for (auto c : os.str()) { - map[c] = (mx_float) n++; - chars.push_back(c); - } - } - // Note: Can't use {} because this would hit the explicit constructor - return std::tuple, std::vector>(map, chars); - } - - std::vector> - convertTextToSequences(const std::wstring& content, wchar_t spliter) { - std::vector> sequences; - sequences.push_back(std::vector()); - for (auto c : content) - if (c == spliter && !sequences.back().empty()) - sequences.push_back(std::vector()); - else - sequences.back().push_back(charIndices[c]); - return sequences; - } -}; - -void OutputPerplexity(NDArray* labels, NDArray* output) { - std::vector charIndices, a; - labels->SyncCopyToCPU(&charIndices, 0L); // 0L indicates all - output->SyncCopyToCPU(&a, 0L)/*4128*84*/; - mx_float loss = 0; - int batchSize = labels->GetShape()[0]/*32*/, sequenceLength = labels->GetShape()[1]/*129*/, - nSamples = output->GetShape()[0]/*4128*/, vocabSize = output->GetShape()[1]/*84*/; - for (int n = 0; n < nSamples; n++) { - int row = n % batchSize, column = n / batchSize, labelOffset = column + - row * sequenceLength; // Search based on column storage: labels.T - mx_float safe_value = std::max(1e-10f, a[vocabSize * n + - static_cast(charIndices[labelOffset])]); - loss += -log(safe_value); // Calculate negative log-likelihood - } - loss = exp(loss / nSamples); - std::cout << "Train-Perplexity=" << loss << std::endl; -} - -void SaveCheckpoint(const std::string filepath, Symbol net, Executor* exe) { - std::map params; - for (auto iter : exe->arg_dict()) - if (iter.first.find("_init_") == std::string::npos - && iter.first.rfind("data") != iter.first.length() - 4 - && iter.first.rfind("label") != iter.first.length() - 5) - params.insert({"arg:" + iter.first, iter.second}); - for (auto iter : exe->aux_dict()) - params.insert({"aux:" + iter.first, iter.second}); - NDArray::Save(filepath, params); -} - -void LoadCheckpoint(const std::string filepath, Executor* exe) { - std::map params = NDArray::LoadToMap(filepath); - for (auto iter : params) { - std::string type = iter.first.substr(0, 4); - std::string name = iter.first.substr(4); - NDArray target; - if (type == "arg:") - target = exe->arg_dict()[name]; - else if (type == "aux:") - target = exe->aux_dict()[name]; - else - continue; - iter.second.CopyTo(&target); - } -} - -int input_dim = 0;/*84*/ -int sequence_length_max = 0;/*129*/ -int num_embed = 256; -int num_lstm_layer = 3; -int num_hidden = 512; -mx_float dropout = 0.2; -void train(const std::string file, int batch_size, int max_epoch, int start_epoch) { - Context device(DeviceType::kGPU, 0); - BucketSentenceIter dataIter(file, batch_size, device); - std::string prefix = file.substr(0, file.rfind(".")); - dataIter.saveCharIndices(prefix + ".dictionary"); - - input_dim = static_cast(dataIter.characterSize()); - sequence_length_max = dataIter.maxSequenceLength(); - - auto RNN = LSTMUnroll(num_lstm_layer, sequence_length_max, input_dim, num_hidden, - num_embed, dropout); - std::map args_map; - args_map["data"] = NDArray(Shape(batch_size, sequence_length_max), device, false); - args_map["softmax_label"] = NDArray(Shape(batch_size, sequence_length_max), device, false); - for (int i = 0; i < num_lstm_layer; i++) { - std::string key = "l" + std::to_string(i) + "_init_"; - args_map[key + "c"] = NDArray(Shape(batch_size, num_hidden), device, false); - args_map[key + "h"] = NDArray(Shape(batch_size, num_hidden), device, false); - } - std::vector zeros(batch_size * num_hidden, 0); - // RNN.SimpleBind(device, args_map, {}, {{"data", kNullOp}}); - Executor* exe = RNN.SimpleBind(device, args_map); - - if (start_epoch == -1) { - Xavier xavier = Xavier(Xavier::gaussian, Xavier::in, 2.34); - for (auto &arg : exe->arg_dict()) - xavier(arg.first, &arg.second); - } else { - LoadCheckpoint(prefix + "-" + std::to_string(start_epoch) + ".params", exe); - } - start_epoch++; - - mx_float learning_rate = 0.0002; - mx_float weight_decay = 0.000002; - Optimizer* opt = OptimizerRegistry::Find("sgd"); - opt->SetParam("lr", learning_rate) - ->SetParam("wd", weight_decay); -// opt->SetParam("momentum", 0.9)->SetParam("rescale_grad", 1.0 / batch_size) -// ->SetParam("clip_gradient", 10); - - for (int epoch = start_epoch; epoch < max_epoch; ++epoch) { - dataIter.Reset(); - auto tic = std::chrono::system_clock::now(); - while (dataIter.Next()) { - auto data_batch = dataIter.GetDataBatch(); - data_batch.data.CopyTo(&exe->arg_dict()["data"]); - data_batch.label.CopyTo(&exe->arg_dict()["softmax_label"]); - for (int l = 0; l < num_lstm_layer; l++) { - std::string key = "l" + std::to_string(l) + "_init_"; - exe->arg_dict()[key + "c"].SyncCopyFromCPU(zeros); - exe->arg_dict()[key + "h"].SyncCopyFromCPU(zeros); - } - NDArray::WaitAll(); - - exe->Forward(true); - exe->Backward(); - for (size_t i = 0; i < exe->arg_arrays.size(); ++i) { - opt->Update(i, exe->arg_arrays[i], exe->grad_arrays[i]); - } - - NDArray::WaitAll(); - } - auto toc = std::chrono::system_clock::now(); - std::cout << "Epoch[" << epoch << "] Time Cost:" << - std::chrono::duration_cast< std::chrono::seconds>(toc - tic).count() << " seconds "; - OutputPerplexity(&exe->arg_dict()["softmax_label"], &exe->outputs[0]); - std::string filepath = prefix + "-" + std::to_string(epoch) + ".params"; - SaveCheckpoint(filepath, RNN, exe); - } - - delete exe; - delete opt; -} - -/*The original example, rnn_cell_demo.py, uses default Xavier as initalizer, which relies on - * variable name, cannot initialize LSTM_parameters. Thus it was renamed to LSTM_bias, - * which can be initialized as zero. But it cannot converge after 100 epochs in this corpus - * example. Using RNNXavier, after 15 oscillating epochs, it rapidly converges like old - * LSTMUnroll version. */ -class RNNXavier : public Xavier { - public: - RNNXavier(RandType rand_type = gaussian, FactorType factor_type = avg, - float magnitude = 3) : Xavier(rand_type, factor_type, magnitude) { - } - virtual ~RNNXavier() {} - protected: - virtual void InitDefault(NDArray* arr) { - Xavier::InitWeight(arr); - } -}; - -void trainWithBuiltInRNNOp(const std::string file, int batch_size, int max_epoch, int start_epoch) { - Context device(DeviceType::kGPU, 0); - BucketSentenceIter dataIter(file, batch_size, device); - std::string prefix = file.substr(0, file.rfind(".")); - dataIter.saveCharIndices(prefix + ".dictionary"); - - input_dim = static_cast(dataIter.characterSize()); - sequence_length_max = dataIter.maxSequenceLength(); - - auto RNN = LSTMWithBuiltInRNNOp(num_lstm_layer, sequence_length_max, input_dim, num_hidden, - num_embed, dropout); - std::map args_map; - args_map["data"] = NDArray(Shape(batch_size, sequence_length_max), device, false); - // Avoiding SwapAxis, batch_size is of second dimension. - args_map["LSTM_init_c"] = NDArray(Shape(num_lstm_layer, batch_size, num_hidden), device, false); - args_map["LSTM_init_h"] = NDArray(Shape(num_lstm_layer, batch_size, num_hidden), device, false); - args_map["softmax_label"] = NDArray(Shape(batch_size, sequence_length_max), device, false); - std::vector zeros(batch_size * num_lstm_layer * num_hidden, 0); - Executor* exe = RNN.SimpleBind(device, args_map); - - if (start_epoch == -1) { - RNNXavier xavier = RNNXavier(Xavier::gaussian, Xavier::in, 2.34); - for (auto &arg : exe->arg_dict()) - xavier(arg.first, &arg.second); - } else { - LoadCheckpoint(prefix + "-" + std::to_string(start_epoch) + ".params", exe); - } - start_epoch++; - - Optimizer* opt = OptimizerRegistry::Find("sgd"); -// opt->SetParam("momentum", 0.9)->SetParam("rescale_grad", 1.0 / batch_size) -// ->SetParam("clip_gradient", 10); - - for (int epoch = start_epoch; epoch < max_epoch; ++epoch) { - dataIter.Reset(); - auto tic = std::chrono::system_clock::now(); - while (dataIter.Next()) { - auto data_batch = dataIter.GetDataBatch(); - data_batch.data.CopyTo(&exe->arg_dict()["data"]); - data_batch.label.CopyTo(&exe->arg_dict()["softmax_label"]); - exe->arg_dict()["LSTM_init_c"].SyncCopyFromCPU(zeros); - exe->arg_dict()["LSTM_init_h"].SyncCopyFromCPU(zeros); - NDArray::WaitAll(); - - exe->Forward(true); - exe->Backward(); - for (size_t i = 0; i < exe->arg_arrays.size(); ++i) { - opt->Update(i, exe->arg_arrays[i], exe->grad_arrays[i]); - } - NDArray::WaitAll(); - } - auto toc = std::chrono::system_clock::now(); - std::cout << "Epoch[" << epoch << "] Time Cost:" << - std::chrono::duration_cast< std::chrono::seconds>(toc - tic).count() << " seconds "; - OutputPerplexity(&exe->arg_dict()["softmax_label"], &exe->outputs[0]); - std::string filepath = prefix + "-" + std::to_string(epoch) + ".params"; - SaveCheckpoint(filepath, RNN, exe); - } - - delete exe; - delete opt; -} - -void predict(std::wstring* ptext, int sequence_length, const std::string param_file, - const std::string dictionary_file) { - Context device(DeviceType::kGPU, 0); - auto results = BucketSentenceIter::loadCharIndices(dictionary_file); - auto dictionary = std::get<0>(results); - auto charIndices = std::get<1>(results); - input_dim = static_cast(charIndices.size()); - auto RNN = LSTMUnroll(num_lstm_layer, 1, input_dim, num_hidden, num_embed, 0); - - std::map args_map; - args_map["data"] = NDArray(Shape(1, 1), device, false); - args_map["softmax_label"] = NDArray(Shape(1, 1), device, false); - std::vector zeros(1 * num_hidden, 0); - for (int l = 0; l < num_lstm_layer; l++) { - std::string key = "l" + std::to_string(l) + "_init_"; - args_map[key + "c"] = NDArray(Shape(1, num_hidden), device, false); - args_map[key + "h"] = NDArray(Shape(1, num_hidden), device, false); - args_map[key + "c"].SyncCopyFromCPU(zeros); - args_map[key + "h"].SyncCopyFromCPU(zeros); - } - Executor* exe = RNN.SimpleBind(device, args_map); - LoadCheckpoint(param_file, exe); - - mx_float index; - wchar_t next = 0; - std::vector softmax; - softmax.resize(input_dim); - for (auto c : *ptext) { - exe->arg_dict()["data"].SyncCopyFromCPU(&dictionary[c], 1); - exe->Forward(false); - - exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim); - for (int l = 0; l < num_lstm_layer; l++) { - std::string key = "l" + std::to_string(l) + "_init_"; - exe->outputs[l * 2 + 1].CopyTo(&args_map[key + "c"]); - exe->outputs[l * 2 + 2].CopyTo(&args_map[key + "h"]); - } - - size_t n = max_element(softmax.begin(), softmax.end()) - softmax.begin(); - index = (mx_float) n; - next = charIndices[n]; - } - ptext->push_back(next); - - for (int i = 0; i < sequence_length; i++) { - exe->arg_dict()["data"].SyncCopyFromCPU(&index, 1); - exe->Forward(false); - - exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim); - for (int l = 0; l < num_lstm_layer; l++) { - std::string key = "l" + std::to_string(l) + "_init_"; - exe->outputs[l * 2 + 1].CopyTo(&args_map[key + "c"]); - exe->outputs[l * 2 + 2].CopyTo(&args_map[key + "h"]); - } - - size_t n = max_element(softmax.begin(), softmax.end()) - softmax.begin(); - index = (mx_float) n; - next = charIndices[n]; - ptext->push_back(next); - } - - delete exe; -} - -void predictWithBuiltInRNNOp(std::wstring* ptext, int sequence_length, const std::string param_file, - const std::string dictionary_file) { - Context device(DeviceType::kGPU, 0); - auto results = BucketSentenceIter::loadCharIndices(dictionary_file); - auto dictionary = std::get<0>(results); - auto charIndices = std::get<1>(results); - input_dim = static_cast(charIndices.size()); - auto RNN = LSTMWithBuiltInRNNOp(num_lstm_layer, 1, input_dim, num_hidden, num_embed, 0); - - std::map args_map; - args_map["data"] = NDArray(Shape(1, 1), device, false); - args_map["softmax_label"] = NDArray(Shape(1, 1), device, false); - std::vector zeros(1 * num_lstm_layer * num_hidden, 0); - // Avoiding SwapAxis, batch_size=1 is of second dimension. - args_map["LSTM_init_c"] = NDArray(Shape(num_lstm_layer, 1, num_hidden), device, false); - args_map["LSTM_init_h"] = NDArray(Shape(num_lstm_layer, 1, num_hidden), device, false); - args_map["LSTM_init_c"].SyncCopyFromCPU(zeros); - args_map["LSTM_init_h"].SyncCopyFromCPU(zeros); - Executor* exe = RNN.SimpleBind(device, args_map); - LoadCheckpoint(param_file, exe); - - mx_float index; - wchar_t next = 0; - std::vector softmax; - softmax.resize(input_dim); - for (auto c : *ptext) { - exe->arg_dict()["data"].SyncCopyFromCPU(&dictionary[c], 1); - exe->Forward(false); - - exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim); - exe->outputs[1].CopyTo(&args_map["LSTM_init_h"]); - exe->outputs[2].CopyTo(&args_map["LSTM_init_c"]); - - size_t n = max_element(softmax.begin(), softmax.end()) - softmax.begin(); - index = (mx_float) n; - next = charIndices[n]; - } - ptext->push_back(next); - - for (int i = 0; i < sequence_length; i++) { - exe->arg_dict()["data"].SyncCopyFromCPU(&index, 1); - exe->Forward(false); - - exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim); - exe->outputs[1].CopyTo(&args_map["LSTM_init_h"]); - exe->outputs[2].CopyTo(&args_map["LSTM_init_c"]); - - size_t n = max_element(softmax.begin(), softmax.end()) - softmax.begin(); - index = (mx_float) n; - next = charIndices[n]; - ptext->push_back(next); - } - - delete exe; -} - -int main(int argc, char** argv) { - if (argc < 5) { - std::cout << "Usage for training: charRNN train[BuiltIn][TimeMajor] {corpus file}" - " {batch size} {max epoch} [{starting epoch}]" << std::endl; - std::cout <<"Usage for prediction: charRNN predict[BuiltIn][TimeMajor] {params file}" - " {dictionary file} {beginning of text}" << std::endl; - std::cout <<"Note: The {params file} of train/trainBuiltIn/trainTimeMajor/trainBuiltInTimeMajor" - " are not compatible with each other." << std::endl; - return 0; - } - - std::string task = argv[1]; - bool builtIn = task.find("BuiltIn") != std::string::npos; - TIME_MAJOR = task.find("TimeMajor") != std::string::npos; - std::cout << "use BuiltIn cuDNN RNN: " << builtIn << std::endl - << "use data as TimeMajor: " << TIME_MAJOR << std::endl; - TRY - if (task.find("train") == 0) { - std::cout << "train batch size: " << argv[3] << std::endl - << "train max epoch: " << argv[4] << std::endl; - int start_epoch = argc > 5? atoi(argv[5]) : -1; - // this function will generate dictionary file and params file. - if (builtIn) - trainWithBuiltInRNNOp(argv[2], atoi(argv[3]), atoi(argv[4]), start_epoch); - else - train(argv[2], atoi(argv[3]), atoi(argv[4]), start_epoch); // ditto - } else if (task.find("predict") == 0) { - std::wstring text; // = L"If there is anyone out there who still doubts "; - // Considering of extending to Chinese samples in future, use wchar_t instead of char - for (char c : std::string(argv[4])) - text.push_back((wchar_t) c); - /*Python version predicts text default to random selecltions. Here I didn't write the random - code, always choose the 'best' character. So the text length reduced to 600. Longer size often - leads to repeated sentances, since training sequence length is only 129 for obama corpus.*/ - if (builtIn) - predictWithBuiltInRNNOp(&text, 600, argv[2], argv[3]); - else - predict(&text, 600, argv[2], argv[3]); - std::wcout << text << std::endl; - } - - MXNotifyShutdown(); - CATCH - return 0; -} diff --git a/cpp-package/example/googlenet.cpp b/cpp-package/example/googlenet.cpp deleted file mode 100644 index 7b51f4fde3a7..000000000000 --- a/cpp-package/example/googlenet.cpp +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - */ -#include -#include -#include -#include -#include "utils.h" -#include "mxnet-cpp/MxNetCpp.h" - -using namespace mxnet::cpp; - -Symbol ConvFactory(Symbol data, int num_filter, - Shape kernel, - Shape stride = Shape(1, 1), - Shape pad = Shape(0, 0), - const std::string & name = "", - const std::string & suffix = "") { - Symbol conv_w("conv_" + name + suffix + "_w"), conv_b("conv_" + name + suffix + "_b"); - - Symbol conv = Convolution("conv_" + name + suffix, data, - conv_w, conv_b, kernel, - num_filter, stride, Shape(1, 1), pad); - return Activation("relu_" + name + suffix, conv, "relu"); -} - -Symbol InceptionFactory(Symbol data, int num_1x1, int num_3x3red, - int num_3x3, int num_d5x5red, int num_d5x5, - PoolingPoolType pool, int proj, const std::string & name) { - Symbol c1x1 = ConvFactory(data, num_1x1, Shape(1, 1), - Shape(1, 1), Shape(0, 0), name + "_1x1"); - - Symbol c3x3r = ConvFactory(data, num_3x3red, Shape(1, 1), - Shape(1, 1), Shape(0, 0), name + "_3x3", "_reduce"); - - Symbol c3x3 = ConvFactory(c3x3r, num_3x3, Shape(3, 3), - Shape(1, 1), Shape(1, 1), name + "_3x3"); - - Symbol cd5x5r = ConvFactory(data, num_d5x5red, Shape(1, 1), - Shape(1, 1), Shape(0, 0), name + "_5x5", "_reduce"); - - Symbol cd5x5 = ConvFactory(cd5x5r, num_d5x5, Shape(5, 5), - Shape(1, 1), Shape(2, 2), name + "_5x5"); - - Symbol pooling = Pooling(name + "_pool", data, Shape(3, 3), pool, - false, false, PoolingPoolingConvention::kValid, - Shape(1, 1), Shape(1, 1)); - - Symbol cproj = ConvFactory(pooling, proj, Shape(1, 1), - Shape(1, 1), Shape(0, 0), name + "_proj"); - - std::vector lst; - lst.push_back(c1x1); - lst.push_back(c3x3); - lst.push_back(cd5x5); - lst.push_back(cproj); - return Concat("ch_concat_" + name + "_chconcat", lst, lst.size()); -} - -Symbol GoogleNetSymbol(int num_classes) { - // data and label - Symbol data = Symbol::Variable("data"); - Symbol data_label = Symbol::Variable("data_label"); - - Symbol conv1 = ConvFactory(data, 64, Shape(7, 7), Shape(2, 2), Shape(3, 3), "conv1"); - Symbol pool1 = Pooling("pool1", conv1, Shape(3, 3), PoolingPoolType::kMax, - false, false, PoolingPoolingConvention::kValid, Shape(2, 2)); - Symbol conv2 = ConvFactory(pool1, 64, Shape(1, 1), Shape(1, 1), - Shape(0, 0), "conv2"); - Symbol conv3 = ConvFactory(conv2, 192, Shape(3, 3), Shape(1, 1), Shape(1, 1), "conv3"); - Symbol pool3 = Pooling("pool3", conv3, Shape(3, 3), PoolingPoolType::kMax, - false, false, PoolingPoolingConvention::kValid, Shape(2, 2)); - - Symbol in3a = InceptionFactory(pool3, 64, 96, 128, 16, 32, PoolingPoolType::kMax, 32, "in3a"); - Symbol in3b = InceptionFactory(in3a, 128, 128, 192, 32, 96, PoolingPoolType::kMax, 64, "in3b"); - Symbol pool4 = Pooling("pool4", in3b, Shape(3, 3), PoolingPoolType::kMax, - false, false, PoolingPoolingConvention::kValid, Shape(2, 2)); - Symbol in4a = InceptionFactory(pool4, 192, 96, 208, 16, 48, PoolingPoolType::kMax, 64, "in4a"); - Symbol in4b = InceptionFactory(in4a, 160, 112, 224, 24, 64, PoolingPoolType::kMax, 64, "in4b"); - Symbol in4c = InceptionFactory(in4b, 128, 128, 256, 24, 64, PoolingPoolType::kMax, 64, "in4c"); - Symbol in4d = InceptionFactory(in4c, 112, 144, 288, 32, 64, PoolingPoolType::kMax, 64, "in4d"); - Symbol in4e = InceptionFactory(in4d, 256, 160, 320, 32, 128, PoolingPoolType::kMax, 128, "in4e"); - Symbol pool5 = Pooling("pool5", in4e, Shape(3, 3), PoolingPoolType::kMax, - false, false, PoolingPoolingConvention::kValid, Shape(2, 2)); - Symbol in5a = InceptionFactory(pool5, 256, 160, 320, 32, 128, PoolingPoolType::kMax, 128, "in5a"); - Symbol in5b = InceptionFactory(in5a, 384, 192, 384, 48, 128, PoolingPoolType::kMax, 128, "in5b"); - Symbol pool6 = Pooling("pool6", in5b, Shape(7, 7), PoolingPoolType::kAvg, - false, false, PoolingPoolingConvention::kValid, Shape(1, 1)); - - Symbol flatten = Flatten("flatten", pool6); - - Symbol fc1_w("fc1_w"), fc1_b("fc1_b"); - Symbol fc1 = FullyConnected("fc1", flatten, fc1_w, fc1_b, num_classes); - - return SoftmaxOutput("softmax", fc1, data_label); -} - -int main(int argc, char const *argv[]) { - int batch_size = 50; - int max_epoch = argc > 1 ? strtol(argv[1], nullptr, 10) : 100; - float learning_rate = 1e-4; - float weight_decay = 1e-4; - - auto ctx = Context::gpu(); -#if MXNET_USE_CPU - ctx = Context::cpu();; -#endif - - TRY - auto googlenet = GoogleNetSymbol(10); - std::map args_map; - std::map aux_map; - - args_map["data"] = NDArray(Shape(batch_size, 3, 256, 256), ctx); - args_map["data_label"] = NDArray(Shape(batch_size), ctx); - googlenet.InferArgsMap(ctx, &args_map, args_map); - - std::vector data_files = { "./data/mnist_data/train-images-idx3-ubyte", - "./data/mnist_data/train-labels-idx1-ubyte", - "./data/mnist_data/t10k-images-idx3-ubyte", - "./data/mnist_data/t10k-labels-idx1-ubyte" - }; - - auto train_iter = MXDataIter("MNISTIter"); - if (!setDataIter(&train_iter, "Train", data_files, batch_size)) { - return 1; - } - - auto val_iter = MXDataIter("MNISTIter"); - if (!setDataIter(&val_iter, "Label", data_files, batch_size)) { - return 1; - } - - Optimizer* opt = OptimizerRegistry::Find("sgd"); - opt->SetParam("momentum", 0.9) - ->SetParam("rescale_grad", 1.0 / batch_size) - ->SetParam("clip_gradient", 10) - ->SetParam("lr", learning_rate) - ->SetParam("wd", weight_decay); - - - auto *exec = googlenet.SimpleBind(ctx, args_map); - auto arg_names = googlenet.ListArguments(); - - for (int iter = 0; iter < max_epoch; ++iter) { - LG << "Epoch: " << iter; - train_iter.Reset(); - while (train_iter.Next()) { - auto data_batch = train_iter.GetDataBatch(); - data_batch.data.CopyTo(&args_map["data"]); - data_batch.label.CopyTo(&args_map["data_label"]); - NDArray::WaitAll(); - exec->Forward(true); - exec->Backward(); - for (size_t i = 0; i < arg_names.size(); ++i) { - if (arg_names[i] == "data" || arg_names[i] == "data_label") continue; - opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]); - } - } - - Accuracy acu; - val_iter.Reset(); - while (val_iter.Next()) { - auto data_batch = val_iter.GetDataBatch(); - data_batch.data.CopyTo(&args_map["data"]); - data_batch.label.CopyTo(&args_map["data_label"]); - NDArray::WaitAll(); - exec->Forward(false); - NDArray::WaitAll(); - acu.Update(data_batch.label, exec->outputs[0]); - } - LG << "Accuracy: " << acu.Get(); - } - - delete exec; - delete opt; - MXNotifyShutdown(); - CATCH - return 0; -} diff --git a/cpp-package/example/inception_bn.cpp b/cpp-package/example/inception_bn.cpp deleted file mode 100644 index 8fe6b070497c..000000000000 --- a/cpp-package/example/inception_bn.cpp +++ /dev/null @@ -1,261 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - */ -#include -#include -#include -#include -#include "utils.h" -#include "mxnet-cpp/MxNetCpp.h" - -using namespace mxnet::cpp; - -Symbol ConvFactoryBN(Symbol data, int num_filter, - Shape kernel, Shape stride, Shape pad, - const std::string & name, - const std::string & suffix = "") { - Symbol conv_w("conv_" + name + suffix + "_w"), conv_b("conv_" + name + suffix + "_b"); - - Symbol conv = Convolution("conv_" + name + suffix, data, - conv_w, conv_b, kernel, - num_filter, stride, Shape(1, 1), pad); - std::string name_suffix = name + suffix; - Symbol gamma(name_suffix + "_gamma"); - Symbol beta(name_suffix + "_beta"); - Symbol mmean(name_suffix + "_mmean"); - Symbol mvar(name_suffix + "_mvar"); - Symbol bn = BatchNorm("bn_" + name + suffix, conv, gamma, beta, mmean, mvar); - return Activation("relu_" + name + suffix, bn, "relu"); -} - -Symbol InceptionFactoryA(Symbol data, int num_1x1, int num_3x3red, - int num_3x3, int num_d3x3red, int num_d3x3, - PoolingPoolType pool, int proj, - const std::string & name) { - Symbol c1x1 = ConvFactoryBN(data, num_1x1, Shape(1, 1), Shape(1, 1), - Shape(0, 0), name + "1x1"); - Symbol c3x3r = ConvFactoryBN(data, num_3x3red, Shape(1, 1), Shape(1, 1), - Shape(0, 0), name + "_3x3r"); - Symbol c3x3 = ConvFactoryBN(c3x3r, num_3x3, Shape(3, 3), Shape(1, 1), - Shape(1, 1), name + "_3x3"); - Symbol cd3x3r = ConvFactoryBN(data, num_d3x3red, Shape(1, 1), Shape(1, 1), - Shape(0, 0), name + "_double_3x3", "_reduce"); - Symbol cd3x3 = ConvFactoryBN(cd3x3r, num_d3x3, Shape(3, 3), Shape(1, 1), - Shape(1, 1), name + "_double_3x3_0"); - cd3x3 = ConvFactoryBN(data = cd3x3, num_d3x3, Shape(3, 3), Shape(1, 1), - Shape(1, 1), name + "_double_3x3_1"); - Symbol pooling = Pooling(name + "_pool", data, - Shape(3, 3), pool, false, false, - PoolingPoolingConvention::kValid, - Shape(1, 1), Shape(1, 1)); - Symbol cproj = ConvFactoryBN(pooling, proj, Shape(1, 1), Shape(1, 1), - Shape(0, 0), name + "_proj"); - std::vector lst; - lst.push_back(c1x1); - lst.push_back(c3x3); - lst.push_back(cd3x3); - lst.push_back(cproj); - return Concat("ch_concat_" + name + "_chconcat", lst, lst.size()); -} - -Symbol InceptionFactoryB(Symbol data, int num_3x3red, int num_3x3, - int num_d3x3red, int num_d3x3, const std::string & name) { - Symbol c3x3r = ConvFactoryBN(data, num_3x3red, Shape(1, 1), - Shape(1, 1), Shape(0, 0), - name + "_3x3", "_reduce"); - Symbol c3x3 = ConvFactoryBN(c3x3r, num_3x3, Shape(3, 3), Shape(2, 2), - Shape(1, 1), name + "_3x3"); - Symbol cd3x3r = ConvFactoryBN(data, num_d3x3red, Shape(1, 1), Shape(1, 1), - Shape(0, 0), name + "_double_3x3", "_reduce"); - Symbol cd3x3 = ConvFactoryBN(cd3x3r, num_d3x3, Shape(3, 3), Shape(1, 1), - Shape(1, 1), name + "_double_3x3_0"); - cd3x3 = ConvFactoryBN(cd3x3, num_d3x3, Shape(3, 3), Shape(2, 2), - Shape(1, 1), name + "_double_3x3_1"); - Symbol pooling = Pooling("max_pool_" + name + "_pool", data, - Shape(3, 3), PoolingPoolType::kMax, - false, false, PoolingPoolingConvention::kValid, - Shape(2, 2), Shape(1, 1)); - std::vector lst; - lst.push_back(c3x3); - lst.push_back(cd3x3); - lst.push_back(pooling); - return Concat("ch_concat_" + name + "_chconcat", lst, lst.size()); -} - -Symbol InceptionSymbol(int num_classes) { - // data and label - Symbol data = Symbol::Variable("data"); - Symbol data_label = Symbol::Variable("data_label"); - - // stage 1 - Symbol conv1 = ConvFactoryBN(data, 64, Shape(7, 7), Shape(2, 2), Shape(3, 3), "conv1"); - Symbol pool1 = Pooling("pool1", conv1, Shape(3, 3), PoolingPoolType::kMax, - false, false, PoolingPoolingConvention::kValid, Shape(2, 2)); - - // stage 2 - Symbol conv2red = ConvFactoryBN(pool1, 64, Shape(1, 1), Shape(1, 1), Shape(0, 0), "conv2red"); - Symbol conv2 = ConvFactoryBN(conv2red, 192, Shape(3, 3), Shape(1, 1), Shape(1, 1), "conv2"); - Symbol pool2 = Pooling("pool2", conv2, Shape(3, 3), PoolingPoolType::kMax, - false, false, PoolingPoolingConvention::kValid, Shape(2, 2)); - - // stage 3 - Symbol in3a = InceptionFactoryA(pool2, 64, 64, 64, 64, 96, PoolingPoolType::kAvg, 32, "3a"); - Symbol in3b = InceptionFactoryA(in3a, 64, 64, 96, 64, 96, PoolingPoolType::kAvg, 64, "3b"); - Symbol in3c = InceptionFactoryB(in3b, 128, 160, 64, 96, "3c"); - - // stage 4 - Symbol in4a = InceptionFactoryA(in3c, 224, 64, 96, 96, 128, PoolingPoolType::kAvg, 128, "4a"); - Symbol in4b = InceptionFactoryA(in4a, 192, 96, 128, 96, 128, PoolingPoolType::kAvg, 128, "4b"); - Symbol in4c = InceptionFactoryA(in4b, 160, 128, 160, 128, 160, PoolingPoolType::kAvg, 128, "4c"); - Symbol in4d = InceptionFactoryA(in4c, 96, 128, 192, 160, 192, PoolingPoolType::kAvg, 128, "4d"); - Symbol in4e = InceptionFactoryB(in4d, 128, 192, 192, 256, "4e"); - - // stage 5 - Symbol in5a = InceptionFactoryA(in4e, 352, 192, 320, 160, 224, PoolingPoolType::kAvg, 128, "5a"); - Symbol in5b = InceptionFactoryA(in5a, 352, 192, 320, 192, 224, PoolingPoolType::kMax, 128, "5b"); - - // average pooling - Symbol avg = Pooling("global_pool", in5b, Shape(7, 7), PoolingPoolType::kAvg); - - // classifier - Symbol flatten = Flatten("flatten", avg); - Symbol conv1_w("conv1_w"), conv1_b("conv1_b"); - Symbol fc1 = FullyConnected("fc1", flatten, conv1_w, conv1_b, num_classes); - return SoftmaxOutput("softmax", fc1, data_label); -} - -NDArray ResizeInput(NDArray data, const Shape new_shape) { - NDArray pic = data.Reshape(Shape(0, 1, 28, 28)); - NDArray pic_1channel; - Operator("_contrib_BilinearResize2D") - .SetParam("height", new_shape[2]) - .SetParam("width", new_shape[3]) - (pic).Invoke(pic_1channel); - NDArray output; - Operator("tile") - .SetParam("reps", Shape(1, 3, 1, 1)) - (pic_1channel).Invoke(output); - return output; -} - -int main(int argc, char const *argv[]) { - int batch_size = 40; - int max_epoch = argc > 1 ? strtol(argv[1], nullptr, 10) : 100; - float learning_rate = 1e-2; - float weight_decay = 1e-4; - - /*context*/ - auto ctx = Context::cpu(); - int num_gpu; - MXGetGPUCount(&num_gpu); -#if !MXNET_USE_CPU - if (num_gpu > 0) { - ctx = Context::gpu(); - } -#endif - - TRY - auto inception_bn_net = InceptionSymbol(10); - std::map args_map; - std::map aux_map; - - const Shape data_shape = Shape(batch_size, 3, 224, 224), - label_shape = Shape(batch_size); - args_map["data"] = NDArray(data_shape, ctx); - args_map["data_label"] = NDArray(label_shape, ctx); - inception_bn_net.InferArgsMap(ctx, &args_map, args_map); - - std::vector data_files = { "./data/mnist_data/train-images-idx3-ubyte", - "./data/mnist_data/train-labels-idx1-ubyte", - "./data/mnist_data/t10k-images-idx3-ubyte", - "./data/mnist_data/t10k-labels-idx1-ubyte" - }; - - auto train_iter = MXDataIter("MNISTIter"); - if (!setDataIter(&train_iter, "Train", data_files, batch_size)) { - return 1; - } - - auto val_iter = MXDataIter("MNISTIter"); - if (!setDataIter(&val_iter, "Label", data_files, batch_size)) { - return 1; - } - - // initialize parameters - Xavier xavier = Xavier(Xavier::gaussian, Xavier::in, 2); - for (auto &arg : args_map) { - xavier(arg.first, &arg.second); - } - - Optimizer* opt = OptimizerRegistry::Find("sgd"); - opt->SetParam("momentum", 0.9) - ->SetParam("rescale_grad", 1.0 / batch_size) - ->SetParam("clip_gradient", 10) - ->SetParam("lr", learning_rate) - ->SetParam("wd", weight_decay); - - auto *exec = inception_bn_net.SimpleBind(ctx, args_map); - auto arg_names = inception_bn_net.ListArguments(); - - // Create metrics - Accuracy train_acc, val_acc; - for (int iter = 0; iter < max_epoch; ++iter) { - LG << "Epoch: " << iter; - train_iter.Reset(); - train_acc.Reset(); - while (train_iter.Next()) { - auto data_batch = train_iter.GetDataBatch(); - ResizeInput(data_batch.data, data_shape).CopyTo(&args_map["data"]); - data_batch.label.CopyTo(&args_map["data_label"]); - NDArray::WaitAll(); - - exec->Forward(true); - exec->Backward(); - // Update parameters - for (size_t i = 0; i < arg_names.size(); ++i) { - if (arg_names[i] == "data" || arg_names[i] == "data_label") continue; - opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]); - } - - NDArray::WaitAll(); - train_acc.Update(data_batch.label, exec->outputs[0]); - } - - val_iter.Reset(); - val_acc.Reset(); - while (val_iter.Next()) { - auto data_batch = val_iter.GetDataBatch(); - ResizeInput(data_batch.data, data_shape).CopyTo(&args_map["data"]); - data_batch.label.CopyTo(&args_map["data_label"]); - NDArray::WaitAll(); - exec->Forward(false); - NDArray::WaitAll(); - val_acc.Update(data_batch.label, exec->outputs[0]); - } - LG << "Train Accuracy: " << train_acc.Get(); - LG << "Validation Accuracy: " << val_acc.Get(); - } - delete exec; - delete opt; - MXNotifyShutdown(); - CATCH - return 0; -} diff --git a/cpp-package/example/lenet.cpp b/cpp-package/example/lenet.cpp deleted file mode 100644 index 3e34dbb486ab..000000000000 --- a/cpp-package/example/lenet.cpp +++ /dev/null @@ -1,267 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - */ -#include -#include -#include -#include -#include -#include "mxnet-cpp/MxNetCpp.h" -#include "utils.h" - -using namespace mxnet::cpp; - -class Lenet { - public: - Lenet() - : ctx_cpu(Context(DeviceType::kCPU, 0)), -#if MXNET_USE_CPU - ctx_dev(Context(DeviceType::kCPU, 0)) -#else - ctx_dev(Context(DeviceType::kGPU, 0)) -#endif - {} - - void Run(int max_epoch) { - /* - * LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner. - * "Gradient-based learning applied to document recognition." - * Proceedings of the IEEE (1998) - * */ - - /*define the symbolic net*/ - Symbol data = Symbol::Variable("data"); - Symbol data_label = Symbol::Variable("data_label"); - Symbol conv1_w("conv1_w"), conv1_b("conv1_b"); - Symbol conv2_w("conv2_w"), conv2_b("conv2_b"); - Symbol conv3_w("conv3_w"), conv3_b("conv3_b"); - Symbol fc1_w("fc1_w"), fc1_b("fc1_b"); - Symbol fc2_w("fc2_w"), fc2_b("fc2_b"); - - Symbol conv1 = - Convolution("conv1", data, conv1_w, conv1_b, Shape(5, 5), 20); - Symbol tanh1 = Activation("tanh1", conv1, ActivationActType::kTanh); - Symbol pool1 = Pooling("pool1", tanh1, Shape(2, 2), PoolingPoolType::kMax, - false, false, PoolingPoolingConvention::kValid, Shape(2, 2)); - - Symbol conv2 = Convolution("conv2", pool1, conv2_w, conv2_b, - Shape(5, 5), 50); - Symbol tanh2 = Activation("tanh2", conv2, ActivationActType::kTanh); - Symbol pool2 = Pooling("pool2", tanh2, Shape(2, 2), PoolingPoolType::kMax, - false, false, PoolingPoolingConvention::kValid, Shape(2, 2)); - - Symbol conv3 = Convolution("conv3", pool2, conv3_w, conv3_b, - Shape(2, 2), 500); - Symbol tanh3 = Activation("tanh3", conv3, ActivationActType::kTanh); - Symbol pool3 = Pooling("pool3", tanh3, Shape(2, 2), PoolingPoolType::kMax, - false, false, PoolingPoolingConvention::kValid, Shape(1, 1)); - - Symbol flatten = Flatten("flatten", pool3); - Symbol fc1 = FullyConnected("fc1", flatten, fc1_w, fc1_b, 500); - Symbol tanh4 = Activation("tanh4", fc1, ActivationActType::kTanh); - Symbol fc2 = FullyConnected("fc2", tanh4, fc2_w, fc2_b, 10); - - Symbol lenet = SoftmaxOutput("softmax", fc2, data_label); - - for (auto s : lenet.ListArguments()) { - LG << s; - } - - /*setup basic configs*/ - int val_fold = 1; - int W = 28; - int H = 28; - int batch_size = 42; - float learning_rate = 1e-4; - float weight_decay = 1e-4; - - /*prepare the data*/ - std::vector data_vec, label_vec; - size_t data_count = GetData(&data_vec, &label_vec); - const float *dptr = data_vec.data(); - const float *lptr = label_vec.data(); - NDArray data_array = NDArray(Shape(data_count, 1, W, H), ctx_cpu, - false); // store in main memory, and copy to - // device memory while training - NDArray label_array = - NDArray(Shape(data_count), ctx_cpu, - false); // it's also ok if just store them all in device memory - data_array.SyncCopyFromCPU(dptr, data_count * W * H); - label_array.SyncCopyFromCPU(lptr, data_count); - data_array.WaitToRead(); - label_array.WaitToRead(); - - size_t train_num = data_count * (1 - val_fold / 10.0); - train_data = data_array.Slice(0, train_num); - train_label = label_array.Slice(0, train_num); - val_data = data_array.Slice(train_num, data_count); - val_label = label_array.Slice(train_num, data_count); - - LG << "here read fin"; - - /*init some of the args*/ - // map args_map; - args_map["data"] = data_array.Slice(0, batch_size).Copy(ctx_dev); - args_map["data_label"] = label_array.Slice(0, batch_size).Copy(ctx_dev); - NDArray::WaitAll(); - - LG << "here slice fin"; - /* - * we can also feed in some of the args other than the input all by - * ourselves, - * fc2-w , fc1-b for example: - * */ - // args_map["fc2_w"] = - // NDArray(mshadow::Shape2(500, 4 * 4 * 50), ctx_dev, false); - // NDArray::SampleGaussian(0, 1, &args_map["fc2_w"]); - // args_map["fc1_b"] = NDArray(mshadow::Shape1(10), ctx_dev, false); - // args_map["fc1_b"] = 0; - - lenet.InferArgsMap(ctx_dev, &args_map, args_map); - Optimizer* opt = OptimizerRegistry::Find("sgd"); - opt->SetParam("momentum", 0.9) - ->SetParam("rescale_grad", 1.0) - ->SetParam("clip_gradient", 10) - ->SetParam("lr", learning_rate) - ->SetParam("wd", weight_decay); - - Executor *exe = lenet.SimpleBind(ctx_dev, args_map); - auto arg_names = lenet.ListArguments(); - - for (int ITER = 0; ITER < max_epoch; ++ITER) { - size_t start_index = 0; - while (start_index < train_num) { - if (start_index + batch_size > train_num) { - start_index = train_num - batch_size; - } - args_map["data"] = - train_data.Slice(start_index, start_index + batch_size) - .Copy(ctx_dev); - args_map["data_label"] = - train_label.Slice(start_index, start_index + batch_size) - .Copy(ctx_dev); - start_index += batch_size; - NDArray::WaitAll(); - - exe->Forward(true); - exe->Backward(); - // Update parameters - for (size_t i = 0; i < arg_names.size(); ++i) { - if (arg_names[i] == "data" || arg_names[i] == "data_label") continue; - opt->Update(i, exe->arg_arrays[i], exe->grad_arrays[i]); - } - } - - LG << "Iter " << ITER - << ", accuracy: " << ValAccuracy(batch_size * 10, lenet); - } - delete exe; - delete opt; - } - - private: - Context ctx_cpu; - Context ctx_dev; - std::map args_map; - NDArray train_data; - NDArray train_label; - NDArray val_data; - NDArray val_label; - - size_t GetData(std::vector *data, std::vector *label) { - const char *train_data_path = "./data/mnist_data/mnist_train.csv"; - std::ifstream inf(train_data_path); - std::string line; - inf >> line; // ignore the header - size_t _N = 0; - while (inf >> line) { - for (auto &c : line) c = (c == ',') ? ' ' : c; - std::stringstream ss; - ss << line; - float _data; - ss >> _data; - label->push_back(_data); - while (ss >> _data) data->push_back(_data / 256.0); - _N++; - } - inf.close(); - return _N; - } - - float ValAccuracy(int batch_size, Symbol lenet) { - size_t val_num = val_data.GetShape()[0]; - - size_t correct_count = 0; - size_t all_count = 0; - - size_t start_index = 0; - while (start_index < val_num) { - if (start_index + batch_size > val_num) { - start_index = val_num - batch_size; - } - args_map["data"] = - val_data.Slice(start_index, start_index + batch_size).Copy(ctx_dev); - args_map["data_label"] = - val_label.Slice(start_index, start_index + batch_size).Copy(ctx_dev); - start_index += batch_size; - NDArray::WaitAll(); - - Executor *exe = lenet.SimpleBind(ctx_dev, args_map); - exe->Forward(false); - - const auto &out = exe->outputs; - NDArray out_cpu = out[0].Copy(ctx_cpu); - NDArray label_cpu = - val_label.Slice(start_index - batch_size, start_index).Copy(ctx_cpu); - - NDArray::WaitAll(); - - const mx_float *dptr_out = out_cpu.GetData(); - const mx_float *dptr_label = label_cpu.GetData(); - for (int i = 0; i < batch_size; ++i) { - float label = dptr_label[i]; - int cat_num = out_cpu.GetShape()[1]; - float p_label = 0, max_p = dptr_out[i * cat_num]; - for (int j = 0; j < cat_num; ++j) { - float p = dptr_out[i * cat_num + j]; - if (max_p < p) { - p_label = j; - max_p = p; - } - } - if (label == p_label) correct_count++; - } - all_count += batch_size; - - delete exe; - } - return correct_count * 1.0 / all_count; - } -}; - -int main(int argc, char const *argv[]) { - TRY - Lenet lenet; - lenet.Run(argc > 1 ? strtol(argv[1], nullptr, 10) : 100000); - MXNotifyShutdown(); - CATCH - return 0; -} diff --git a/cpp-package/example/lenet_with_mxdataiter.cpp b/cpp-package/example/lenet_with_mxdataiter.cpp deleted file mode 100644 index 6b37693cda59..000000000000 --- a/cpp-package/example/lenet_with_mxdataiter.cpp +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - */ -#include -#include -#include -#include -#include -#include -#include "utils.h" -#include "mxnet-cpp/MxNetCpp.h" - -using namespace mxnet::cpp; - -Symbol LenetSymbol() { - /* - * LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner. - * "Gradient-based learning applied to document recognition." - * Proceedings of the IEEE (1998) - * */ - - /*define the symbolic net*/ - Symbol data = Symbol::Variable("data"); - Symbol data_label = Symbol::Variable("data_label"); - Symbol conv1_w("conv1_w"), conv1_b("conv1_b"); - Symbol conv2_w("conv2_w"), conv2_b("conv2_b"); - Symbol conv3_w("conv3_w"), conv3_b("conv3_b"); - Symbol fc1_w("fc1_w"), fc1_b("fc1_b"); - Symbol fc2_w("fc2_w"), fc2_b("fc2_b"); - - Symbol conv1 = Convolution("conv1", data, conv1_w, conv1_b, Shape(5, 5), 20); - Symbol tanh1 = Activation("tanh1", conv1, ActivationActType::kTanh); - Symbol pool1 = Pooling("pool1", tanh1, Shape(2, 2), PoolingPoolType::kMax, - false, false, PoolingPoolingConvention::kValid, Shape(2, 2)); - - Symbol conv2 = Convolution("conv2", pool1, conv2_w, conv2_b, Shape(5, 5), 50); - Symbol tanh2 = Activation("tanh2", conv2, ActivationActType::kTanh); - Symbol pool2 = Pooling("pool2", tanh2, Shape(2, 2), PoolingPoolType::kMax, - false, false, PoolingPoolingConvention::kValid, Shape(2, 2)); - - Symbol flatten = Flatten("flatten", pool2); - Symbol fc1 = FullyConnected("fc1", flatten, fc1_w, fc1_b, 500); - Symbol tanh3 = Activation("tanh3", fc1, ActivationActType::kTanh); - Symbol fc2 = FullyConnected("fc2", tanh3, fc2_w, fc2_b, 10); - - Symbol lenet = SoftmaxOutput("softmax", fc2, data_label); - - return lenet; -} - -NDArray ResizeInput(NDArray data, const Shape new_shape) { - NDArray pic = data.Reshape(Shape(0, 1, 28, 28)); - NDArray output; - Operator("_contrib_BilinearResize2D") - .SetParam("height", new_shape[2]) - .SetParam("width", new_shape[3]) - (pic).Invoke(output); - return output; -} - -int main(int argc, char const *argv[]) { - /*setup basic configs*/ - int W = 28; - int H = 28; - int batch_size = 128; - int max_epoch = argc > 1 ? strtol(argv[1], nullptr, 10) : 100; - float learning_rate = 1e-4; - float weight_decay = 1e-4; - - auto dev_ctx = Context::cpu(); - int num_gpu; - MXGetGPUCount(&num_gpu); -#if !MXNET_USE_CPU - if (num_gpu > 0) { - dev_ctx = Context::gpu(); - } -#endif - - TRY - auto lenet = LenetSymbol(); - std::map args_map; - - const Shape data_shape = Shape(batch_size, 1, H, W), - label_shape = Shape(batch_size); - args_map["data"] = NDArray(data_shape, dev_ctx); - args_map["data_label"] = NDArray(label_shape, dev_ctx); - lenet.InferArgsMap(dev_ctx, &args_map, args_map); - - args_map["fc1_w"] = NDArray(Shape(500, 4 * 4 * 50), dev_ctx); - NDArray::SampleGaussian(0, 1, &args_map["fc1_w"]); - args_map["fc2_b"] = NDArray(Shape(10), dev_ctx); - args_map["fc2_b"] = 0; - - std::vector data_files = { "./data/mnist_data/train-images-idx3-ubyte", - "./data/mnist_data/train-labels-idx1-ubyte", - "./data/mnist_data/t10k-images-idx3-ubyte", - "./data/mnist_data/t10k-labels-idx1-ubyte" - }; - - auto train_iter = MXDataIter("MNISTIter"); - if (!setDataIter(&train_iter, "Train", data_files, batch_size)) { - return 1; - } - - auto val_iter = MXDataIter("MNISTIter"); - if (!setDataIter(&val_iter, "Label", data_files, batch_size)) { - return 1; - } - - Optimizer* opt = OptimizerRegistry::Find("sgd"); - opt->SetParam("momentum", 0.9) - ->SetParam("rescale_grad", 1.0) - ->SetParam("clip_gradient", 10) - ->SetParam("lr", learning_rate) - ->SetParam("wd", weight_decay); - - - auto *exec = lenet.SimpleBind(dev_ctx, args_map); - auto arg_names = lenet.ListArguments(); - - // Create metrics - Accuracy train_acc, val_acc; - - for (int iter = 0; iter < max_epoch; ++iter) { - int samples = 0; - train_iter.Reset(); - train_acc.Reset(); - - auto tic = std::chrono::system_clock::now(); - - while (train_iter.Next()) { - samples += batch_size; - auto data_batch = train_iter.GetDataBatch(); - - ResizeInput(data_batch.data, data_shape).CopyTo(&args_map["data"]); - data_batch.label.CopyTo(&args_map["data_label"]); - NDArray::WaitAll(); - - // Compute gradients - exec->Forward(true); - exec->Backward(); - - // Update parameters - for (size_t i = 0; i < arg_names.size(); ++i) { - if (arg_names[i] == "data" || arg_names[i] == "data_label") continue; - opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]); - } - - // Update metric - train_acc.Update(data_batch.label, exec->outputs[0]); - } - - // one epoch of training is finished - auto toc = std::chrono::system_clock::now(); - float duration = std::chrono::duration_cast - (toc - tic).count() / 1000.0; - LG << "Epoch[" << iter << "] " << samples / duration \ - << " samples/sec " << "Train-Accuracy=" << train_acc.Get();; - - val_iter.Reset(); - val_acc.Reset(); - - Accuracy acu; - val_iter.Reset(); - while (val_iter.Next()) { - auto data_batch = val_iter.GetDataBatch(); - ResizeInput(data_batch.data, data_shape).CopyTo(&args_map["data"]); - data_batch.label.CopyTo(&args_map["data_label"]); - NDArray::WaitAll(); - - // Only forward pass is enough as no gradient is needed when evaluating - exec->Forward(false); - NDArray::WaitAll(); - acu.Update(data_batch.label, exec->outputs[0]); - val_acc.Update(data_batch.label, exec->outputs[0]); - } - LG << "Epoch[" << iter << "] Val-Accuracy=" << val_acc.Get(); - } - - delete exec; - delete opt; - MXNotifyShutdown(); - CATCH - return 0; -} diff --git a/cpp-package/example/mlp.cpp b/cpp-package/example/mlp.cpp deleted file mode 100644 index 970dad74e727..000000000000 --- a/cpp-package/example/mlp.cpp +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - */ - -#include -#include -#include -#include "mxnet-cpp/MxNetCpp.h" -#include "utils.h" - -using namespace mxnet::cpp; - -/* - * In this example, - * we make by hand some data in 10 classes with some pattern - * and try to use MLP to recognize the pattern. - */ - -void OutputAccuracy(mx_float* pred, mx_float* target) { - int right = 0; - for (int i = 0; i < 128; ++i) { - float mx_p = pred[i * 10 + 0]; - float p_y = 0; - for (int j = 0; j < 10; ++j) { - if (pred[i * 10 + j] > mx_p) { - mx_p = pred[i * 10 + j]; - p_y = j; - } - } - if (p_y == target[i]) right++; - } - std::cout << "Accuracy: " << right / 128.0 << std::endl; -} - -void MLP(int max_epoch) { - auto sym_x = Symbol::Variable("X"); - auto sym_label = Symbol::Variable("label"); - - const int nLayers = 2; - std::vector layerSizes({512, 10}); - std::vector weights(nLayers); - std::vector biases(nLayers); - std::vector outputs(nLayers); - - Symbol null_sym; - for (int i = 0; i < nLayers; i++) { - std::string istr = std::to_string(i); - weights[i] = Symbol::Variable(std::string("w") + istr); - biases[i] = Symbol::Variable(std::string("b") + istr); - Symbol fc = FullyConnected(std::string("fc") + istr, - i == 0? sym_x : outputs[i-1], - weights[i], biases[i], layerSizes[i]); - outputs[i] = LeakyReLU(std::string("act") + istr, fc, null_sym, LeakyReLUActType::kLeaky); - } - auto sym_out = SoftmaxOutput("softmax", outputs[nLayers - 1], sym_label); - - Context ctx_dev(DeviceType::kCPU, 0); - - NDArray array_x(Shape(128, 28), ctx_dev, false); - NDArray array_y(Shape(128), ctx_dev, false); - - mx_float* aptr_x = new mx_float[128 * 28]; - mx_float* aptr_y = new mx_float[128]; - - // we make the data by hand, in 10 classes, with some pattern - for (int i = 0; i < 128; i++) { - for (int j = 0; j < 28; j++) { - aptr_x[i * 28 + j] = i % 10 * 1.0f; - } - aptr_y[i] = i % 10; - } - array_x.SyncCopyFromCPU(aptr_x, 128 * 28); - array_x.WaitToRead(); - array_y.SyncCopyFromCPU(aptr_y, 128); - array_y.WaitToRead(); - - // init the parameters - NDArray array_w_1(Shape(512, 28), ctx_dev, false); - NDArray array_b_1(Shape(512), ctx_dev, false); - NDArray array_w_2(Shape(10, 512), ctx_dev, false); - NDArray array_b_2(Shape(10), ctx_dev, false); - - // the parameters should be initialized in some kind of distribution, - // so it learns fast - // but here just give a const value by hand - array_w_1 = 0.5f; - array_b_1 = 0.0f; - array_w_2 = 0.5f; - array_b_2 = 0.0f; - - // the grads - NDArray array_w_1_g(Shape(512, 28), ctx_dev, false); - NDArray array_b_1_g(Shape(512), ctx_dev, false); - NDArray array_w_2_g(Shape(10, 512), ctx_dev, false); - NDArray array_b_2_g(Shape(10), ctx_dev, false); - - // Bind the symolic network with the ndarray - // all the input args - std::vector in_args; - in_args.push_back(array_x); - in_args.push_back(array_w_1); - in_args.push_back(array_b_1); - in_args.push_back(array_w_2); - in_args.push_back(array_b_2); - in_args.push_back(array_y); - // all the grads - std::vector arg_grad_store; - arg_grad_store.push_back(NDArray()); // we don't need the grad of the input - arg_grad_store.push_back(array_w_1_g); - arg_grad_store.push_back(array_b_1_g); - arg_grad_store.push_back(array_w_2_g); - arg_grad_store.push_back(array_b_2_g); - arg_grad_store.push_back( - NDArray()); // neither do we need the grad of the loss - // how to handle the grad - std::vector grad_req_type; - grad_req_type.push_back(kNullOp); - grad_req_type.push_back(kWriteTo); - grad_req_type.push_back(kWriteTo); - grad_req_type.push_back(kWriteTo); - grad_req_type.push_back(kWriteTo); - grad_req_type.push_back(kNullOp); - std::vector aux_states; - - std::cout << "make the Executor" << std::endl; - Executor* exe = new Executor(sym_out, ctx_dev, in_args, arg_grad_store, - grad_req_type, aux_states); - - std::cout << "Training" << std::endl; - mx_float learning_rate = 0.0001; - for (int epoch_num = 0; epoch_num < max_epoch; ++epoch_num) { - exe->Forward(true); - // print accuracy every 100 epoch - if (epoch_num % 100 == 0) { - std::cout << "epoch " << epoch_num << std::endl; - std::vector& out = exe->outputs; - float* cptr = new float[128 * 10]; - out[0].SyncCopyToCPU(cptr, 128 * 10); - NDArray::WaitAll(); - OutputAccuracy(cptr, aptr_y); - delete[] cptr; - } - - // update the parameters - exe->Backward(); - for (int i = 1; i < 5; ++i) { - in_args[i] -= arg_grad_store[i] * learning_rate; - } - NDArray::WaitAll(); - } - - delete exe; - delete[] aptr_x; - delete[] aptr_y; -} - -int main(int argc, char** argv) { - int max_epoch = argc > 1 ? strtol(argv[1], nullptr, 10) : 15000; - TRY - MLP(max_epoch); - MXNotifyShutdown(); - CATCH - return 0; -} diff --git a/cpp-package/example/mlp_cpu.cpp b/cpp-package/example/mlp_cpu.cpp deleted file mode 100644 index 7ea6946dd8c2..000000000000 --- a/cpp-package/example/mlp_cpu.cpp +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * Xin Li yakumolx@gmail.com - */ -#include -#include "utils.h" -#include "mxnet-cpp/MxNetCpp.h" - -using namespace mxnet::cpp; - -Symbol mlp(const std::vector &layers) { - auto x = Symbol::Variable("X"); - auto label = Symbol::Variable("label"); - - std::vector weights(layers.size()); - std::vector biases(layers.size()); - std::vector outputs(layers.size()); - - for (size_t i = 0; i < layers.size(); ++i) { - weights[i] = Symbol::Variable("w" + std::to_string(i)); - biases[i] = Symbol::Variable("b" + std::to_string(i)); - Symbol fc = FullyConnected( - i == 0? x : outputs[i-1], // data - weights[i], - biases[i], - layers[i]); - outputs[i] = i == layers.size()-1 ? fc : Activation(fc, ActivationActType::kRelu); - } - - return SoftmaxOutput(outputs.back(), label); -} - -int main(int argc, char** argv) { - const int image_size = 28; - const std::vector layers{128, 64, 10}; - const int batch_size = 100; - const int max_epoch = 10; - const float learning_rate = 0.1; - const float weight_decay = 1e-2; - - std::vector data_files = { "./data/mnist_data/train-images-idx3-ubyte", - "./data/mnist_data/train-labels-idx1-ubyte", - "./data/mnist_data/t10k-images-idx3-ubyte", - "./data/mnist_data/t10k-labels-idx1-ubyte" - }; - - auto train_iter = MXDataIter("MNISTIter"); - if (!setDataIter(&train_iter, "Train", data_files, batch_size)) { - return 1; - } - - auto val_iter = MXDataIter("MNISTIter"); - if (!setDataIter(&val_iter, "Label", data_files, batch_size)) { - return 1; - } - - TRY - auto net = mlp(layers); - - Context ctx = Context::cpu(); // Use CPU for training - - std::map args; - args["X"] = NDArray(Shape(batch_size, image_size*image_size), ctx); - args["label"] = NDArray(Shape(batch_size), ctx); - // Let MXNet infer shapes other parameters such as weights - net.InferArgsMap(ctx, &args, args); - - // Initialize all parameters with uniform distribution U(-0.01, 0.01) - auto initializer = Uniform(0.01); - for (auto& arg : args) { - // arg.first is parameter name, and arg.second is the value - initializer(arg.first, &arg.second); - } - - // Create sgd optimizer - Optimizer* opt = OptimizerRegistry::Find("sgd"); - opt->SetParam("rescale_grad", 1.0/batch_size) - ->SetParam("lr", learning_rate) - ->SetParam("wd", weight_decay); - - // Create executor by binding parameters to the model - auto *exec = net.SimpleBind(ctx, args); - auto arg_names = net.ListArguments(); - - // Start training - for (int iter = 0; iter < max_epoch; ++iter) { - int samples = 0; - train_iter.Reset(); - - auto tic = std::chrono::system_clock::now(); - while (train_iter.Next()) { - samples += batch_size; - auto data_batch = train_iter.GetDataBatch(); - // Set data and label - data_batch.data.CopyTo(&args["X"]); - data_batch.label.CopyTo(&args["label"]); - - // Compute gradients - exec->Forward(true); - exec->Backward(); - // Update parameters - for (size_t i = 0; i < arg_names.size(); ++i) { - if (arg_names[i] == "X" || arg_names[i] == "label") continue; - opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]); - } - } - auto toc = std::chrono::system_clock::now(); - - Accuracy acc; - val_iter.Reset(); - while (val_iter.Next()) { - auto data_batch = val_iter.GetDataBatch(); - data_batch.data.CopyTo(&args["X"]); - data_batch.label.CopyTo(&args["label"]); - // Forward pass is enough as no gradient is needed when evaluating - exec->Forward(false); - acc.Update(data_batch.label, exec->outputs[0]); - } - float duration = std::chrono::duration_cast - (toc - tic).count() / 1000.0; - LG << "Epoch: " << iter << " " << samples/duration << " samples/sec Accuracy: " << acc.Get(); - } - - delete exec; - delete opt; - MXNotifyShutdown(); - CATCH - return 0; -} diff --git a/cpp-package/example/mlp_csv.cpp b/cpp-package/example/mlp_csv.cpp deleted file mode 100644 index 8db6638a90d3..000000000000 --- a/cpp-package/example/mlp_csv.cpp +++ /dev/null @@ -1,276 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Example: mlp_csv - * Description: - * The following example demonstrates how to use CSVIter. This example creates - * mlp (multi-layer perceptron) model and trains the MNIST data which is in - * CSV format. - */ -#include -#include -#include "utils.h" -#include "mxnet-cpp/MxNetCpp.h" - -using namespace mxnet::cpp; - -/* - * Implementing the mlp symbol with given hidden units configuration. - */ -Symbol mlp(const std::vector &hidden_units) { - auto data = Symbol::Variable("data"); - auto label = Symbol::Variable("label"); - - std::vector weights(hidden_units.size()); - std::vector biases(hidden_units.size()); - std::vector outputs(hidden_units.size()); - - for (size_t i = 0; i < hidden_units.size(); ++i) { - weights[i] = Symbol::Variable("w" + std::to_string(i)); - biases[i] = Symbol::Variable("b" + std::to_string(i)); - Symbol fc = FullyConnected( - i == 0? data : outputs[i-1], // data - weights[i], - biases[i], - hidden_units[i]); - outputs[i] = i == hidden_units.size()-1 ? fc : Activation(fc, ActivationActType::kRelu); - } - return SoftmaxOutput(outputs.back(), label); -} - -/* - * Convert the input string of number of hidden units into the vector of integers. - */ -std::vector getLayers(const std::string& hidden_units_string) { - std::vector hidden_units; - char *pNext; - int num_unit = strtol(hidden_units_string.c_str(), &pNext, 10); - hidden_units.push_back(num_unit); - while (*pNext) { - num_unit = strtol(pNext, &pNext, 10); - hidden_units.push_back(num_unit); - } - return hidden_units; -} - -void printUsage() { - std::cout << "Usage:" << std::endl; - std::cout << "mlp_csv --train mnist_training_set.csv --test mnist_test_set.csv --epochs 10 " - << "--batch_size 100 --hidden_units \"128 64 64\" --gpu" << std::endl; - std::cout << "The example uses mnist data in CSV format. The MNIST data in CSV format assumes " - << "the column 0 to be label and the rest 784 column to be data." << std::endl; - std::cout << "By default, the example uses 'cpu' context. If '--gpu' is specified, " - << "program uses 'gpu' context." <